{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15410322498768686, "eval_steps": 500, "global_step": 153000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.036053104172773e-06, "grad_norm": 184.37385834066, "learning_rate": 5.035956731059767e-09, "loss": 2.9192, "mean_token_accuracy": 0.3965517282485962, "step": 5 }, { "epoch": 1.0072106208345546e-05, "grad_norm": 115.51465348321524, "learning_rate": 1.0071913462119533e-08, "loss": 3.1229, "mean_token_accuracy": 0.3206896513700485, "step": 10 }, { "epoch": 1.5108159312518318e-05, "grad_norm": 128.14684544029438, "learning_rate": 1.51078701931793e-08, "loss": 2.8221, "mean_token_accuracy": 0.42068964838981626, "step": 15 }, { "epoch": 2.014421241669109e-05, "grad_norm": 93.98992739112428, "learning_rate": 2.0143826924239066e-08, "loss": 2.9034, "mean_token_accuracy": 0.42413793206214906, "step": 20 }, { "epoch": 2.5180265520863864e-05, "grad_norm": 120.66703121286582, "learning_rate": 2.5179783655298835e-08, "loss": 2.7341, "mean_token_accuracy": 0.420689657330513, "step": 25 }, { "epoch": 3.0216318625036637e-05, "grad_norm": 108.72624899473382, "learning_rate": 3.02157403863586e-08, "loss": 2.9094, "mean_token_accuracy": 0.3896551728248596, "step": 30 }, { "epoch": 3.525237172920941e-05, "grad_norm": 135.17281815093503, "learning_rate": 3.5251697117418364e-08, "loss": 3.3604, "mean_token_accuracy": 0.3733817219734192, "step": 35 }, { "epoch": 4.028842483338218e-05, "grad_norm": 121.9632914213751, "learning_rate": 4.028765384847813e-08, "loss": 3.2009, "mean_token_accuracy": 0.3379310369491577, "step": 40 }, { "epoch": 4.532447793755496e-05, "grad_norm": 97.98992561195891, "learning_rate": 4.53236105795379e-08, "loss": 2.7912, "mean_token_accuracy": 0.4034482717514038, "step": 45 }, { "epoch": 5.036053104172773e-05, "grad_norm": 142.3603859690338, "learning_rate": 5.035956731059767e-08, "loss": 2.9498, "mean_token_accuracy": 0.3827586233615875, "step": 50 }, { "epoch": 5.5396584145900504e-05, "grad_norm": 135.23102796458127, "learning_rate": 5.5395524041657433e-08, "loss": 2.9546, "mean_token_accuracy": 0.38620689511299133, "step": 55 }, { "epoch": 6.043263725007327e-05, "grad_norm": 114.50257152228085, "learning_rate": 6.04314807727172e-08, "loss": 2.4589, "mean_token_accuracy": 0.46551724076271056, "step": 60 }, { "epoch": 6.546869035424605e-05, "grad_norm": 106.87696604255558, "learning_rate": 6.546743750377697e-08, "loss": 2.3529, "mean_token_accuracy": 0.482758617401123, "step": 65 }, { "epoch": 7.050474345841883e-05, "grad_norm": 144.7875008882521, "learning_rate": 7.050339423483673e-08, "loss": 2.7852, "mean_token_accuracy": 0.41724138259887694, "step": 70 }, { "epoch": 7.554079656259159e-05, "grad_norm": 150.28386750658944, "learning_rate": 7.553935096589651e-08, "loss": 2.9964, "mean_token_accuracy": 0.3990147829055786, "step": 75 }, { "epoch": 8.057684966676436e-05, "grad_norm": 128.590739084924, "learning_rate": 8.057530769695627e-08, "loss": 2.6091, "mean_token_accuracy": 0.441379314661026, "step": 80 }, { "epoch": 8.561290277093714e-05, "grad_norm": 136.77950873458363, "learning_rate": 8.561126442801605e-08, "loss": 3.0939, "mean_token_accuracy": 0.3793103456497192, "step": 85 }, { "epoch": 9.064895587510992e-05, "grad_norm": 154.18804147239845, "learning_rate": 9.06472211590758e-08, "loss": 3.4264, "mean_token_accuracy": 0.346158492565155, "step": 90 }, { "epoch": 9.568500897928268e-05, "grad_norm": 116.13046764073876, "learning_rate": 9.568317789013557e-08, "loss": 2.7618, "mean_token_accuracy": 0.36896551251411436, "step": 95 }, { "epoch": 0.00010072106208345546, "grad_norm": 110.77374107676607, "learning_rate": 1.0071913462119534e-07, "loss": 3.0722, "mean_token_accuracy": 0.36551723480224607, "step": 100 }, { "epoch": 0.00010575711518762823, "grad_norm": 113.01719127713102, "learning_rate": 1.057550913522551e-07, "loss": 2.698, "mean_token_accuracy": 0.3724137842655182, "step": 105 }, { "epoch": 0.00011079316829180101, "grad_norm": 142.57442968544424, "learning_rate": 1.1079104808331487e-07, "loss": 2.4708, "mean_token_accuracy": 0.47586206197738645, "step": 110 }, { "epoch": 0.00011582922139597377, "grad_norm": 115.01977070578374, "learning_rate": 1.1582700481437464e-07, "loss": 2.3308, "mean_token_accuracy": 0.44137930274009707, "step": 115 }, { "epoch": 0.00012086527450014655, "grad_norm": 123.76974341799016, "learning_rate": 1.208629615454344e-07, "loss": 2.5578, "mean_token_accuracy": 0.4724137902259827, "step": 120 }, { "epoch": 0.0001259013276043193, "grad_norm": 152.76032381601127, "learning_rate": 1.258989182764942e-07, "loss": 2.7359, "mean_token_accuracy": 0.4068965494632721, "step": 125 }, { "epoch": 0.0001309373807084921, "grad_norm": 118.64261820647128, "learning_rate": 1.3093487500755394e-07, "loss": 2.7449, "mean_token_accuracy": 0.44482758045196535, "step": 130 }, { "epoch": 0.00013597343381266486, "grad_norm": 151.93036554078935, "learning_rate": 1.359708317386137e-07, "loss": 2.8709, "mean_token_accuracy": 0.3896551728248596, "step": 135 }, { "epoch": 0.00014100948691683765, "grad_norm": 101.1616090153162, "learning_rate": 1.4100678846967345e-07, "loss": 2.7185, "mean_token_accuracy": 0.4379310250282288, "step": 140 }, { "epoch": 0.00014604554002101041, "grad_norm": 125.01068204666701, "learning_rate": 1.4604274520073324e-07, "loss": 2.8559, "mean_token_accuracy": 0.47241378426551817, "step": 145 }, { "epoch": 0.00015108159312518318, "grad_norm": 134.74781524363345, "learning_rate": 1.5107870193179302e-07, "loss": 2.6456, "mean_token_accuracy": 0.4068965554237366, "step": 150 }, { "epoch": 0.00015611764622935597, "grad_norm": 153.65307950744636, "learning_rate": 1.5611465866285278e-07, "loss": 2.6937, "mean_token_accuracy": 0.40689656138420105, "step": 155 }, { "epoch": 0.00016115369933352873, "grad_norm": 119.37977441865058, "learning_rate": 1.6115061539391253e-07, "loss": 2.8349, "mean_token_accuracy": 0.4, "step": 160 }, { "epoch": 0.0001661897524377015, "grad_norm": 158.93124979095967, "learning_rate": 1.6618657212497231e-07, "loss": 2.9299, "mean_token_accuracy": 0.4137930989265442, "step": 165 }, { "epoch": 0.00017122580554187428, "grad_norm": 135.82887677202035, "learning_rate": 1.712225288560321e-07, "loss": 2.9838, "mean_token_accuracy": 0.37586206793785093, "step": 170 }, { "epoch": 0.00017626185864604704, "grad_norm": 112.26795998385566, "learning_rate": 1.7625848558709185e-07, "loss": 2.4926, "mean_token_accuracy": 0.4586206912994385, "step": 175 }, { "epoch": 0.00018129791175021983, "grad_norm": 76.3531078505134, "learning_rate": 1.812944423181516e-07, "loss": 2.4358, "mean_token_accuracy": 0.4931034505367279, "step": 180 }, { "epoch": 0.0001863339648543926, "grad_norm": 142.39390058617943, "learning_rate": 1.8633039904921136e-07, "loss": 2.6134, "mean_token_accuracy": 0.4551724076271057, "step": 185 }, { "epoch": 0.00019137001795856536, "grad_norm": 114.87053401612461, "learning_rate": 1.9136635578027115e-07, "loss": 2.7383, "mean_token_accuracy": 0.458620685338974, "step": 190 }, { "epoch": 0.00019640607106273815, "grad_norm": 174.51701190472923, "learning_rate": 1.9640231251133093e-07, "loss": 3.0416, "mean_token_accuracy": 0.3551724135875702, "step": 195 }, { "epoch": 0.0002014421241669109, "grad_norm": 87.92382514620128, "learning_rate": 2.0143826924239068e-07, "loss": 2.6535, "mean_token_accuracy": 0.44827585816383364, "step": 200 }, { "epoch": 0.0002064781772710837, "grad_norm": 125.3923304530879, "learning_rate": 2.0647422597345044e-07, "loss": 3.395, "mean_token_accuracy": 0.3034482717514038, "step": 205 }, { "epoch": 0.00021151423037525646, "grad_norm": 189.93261250839308, "learning_rate": 2.115101827045102e-07, "loss": 2.6287, "mean_token_accuracy": 0.41379310488700866, "step": 210 }, { "epoch": 0.00021655028347942923, "grad_norm": 147.48631644542024, "learning_rate": 2.1654613943556998e-07, "loss": 2.9504, "mean_token_accuracy": 0.3875982999801636, "step": 215 }, { "epoch": 0.00022158633658360202, "grad_norm": 176.65054130210027, "learning_rate": 2.2158209616662973e-07, "loss": 2.9528, "mean_token_accuracy": 0.36551723480224607, "step": 220 }, { "epoch": 0.00022662238968777478, "grad_norm": 170.4652299488915, "learning_rate": 2.266180528976895e-07, "loss": 2.3855, "mean_token_accuracy": 0.4517241358757019, "step": 225 }, { "epoch": 0.00023165844279194754, "grad_norm": 120.54500147452612, "learning_rate": 2.3165400962874927e-07, "loss": 2.7001, "mean_token_accuracy": 0.4000000059604645, "step": 230 }, { "epoch": 0.00023669449589612033, "grad_norm": 113.6082665799816, "learning_rate": 2.3668996635980905e-07, "loss": 3.0279, "mean_token_accuracy": 0.3551724135875702, "step": 235 }, { "epoch": 0.0002417305490002931, "grad_norm": 93.23066527567585, "learning_rate": 2.417259230908688e-07, "loss": 2.7444, "mean_token_accuracy": 0.4172413766384125, "step": 240 }, { "epoch": 0.0002467666021044659, "grad_norm": 170.03465277438232, "learning_rate": 2.467618798219286e-07, "loss": 2.5591, "mean_token_accuracy": 0.42758620977401735, "step": 245 }, { "epoch": 0.0002518026552086386, "grad_norm": 97.79355591471369, "learning_rate": 2.517978365529884e-07, "loss": 2.6857, "mean_token_accuracy": 0.4448275864124298, "step": 250 }, { "epoch": 0.0002568387083128114, "grad_norm": 110.2269865945556, "learning_rate": 2.568337932840481e-07, "loss": 2.745, "mean_token_accuracy": 0.4310344815254211, "step": 255 }, { "epoch": 0.0002618747614169842, "grad_norm": 167.01819819201668, "learning_rate": 2.618697500151079e-07, "loss": 2.4903, "mean_token_accuracy": 0.4448275864124298, "step": 260 }, { "epoch": 0.000266910814521157, "grad_norm": 82.02620765743933, "learning_rate": 2.669057067461676e-07, "loss": 2.813, "mean_token_accuracy": 0.36896551251411436, "step": 265 }, { "epoch": 0.0002719468676253297, "grad_norm": 114.49835583281566, "learning_rate": 2.719416634772274e-07, "loss": 2.8973, "mean_token_accuracy": 0.4034482777118683, "step": 270 }, { "epoch": 0.0002769829207295025, "grad_norm": 143.21545063778842, "learning_rate": 2.769776202082872e-07, "loss": 2.9087, "mean_token_accuracy": 0.44482758045196535, "step": 275 }, { "epoch": 0.0002820189738336753, "grad_norm": 131.6447637316192, "learning_rate": 2.820135769393469e-07, "loss": 2.38, "mean_token_accuracy": 0.45517241954803467, "step": 280 }, { "epoch": 0.00028705502693784804, "grad_norm": 164.32375103652484, "learning_rate": 2.8704953367040674e-07, "loss": 2.6754, "mean_token_accuracy": 0.4620689690113068, "step": 285 }, { "epoch": 0.00029209108004202083, "grad_norm": 127.73209131432515, "learning_rate": 2.920854904014665e-07, "loss": 2.767, "mean_token_accuracy": 0.41379310488700866, "step": 290 }, { "epoch": 0.0002971271331461936, "grad_norm": 95.51532568889598, "learning_rate": 2.9712144713252626e-07, "loss": 2.7629, "mean_token_accuracy": 0.41034482717514037, "step": 295 }, { "epoch": 0.00030216318625036635, "grad_norm": 142.2580984331193, "learning_rate": 3.0215740386358604e-07, "loss": 3.0414, "mean_token_accuracy": 0.3551724135875702, "step": 300 }, { "epoch": 0.00030719923935453914, "grad_norm": 148.80577494007386, "learning_rate": 3.0719336059464577e-07, "loss": 2.9303, "mean_token_accuracy": 0.3931034505367279, "step": 305 }, { "epoch": 0.00031223529245871193, "grad_norm": 130.62579712147522, "learning_rate": 3.1222931732570555e-07, "loss": 2.6566, "mean_token_accuracy": 0.42413792610168455, "step": 310 }, { "epoch": 0.00031727134556288467, "grad_norm": 147.32065904244521, "learning_rate": 3.1726527405676533e-07, "loss": 2.6772, "mean_token_accuracy": 0.42758620977401735, "step": 315 }, { "epoch": 0.00032230739866705746, "grad_norm": 192.00664416812978, "learning_rate": 3.2230123078782506e-07, "loss": 2.7212, "mean_token_accuracy": 0.4255293428897858, "step": 320 }, { "epoch": 0.00032734345177123025, "grad_norm": 103.49225237823313, "learning_rate": 3.2733718751888484e-07, "loss": 2.926, "mean_token_accuracy": 0.34827586710453035, "step": 325 }, { "epoch": 0.000332379504875403, "grad_norm": 127.22193639149162, "learning_rate": 3.3237314424994463e-07, "loss": 2.8522, "mean_token_accuracy": 0.4, "step": 330 }, { "epoch": 0.0003374155579795758, "grad_norm": 116.89075063440451, "learning_rate": 3.374091009810044e-07, "loss": 2.603, "mean_token_accuracy": 0.4413793087005615, "step": 335 }, { "epoch": 0.00034245161108374856, "grad_norm": 102.90675650693785, "learning_rate": 3.424450577120642e-07, "loss": 2.4348, "mean_token_accuracy": 0.43793103098869324, "step": 340 }, { "epoch": 0.00034748766418792135, "grad_norm": 129.28524192411595, "learning_rate": 3.474810144431239e-07, "loss": 2.9245, "mean_token_accuracy": 0.3896551728248596, "step": 345 }, { "epoch": 0.0003525237172920941, "grad_norm": 118.97989649596283, "learning_rate": 3.525169711741837e-07, "loss": 2.4452, "mean_token_accuracy": 0.43448275327682495, "step": 350 }, { "epoch": 0.0003575597703962669, "grad_norm": 111.56592749408559, "learning_rate": 3.5755292790524343e-07, "loss": 2.9099, "mean_token_accuracy": 0.41639443039894103, "step": 355 }, { "epoch": 0.00036259582350043967, "grad_norm": 144.8197948477708, "learning_rate": 3.625888846363032e-07, "loss": 2.9199, "mean_token_accuracy": 0.3896551728248596, "step": 360 }, { "epoch": 0.0003676318766046124, "grad_norm": 95.28332303023947, "learning_rate": 3.67624841367363e-07, "loss": 3.0385, "mean_token_accuracy": 0.37241379618644715, "step": 365 }, { "epoch": 0.0003726679297087852, "grad_norm": 130.90212137646938, "learning_rate": 3.7266079809842273e-07, "loss": 2.4822, "mean_token_accuracy": 0.4517241418361664, "step": 370 }, { "epoch": 0.000377703982812958, "grad_norm": 84.3170716920256, "learning_rate": 3.776967548294825e-07, "loss": 2.4249, "mean_token_accuracy": 0.5034482657909394, "step": 375 }, { "epoch": 0.0003827400359171307, "grad_norm": 115.19065645002041, "learning_rate": 3.827327115605423e-07, "loss": 2.8054, "mean_token_accuracy": 0.4602540791034698, "step": 380 }, { "epoch": 0.0003877760890213035, "grad_norm": 171.33198789937248, "learning_rate": 3.8776866829160207e-07, "loss": 2.8631, "mean_token_accuracy": 0.3676108419895172, "step": 385 }, { "epoch": 0.0003928121421254763, "grad_norm": 136.27446247446824, "learning_rate": 3.9280462502266186e-07, "loss": 2.6387, "mean_token_accuracy": 0.45517241954803467, "step": 390 }, { "epoch": 0.00039784819522964903, "grad_norm": 129.7593181067108, "learning_rate": 3.978405817537216e-07, "loss": 2.6709, "mean_token_accuracy": 0.42413793206214906, "step": 395 }, { "epoch": 0.0004028842483338218, "grad_norm": 112.10185767125414, "learning_rate": 4.0287653848478137e-07, "loss": 2.7646, "mean_token_accuracy": 0.37586207389831544, "step": 400 }, { "epoch": 0.0004079203014379946, "grad_norm": 95.84334217571804, "learning_rate": 4.0791249521584115e-07, "loss": 2.7615, "mean_token_accuracy": 0.403448274731636, "step": 405 }, { "epoch": 0.0004129563545421674, "grad_norm": 106.4905683415504, "learning_rate": 4.129484519469009e-07, "loss": 2.7835, "mean_token_accuracy": 0.41161524653434756, "step": 410 }, { "epoch": 0.00041799240764634014, "grad_norm": 82.77934931657926, "learning_rate": 4.1798440867796066e-07, "loss": 2.5381, "mean_token_accuracy": 0.45517241954803467, "step": 415 }, { "epoch": 0.00042302846075051293, "grad_norm": 105.1284031779718, "learning_rate": 4.230203654090204e-07, "loss": 2.8011, "mean_token_accuracy": 0.3836055725812912, "step": 420 }, { "epoch": 0.0004280645138546857, "grad_norm": 101.16566563275299, "learning_rate": 4.2805632214008017e-07, "loss": 2.3296, "mean_token_accuracy": 0.4620689630508423, "step": 425 }, { "epoch": 0.00043310056695885845, "grad_norm": 95.80579434894857, "learning_rate": 4.3309227887113996e-07, "loss": 2.3242, "mean_token_accuracy": 0.46551724076271056, "step": 430 }, { "epoch": 0.00043813662006303124, "grad_norm": 141.8439665770817, "learning_rate": 4.381282356021997e-07, "loss": 2.6309, "mean_token_accuracy": 0.38275861740112305, "step": 435 }, { "epoch": 0.00044317267316720403, "grad_norm": 81.4978736073144, "learning_rate": 4.4316419233325947e-07, "loss": 2.4957, "mean_token_accuracy": 0.43448275327682495, "step": 440 }, { "epoch": 0.00044820872627137677, "grad_norm": 89.41859589037193, "learning_rate": 4.4820014906431925e-07, "loss": 2.6591, "mean_token_accuracy": 0.4379310429096222, "step": 445 }, { "epoch": 0.00045324477937554956, "grad_norm": 117.22822735881033, "learning_rate": 4.53236105795379e-07, "loss": 2.6265, "mean_token_accuracy": 0.42413793206214906, "step": 450 }, { "epoch": 0.00045828083247972235, "grad_norm": 117.772096619878, "learning_rate": 4.5827206252643876e-07, "loss": 2.4679, "mean_token_accuracy": 0.46896551847457885, "step": 455 }, { "epoch": 0.0004633168855838951, "grad_norm": 119.45972976735337, "learning_rate": 4.6330801925749854e-07, "loss": 2.7592, "mean_token_accuracy": 0.42183908224105837, "step": 460 }, { "epoch": 0.00046835293868806787, "grad_norm": 101.66925022479907, "learning_rate": 4.683439759885584e-07, "loss": 2.5975, "mean_token_accuracy": 0.44162561297416686, "step": 465 }, { "epoch": 0.00047338899179224066, "grad_norm": 74.51822544368427, "learning_rate": 4.733799327196181e-07, "loss": 2.2852, "mean_token_accuracy": 0.46551724672317507, "step": 470 }, { "epoch": 0.0004784250448964134, "grad_norm": 104.31649200353398, "learning_rate": 4.784158894506779e-07, "loss": 2.7482, "mean_token_accuracy": 0.3758620619773865, "step": 475 }, { "epoch": 0.0004834610980005862, "grad_norm": 70.35465134175018, "learning_rate": 4.834518461817376e-07, "loss": 2.4995, "mean_token_accuracy": 0.4379310369491577, "step": 480 }, { "epoch": 0.000488497151104759, "grad_norm": 97.30671099352266, "learning_rate": 4.884878029127974e-07, "loss": 2.7713, "mean_token_accuracy": 0.4172413766384125, "step": 485 }, { "epoch": 0.0004935332042089318, "grad_norm": 128.57769163039333, "learning_rate": 4.935237596438572e-07, "loss": 2.9798, "mean_token_accuracy": 0.3931034505367279, "step": 490 }, { "epoch": 0.0004985692573131046, "grad_norm": 109.78819543288185, "learning_rate": 4.98559716374917e-07, "loss": 2.5184, "mean_token_accuracy": 0.4206896543502808, "step": 495 }, { "epoch": 0.0005036053104172772, "grad_norm": 107.37855232859297, "learning_rate": 5.035956731059767e-07, "loss": 2.7295, "mean_token_accuracy": 0.39310345649719236, "step": 500 }, { "epoch": 0.00050864136352145, "grad_norm": 77.71522135810928, "learning_rate": 5.086316298370364e-07, "loss": 2.5853, "mean_token_accuracy": 0.4517241358757019, "step": 505 }, { "epoch": 0.0005136774166256228, "grad_norm": 135.20387787149846, "learning_rate": 5.136675865680962e-07, "loss": 2.5211, "mean_token_accuracy": 0.42758620977401735, "step": 510 }, { "epoch": 0.0005187134697297956, "grad_norm": 116.21072332002251, "learning_rate": 5.18703543299156e-07, "loss": 2.7668, "mean_token_accuracy": 0.4068965554237366, "step": 515 }, { "epoch": 0.0005237495228339684, "grad_norm": 96.95603099242913, "learning_rate": 5.237395000302158e-07, "loss": 2.6718, "mean_token_accuracy": 0.4172413766384125, "step": 520 }, { "epoch": 0.0005287855759381412, "grad_norm": 100.72935932881842, "learning_rate": 5.287754567612756e-07, "loss": 3.1407, "mean_token_accuracy": 0.3827586233615875, "step": 525 }, { "epoch": 0.000533821629042314, "grad_norm": 125.46755545312327, "learning_rate": 5.338114134923352e-07, "loss": 2.5017, "mean_token_accuracy": 0.4310344815254211, "step": 530 }, { "epoch": 0.0005388576821464867, "grad_norm": 110.32098076381747, "learning_rate": 5.38847370223395e-07, "loss": 2.5591, "mean_token_accuracy": 0.4310344815254211, "step": 535 }, { "epoch": 0.0005438937352506594, "grad_norm": 98.3318094080258, "learning_rate": 5.438833269544548e-07, "loss": 2.7035, "mean_token_accuracy": 0.4, "step": 540 }, { "epoch": 0.0005489297883548322, "grad_norm": 68.26146745661678, "learning_rate": 5.489192836855146e-07, "loss": 2.8647, "mean_token_accuracy": 0.41476104259490965, "step": 545 }, { "epoch": 0.000553965841459005, "grad_norm": 78.42414525742541, "learning_rate": 5.539552404165744e-07, "loss": 2.635, "mean_token_accuracy": 0.4310344815254211, "step": 550 }, { "epoch": 0.0005590018945631778, "grad_norm": 91.61441788019096, "learning_rate": 5.589911971476341e-07, "loss": 2.5186, "mean_token_accuracy": 0.441379314661026, "step": 555 }, { "epoch": 0.0005640379476673506, "grad_norm": 106.20995303402762, "learning_rate": 5.640271538786938e-07, "loss": 2.6926, "mean_token_accuracy": 0.42413793206214906, "step": 560 }, { "epoch": 0.0005690740007715233, "grad_norm": 141.215242800728, "learning_rate": 5.690631106097537e-07, "loss": 2.7865, "mean_token_accuracy": 0.4010889232158661, "step": 565 }, { "epoch": 0.0005741100538756961, "grad_norm": 85.06108014144998, "learning_rate": 5.740990673408135e-07, "loss": 2.7034, "mean_token_accuracy": 0.4517241358757019, "step": 570 }, { "epoch": 0.0005791461069798689, "grad_norm": 108.0575295666833, "learning_rate": 5.791350240718733e-07, "loss": 2.6878, "mean_token_accuracy": 0.4517241358757019, "step": 575 }, { "epoch": 0.0005841821600840417, "grad_norm": 75.89057427381948, "learning_rate": 5.84170980802933e-07, "loss": 2.3963, "mean_token_accuracy": 0.4517241358757019, "step": 580 }, { "epoch": 0.0005892182131882144, "grad_norm": 101.216650291623, "learning_rate": 5.892069375339927e-07, "loss": 2.2118, "mean_token_accuracy": 0.5068965554237366, "step": 585 }, { "epoch": 0.0005942542662923872, "grad_norm": 88.03001336056393, "learning_rate": 5.942428942650525e-07, "loss": 2.5335, "mean_token_accuracy": 0.47404718995094297, "step": 590 }, { "epoch": 0.00059929031939656, "grad_norm": 92.11349880726202, "learning_rate": 5.992788509961123e-07, "loss": 2.1097, "mean_token_accuracy": 0.46551724076271056, "step": 595 }, { "epoch": 0.0006043263725007327, "grad_norm": 121.245838725664, "learning_rate": 6.043148077271721e-07, "loss": 2.3172, "mean_token_accuracy": 0.46896552443504336, "step": 600 }, { "epoch": 0.0006093624256049055, "grad_norm": 79.52855727841695, "learning_rate": 6.093507644582319e-07, "loss": 2.3817, "mean_token_accuracy": 0.46696914434432985, "step": 605 }, { "epoch": 0.0006143984787090783, "grad_norm": 67.81969591475001, "learning_rate": 6.143867211892915e-07, "loss": 2.4404, "mean_token_accuracy": 0.46551724076271056, "step": 610 }, { "epoch": 0.0006194345318132511, "grad_norm": 92.77669389751524, "learning_rate": 6.194226779203513e-07, "loss": 2.5152, "mean_token_accuracy": 0.4068965494632721, "step": 615 }, { "epoch": 0.0006244705849174239, "grad_norm": 80.48286293005252, "learning_rate": 6.244586346514111e-07, "loss": 2.4309, "mean_token_accuracy": 0.47241379618644713, "step": 620 }, { "epoch": 0.0006295066380215967, "grad_norm": 61.75806798712446, "learning_rate": 6.294945913824709e-07, "loss": 2.5416, "mean_token_accuracy": 0.4551724135875702, "step": 625 }, { "epoch": 0.0006345426911257693, "grad_norm": 74.23387017595724, "learning_rate": 6.345305481135307e-07, "loss": 2.6247, "mean_token_accuracy": 0.4689655125141144, "step": 630 }, { "epoch": 0.0006395787442299421, "grad_norm": 76.06760280009239, "learning_rate": 6.395665048445903e-07, "loss": 2.8092, "mean_token_accuracy": 0.4000000059604645, "step": 635 }, { "epoch": 0.0006446147973341149, "grad_norm": 85.24232343466204, "learning_rate": 6.446024615756501e-07, "loss": 2.8278, "mean_token_accuracy": 0.3862069010734558, "step": 640 }, { "epoch": 0.0006496508504382877, "grad_norm": 86.6529097753573, "learning_rate": 6.496384183067099e-07, "loss": 2.5016, "mean_token_accuracy": 0.4413793206214905, "step": 645 }, { "epoch": 0.0006546869035424605, "grad_norm": 104.60639124238119, "learning_rate": 6.546743750377697e-07, "loss": 2.4112, "mean_token_accuracy": 0.4655172348022461, "step": 650 }, { "epoch": 0.0006597229566466333, "grad_norm": 93.64425274786298, "learning_rate": 6.597103317688295e-07, "loss": 2.3852, "mean_token_accuracy": 0.4206896543502808, "step": 655 }, { "epoch": 0.000664759009750806, "grad_norm": 76.01933409259676, "learning_rate": 6.647462884998893e-07, "loss": 2.6791, "mean_token_accuracy": 0.3999999940395355, "step": 660 }, { "epoch": 0.0006697950628549788, "grad_norm": 98.87426350115594, "learning_rate": 6.69782245230949e-07, "loss": 2.3945, "mean_token_accuracy": 0.4448275864124298, "step": 665 }, { "epoch": 0.0006748311159591515, "grad_norm": 92.22357358011968, "learning_rate": 6.748182019620088e-07, "loss": 2.3475, "mean_token_accuracy": 0.4620689690113068, "step": 670 }, { "epoch": 0.0006798671690633243, "grad_norm": 81.95174569212841, "learning_rate": 6.798541586930686e-07, "loss": 2.4461, "mean_token_accuracy": 0.4413793087005615, "step": 675 }, { "epoch": 0.0006849032221674971, "grad_norm": 79.96710306620447, "learning_rate": 6.848901154241284e-07, "loss": 2.4317, "mean_token_accuracy": 0.4517241358757019, "step": 680 }, { "epoch": 0.0006899392752716699, "grad_norm": 103.32052424746628, "learning_rate": 6.899260721551881e-07, "loss": 2.2136, "mean_token_accuracy": 0.458620685338974, "step": 685 }, { "epoch": 0.0006949753283758427, "grad_norm": 86.42014498307215, "learning_rate": 6.949620288862478e-07, "loss": 2.579, "mean_token_accuracy": 0.43448275327682495, "step": 690 }, { "epoch": 0.0007000113814800154, "grad_norm": 74.84087535799382, "learning_rate": 6.999979856173076e-07, "loss": 2.6747, "mean_token_accuracy": 0.38275861740112305, "step": 695 }, { "epoch": 0.0007050474345841882, "grad_norm": 116.48024543484408, "learning_rate": 7.050339423483674e-07, "loss": 2.5092, "mean_token_accuracy": 0.4103448331356049, "step": 700 }, { "epoch": 0.000710083487688361, "grad_norm": 93.84051280549203, "learning_rate": 7.100698990794272e-07, "loss": 2.3428, "mean_token_accuracy": 0.4689655125141144, "step": 705 }, { "epoch": 0.0007151195407925338, "grad_norm": 176.4810165342915, "learning_rate": 7.151058558104869e-07, "loss": 2.4544, "mean_token_accuracy": 0.4620689630508423, "step": 710 }, { "epoch": 0.0007201555938967065, "grad_norm": 123.49408801166076, "learning_rate": 7.201418125415466e-07, "loss": 2.5885, "mean_token_accuracy": 0.4413793087005615, "step": 715 }, { "epoch": 0.0007251916470008793, "grad_norm": 83.05349987822244, "learning_rate": 7.251777692726064e-07, "loss": 2.7422, "mean_token_accuracy": 0.42413793206214906, "step": 720 }, { "epoch": 0.000730227700105052, "grad_norm": 83.15795900002738, "learning_rate": 7.302137260036662e-07, "loss": 2.2589, "mean_token_accuracy": 0.4482758641242981, "step": 725 }, { "epoch": 0.0007352637532092248, "grad_norm": 75.12910461884285, "learning_rate": 7.35249682734726e-07, "loss": 2.4144, "mean_token_accuracy": 0.4793103516101837, "step": 730 }, { "epoch": 0.0007402998063133976, "grad_norm": 76.6862144802982, "learning_rate": 7.402856394657858e-07, "loss": 2.7284, "mean_token_accuracy": 0.41379310488700866, "step": 735 }, { "epoch": 0.0007453358594175704, "grad_norm": 79.02963844112384, "learning_rate": 7.453215961968455e-07, "loss": 2.4321, "mean_token_accuracy": 0.4862068951129913, "step": 740 }, { "epoch": 0.0007503719125217432, "grad_norm": 89.21830756888185, "learning_rate": 7.503575529279052e-07, "loss": 2.5776, "mean_token_accuracy": 0.42758620977401735, "step": 745 }, { "epoch": 0.000755407965625916, "grad_norm": 119.60579380554891, "learning_rate": 7.55393509658965e-07, "loss": 2.3662, "mean_token_accuracy": 0.44827585220336913, "step": 750 }, { "epoch": 0.0007604440187300888, "grad_norm": 120.12614404675867, "learning_rate": 7.604294663900248e-07, "loss": 2.609, "mean_token_accuracy": 0.4068965494632721, "step": 755 }, { "epoch": 0.0007654800718342614, "grad_norm": 77.52867628278035, "learning_rate": 7.654654231210846e-07, "loss": 2.4938, "mean_token_accuracy": 0.4137930989265442, "step": 760 }, { "epoch": 0.0007705161249384342, "grad_norm": 112.22966822304785, "learning_rate": 7.705013798521443e-07, "loss": 2.3313, "mean_token_accuracy": 0.46551724672317507, "step": 765 }, { "epoch": 0.000775552178042607, "grad_norm": 131.61596142722587, "learning_rate": 7.755373365832041e-07, "loss": 2.3593, "mean_token_accuracy": 0.4310344815254211, "step": 770 }, { "epoch": 0.0007805882311467798, "grad_norm": 79.68065032041677, "learning_rate": 7.805732933142639e-07, "loss": 2.4274, "mean_token_accuracy": 0.4517241358757019, "step": 775 }, { "epoch": 0.0007856242842509526, "grad_norm": 114.9038039336646, "learning_rate": 7.856092500453237e-07, "loss": 2.2574, "mean_token_accuracy": 0.4413793087005615, "step": 780 }, { "epoch": 0.0007906603373551254, "grad_norm": 119.73169033830584, "learning_rate": 7.906452067763835e-07, "loss": 2.335, "mean_token_accuracy": 0.47586206197738645, "step": 785 }, { "epoch": 0.0007956963904592981, "grad_norm": 92.57948321194462, "learning_rate": 7.956811635074432e-07, "loss": 2.7936, "mean_token_accuracy": 0.4, "step": 790 }, { "epoch": 0.0008007324435634709, "grad_norm": 101.27560151020234, "learning_rate": 8.00717120238503e-07, "loss": 2.5011, "mean_token_accuracy": 0.43103447556495667, "step": 795 }, { "epoch": 0.0008057684966676436, "grad_norm": 96.97006473089812, "learning_rate": 8.057530769695627e-07, "loss": 2.4138, "mean_token_accuracy": 0.4241379380226135, "step": 800 }, { "epoch": 0.0008108045497718164, "grad_norm": 102.94666468476578, "learning_rate": 8.107890337006225e-07, "loss": 2.4822, "mean_token_accuracy": 0.3999999940395355, "step": 805 }, { "epoch": 0.0008158406028759892, "grad_norm": 75.38572226991253, "learning_rate": 8.158249904316823e-07, "loss": 2.7287, "mean_token_accuracy": 0.379310342669487, "step": 810 }, { "epoch": 0.000820876655980162, "grad_norm": 84.97815107088769, "learning_rate": 8.20860947162742e-07, "loss": 2.3224, "mean_token_accuracy": 0.4517241299152374, "step": 815 }, { "epoch": 0.0008259127090843348, "grad_norm": 70.05670008505929, "learning_rate": 8.258969038938018e-07, "loss": 2.4871, "mean_token_accuracy": 0.42758620977401735, "step": 820 }, { "epoch": 0.0008309487621885075, "grad_norm": 76.18238155734613, "learning_rate": 8.309328606248615e-07, "loss": 2.187, "mean_token_accuracy": 0.47586206793785096, "step": 825 }, { "epoch": 0.0008359848152926803, "grad_norm": 63.141514681604384, "learning_rate": 8.359688173559213e-07, "loss": 2.3036, "mean_token_accuracy": 0.4551724135875702, "step": 830 }, { "epoch": 0.0008410208683968531, "grad_norm": 109.9644745805576, "learning_rate": 8.410047740869811e-07, "loss": 2.5717, "mean_token_accuracy": 0.4103448301553726, "step": 835 }, { "epoch": 0.0008460569215010259, "grad_norm": 87.77664891234033, "learning_rate": 8.460407308180408e-07, "loss": 2.4026, "mean_token_accuracy": 0.4517241418361664, "step": 840 }, { "epoch": 0.0008510929746051986, "grad_norm": 69.01201408024284, "learning_rate": 8.510766875491006e-07, "loss": 2.3477, "mean_token_accuracy": 0.4758620738983154, "step": 845 }, { "epoch": 0.0008561290277093714, "grad_norm": 121.60129783717338, "learning_rate": 8.561126442801603e-07, "loss": 1.9042, "mean_token_accuracy": 0.5482758581638336, "step": 850 }, { "epoch": 0.0008611650808135441, "grad_norm": 121.32099991086666, "learning_rate": 8.611486010112201e-07, "loss": 2.1015, "mean_token_accuracy": 0.4931034505367279, "step": 855 }, { "epoch": 0.0008662011339177169, "grad_norm": 104.05189548327019, "learning_rate": 8.661845577422799e-07, "loss": 2.1571, "mean_token_accuracy": 0.49655172824859617, "step": 860 }, { "epoch": 0.0008712371870218897, "grad_norm": 125.16861668453716, "learning_rate": 8.712205144733397e-07, "loss": 2.1423, "mean_token_accuracy": 0.4965517222881317, "step": 865 }, { "epoch": 0.0008762732401260625, "grad_norm": 105.3064722604332, "learning_rate": 8.762564712043994e-07, "loss": 2.1473, "mean_token_accuracy": 0.5000000059604645, "step": 870 }, { "epoch": 0.0008813092932302353, "grad_norm": 70.464647210329, "learning_rate": 8.812924279354592e-07, "loss": 1.8264, "mean_token_accuracy": 0.5344827532768249, "step": 875 }, { "epoch": 0.0008863453463344081, "grad_norm": 77.82747467870017, "learning_rate": 8.863283846665189e-07, "loss": 2.22, "mean_token_accuracy": 0.48965518474578856, "step": 880 }, { "epoch": 0.0008913813994385809, "grad_norm": 70.69831654174507, "learning_rate": 8.913643413975787e-07, "loss": 2.2467, "mean_token_accuracy": 0.45862067937850953, "step": 885 }, { "epoch": 0.0008964174525427535, "grad_norm": 67.69527822701062, "learning_rate": 8.964002981286385e-07, "loss": 2.2231, "mean_token_accuracy": 0.4551724076271057, "step": 890 }, { "epoch": 0.0009014535056469263, "grad_norm": 78.5600702087927, "learning_rate": 9.014362548596982e-07, "loss": 2.2696, "mean_token_accuracy": 0.4379310369491577, "step": 895 }, { "epoch": 0.0009064895587510991, "grad_norm": 90.37824983572487, "learning_rate": 9.06472211590758e-07, "loss": 2.3338, "mean_token_accuracy": 0.4396854221820831, "step": 900 }, { "epoch": 0.0009115256118552719, "grad_norm": 83.8315104397229, "learning_rate": 9.115081683218177e-07, "loss": 2.5784, "mean_token_accuracy": 0.42068966031074523, "step": 905 }, { "epoch": 0.0009165616649594447, "grad_norm": 73.61129192415399, "learning_rate": 9.165441250528775e-07, "loss": 2.3417, "mean_token_accuracy": 0.42413793206214906, "step": 910 }, { "epoch": 0.0009215977180636175, "grad_norm": 87.10247355700956, "learning_rate": 9.215800817839373e-07, "loss": 2.766, "mean_token_accuracy": 0.36896551847457887, "step": 915 }, { "epoch": 0.0009266337711677902, "grad_norm": 89.69322759711844, "learning_rate": 9.266160385149971e-07, "loss": 2.4475, "mean_token_accuracy": 0.42413792610168455, "step": 920 }, { "epoch": 0.000931669824271963, "grad_norm": 78.14797469167598, "learning_rate": 9.31651995246057e-07, "loss": 2.4458, "mean_token_accuracy": 0.4448275864124298, "step": 925 }, { "epoch": 0.0009367058773761357, "grad_norm": 85.58270905685941, "learning_rate": 9.366879519771168e-07, "loss": 2.8264, "mean_token_accuracy": 0.3655172437429428, "step": 930 }, { "epoch": 0.0009417419304803085, "grad_norm": 80.21212458809858, "learning_rate": 9.417239087081765e-07, "loss": 2.2188, "mean_token_accuracy": 0.4689655125141144, "step": 935 }, { "epoch": 0.0009467779835844813, "grad_norm": 80.47822616708802, "learning_rate": 9.467598654392362e-07, "loss": 2.4089, "mean_token_accuracy": 0.4551724135875702, "step": 940 }, { "epoch": 0.0009518140366886541, "grad_norm": 69.37473453455668, "learning_rate": 9.51795822170296e-07, "loss": 2.1651, "mean_token_accuracy": 0.4620689630508423, "step": 945 }, { "epoch": 0.0009568500897928268, "grad_norm": 64.46855922972864, "learning_rate": 9.568317789013559e-07, "loss": 2.3544, "mean_token_accuracy": 0.4068965494632721, "step": 950 }, { "epoch": 0.0009618861428969996, "grad_norm": 97.94578556230171, "learning_rate": 9.618677356324156e-07, "loss": 2.1219, "mean_token_accuracy": 0.4896551728248596, "step": 955 }, { "epoch": 0.0009669221960011724, "grad_norm": 82.31751501537289, "learning_rate": 9.669036923634752e-07, "loss": 1.9896, "mean_token_accuracy": 0.5241379261016845, "step": 960 }, { "epoch": 0.0009719582491053452, "grad_norm": 90.64651861642238, "learning_rate": 9.719396490945351e-07, "loss": 2.3545, "mean_token_accuracy": 0.4275862157344818, "step": 965 }, { "epoch": 0.000976994302209518, "grad_norm": 73.91083962121589, "learning_rate": 9.769756058255948e-07, "loss": 2.0212, "mean_token_accuracy": 0.5068965494632721, "step": 970 }, { "epoch": 0.0009820303553136906, "grad_norm": 98.13645531687172, "learning_rate": 9.820115625566547e-07, "loss": 2.2457, "mean_token_accuracy": 0.4586206912994385, "step": 975 }, { "epoch": 0.0009870664084178635, "grad_norm": 105.73782207242293, "learning_rate": 9.870475192877144e-07, "loss": 2.6174, "mean_token_accuracy": 0.4448275864124298, "step": 980 }, { "epoch": 0.0009921024615220362, "grad_norm": 94.53904273037824, "learning_rate": 9.92083476018774e-07, "loss": 2.4942, "mean_token_accuracy": 0.4379310369491577, "step": 985 }, { "epoch": 0.0009971385146262091, "grad_norm": 95.10209112496764, "learning_rate": 9.97119432749834e-07, "loss": 2.2514, "mean_token_accuracy": 0.458620685338974, "step": 990 }, { "epoch": 0.0010021745677303818, "grad_norm": 94.55687912818004, "learning_rate": 1.0021553894808936e-06, "loss": 2.6231, "mean_token_accuracy": 0.4068965554237366, "step": 995 }, { "epoch": 0.0010072106208345545, "grad_norm": 87.29290646053556, "learning_rate": 1.0071913462119535e-06, "loss": 2.1165, "mean_token_accuracy": 0.4310344696044922, "step": 1000 }, { "epoch": 0.0010122466739387274, "grad_norm": 82.80634468832311, "learning_rate": 1.0122273029430132e-06, "loss": 2.8514, "mean_token_accuracy": 0.38620689511299133, "step": 1005 }, { "epoch": 0.0010172827270429, "grad_norm": 98.21734279194256, "learning_rate": 1.0172632596740729e-06, "loss": 2.1538, "mean_token_accuracy": 0.46358137130737304, "step": 1010 }, { "epoch": 0.001022318780147073, "grad_norm": 81.24226898033034, "learning_rate": 1.0222992164051327e-06, "loss": 2.2706, "mean_token_accuracy": 0.4206896543502808, "step": 1015 }, { "epoch": 0.0010273548332512456, "grad_norm": 89.83676190967354, "learning_rate": 1.0273351731361924e-06, "loss": 2.2762, "mean_token_accuracy": 0.4551724076271057, "step": 1020 }, { "epoch": 0.0010323908863554185, "grad_norm": 84.52068018737582, "learning_rate": 1.0323711298672523e-06, "loss": 2.1873, "mean_token_accuracy": 0.5087719261646271, "step": 1025 }, { "epoch": 0.0010374269394595912, "grad_norm": 75.93525632067038, "learning_rate": 1.037407086598312e-06, "loss": 2.3937, "mean_token_accuracy": 0.4344827592372894, "step": 1030 }, { "epoch": 0.001042462992563764, "grad_norm": 89.77920301537786, "learning_rate": 1.0424430433293717e-06, "loss": 2.5033, "mean_token_accuracy": 0.4241379380226135, "step": 1035 }, { "epoch": 0.0010474990456679368, "grad_norm": 70.79544822302483, "learning_rate": 1.0474790000604315e-06, "loss": 2.2906, "mean_token_accuracy": 0.38275861740112305, "step": 1040 }, { "epoch": 0.0010525350987721095, "grad_norm": 54.96460134605188, "learning_rate": 1.0525149567914912e-06, "loss": 2.3459, "mean_token_accuracy": 0.4310344815254211, "step": 1045 }, { "epoch": 0.0010575711518762824, "grad_norm": 125.77867336302904, "learning_rate": 1.0575509135225511e-06, "loss": 2.497, "mean_token_accuracy": 0.43448275327682495, "step": 1050 }, { "epoch": 0.001062607204980455, "grad_norm": 64.89975992149849, "learning_rate": 1.0625868702536108e-06, "loss": 2.478, "mean_token_accuracy": 0.441379314661026, "step": 1055 }, { "epoch": 0.001067643258084628, "grad_norm": 94.86657410197256, "learning_rate": 1.0676228269846705e-06, "loss": 2.6097, "mean_token_accuracy": 0.4137930989265442, "step": 1060 }, { "epoch": 0.0010726793111888006, "grad_norm": 69.48415418506345, "learning_rate": 1.0726587837157304e-06, "loss": 2.3105, "mean_token_accuracy": 0.4379310429096222, "step": 1065 }, { "epoch": 0.0010777153642929733, "grad_norm": 77.21060146011098, "learning_rate": 1.07769474044679e-06, "loss": 2.0062, "mean_token_accuracy": 0.482758617401123, "step": 1070 }, { "epoch": 0.0010827514173971462, "grad_norm": 116.27805213569609, "learning_rate": 1.08273069717785e-06, "loss": 2.4718, "mean_token_accuracy": 0.44827585816383364, "step": 1075 }, { "epoch": 0.001087787470501319, "grad_norm": 88.34489248480183, "learning_rate": 1.0877666539089096e-06, "loss": 2.4136, "mean_token_accuracy": 0.4413793087005615, "step": 1080 }, { "epoch": 0.0010928235236054918, "grad_norm": 95.08995116339224, "learning_rate": 1.0928026106399695e-06, "loss": 2.1384, "mean_token_accuracy": 0.4862069010734558, "step": 1085 }, { "epoch": 0.0010978595767096645, "grad_norm": 93.39381170238919, "learning_rate": 1.0978385673710292e-06, "loss": 2.4491, "mean_token_accuracy": 0.4655172526836395, "step": 1090 }, { "epoch": 0.0011028956298138372, "grad_norm": 89.49662934117966, "learning_rate": 1.1028745241020888e-06, "loss": 2.4618, "mean_token_accuracy": 0.4344827651977539, "step": 1095 }, { "epoch": 0.00110793168291801, "grad_norm": 79.1104342235396, "learning_rate": 1.1079104808331487e-06, "loss": 2.3571, "mean_token_accuracy": 0.45517240166664125, "step": 1100 }, { "epoch": 0.0011129677360221827, "grad_norm": 98.20149378743784, "learning_rate": 1.1129464375642084e-06, "loss": 2.484, "mean_token_accuracy": 0.4448275864124298, "step": 1105 }, { "epoch": 0.0011180037891263556, "grad_norm": 79.82266104144223, "learning_rate": 1.1179823942952683e-06, "loss": 2.0848, "mean_token_accuracy": 0.5068965435028077, "step": 1110 }, { "epoch": 0.0011230398422305283, "grad_norm": 85.3229008140252, "learning_rate": 1.123018351026328e-06, "loss": 2.3575, "mean_token_accuracy": 0.43103447556495667, "step": 1115 }, { "epoch": 0.0011280758953347012, "grad_norm": 87.4265057563504, "learning_rate": 1.1280543077573876e-06, "loss": 2.4454, "mean_token_accuracy": 0.4206896543502808, "step": 1120 }, { "epoch": 0.001133111948438874, "grad_norm": 92.87773421724492, "learning_rate": 1.1330902644884477e-06, "loss": 2.0481, "mean_token_accuracy": 0.4724137902259827, "step": 1125 }, { "epoch": 0.0011381480015430466, "grad_norm": 101.75831315850915, "learning_rate": 1.1381262212195074e-06, "loss": 2.1305, "mean_token_accuracy": 0.4896551609039307, "step": 1130 }, { "epoch": 0.0011431840546472195, "grad_norm": 70.03599602938513, "learning_rate": 1.143162177950567e-06, "loss": 2.0372, "mean_token_accuracy": 0.5034482717514038, "step": 1135 }, { "epoch": 0.0011482201077513922, "grad_norm": 98.2055206900503, "learning_rate": 1.148198134681627e-06, "loss": 2.3925, "mean_token_accuracy": 0.4551724135875702, "step": 1140 }, { "epoch": 0.001153256160855565, "grad_norm": 92.20626916972, "learning_rate": 1.1532340914126867e-06, "loss": 2.4617, "mean_token_accuracy": 0.39655172228813174, "step": 1145 }, { "epoch": 0.0011582922139597377, "grad_norm": 85.11082646832499, "learning_rate": 1.1582700481437465e-06, "loss": 2.0419, "mean_token_accuracy": 0.510344821214676, "step": 1150 }, { "epoch": 0.0011633282670639106, "grad_norm": 63.86010897517878, "learning_rate": 1.1633060048748062e-06, "loss": 2.381, "mean_token_accuracy": 0.41724138259887694, "step": 1155 }, { "epoch": 0.0011683643201680833, "grad_norm": 125.54070223178105, "learning_rate": 1.168341961605866e-06, "loss": 2.2377, "mean_token_accuracy": 0.4655172288417816, "step": 1160 }, { "epoch": 0.001173400373272256, "grad_norm": 90.16110689488018, "learning_rate": 1.1733779183369258e-06, "loss": 2.2894, "mean_token_accuracy": 0.482758617401123, "step": 1165 }, { "epoch": 0.001178436426376429, "grad_norm": 83.87535543555006, "learning_rate": 1.1784138750679855e-06, "loss": 2.6102, "mean_token_accuracy": 0.4121597111225128, "step": 1170 }, { "epoch": 0.0011834724794806016, "grad_norm": 100.54991805051277, "learning_rate": 1.1834498317990453e-06, "loss": 2.1402, "mean_token_accuracy": 0.5172413766384125, "step": 1175 }, { "epoch": 0.0011885085325847745, "grad_norm": 105.51429755094078, "learning_rate": 1.188485788530105e-06, "loss": 2.1268, "mean_token_accuracy": 0.47931033968925474, "step": 1180 }, { "epoch": 0.0011935445856889472, "grad_norm": 81.31925779452442, "learning_rate": 1.193521745261165e-06, "loss": 2.3923, "mean_token_accuracy": 0.43103448748588563, "step": 1185 }, { "epoch": 0.00119858063879312, "grad_norm": 75.45651145037678, "learning_rate": 1.1985577019922246e-06, "loss": 2.0322, "mean_token_accuracy": 0.4862069010734558, "step": 1190 }, { "epoch": 0.0012036166918972927, "grad_norm": 146.180295202791, "learning_rate": 1.2035936587232843e-06, "loss": 2.2846, "mean_token_accuracy": 0.49679802656173705, "step": 1195 }, { "epoch": 0.0012086527450014654, "grad_norm": 90.76549548259376, "learning_rate": 1.2086296154543442e-06, "loss": 2.4331, "mean_token_accuracy": 0.44827585816383364, "step": 1200 }, { "epoch": 0.0012136887981056383, "grad_norm": 70.80672854832602, "learning_rate": 1.2136655721854038e-06, "loss": 2.0853, "mean_token_accuracy": 0.46551724076271056, "step": 1205 }, { "epoch": 0.001218724851209811, "grad_norm": 75.90940406667318, "learning_rate": 1.2187015289164637e-06, "loss": 2.4316, "mean_token_accuracy": 0.41034482717514037, "step": 1210 }, { "epoch": 0.0012237609043139839, "grad_norm": 74.55984293879484, "learning_rate": 1.2237374856475234e-06, "loss": 2.2343, "mean_token_accuracy": 0.4724137902259827, "step": 1215 }, { "epoch": 0.0012287969574181566, "grad_norm": 61.851756927426095, "learning_rate": 1.228773442378583e-06, "loss": 2.1658, "mean_token_accuracy": 0.47241380214691164, "step": 1220 }, { "epoch": 0.0012338330105223293, "grad_norm": 86.12593022767761, "learning_rate": 1.233809399109643e-06, "loss": 2.2726, "mean_token_accuracy": 0.4482758641242981, "step": 1225 }, { "epoch": 0.0012388690636265022, "grad_norm": 80.10808653391787, "learning_rate": 1.2388453558407026e-06, "loss": 2.4066, "mean_token_accuracy": 0.41440886855125425, "step": 1230 }, { "epoch": 0.0012439051167306748, "grad_norm": 100.8825454342411, "learning_rate": 1.2438813125717625e-06, "loss": 2.357, "mean_token_accuracy": 0.4417487621307373, "step": 1235 }, { "epoch": 0.0012489411698348477, "grad_norm": 98.5626113712527, "learning_rate": 1.2489172693028222e-06, "loss": 2.5658, "mean_token_accuracy": 0.42758620977401735, "step": 1240 }, { "epoch": 0.0012539772229390204, "grad_norm": 79.78350555896583, "learning_rate": 1.2539532260338819e-06, "loss": 2.3719, "mean_token_accuracy": 0.42758620977401735, "step": 1245 }, { "epoch": 0.0012590132760431933, "grad_norm": 92.1415609730985, "learning_rate": 1.2589891827649418e-06, "loss": 2.1144, "mean_token_accuracy": 0.4724137902259827, "step": 1250 }, { "epoch": 0.001264049329147366, "grad_norm": 80.28929122349953, "learning_rate": 1.2640251394960014e-06, "loss": 2.1751, "mean_token_accuracy": 0.45517241954803467, "step": 1255 }, { "epoch": 0.0012690853822515387, "grad_norm": 98.53427546574116, "learning_rate": 1.2690610962270613e-06, "loss": 2.4729, "mean_token_accuracy": 0.4344827592372894, "step": 1260 }, { "epoch": 0.0012741214353557116, "grad_norm": 89.22919365213846, "learning_rate": 1.274097052958121e-06, "loss": 2.3517, "mean_token_accuracy": 0.42758620381355283, "step": 1265 }, { "epoch": 0.0012791574884598843, "grad_norm": 66.32414561947962, "learning_rate": 1.2791330096891807e-06, "loss": 2.1336, "mean_token_accuracy": 0.4379310369491577, "step": 1270 }, { "epoch": 0.0012841935415640572, "grad_norm": 79.24123706737915, "learning_rate": 1.2841689664202406e-06, "loss": 2.3614, "mean_token_accuracy": 0.4068965494632721, "step": 1275 }, { "epoch": 0.0012892295946682298, "grad_norm": 91.26346669185378, "learning_rate": 1.2892049231513002e-06, "loss": 2.5666, "mean_token_accuracy": 0.4103448331356049, "step": 1280 }, { "epoch": 0.0012942656477724027, "grad_norm": 123.23330494528199, "learning_rate": 1.2942408798823601e-06, "loss": 2.1011, "mean_token_accuracy": 0.5297640681266784, "step": 1285 }, { "epoch": 0.0012993017008765754, "grad_norm": 57.70042962841069, "learning_rate": 1.2992768366134198e-06, "loss": 2.5687, "mean_token_accuracy": 0.4206896543502808, "step": 1290 }, { "epoch": 0.001304337753980748, "grad_norm": 96.58903314675543, "learning_rate": 1.3043127933444795e-06, "loss": 2.4582, "mean_token_accuracy": 0.43448275327682495, "step": 1295 }, { "epoch": 0.001309373807084921, "grad_norm": 72.08930163850195, "learning_rate": 1.3093487500755394e-06, "loss": 2.2915, "mean_token_accuracy": 0.4689655125141144, "step": 1300 }, { "epoch": 0.0013144098601890937, "grad_norm": 105.52358898170489, "learning_rate": 1.314384706806599e-06, "loss": 2.063, "mean_token_accuracy": 0.5034482777118683, "step": 1305 }, { "epoch": 0.0013194459132932666, "grad_norm": 85.14372468467099, "learning_rate": 1.319420663537659e-06, "loss": 2.3354, "mean_token_accuracy": 0.4172413766384125, "step": 1310 }, { "epoch": 0.0013244819663974393, "grad_norm": 97.53967066326068, "learning_rate": 1.3244566202687186e-06, "loss": 2.4497, "mean_token_accuracy": 0.4413793087005615, "step": 1315 }, { "epoch": 0.001329518019501612, "grad_norm": 86.35446091083718, "learning_rate": 1.3294925769997785e-06, "loss": 2.63, "mean_token_accuracy": 0.3931034505367279, "step": 1320 }, { "epoch": 0.0013345540726057848, "grad_norm": 126.78052960926874, "learning_rate": 1.3345285337308382e-06, "loss": 2.1806, "mean_token_accuracy": 0.4620689570903778, "step": 1325 }, { "epoch": 0.0013395901257099575, "grad_norm": 94.40356981733358, "learning_rate": 1.339564490461898e-06, "loss": 2.175, "mean_token_accuracy": 0.4846340000629425, "step": 1330 }, { "epoch": 0.0013446261788141304, "grad_norm": 80.673031220175, "learning_rate": 1.344600447192958e-06, "loss": 2.3372, "mean_token_accuracy": 0.42758620381355283, "step": 1335 }, { "epoch": 0.001349662231918303, "grad_norm": 75.00444691515132, "learning_rate": 1.3496364039240176e-06, "loss": 2.0388, "mean_token_accuracy": 0.517241382598877, "step": 1340 }, { "epoch": 0.001354698285022476, "grad_norm": 64.32829062489732, "learning_rate": 1.3546723606550773e-06, "loss": 2.2268, "mean_token_accuracy": 0.41724138259887694, "step": 1345 }, { "epoch": 0.0013597343381266487, "grad_norm": 75.68791544527417, "learning_rate": 1.3597083173861372e-06, "loss": 2.0087, "mean_token_accuracy": 0.4620689690113068, "step": 1350 }, { "epoch": 0.0013647703912308214, "grad_norm": 82.44388548224241, "learning_rate": 1.3647442741171969e-06, "loss": 2.1741, "mean_token_accuracy": 0.47586206793785096, "step": 1355 }, { "epoch": 0.0013698064443349943, "grad_norm": 102.22951578421268, "learning_rate": 1.3697802308482568e-06, "loss": 2.2479, "mean_token_accuracy": 0.441379314661026, "step": 1360 }, { "epoch": 0.001374842497439167, "grad_norm": 90.29348760683527, "learning_rate": 1.3748161875793164e-06, "loss": 2.2465, "mean_token_accuracy": 0.4620689630508423, "step": 1365 }, { "epoch": 0.0013798785505433398, "grad_norm": 82.6765889619488, "learning_rate": 1.3798521443103761e-06, "loss": 2.1641, "mean_token_accuracy": 0.45862067937850953, "step": 1370 }, { "epoch": 0.0013849146036475125, "grad_norm": 79.0657462090166, "learning_rate": 1.384888101041436e-06, "loss": 2.502, "mean_token_accuracy": 0.42758620381355283, "step": 1375 }, { "epoch": 0.0013899506567516854, "grad_norm": 72.71052100869652, "learning_rate": 1.3899240577724957e-06, "loss": 2.638, "mean_token_accuracy": 0.39655172228813174, "step": 1380 }, { "epoch": 0.001394986709855858, "grad_norm": 87.28279095020855, "learning_rate": 1.3949600145035556e-06, "loss": 2.3732, "mean_token_accuracy": 0.41724138259887694, "step": 1385 }, { "epoch": 0.0014000227629600308, "grad_norm": 108.05874347851243, "learning_rate": 1.3999959712346152e-06, "loss": 2.283, "mean_token_accuracy": 0.44482759237289426, "step": 1390 }, { "epoch": 0.0014050588160642037, "grad_norm": 91.61751858243969, "learning_rate": 1.405031927965675e-06, "loss": 2.5354, "mean_token_accuracy": 0.38620689511299133, "step": 1395 }, { "epoch": 0.0014100948691683764, "grad_norm": 81.13759435226928, "learning_rate": 1.4100678846967348e-06, "loss": 2.1121, "mean_token_accuracy": 0.47586206793785096, "step": 1400 }, { "epoch": 0.0014151309222725492, "grad_norm": 67.64855921366826, "learning_rate": 1.4151038414277945e-06, "loss": 2.1707, "mean_token_accuracy": 0.4586206912994385, "step": 1405 }, { "epoch": 0.001420166975376722, "grad_norm": 92.82286441771318, "learning_rate": 1.4201397981588544e-06, "loss": 2.2161, "mean_token_accuracy": 0.39655172228813174, "step": 1410 }, { "epoch": 0.0014252030284808948, "grad_norm": 80.09402502047881, "learning_rate": 1.425175754889914e-06, "loss": 2.3469, "mean_token_accuracy": 0.417241370677948, "step": 1415 }, { "epoch": 0.0014302390815850675, "grad_norm": 96.60022128767807, "learning_rate": 1.4302117116209737e-06, "loss": 2.1765, "mean_token_accuracy": 0.4620689690113068, "step": 1420 }, { "epoch": 0.0014352751346892402, "grad_norm": 92.51314836577201, "learning_rate": 1.4352476683520336e-06, "loss": 2.3332, "mean_token_accuracy": 0.4344827651977539, "step": 1425 }, { "epoch": 0.001440311187793413, "grad_norm": 76.5267903608808, "learning_rate": 1.4402836250830933e-06, "loss": 2.454, "mean_token_accuracy": 0.40816696882247927, "step": 1430 }, { "epoch": 0.0014453472408975858, "grad_norm": 112.99849516326708, "learning_rate": 1.4453195818141532e-06, "loss": 2.6772, "mean_token_accuracy": 0.4, "step": 1435 }, { "epoch": 0.0014503832940017587, "grad_norm": 78.57110578607713, "learning_rate": 1.4503555385452129e-06, "loss": 2.3715, "mean_token_accuracy": 0.4586206912994385, "step": 1440 }, { "epoch": 0.0014554193471059314, "grad_norm": 92.1189167634099, "learning_rate": 1.4553914952762727e-06, "loss": 2.3398, "mean_token_accuracy": 0.441379314661026, "step": 1445 }, { "epoch": 0.001460455400210104, "grad_norm": 85.60064817375593, "learning_rate": 1.4604274520073324e-06, "loss": 2.0117, "mean_token_accuracy": 0.5172413766384125, "step": 1450 }, { "epoch": 0.001465491453314277, "grad_norm": 116.31359990676597, "learning_rate": 1.465463408738392e-06, "loss": 2.3036, "mean_token_accuracy": 0.4103448212146759, "step": 1455 }, { "epoch": 0.0014705275064184496, "grad_norm": 106.96774675136832, "learning_rate": 1.470499365469452e-06, "loss": 2.4191, "mean_token_accuracy": 0.3896551728248596, "step": 1460 }, { "epoch": 0.0014755635595226225, "grad_norm": 89.51959518282786, "learning_rate": 1.4755353222005117e-06, "loss": 2.3593, "mean_token_accuracy": 0.4517241358757019, "step": 1465 }, { "epoch": 0.0014805996126267952, "grad_norm": 82.50721156898831, "learning_rate": 1.4805712789315716e-06, "loss": 2.279, "mean_token_accuracy": 0.4655172288417816, "step": 1470 }, { "epoch": 0.001485635665730968, "grad_norm": 90.02670440792986, "learning_rate": 1.4856072356626312e-06, "loss": 2.0858, "mean_token_accuracy": 0.5021778523921967, "step": 1475 }, { "epoch": 0.0014906717188351408, "grad_norm": 83.33805010048364, "learning_rate": 1.490643192393691e-06, "loss": 2.4329, "mean_token_accuracy": 0.4620689690113068, "step": 1480 }, { "epoch": 0.0014957077719393135, "grad_norm": 97.84951238368177, "learning_rate": 1.4956791491247508e-06, "loss": 2.1651, "mean_token_accuracy": 0.4620689690113068, "step": 1485 }, { "epoch": 0.0015007438250434863, "grad_norm": 114.56671182964665, "learning_rate": 1.5007151058558105e-06, "loss": 2.4274, "mean_token_accuracy": 0.4655172288417816, "step": 1490 }, { "epoch": 0.001505779878147659, "grad_norm": 92.29710095724172, "learning_rate": 1.5057510625868704e-06, "loss": 2.0575, "mean_token_accuracy": 0.4689655125141144, "step": 1495 }, { "epoch": 0.001510815931251832, "grad_norm": 81.63875752328688, "learning_rate": 1.51078701931793e-06, "loss": 2.228, "mean_token_accuracy": 0.4655172348022461, "step": 1500 }, { "epoch": 0.0015158519843560046, "grad_norm": 95.38223895272955, "learning_rate": 1.5158229760489897e-06, "loss": 2.5547, "mean_token_accuracy": 0.417241370677948, "step": 1505 }, { "epoch": 0.0015208880374601775, "grad_norm": 86.54306236708071, "learning_rate": 1.5208589327800496e-06, "loss": 2.2331, "mean_token_accuracy": 0.4689655125141144, "step": 1510 }, { "epoch": 0.0015259240905643502, "grad_norm": 105.14984086161786, "learning_rate": 1.5258948895111093e-06, "loss": 2.3451, "mean_token_accuracy": 0.4551724076271057, "step": 1515 }, { "epoch": 0.0015309601436685229, "grad_norm": 69.71878729515852, "learning_rate": 1.5309308462421692e-06, "loss": 2.0062, "mean_token_accuracy": 0.5, "step": 1520 }, { "epoch": 0.0015359961967726958, "grad_norm": 72.75790316207774, "learning_rate": 1.5359668029732288e-06, "loss": 2.2077, "mean_token_accuracy": 0.4517241358757019, "step": 1525 }, { "epoch": 0.0015410322498768685, "grad_norm": 105.32959226149646, "learning_rate": 1.5410027597042885e-06, "loss": 2.3033, "mean_token_accuracy": 0.4517241299152374, "step": 1530 }, { "epoch": 0.0015460683029810413, "grad_norm": 81.31609279829296, "learning_rate": 1.5460387164353486e-06, "loss": 2.4319, "mean_token_accuracy": 0.46551724076271056, "step": 1535 }, { "epoch": 0.001551104356085214, "grad_norm": 120.60692135671961, "learning_rate": 1.5510746731664083e-06, "loss": 2.1975, "mean_token_accuracy": 0.5068965554237366, "step": 1540 }, { "epoch": 0.001556140409189387, "grad_norm": 66.07699669915323, "learning_rate": 1.556110629897468e-06, "loss": 2.6908, "mean_token_accuracy": 0.4068965554237366, "step": 1545 }, { "epoch": 0.0015611764622935596, "grad_norm": 94.284505799871, "learning_rate": 1.5611465866285279e-06, "loss": 2.2898, "mean_token_accuracy": 0.4482758641242981, "step": 1550 }, { "epoch": 0.0015662125153977323, "grad_norm": 101.87430211498186, "learning_rate": 1.5661825433595875e-06, "loss": 2.5544, "mean_token_accuracy": 0.4068965494632721, "step": 1555 }, { "epoch": 0.0015712485685019052, "grad_norm": 64.9453871123965, "learning_rate": 1.5712185000906474e-06, "loss": 2.1962, "mean_token_accuracy": 0.506896561384201, "step": 1560 }, { "epoch": 0.0015762846216060779, "grad_norm": 102.87728343758079, "learning_rate": 1.576254456821707e-06, "loss": 2.2039, "mean_token_accuracy": 0.482758617401123, "step": 1565 }, { "epoch": 0.0015813206747102508, "grad_norm": 57.98800839039422, "learning_rate": 1.581290413552767e-06, "loss": 2.187, "mean_token_accuracy": 0.45862067937850953, "step": 1570 }, { "epoch": 0.0015863567278144235, "grad_norm": 83.2383170154284, "learning_rate": 1.5863263702838267e-06, "loss": 2.505, "mean_token_accuracy": 0.4310344815254211, "step": 1575 }, { "epoch": 0.0015913927809185961, "grad_norm": 81.55813491742742, "learning_rate": 1.5913623270148863e-06, "loss": 2.3187, "mean_token_accuracy": 0.4482758641242981, "step": 1580 }, { "epoch": 0.001596428834022769, "grad_norm": 79.37737258693716, "learning_rate": 1.5963982837459462e-06, "loss": 2.1579, "mean_token_accuracy": 0.47586206197738645, "step": 1585 }, { "epoch": 0.0016014648871269417, "grad_norm": 65.85649458113443, "learning_rate": 1.601434240477006e-06, "loss": 2.4545, "mean_token_accuracy": 0.4462794899940491, "step": 1590 }, { "epoch": 0.0016065009402311146, "grad_norm": 86.55837076754545, "learning_rate": 1.6064701972080658e-06, "loss": 2.2032, "mean_token_accuracy": 0.5206896483898162, "step": 1595 }, { "epoch": 0.0016115369933352873, "grad_norm": 67.68460300678399, "learning_rate": 1.6115061539391255e-06, "loss": 2.3538, "mean_token_accuracy": 0.43103447556495667, "step": 1600 }, { "epoch": 0.0016165730464394602, "grad_norm": 97.73554671726215, "learning_rate": 1.6165421106701851e-06, "loss": 2.3681, "mean_token_accuracy": 0.41379310488700866, "step": 1605 }, { "epoch": 0.0016216090995436329, "grad_norm": 89.9015968600415, "learning_rate": 1.621578067401245e-06, "loss": 2.1631, "mean_token_accuracy": 0.4931034505367279, "step": 1610 }, { "epoch": 0.0016266451526478056, "grad_norm": 79.46924592264445, "learning_rate": 1.6266140241323047e-06, "loss": 2.3175, "mean_token_accuracy": 0.43103448748588563, "step": 1615 }, { "epoch": 0.0016316812057519784, "grad_norm": 73.13835634105239, "learning_rate": 1.6316499808633646e-06, "loss": 2.4491, "mean_token_accuracy": 0.47241380214691164, "step": 1620 }, { "epoch": 0.0016367172588561511, "grad_norm": 75.7165498923317, "learning_rate": 1.6366859375944243e-06, "loss": 2.2476, "mean_token_accuracy": 0.47241378426551817, "step": 1625 }, { "epoch": 0.001641753311960324, "grad_norm": 99.96607029375616, "learning_rate": 1.641721894325484e-06, "loss": 2.0968, "mean_token_accuracy": 0.5034482717514038, "step": 1630 }, { "epoch": 0.0016467893650644967, "grad_norm": 79.96250185977877, "learning_rate": 1.6467578510565438e-06, "loss": 2.3436, "mean_token_accuracy": 0.4344827473163605, "step": 1635 }, { "epoch": 0.0016518254181686696, "grad_norm": 95.68949071193957, "learning_rate": 1.6517938077876035e-06, "loss": 2.1151, "mean_token_accuracy": 0.4551724135875702, "step": 1640 }, { "epoch": 0.0016568614712728423, "grad_norm": 95.36760843473598, "learning_rate": 1.6568297645186634e-06, "loss": 2.5977, "mean_token_accuracy": 0.4172413766384125, "step": 1645 }, { "epoch": 0.001661897524377015, "grad_norm": 86.1813138303892, "learning_rate": 1.661865721249723e-06, "loss": 2.0947, "mean_token_accuracy": 0.4947973370552063, "step": 1650 }, { "epoch": 0.0016669335774811879, "grad_norm": 97.68301957398029, "learning_rate": 1.6669016779807828e-06, "loss": 2.3633, "mean_token_accuracy": 0.4258318305015564, "step": 1655 }, { "epoch": 0.0016719696305853606, "grad_norm": 90.99186538858493, "learning_rate": 1.6719376347118426e-06, "loss": 2.5167, "mean_token_accuracy": 0.38620689511299133, "step": 1660 }, { "epoch": 0.0016770056836895334, "grad_norm": 80.15738544479304, "learning_rate": 1.6769735914429023e-06, "loss": 2.1744, "mean_token_accuracy": 0.46551724076271056, "step": 1665 }, { "epoch": 0.0016820417367937061, "grad_norm": 62.455297579323094, "learning_rate": 1.6820095481739622e-06, "loss": 1.7986, "mean_token_accuracy": 0.5172413766384125, "step": 1670 }, { "epoch": 0.0016870777898978788, "grad_norm": 93.5258823881037, "learning_rate": 1.6870455049050219e-06, "loss": 2.2099, "mean_token_accuracy": 0.4620689630508423, "step": 1675 }, { "epoch": 0.0016921138430020517, "grad_norm": 81.85947049962068, "learning_rate": 1.6920814616360816e-06, "loss": 2.2992, "mean_token_accuracy": 0.4344827592372894, "step": 1680 }, { "epoch": 0.0016971498961062244, "grad_norm": 83.11223935967024, "learning_rate": 1.6971174183671415e-06, "loss": 2.2199, "mean_token_accuracy": 0.4517241418361664, "step": 1685 }, { "epoch": 0.0017021859492103973, "grad_norm": 72.37325634818649, "learning_rate": 1.7021533750982011e-06, "loss": 1.9433, "mean_token_accuracy": 0.517241370677948, "step": 1690 }, { "epoch": 0.00170722200231457, "grad_norm": 62.67619225503342, "learning_rate": 1.707189331829261e-06, "loss": 2.4231, "mean_token_accuracy": 0.41881426572799685, "step": 1695 }, { "epoch": 0.0017122580554187429, "grad_norm": 76.68126850923235, "learning_rate": 1.7122252885603207e-06, "loss": 2.2686, "mean_token_accuracy": 0.46206897497177124, "step": 1700 }, { "epoch": 0.0017172941085229155, "grad_norm": 117.7889306707762, "learning_rate": 1.7172612452913806e-06, "loss": 2.3816, "mean_token_accuracy": 0.4517241358757019, "step": 1705 }, { "epoch": 0.0017223301616270882, "grad_norm": 84.14255536308399, "learning_rate": 1.7222972020224403e-06, "loss": 2.6233, "mean_token_accuracy": 0.4241379380226135, "step": 1710 }, { "epoch": 0.0017273662147312611, "grad_norm": 69.09697402332309, "learning_rate": 1.7273331587535e-06, "loss": 2.2852, "mean_token_accuracy": 0.4620689630508423, "step": 1715 }, { "epoch": 0.0017324022678354338, "grad_norm": 78.92314362178588, "learning_rate": 1.7323691154845598e-06, "loss": 2.3568, "mean_token_accuracy": 0.4551724076271057, "step": 1720 }, { "epoch": 0.0017374383209396067, "grad_norm": 139.17619606608554, "learning_rate": 1.7374050722156195e-06, "loss": 2.4275, "mean_token_accuracy": 0.4620689690113068, "step": 1725 }, { "epoch": 0.0017424743740437794, "grad_norm": 90.20116612955967, "learning_rate": 1.7424410289466794e-06, "loss": 2.1705, "mean_token_accuracy": 0.4896551728248596, "step": 1730 }, { "epoch": 0.0017475104271479523, "grad_norm": 99.60120723489602, "learning_rate": 1.747476985677739e-06, "loss": 2.6302, "mean_token_accuracy": 0.4413793087005615, "step": 1735 }, { "epoch": 0.001752546480252125, "grad_norm": 77.20807801933192, "learning_rate": 1.7525129424087987e-06, "loss": 1.9387, "mean_token_accuracy": 0.5103448331356049, "step": 1740 }, { "epoch": 0.0017575825333562977, "grad_norm": 76.41242622887216, "learning_rate": 1.7575488991398586e-06, "loss": 2.2007, "mean_token_accuracy": 0.4482758641242981, "step": 1745 }, { "epoch": 0.0017626185864604705, "grad_norm": 80.8139352793975, "learning_rate": 1.7625848558709183e-06, "loss": 2.1226, "mean_token_accuracy": 0.44827587008476255, "step": 1750 }, { "epoch": 0.0017676546395646432, "grad_norm": 76.4179824339492, "learning_rate": 1.7676208126019782e-06, "loss": 2.0023, "mean_token_accuracy": 0.5344827592372894, "step": 1755 }, { "epoch": 0.0017726906926688161, "grad_norm": 95.62054558276478, "learning_rate": 1.7726567693330379e-06, "loss": 1.972, "mean_token_accuracy": 0.5056261241436004, "step": 1760 }, { "epoch": 0.0017777267457729888, "grad_norm": 72.46220370172047, "learning_rate": 1.7776927260640975e-06, "loss": 2.3215, "mean_token_accuracy": 0.4325468838214874, "step": 1765 }, { "epoch": 0.0017827627988771617, "grad_norm": 128.46683836513, "learning_rate": 1.7827286827951574e-06, "loss": 2.3107, "mean_token_accuracy": 0.4517241299152374, "step": 1770 }, { "epoch": 0.0017877988519813344, "grad_norm": 112.44825568435205, "learning_rate": 1.7877646395262171e-06, "loss": 1.9738, "mean_token_accuracy": 0.5294010818004609, "step": 1775 }, { "epoch": 0.001792834905085507, "grad_norm": 105.88635733073585, "learning_rate": 1.792800596257277e-06, "loss": 2.4159, "mean_token_accuracy": 0.42758620977401735, "step": 1780 }, { "epoch": 0.00179787095818968, "grad_norm": 76.56275729629095, "learning_rate": 1.7978365529883367e-06, "loss": 2.1934, "mean_token_accuracy": 0.4517241358757019, "step": 1785 }, { "epoch": 0.0018029070112938526, "grad_norm": 96.91990849635508, "learning_rate": 1.8028725097193964e-06, "loss": 2.3044, "mean_token_accuracy": 0.47931034564971925, "step": 1790 }, { "epoch": 0.0018079430643980255, "grad_norm": 89.37010754407544, "learning_rate": 1.8079084664504562e-06, "loss": 2.1361, "mean_token_accuracy": 0.4965517222881317, "step": 1795 }, { "epoch": 0.0018129791175021982, "grad_norm": 73.72513138969829, "learning_rate": 1.812944423181516e-06, "loss": 2.2718, "mean_token_accuracy": 0.4137930989265442, "step": 1800 }, { "epoch": 0.001818015170606371, "grad_norm": 92.87287243238539, "learning_rate": 1.8179803799125758e-06, "loss": 2.5586, "mean_token_accuracy": 0.4310344815254211, "step": 1805 }, { "epoch": 0.0018230512237105438, "grad_norm": 82.8126099115831, "learning_rate": 1.8230163366436355e-06, "loss": 1.9987, "mean_token_accuracy": 0.47398667931556704, "step": 1810 }, { "epoch": 0.0018280872768147165, "grad_norm": 74.56083754345453, "learning_rate": 1.8280522933746952e-06, "loss": 2.2487, "mean_token_accuracy": 0.4551724076271057, "step": 1815 }, { "epoch": 0.0018331233299188894, "grad_norm": 77.35069789149611, "learning_rate": 1.833088250105755e-06, "loss": 2.3571, "mean_token_accuracy": 0.41379311084747317, "step": 1820 }, { "epoch": 0.001838159383023062, "grad_norm": 74.50495153809906, "learning_rate": 1.8381242068368147e-06, "loss": 1.8451, "mean_token_accuracy": 0.5778325200080872, "step": 1825 }, { "epoch": 0.001843195436127235, "grad_norm": 65.97754985371876, "learning_rate": 1.8431601635678746e-06, "loss": 2.227, "mean_token_accuracy": 0.47586206793785096, "step": 1830 }, { "epoch": 0.0018482314892314076, "grad_norm": 90.9212629625924, "learning_rate": 1.8481961202989343e-06, "loss": 1.9323, "mean_token_accuracy": 0.5206896543502808, "step": 1835 }, { "epoch": 0.0018532675423355803, "grad_norm": 73.71458988607003, "learning_rate": 1.8532320770299942e-06, "loss": 2.3401, "mean_token_accuracy": 0.5034482777118683, "step": 1840 }, { "epoch": 0.0018583035954397532, "grad_norm": 67.14808135899057, "learning_rate": 1.8582680337610543e-06, "loss": 2.4888, "mean_token_accuracy": 0.4310344815254211, "step": 1845 }, { "epoch": 0.001863339648543926, "grad_norm": 109.35527297957105, "learning_rate": 1.863303990492114e-06, "loss": 2.3476, "mean_token_accuracy": 0.46551724076271056, "step": 1850 }, { "epoch": 0.0018683757016480988, "grad_norm": 104.65003734665592, "learning_rate": 1.8683399472231736e-06, "loss": 2.1779, "mean_token_accuracy": 0.4620689630508423, "step": 1855 }, { "epoch": 0.0018734117547522715, "grad_norm": 63.45899564844917, "learning_rate": 1.8733759039542335e-06, "loss": 2.1909, "mean_token_accuracy": 0.46896551847457885, "step": 1860 }, { "epoch": 0.0018784478078564444, "grad_norm": 98.1692066952957, "learning_rate": 1.8784118606852932e-06, "loss": 2.1605, "mean_token_accuracy": 0.49655172824859617, "step": 1865 }, { "epoch": 0.001883483860960617, "grad_norm": 72.12043589392583, "learning_rate": 1.883447817416353e-06, "loss": 1.9092, "mean_token_accuracy": 0.5310344755649566, "step": 1870 }, { "epoch": 0.0018885199140647897, "grad_norm": 63.38032620835685, "learning_rate": 1.8884837741474128e-06, "loss": 2.307, "mean_token_accuracy": 0.42413792610168455, "step": 1875 }, { "epoch": 0.0018935559671689626, "grad_norm": 97.51558259527754, "learning_rate": 1.8935197308784724e-06, "loss": 2.2036, "mean_token_accuracy": 0.43793103098869324, "step": 1880 }, { "epoch": 0.0018985920202731353, "grad_norm": 81.28538443821304, "learning_rate": 1.8985556876095323e-06, "loss": 2.4599, "mean_token_accuracy": 0.42758620977401735, "step": 1885 }, { "epoch": 0.0019036280733773082, "grad_norm": 93.11508609155473, "learning_rate": 1.903591644340592e-06, "loss": 2.2178, "mean_token_accuracy": 0.45862067937850953, "step": 1890 }, { "epoch": 0.001908664126481481, "grad_norm": 94.4600094375921, "learning_rate": 1.9086276010716517e-06, "loss": 2.3936, "mean_token_accuracy": 0.4172413766384125, "step": 1895 }, { "epoch": 0.0019137001795856536, "grad_norm": 107.17710083956324, "learning_rate": 1.9136635578027118e-06, "loss": 2.0258, "mean_token_accuracy": 0.5034482836723327, "step": 1900 }, { "epoch": 0.0019187362326898265, "grad_norm": 117.47541846574994, "learning_rate": 1.9186995145337714e-06, "loss": 2.5823, "mean_token_accuracy": 0.42413792610168455, "step": 1905 }, { "epoch": 0.0019237722857939992, "grad_norm": 66.79156739266057, "learning_rate": 1.923735471264831e-06, "loss": 2.2979, "mean_token_accuracy": 0.4517241418361664, "step": 1910 }, { "epoch": 0.001928808338898172, "grad_norm": 78.40004200556646, "learning_rate": 1.928771427995891e-06, "loss": 2.1359, "mean_token_accuracy": 0.4931034564971924, "step": 1915 }, { "epoch": 0.0019338443920023447, "grad_norm": 79.92592154394615, "learning_rate": 1.9338073847269505e-06, "loss": 1.9515, "mean_token_accuracy": 0.5430732011795044, "step": 1920 }, { "epoch": 0.0019388804451065176, "grad_norm": 79.20333698917693, "learning_rate": 1.9388433414580106e-06, "loss": 2.5004, "mean_token_accuracy": 0.37241379022598264, "step": 1925 }, { "epoch": 0.0019439164982106903, "grad_norm": 99.99097699670727, "learning_rate": 1.9438792981890703e-06, "loss": 2.4431, "mean_token_accuracy": 0.37586206793785093, "step": 1930 }, { "epoch": 0.001948952551314863, "grad_norm": 89.71904440600169, "learning_rate": 1.94891525492013e-06, "loss": 2.0776, "mean_token_accuracy": 0.47241379618644713, "step": 1935 }, { "epoch": 0.001953988604419036, "grad_norm": 75.47698662083619, "learning_rate": 1.9539512116511896e-06, "loss": 2.426, "mean_token_accuracy": 0.45517241954803467, "step": 1940 }, { "epoch": 0.001959024657523209, "grad_norm": 70.85028722040185, "learning_rate": 1.9589871683822493e-06, "loss": 2.4601, "mean_token_accuracy": 0.4379310429096222, "step": 1945 }, { "epoch": 0.0019640607106273813, "grad_norm": 93.65941348272948, "learning_rate": 1.9640231251133094e-06, "loss": 2.3995, "mean_token_accuracy": 0.4517241418361664, "step": 1950 }, { "epoch": 0.001969096763731554, "grad_norm": 80.73007515789497, "learning_rate": 1.969059081844369e-06, "loss": 2.4547, "mean_token_accuracy": 0.43103447556495667, "step": 1955 }, { "epoch": 0.001974132816835727, "grad_norm": 75.54402884891343, "learning_rate": 1.9740950385754287e-06, "loss": 2.1322, "mean_token_accuracy": 0.44137930274009707, "step": 1960 }, { "epoch": 0.0019791688699398995, "grad_norm": 112.67510206807452, "learning_rate": 1.9791309953064884e-06, "loss": 1.8569, "mean_token_accuracy": 0.5344827592372894, "step": 1965 }, { "epoch": 0.0019842049230440724, "grad_norm": 91.16433235557035, "learning_rate": 1.984166952037548e-06, "loss": 2.4891, "mean_token_accuracy": 0.4172413766384125, "step": 1970 }, { "epoch": 0.0019892409761482453, "grad_norm": 83.11854793895924, "learning_rate": 1.989202908768608e-06, "loss": 2.2815, "mean_token_accuracy": 0.4620689690113068, "step": 1975 }, { "epoch": 0.0019942770292524182, "grad_norm": 92.17564771902191, "learning_rate": 1.994238865499668e-06, "loss": 2.2691, "mean_token_accuracy": 0.4586206912994385, "step": 1980 }, { "epoch": 0.0019993130823565907, "grad_norm": 82.1739264613652, "learning_rate": 1.9992748222307275e-06, "loss": 2.4372, "mean_token_accuracy": 0.40344828069210054, "step": 1985 }, { "epoch": 0.0020043491354607636, "grad_norm": 77.13165118044004, "learning_rate": 2.0043107789617872e-06, "loss": 2.0082, "mean_token_accuracy": 0.5137930989265442, "step": 1990 }, { "epoch": 0.0020093851885649365, "grad_norm": 84.79058491562802, "learning_rate": 2.009346735692847e-06, "loss": 2.3669, "mean_token_accuracy": 0.45517241954803467, "step": 1995 }, { "epoch": 0.002014421241669109, "grad_norm": 80.37046342006643, "learning_rate": 2.014382692423907e-06, "loss": 1.8865, "mean_token_accuracy": 0.5241379320621491, "step": 2000 }, { "epoch": 0.002019457294773282, "grad_norm": 81.49479556557111, "learning_rate": 2.0194186491549667e-06, "loss": 2.4085, "mean_token_accuracy": 0.493103438615799, "step": 2005 }, { "epoch": 0.0020244933478774547, "grad_norm": 56.22427568650456, "learning_rate": 2.0244546058860263e-06, "loss": 2.5069, "mean_token_accuracy": 0.4034482777118683, "step": 2010 }, { "epoch": 0.0020295294009816276, "grad_norm": 94.38835039101788, "learning_rate": 2.029490562617086e-06, "loss": 2.2433, "mean_token_accuracy": 0.43103447556495667, "step": 2015 }, { "epoch": 0.0020345654540858, "grad_norm": 85.49904954261659, "learning_rate": 2.0345265193481457e-06, "loss": 2.1228, "mean_token_accuracy": 0.441379314661026, "step": 2020 }, { "epoch": 0.002039601507189973, "grad_norm": 90.15610942839515, "learning_rate": 2.039562476079206e-06, "loss": 2.217, "mean_token_accuracy": 0.46896551847457885, "step": 2025 }, { "epoch": 0.002044637560294146, "grad_norm": 66.39729329968833, "learning_rate": 2.0445984328102655e-06, "loss": 2.2885, "mean_token_accuracy": 0.4517241358757019, "step": 2030 }, { "epoch": 0.0020496736133983184, "grad_norm": 65.84731199639643, "learning_rate": 2.049634389541325e-06, "loss": 2.1231, "mean_token_accuracy": 0.47241378426551817, "step": 2035 }, { "epoch": 0.0020547096665024913, "grad_norm": 130.13767097405986, "learning_rate": 2.054670346272385e-06, "loss": 2.0289, "mean_token_accuracy": 0.5103448152542114, "step": 2040 }, { "epoch": 0.002059745719606664, "grad_norm": 54.395648527616, "learning_rate": 2.0597063030034445e-06, "loss": 2.2106, "mean_token_accuracy": 0.5137931048870087, "step": 2045 }, { "epoch": 0.002064781772710837, "grad_norm": 134.39841460779195, "learning_rate": 2.0647422597345046e-06, "loss": 2.413, "mean_token_accuracy": 0.44313369393348695, "step": 2050 }, { "epoch": 0.0020698178258150095, "grad_norm": 120.07876619471952, "learning_rate": 2.0697782164655643e-06, "loss": 2.5518, "mean_token_accuracy": 0.44482758045196535, "step": 2055 }, { "epoch": 0.0020748538789191824, "grad_norm": 72.39383002650447, "learning_rate": 2.074814173196624e-06, "loss": 2.2809, "mean_token_accuracy": 0.47931034564971925, "step": 2060 }, { "epoch": 0.0020798899320233553, "grad_norm": 97.62233220271177, "learning_rate": 2.0798501299276836e-06, "loss": 2.2048, "mean_token_accuracy": 0.43103448748588563, "step": 2065 }, { "epoch": 0.002084925985127528, "grad_norm": 94.22741555340882, "learning_rate": 2.0848860866587433e-06, "loss": 2.6581, "mean_token_accuracy": 0.3773139715194702, "step": 2070 }, { "epoch": 0.0020899620382317007, "grad_norm": 72.49275371459426, "learning_rate": 2.0899220433898034e-06, "loss": 2.3135, "mean_token_accuracy": 0.4517241358757019, "step": 2075 }, { "epoch": 0.0020949980913358736, "grad_norm": 92.5064895978771, "learning_rate": 2.094958000120863e-06, "loss": 1.944, "mean_token_accuracy": 0.5214285731315613, "step": 2080 }, { "epoch": 0.0021000341444400465, "grad_norm": 64.00100532892989, "learning_rate": 2.0999939568519228e-06, "loss": 2.033, "mean_token_accuracy": 0.5137931048870087, "step": 2085 }, { "epoch": 0.002105070197544219, "grad_norm": 87.93719120962002, "learning_rate": 2.1050299135829824e-06, "loss": 2.0544, "mean_token_accuracy": 0.5052631616592407, "step": 2090 }, { "epoch": 0.002110106250648392, "grad_norm": 73.45534817073921, "learning_rate": 2.110065870314042e-06, "loss": 2.2357, "mean_token_accuracy": 0.46896551847457885, "step": 2095 }, { "epoch": 0.0021151423037525647, "grad_norm": 94.14541146787158, "learning_rate": 2.1151018270451022e-06, "loss": 2.5262, "mean_token_accuracy": 0.42758620381355283, "step": 2100 }, { "epoch": 0.002120178356856737, "grad_norm": 56.35782310806002, "learning_rate": 2.120137783776162e-06, "loss": 2.2755, "mean_token_accuracy": 0.42758620977401735, "step": 2105 }, { "epoch": 0.00212521440996091, "grad_norm": 93.10910782240434, "learning_rate": 2.1251737405072216e-06, "loss": 2.3457, "mean_token_accuracy": 0.46896551847457885, "step": 2110 }, { "epoch": 0.002130250463065083, "grad_norm": 89.00218583542788, "learning_rate": 2.1302096972382812e-06, "loss": 2.0866, "mean_token_accuracy": 0.47241379618644713, "step": 2115 }, { "epoch": 0.002135286516169256, "grad_norm": 104.72491235268645, "learning_rate": 2.135245653969341e-06, "loss": 2.4817, "mean_token_accuracy": 0.44137930274009707, "step": 2120 }, { "epoch": 0.0021403225692734284, "grad_norm": 64.98428934631258, "learning_rate": 2.140281610700401e-06, "loss": 2.343, "mean_token_accuracy": 0.4241379380226135, "step": 2125 }, { "epoch": 0.0021453586223776013, "grad_norm": 73.36348079388735, "learning_rate": 2.1453175674314607e-06, "loss": 2.1639, "mean_token_accuracy": 0.482758617401123, "step": 2130 }, { "epoch": 0.002150394675481774, "grad_norm": 107.03010254220776, "learning_rate": 2.1503535241625204e-06, "loss": 2.5505, "mean_token_accuracy": 0.4586206912994385, "step": 2135 }, { "epoch": 0.0021554307285859466, "grad_norm": 66.03717059466969, "learning_rate": 2.15538948089358e-06, "loss": 2.0671, "mean_token_accuracy": 0.46551724076271056, "step": 2140 }, { "epoch": 0.0021604667816901195, "grad_norm": 79.19439165546005, "learning_rate": 2.16042543762464e-06, "loss": 2.1992, "mean_token_accuracy": 0.46896552443504336, "step": 2145 }, { "epoch": 0.0021655028347942924, "grad_norm": 82.24561213588349, "learning_rate": 2.1654613943557e-06, "loss": 2.1769, "mean_token_accuracy": 0.4620689630508423, "step": 2150 }, { "epoch": 0.0021705388878984653, "grad_norm": 76.36154367829377, "learning_rate": 2.1704973510867595e-06, "loss": 2.3223, "mean_token_accuracy": 0.44652147889137267, "step": 2155 }, { "epoch": 0.002175574941002638, "grad_norm": 108.17682750951424, "learning_rate": 2.175533307817819e-06, "loss": 2.4235, "mean_token_accuracy": 0.4551724076271057, "step": 2160 }, { "epoch": 0.0021806109941068107, "grad_norm": 77.20581243724298, "learning_rate": 2.180569264548879e-06, "loss": 2.1487, "mean_token_accuracy": 0.43448275327682495, "step": 2165 }, { "epoch": 0.0021856470472109836, "grad_norm": 108.63764512169979, "learning_rate": 2.185605221279939e-06, "loss": 2.113, "mean_token_accuracy": 0.47586206793785096, "step": 2170 }, { "epoch": 0.002190683100315156, "grad_norm": 127.56322936980933, "learning_rate": 2.1906411780109986e-06, "loss": 2.0053, "mean_token_accuracy": 0.49655172824859617, "step": 2175 }, { "epoch": 0.002195719153419329, "grad_norm": 55.45017324660117, "learning_rate": 2.1956771347420583e-06, "loss": 2.3794, "mean_token_accuracy": 0.42068964838981626, "step": 2180 }, { "epoch": 0.002200755206523502, "grad_norm": 85.81602701764474, "learning_rate": 2.200713091473118e-06, "loss": 2.4024, "mean_token_accuracy": 0.37586206793785093, "step": 2185 }, { "epoch": 0.0022057912596276743, "grad_norm": 88.05309187658263, "learning_rate": 2.2057490482041777e-06, "loss": 2.1531, "mean_token_accuracy": 0.45862067937850953, "step": 2190 }, { "epoch": 0.002210827312731847, "grad_norm": 87.3300889846787, "learning_rate": 2.2107850049352378e-06, "loss": 2.2589, "mean_token_accuracy": 0.4517241358757019, "step": 2195 }, { "epoch": 0.00221586336583602, "grad_norm": 80.27639809690658, "learning_rate": 2.2158209616662974e-06, "loss": 2.5973, "mean_token_accuracy": 0.3965517282485962, "step": 2200 }, { "epoch": 0.002220899418940193, "grad_norm": 78.04712385778724, "learning_rate": 2.220856918397357e-06, "loss": 2.4277, "mean_token_accuracy": 0.46896552443504336, "step": 2205 }, { "epoch": 0.0022259354720443655, "grad_norm": 92.64826813063166, "learning_rate": 2.225892875128417e-06, "loss": 2.1198, "mean_token_accuracy": 0.4896551728248596, "step": 2210 }, { "epoch": 0.0022309715251485384, "grad_norm": 89.20145562977099, "learning_rate": 2.2309288318594765e-06, "loss": 1.9321, "mean_token_accuracy": 0.4896551728248596, "step": 2215 }, { "epoch": 0.0022360075782527113, "grad_norm": 75.11550738769606, "learning_rate": 2.2359647885905366e-06, "loss": 2.4328, "mean_token_accuracy": 0.4379310250282288, "step": 2220 }, { "epoch": 0.0022410436313568837, "grad_norm": 83.96441877070305, "learning_rate": 2.2410007453215962e-06, "loss": 2.5839, "mean_token_accuracy": 0.37931033968925476, "step": 2225 }, { "epoch": 0.0022460796844610566, "grad_norm": 95.40399944514314, "learning_rate": 2.246036702052656e-06, "loss": 2.1122, "mean_token_accuracy": 0.482758617401123, "step": 2230 }, { "epoch": 0.0022511157375652295, "grad_norm": 74.50641584187392, "learning_rate": 2.2510726587837156e-06, "loss": 1.904, "mean_token_accuracy": 0.517241370677948, "step": 2235 }, { "epoch": 0.0022561517906694024, "grad_norm": 67.31716477555766, "learning_rate": 2.2561086155147753e-06, "loss": 2.0203, "mean_token_accuracy": 0.5, "step": 2240 }, { "epoch": 0.002261187843773575, "grad_norm": 64.81349773051566, "learning_rate": 2.2611445722458354e-06, "loss": 2.4687, "mean_token_accuracy": 0.4551724076271057, "step": 2245 }, { "epoch": 0.002266223896877748, "grad_norm": 109.77026567699987, "learning_rate": 2.2661805289768955e-06, "loss": 2.349, "mean_token_accuracy": 0.46551724076271056, "step": 2250 }, { "epoch": 0.0022712599499819207, "grad_norm": 120.97306916821232, "learning_rate": 2.271216485707955e-06, "loss": 2.5395, "mean_token_accuracy": 0.3620689630508423, "step": 2255 }, { "epoch": 0.002276296003086093, "grad_norm": 94.8291436898578, "learning_rate": 2.276252442439015e-06, "loss": 2.6694, "mean_token_accuracy": 0.41724138259887694, "step": 2260 }, { "epoch": 0.002281332056190266, "grad_norm": 86.98101902489451, "learning_rate": 2.2812883991700745e-06, "loss": 2.4048, "mean_token_accuracy": 0.4448275864124298, "step": 2265 }, { "epoch": 0.002286368109294439, "grad_norm": 95.56511129735804, "learning_rate": 2.286324355901134e-06, "loss": 2.1267, "mean_token_accuracy": 0.4689655065536499, "step": 2270 }, { "epoch": 0.002291404162398612, "grad_norm": 104.90206108988845, "learning_rate": 2.2913603126321943e-06, "loss": 2.3055, "mean_token_accuracy": 0.4206896543502808, "step": 2275 }, { "epoch": 0.0022964402155027843, "grad_norm": 74.27571117193528, "learning_rate": 2.296396269363254e-06, "loss": 1.9802, "mean_token_accuracy": 0.5000000059604645, "step": 2280 }, { "epoch": 0.002301476268606957, "grad_norm": 72.30702953692304, "learning_rate": 2.3014322260943136e-06, "loss": 2.2243, "mean_token_accuracy": 0.493103438615799, "step": 2285 }, { "epoch": 0.00230651232171113, "grad_norm": 84.27252913417315, "learning_rate": 2.3064681828253733e-06, "loss": 1.8297, "mean_token_accuracy": 0.4758620738983154, "step": 2290 }, { "epoch": 0.0023115483748153026, "grad_norm": 88.51176814283133, "learning_rate": 2.311504139556433e-06, "loss": 2.4405, "mean_token_accuracy": 0.47586206793785096, "step": 2295 }, { "epoch": 0.0023165844279194755, "grad_norm": 82.73835152115163, "learning_rate": 2.316540096287493e-06, "loss": 2.5402, "mean_token_accuracy": 0.4379310369491577, "step": 2300 }, { "epoch": 0.0023216204810236484, "grad_norm": 64.3044780421925, "learning_rate": 2.3215760530185528e-06, "loss": 2.1602, "mean_token_accuracy": 0.4448275864124298, "step": 2305 }, { "epoch": 0.0023266565341278213, "grad_norm": 74.1918294250614, "learning_rate": 2.3266120097496124e-06, "loss": 2.3048, "mean_token_accuracy": 0.4517241418361664, "step": 2310 }, { "epoch": 0.0023316925872319937, "grad_norm": 83.60794216618378, "learning_rate": 2.331647966480672e-06, "loss": 1.9677, "mean_token_accuracy": 0.4965517222881317, "step": 2315 }, { "epoch": 0.0023367286403361666, "grad_norm": 89.93528196412021, "learning_rate": 2.336683923211732e-06, "loss": 2.0785, "mean_token_accuracy": 0.47586207985877993, "step": 2320 }, { "epoch": 0.0023417646934403395, "grad_norm": 86.46648739040855, "learning_rate": 2.341719879942792e-06, "loss": 2.1121, "mean_token_accuracy": 0.4918330252170563, "step": 2325 }, { "epoch": 0.002346800746544512, "grad_norm": 67.5352938098029, "learning_rate": 2.3467558366738516e-06, "loss": 2.1478, "mean_token_accuracy": 0.49655171632766726, "step": 2330 }, { "epoch": 0.002351836799648685, "grad_norm": 108.5068336301013, "learning_rate": 2.3517917934049112e-06, "loss": 2.2149, "mean_token_accuracy": 0.4586206912994385, "step": 2335 }, { "epoch": 0.002356872852752858, "grad_norm": 103.22246022867125, "learning_rate": 2.356827750135971e-06, "loss": 2.0018, "mean_token_accuracy": 0.5137930989265442, "step": 2340 }, { "epoch": 0.0023619089058570307, "grad_norm": 94.32368257825978, "learning_rate": 2.3618637068670306e-06, "loss": 2.1515, "mean_token_accuracy": 0.4620689630508423, "step": 2345 }, { "epoch": 0.002366944958961203, "grad_norm": 99.54118280992225, "learning_rate": 2.3668996635980907e-06, "loss": 2.44, "mean_token_accuracy": 0.4, "step": 2350 }, { "epoch": 0.002371981012065376, "grad_norm": 70.83937677532117, "learning_rate": 2.3719356203291504e-06, "loss": 1.9913, "mean_token_accuracy": 0.4931034445762634, "step": 2355 }, { "epoch": 0.002377017065169549, "grad_norm": 102.43012653014074, "learning_rate": 2.37697157706021e-06, "loss": 2.5206, "mean_token_accuracy": 0.4068965554237366, "step": 2360 }, { "epoch": 0.0023820531182737214, "grad_norm": 86.88467641525777, "learning_rate": 2.3820075337912697e-06, "loss": 2.4241, "mean_token_accuracy": 0.43793103098869324, "step": 2365 }, { "epoch": 0.0023870891713778943, "grad_norm": 63.118065292996434, "learning_rate": 2.38704349052233e-06, "loss": 2.0229, "mean_token_accuracy": 0.47586206197738645, "step": 2370 }, { "epoch": 0.002392125224482067, "grad_norm": 70.01644381681285, "learning_rate": 2.3920794472533895e-06, "loss": 2.2602, "mean_token_accuracy": 0.47586206197738645, "step": 2375 }, { "epoch": 0.00239716127758624, "grad_norm": 97.41031614296067, "learning_rate": 2.397115403984449e-06, "loss": 2.4941, "mean_token_accuracy": 0.3896551728248596, "step": 2380 }, { "epoch": 0.0024021973306904126, "grad_norm": 75.60346659696802, "learning_rate": 2.402151360715509e-06, "loss": 2.0855, "mean_token_accuracy": 0.47586206793785096, "step": 2385 }, { "epoch": 0.0024072333837945855, "grad_norm": 83.90825897672056, "learning_rate": 2.4071873174465685e-06, "loss": 2.2922, "mean_token_accuracy": 0.4482758641242981, "step": 2390 }, { "epoch": 0.0024122694368987584, "grad_norm": 100.94223097937659, "learning_rate": 2.4122232741776286e-06, "loss": 2.1944, "mean_token_accuracy": 0.4413793087005615, "step": 2395 }, { "epoch": 0.002417305490002931, "grad_norm": 102.77293421672823, "learning_rate": 2.4172592309086883e-06, "loss": 2.3001, "mean_token_accuracy": 0.47241379618644713, "step": 2400 }, { "epoch": 0.0024223415431071037, "grad_norm": 86.84525134236557, "learning_rate": 2.422295187639748e-06, "loss": 2.4248, "mean_token_accuracy": 0.44137930274009707, "step": 2405 }, { "epoch": 0.0024273775962112766, "grad_norm": 96.97384741714902, "learning_rate": 2.4273311443708077e-06, "loss": 2.1911, "mean_token_accuracy": 0.4430732041597366, "step": 2410 }, { "epoch": 0.002432413649315449, "grad_norm": 72.50198571802704, "learning_rate": 2.4323671011018673e-06, "loss": 2.4365, "mean_token_accuracy": 0.4517241418361664, "step": 2415 }, { "epoch": 0.002437449702419622, "grad_norm": 70.69782391007874, "learning_rate": 2.4374030578329274e-06, "loss": 2.2345, "mean_token_accuracy": 0.5068965494632721, "step": 2420 }, { "epoch": 0.002442485755523795, "grad_norm": 60.86278051650412, "learning_rate": 2.442439014563987e-06, "loss": 2.1528, "mean_token_accuracy": 0.4689655125141144, "step": 2425 }, { "epoch": 0.0024475218086279678, "grad_norm": 106.27687998858838, "learning_rate": 2.447474971295047e-06, "loss": 2.2131, "mean_token_accuracy": 0.47241379618644713, "step": 2430 }, { "epoch": 0.0024525578617321402, "grad_norm": 77.03583959000423, "learning_rate": 2.4525109280261065e-06, "loss": 2.4299, "mean_token_accuracy": 0.44482758045196535, "step": 2435 }, { "epoch": 0.002457593914836313, "grad_norm": 86.48972855633426, "learning_rate": 2.457546884757166e-06, "loss": 2.4645, "mean_token_accuracy": 0.3931034505367279, "step": 2440 }, { "epoch": 0.002462629967940486, "grad_norm": 101.74116534462931, "learning_rate": 2.4625828414882262e-06, "loss": 2.2693, "mean_token_accuracy": 0.458620685338974, "step": 2445 }, { "epoch": 0.0024676660210446585, "grad_norm": 87.02022023865, "learning_rate": 2.467618798219286e-06, "loss": 2.14, "mean_token_accuracy": 0.4448275864124298, "step": 2450 }, { "epoch": 0.0024727020741488314, "grad_norm": 108.44561616595888, "learning_rate": 2.4726547549503456e-06, "loss": 2.0587, "mean_token_accuracy": 0.501875376701355, "step": 2455 }, { "epoch": 0.0024777381272530043, "grad_norm": 48.251676953606776, "learning_rate": 2.4776907116814053e-06, "loss": 2.0645, "mean_token_accuracy": 0.48965516686439514, "step": 2460 }, { "epoch": 0.002482774180357177, "grad_norm": 70.59690625746232, "learning_rate": 2.482726668412465e-06, "loss": 2.1256, "mean_token_accuracy": 0.4413793206214905, "step": 2465 }, { "epoch": 0.0024878102334613497, "grad_norm": 82.42145596417369, "learning_rate": 2.487762625143525e-06, "loss": 2.2615, "mean_token_accuracy": 0.4344827651977539, "step": 2470 }, { "epoch": 0.0024928462865655226, "grad_norm": 81.98411365204434, "learning_rate": 2.4927985818745847e-06, "loss": 2.2869, "mean_token_accuracy": 0.4689655065536499, "step": 2475 }, { "epoch": 0.0024978823396696955, "grad_norm": 70.04223610163596, "learning_rate": 2.4978345386056444e-06, "loss": 2.1946, "mean_token_accuracy": 0.43103448748588563, "step": 2480 }, { "epoch": 0.002502918392773868, "grad_norm": 101.49786400227609, "learning_rate": 2.502870495336704e-06, "loss": 2.0097, "mean_token_accuracy": 0.5137931048870087, "step": 2485 }, { "epoch": 0.002507954445878041, "grad_norm": 104.72451071576143, "learning_rate": 2.5079064520677638e-06, "loss": 2.4874, "mean_token_accuracy": 0.41034482717514037, "step": 2490 }, { "epoch": 0.0025129904989822137, "grad_norm": 99.01470523307447, "learning_rate": 2.512942408798824e-06, "loss": 2.2392, "mean_token_accuracy": 0.4517241418361664, "step": 2495 }, { "epoch": 0.0025180265520863866, "grad_norm": 88.32559689127825, "learning_rate": 2.5179783655298835e-06, "loss": 2.3482, "mean_token_accuracy": 0.4517241358757019, "step": 2500 }, { "epoch": 0.002523062605190559, "grad_norm": 89.02802254323824, "learning_rate": 2.523014322260943e-06, "loss": 2.2567, "mean_token_accuracy": 0.4482758641242981, "step": 2505 }, { "epoch": 0.002528098658294732, "grad_norm": 86.95813959236109, "learning_rate": 2.528050278992003e-06, "loss": 2.246, "mean_token_accuracy": 0.43103448748588563, "step": 2510 }, { "epoch": 0.002533134711398905, "grad_norm": 79.48097965215312, "learning_rate": 2.5330862357230626e-06, "loss": 2.2283, "mean_token_accuracy": 0.4551724135875702, "step": 2515 }, { "epoch": 0.0025381707645030773, "grad_norm": 58.9093877425727, "learning_rate": 2.5381221924541227e-06, "loss": 2.2861, "mean_token_accuracy": 0.4310344815254211, "step": 2520 }, { "epoch": 0.0025432068176072502, "grad_norm": 84.59294272409089, "learning_rate": 2.5431581491851823e-06, "loss": 2.1781, "mean_token_accuracy": 0.47241380214691164, "step": 2525 }, { "epoch": 0.002548242870711423, "grad_norm": 93.01054790297542, "learning_rate": 2.548194105916242e-06, "loss": 2.1546, "mean_token_accuracy": 0.46896552443504336, "step": 2530 }, { "epoch": 0.002553278923815596, "grad_norm": 78.08738277996378, "learning_rate": 2.5532300626473017e-06, "loss": 1.9295, "mean_token_accuracy": 0.5344827532768249, "step": 2535 }, { "epoch": 0.0025583149769197685, "grad_norm": 67.40543490089269, "learning_rate": 2.5582660193783614e-06, "loss": 2.248, "mean_token_accuracy": 0.47241379618644713, "step": 2540 }, { "epoch": 0.0025633510300239414, "grad_norm": 72.29670272302188, "learning_rate": 2.5633019761094215e-06, "loss": 2.0328, "mean_token_accuracy": 0.5241379261016845, "step": 2545 }, { "epoch": 0.0025683870831281143, "grad_norm": 58.53504161644367, "learning_rate": 2.568337932840481e-06, "loss": 2.1585, "mean_token_accuracy": 0.47586206793785096, "step": 2550 }, { "epoch": 0.0025734231362322868, "grad_norm": 80.11779051597156, "learning_rate": 2.573373889571541e-06, "loss": 2.0434, "mean_token_accuracy": 0.5206896483898162, "step": 2555 }, { "epoch": 0.0025784591893364597, "grad_norm": 71.36974839218286, "learning_rate": 2.5784098463026005e-06, "loss": 2.1484, "mean_token_accuracy": 0.4931034445762634, "step": 2560 }, { "epoch": 0.0025834952424406326, "grad_norm": 81.40961290959542, "learning_rate": 2.58344580303366e-06, "loss": 1.8759, "mean_token_accuracy": 0.501600980758667, "step": 2565 }, { "epoch": 0.0025885312955448055, "grad_norm": 66.22402706092257, "learning_rate": 2.5884817597647203e-06, "loss": 2.4082, "mean_token_accuracy": 0.47241379618644713, "step": 2570 }, { "epoch": 0.002593567348648978, "grad_norm": 80.19970229278616, "learning_rate": 2.59351771649578e-06, "loss": 2.0116, "mean_token_accuracy": 0.48965516686439514, "step": 2575 }, { "epoch": 0.002598603401753151, "grad_norm": 104.71192991518264, "learning_rate": 2.5985536732268396e-06, "loss": 2.1655, "mean_token_accuracy": 0.5034482657909394, "step": 2580 }, { "epoch": 0.0026036394548573237, "grad_norm": 68.27060019618828, "learning_rate": 2.6035896299578993e-06, "loss": 2.4255, "mean_token_accuracy": 0.4344827651977539, "step": 2585 }, { "epoch": 0.002608675507961496, "grad_norm": 70.59755907631495, "learning_rate": 2.608625586688959e-06, "loss": 2.0864, "mean_token_accuracy": 0.4896551728248596, "step": 2590 }, { "epoch": 0.002613711561065669, "grad_norm": 66.4695034393724, "learning_rate": 2.613661543420019e-06, "loss": 1.9887, "mean_token_accuracy": 0.48965516686439514, "step": 2595 }, { "epoch": 0.002618747614169842, "grad_norm": 85.86804752415499, "learning_rate": 2.6186975001510788e-06, "loss": 2.3009, "mean_token_accuracy": 0.4344827651977539, "step": 2600 }, { "epoch": 0.002623783667274015, "grad_norm": 58.1255458488033, "learning_rate": 2.6237334568821384e-06, "loss": 2.622, "mean_token_accuracy": 0.39655171930789945, "step": 2605 }, { "epoch": 0.0026288197203781873, "grad_norm": 119.02446585580975, "learning_rate": 2.628769413613198e-06, "loss": 2.0564, "mean_token_accuracy": 0.49999999403953554, "step": 2610 }, { "epoch": 0.0026338557734823602, "grad_norm": 82.92094424704854, "learning_rate": 2.6338053703442578e-06, "loss": 2.0456, "mean_token_accuracy": 0.4914700508117676, "step": 2615 }, { "epoch": 0.002638891826586533, "grad_norm": 55.66215764464568, "learning_rate": 2.638841327075318e-06, "loss": 2.072, "mean_token_accuracy": 0.5124016880989075, "step": 2620 }, { "epoch": 0.0026439278796907056, "grad_norm": 87.19697469509939, "learning_rate": 2.6438772838063776e-06, "loss": 2.3927, "mean_token_accuracy": 0.4172413766384125, "step": 2625 }, { "epoch": 0.0026489639327948785, "grad_norm": 78.62896459530597, "learning_rate": 2.6489132405374372e-06, "loss": 2.4046, "mean_token_accuracy": 0.4344827473163605, "step": 2630 }, { "epoch": 0.0026539999858990514, "grad_norm": 81.7209698382802, "learning_rate": 2.653949197268497e-06, "loss": 2.298, "mean_token_accuracy": 0.44482758045196535, "step": 2635 }, { "epoch": 0.002659036039003224, "grad_norm": 73.54194820010473, "learning_rate": 2.658985153999557e-06, "loss": 2.3581, "mean_token_accuracy": 0.42758620977401735, "step": 2640 }, { "epoch": 0.0026640720921073968, "grad_norm": 76.53589154258704, "learning_rate": 2.6640211107306167e-06, "loss": 1.9931, "mean_token_accuracy": 0.5103448331356049, "step": 2645 }, { "epoch": 0.0026691081452115697, "grad_norm": 70.3446493180254, "learning_rate": 2.6690570674616764e-06, "loss": 2.2345, "mean_token_accuracy": 0.44827585220336913, "step": 2650 }, { "epoch": 0.0026741441983157426, "grad_norm": 66.94775145030579, "learning_rate": 2.674093024192736e-06, "loss": 2.1199, "mean_token_accuracy": 0.46896551847457885, "step": 2655 }, { "epoch": 0.002679180251419915, "grad_norm": 82.4314149698713, "learning_rate": 2.679128980923796e-06, "loss": 2.4063, "mean_token_accuracy": 0.44482758045196535, "step": 2660 }, { "epoch": 0.002684216304524088, "grad_norm": 62.06480470534354, "learning_rate": 2.684164937654856e-06, "loss": 2.0691, "mean_token_accuracy": 0.5137930989265442, "step": 2665 }, { "epoch": 0.002689252357628261, "grad_norm": 66.70698711652226, "learning_rate": 2.689200894385916e-06, "loss": 2.2688, "mean_token_accuracy": 0.47586206793785096, "step": 2670 }, { "epoch": 0.0026942884107324333, "grad_norm": 92.30191251456944, "learning_rate": 2.6942368511169756e-06, "loss": 2.4153, "mean_token_accuracy": 0.4172413766384125, "step": 2675 }, { "epoch": 0.002699324463836606, "grad_norm": 82.37019415130442, "learning_rate": 2.6992728078480353e-06, "loss": 2.2139, "mean_token_accuracy": 0.4551724135875702, "step": 2680 }, { "epoch": 0.002704360516940779, "grad_norm": 86.38876965878977, "learning_rate": 2.704308764579095e-06, "loss": 1.9345, "mean_token_accuracy": 0.45728976726531984, "step": 2685 }, { "epoch": 0.002709396570044952, "grad_norm": 76.06336349944823, "learning_rate": 2.7093447213101546e-06, "loss": 2.243, "mean_token_accuracy": 0.4620689690113068, "step": 2690 }, { "epoch": 0.0027144326231491244, "grad_norm": 75.96416232968137, "learning_rate": 2.7143806780412147e-06, "loss": 2.0259, "mean_token_accuracy": 0.4945812880992889, "step": 2695 }, { "epoch": 0.0027194686762532973, "grad_norm": 69.82960493111311, "learning_rate": 2.7194166347722744e-06, "loss": 1.9928, "mean_token_accuracy": 0.5050211727619172, "step": 2700 }, { "epoch": 0.0027245047293574702, "grad_norm": 52.23875665766504, "learning_rate": 2.724452591503334e-06, "loss": 2.0432, "mean_token_accuracy": 0.47241380214691164, "step": 2705 }, { "epoch": 0.0027295407824616427, "grad_norm": 115.77684114149575, "learning_rate": 2.7294885482343938e-06, "loss": 2.3347, "mean_token_accuracy": 0.40889292359352114, "step": 2710 }, { "epoch": 0.0027345768355658156, "grad_norm": 84.14499349996761, "learning_rate": 2.7345245049654534e-06, "loss": 2.2263, "mean_token_accuracy": 0.44482758045196535, "step": 2715 }, { "epoch": 0.0027396128886699885, "grad_norm": 56.68820943966481, "learning_rate": 2.7395604616965135e-06, "loss": 2.2164, "mean_token_accuracy": 0.4813067138195038, "step": 2720 }, { "epoch": 0.0027446489417741614, "grad_norm": 87.70003380523275, "learning_rate": 2.744596418427573e-06, "loss": 2.2856, "mean_token_accuracy": 0.41034482717514037, "step": 2725 }, { "epoch": 0.002749684994878334, "grad_norm": 62.4706991286528, "learning_rate": 2.749632375158633e-06, "loss": 2.1508, "mean_token_accuracy": 0.46551724672317507, "step": 2730 }, { "epoch": 0.0027547210479825068, "grad_norm": 99.26586227429233, "learning_rate": 2.7546683318896926e-06, "loss": 2.4804, "mean_token_accuracy": 0.39310344457626345, "step": 2735 }, { "epoch": 0.0027597571010866797, "grad_norm": 73.53182483854552, "learning_rate": 2.7597042886207522e-06, "loss": 2.4294, "mean_token_accuracy": 0.4551724135875702, "step": 2740 }, { "epoch": 0.002764793154190852, "grad_norm": 82.72995814482483, "learning_rate": 2.7647402453518123e-06, "loss": 2.3659, "mean_token_accuracy": 0.4551724076271057, "step": 2745 }, { "epoch": 0.002769829207295025, "grad_norm": 64.62304256081194, "learning_rate": 2.769776202082872e-06, "loss": 2.4052, "mean_token_accuracy": 0.43793103098869324, "step": 2750 }, { "epoch": 0.002774865260399198, "grad_norm": 65.3219204617248, "learning_rate": 2.7748121588139317e-06, "loss": 2.5977, "mean_token_accuracy": 0.3758620619773865, "step": 2755 }, { "epoch": 0.002779901313503371, "grad_norm": 71.86885898318236, "learning_rate": 2.7798481155449914e-06, "loss": 2.1898, "mean_token_accuracy": 0.482758617401123, "step": 2760 }, { "epoch": 0.0027849373666075433, "grad_norm": 69.210397481664, "learning_rate": 2.784884072276051e-06, "loss": 2.0809, "mean_token_accuracy": 0.4551724135875702, "step": 2765 }, { "epoch": 0.002789973419711716, "grad_norm": 83.34684729851516, "learning_rate": 2.789920029007111e-06, "loss": 2.1819, "mean_token_accuracy": 0.5137931048870087, "step": 2770 }, { "epoch": 0.002795009472815889, "grad_norm": 79.64701795356842, "learning_rate": 2.794955985738171e-06, "loss": 1.9888, "mean_token_accuracy": 0.519177258014679, "step": 2775 }, { "epoch": 0.0028000455259200615, "grad_norm": 87.4166099000275, "learning_rate": 2.7999919424692305e-06, "loss": 2.2323, "mean_token_accuracy": 0.4620689630508423, "step": 2780 }, { "epoch": 0.0028050815790242344, "grad_norm": 92.27110314428752, "learning_rate": 2.80502789920029e-06, "loss": 2.2346, "mean_token_accuracy": 0.46551724076271056, "step": 2785 }, { "epoch": 0.0028101176321284073, "grad_norm": 83.89918878588203, "learning_rate": 2.81006385593135e-06, "loss": 2.4355, "mean_token_accuracy": 0.4586206912994385, "step": 2790 }, { "epoch": 0.0028151536852325802, "grad_norm": 93.45920370138626, "learning_rate": 2.81509981266241e-06, "loss": 2.1915, "mean_token_accuracy": 0.4482758641242981, "step": 2795 }, { "epoch": 0.0028201897383367527, "grad_norm": 66.17805425813921, "learning_rate": 2.8201357693934696e-06, "loss": 1.9405, "mean_token_accuracy": 0.5172413766384125, "step": 2800 }, { "epoch": 0.0028252257914409256, "grad_norm": 80.65609196819172, "learning_rate": 2.8251717261245293e-06, "loss": 1.9453, "mean_token_accuracy": 0.4965517222881317, "step": 2805 }, { "epoch": 0.0028302618445450985, "grad_norm": 97.7429973449736, "learning_rate": 2.830207682855589e-06, "loss": 2.0581, "mean_token_accuracy": 0.47931033968925474, "step": 2810 }, { "epoch": 0.002835297897649271, "grad_norm": 73.69020724777327, "learning_rate": 2.8352436395866487e-06, "loss": 2.06, "mean_token_accuracy": 0.4862068951129913, "step": 2815 }, { "epoch": 0.002840333950753444, "grad_norm": 103.57005210577499, "learning_rate": 2.8402795963177088e-06, "loss": 1.9724, "mean_token_accuracy": 0.5599753677845001, "step": 2820 }, { "epoch": 0.0028453700038576168, "grad_norm": 65.42148294671267, "learning_rate": 2.8453155530487684e-06, "loss": 2.1466, "mean_token_accuracy": 0.4862068951129913, "step": 2825 }, { "epoch": 0.0028504060569617897, "grad_norm": 80.32749919516732, "learning_rate": 2.850351509779828e-06, "loss": 2.3368, "mean_token_accuracy": 0.4413793087005615, "step": 2830 }, { "epoch": 0.002855442110065962, "grad_norm": 63.96996264969172, "learning_rate": 2.8553874665108878e-06, "loss": 2.2333, "mean_token_accuracy": 0.44827585220336913, "step": 2835 }, { "epoch": 0.002860478163170135, "grad_norm": 100.35171769164828, "learning_rate": 2.8604234232419475e-06, "loss": 2.1746, "mean_token_accuracy": 0.48965516686439514, "step": 2840 }, { "epoch": 0.002865514216274308, "grad_norm": 74.6536346084636, "learning_rate": 2.8654593799730076e-06, "loss": 2.4039, "mean_token_accuracy": 0.43103447556495667, "step": 2845 }, { "epoch": 0.0028705502693784804, "grad_norm": 67.03898789894554, "learning_rate": 2.8704953367040672e-06, "loss": 1.868, "mean_token_accuracy": 0.5344827532768249, "step": 2850 }, { "epoch": 0.0028755863224826533, "grad_norm": 62.902392482822286, "learning_rate": 2.875531293435127e-06, "loss": 2.0197, "mean_token_accuracy": 0.4896551609039307, "step": 2855 }, { "epoch": 0.002880622375586826, "grad_norm": 88.00830680347272, "learning_rate": 2.8805672501661866e-06, "loss": 2.1306, "mean_token_accuracy": 0.46551724672317507, "step": 2860 }, { "epoch": 0.0028856584286909986, "grad_norm": 63.916751344187546, "learning_rate": 2.8856032068972463e-06, "loss": 2.3754, "mean_token_accuracy": 0.4137930989265442, "step": 2865 }, { "epoch": 0.0028906944817951715, "grad_norm": 103.41070480830378, "learning_rate": 2.8906391636283064e-06, "loss": 2.3709, "mean_token_accuracy": 0.4517241418361664, "step": 2870 }, { "epoch": 0.0028957305348993444, "grad_norm": 91.28132935611058, "learning_rate": 2.895675120359366e-06, "loss": 2.4747, "mean_token_accuracy": 0.4284482777118683, "step": 2875 }, { "epoch": 0.0029007665880035173, "grad_norm": 65.39938015352139, "learning_rate": 2.9007110770904257e-06, "loss": 2.1448, "mean_token_accuracy": 0.5017587065696716, "step": 2880 }, { "epoch": 0.00290580264110769, "grad_norm": 80.84669521191931, "learning_rate": 2.9057470338214854e-06, "loss": 2.424, "mean_token_accuracy": 0.4517241358757019, "step": 2885 }, { "epoch": 0.0029108386942118627, "grad_norm": 70.26365698499949, "learning_rate": 2.9107829905525455e-06, "loss": 2.0661, "mean_token_accuracy": 0.5137931048870087, "step": 2890 }, { "epoch": 0.0029158747473160356, "grad_norm": 101.26292734443231, "learning_rate": 2.915818947283605e-06, "loss": 2.5474, "mean_token_accuracy": 0.4, "step": 2895 }, { "epoch": 0.002920910800420208, "grad_norm": 60.000357169863605, "learning_rate": 2.920854904014665e-06, "loss": 2.1052, "mean_token_accuracy": 0.4631773352622986, "step": 2900 }, { "epoch": 0.002925946853524381, "grad_norm": 74.98131419220954, "learning_rate": 2.9258908607457245e-06, "loss": 2.1614, "mean_token_accuracy": 0.4551724135875702, "step": 2905 }, { "epoch": 0.002930982906628554, "grad_norm": 68.99947846683835, "learning_rate": 2.930926817476784e-06, "loss": 2.3706, "mean_token_accuracy": 0.4137930989265442, "step": 2910 }, { "epoch": 0.0029360189597327268, "grad_norm": 67.03628627387538, "learning_rate": 2.9359627742078443e-06, "loss": 2.04, "mean_token_accuracy": 0.5137931048870087, "step": 2915 }, { "epoch": 0.0029410550128368992, "grad_norm": 75.64477076811589, "learning_rate": 2.940998730938904e-06, "loss": 2.0422, "mean_token_accuracy": 0.47931033968925474, "step": 2920 }, { "epoch": 0.002946091065941072, "grad_norm": 67.99748776157973, "learning_rate": 2.9460346876699637e-06, "loss": 2.1311, "mean_token_accuracy": 0.4344827592372894, "step": 2925 }, { "epoch": 0.002951127119045245, "grad_norm": 89.5573630946037, "learning_rate": 2.9510706444010233e-06, "loss": 2.0571, "mean_token_accuracy": 0.49655171632766726, "step": 2930 }, { "epoch": 0.0029561631721494175, "grad_norm": 66.55704747284501, "learning_rate": 2.956106601132083e-06, "loss": 2.3881, "mean_token_accuracy": 0.4620689630508423, "step": 2935 }, { "epoch": 0.0029611992252535904, "grad_norm": 87.14039565243294, "learning_rate": 2.961142557863143e-06, "loss": 2.1442, "mean_token_accuracy": 0.49655171632766726, "step": 2940 }, { "epoch": 0.0029662352783577633, "grad_norm": 90.25881878937624, "learning_rate": 2.9661785145942028e-06, "loss": 2.4009, "mean_token_accuracy": 0.46551724076271056, "step": 2945 }, { "epoch": 0.002971271331461936, "grad_norm": 65.05480310172725, "learning_rate": 2.9712144713252625e-06, "loss": 2.2554, "mean_token_accuracy": 0.4827586203813553, "step": 2950 }, { "epoch": 0.0029763073845661086, "grad_norm": 86.4849105855807, "learning_rate": 2.976250428056322e-06, "loss": 2.5262, "mean_token_accuracy": 0.3999999940395355, "step": 2955 }, { "epoch": 0.0029813434376702815, "grad_norm": 88.51232309243866, "learning_rate": 2.981286384787382e-06, "loss": 2.4157, "mean_token_accuracy": 0.4482758641242981, "step": 2960 }, { "epoch": 0.0029863794907744544, "grad_norm": 91.550109868469, "learning_rate": 2.986322341518442e-06, "loss": 2.4113, "mean_token_accuracy": 0.44990926384925845, "step": 2965 }, { "epoch": 0.002991415543878627, "grad_norm": 89.42015661481138, "learning_rate": 2.9913582982495016e-06, "loss": 2.4826, "mean_token_accuracy": 0.4034482777118683, "step": 2970 }, { "epoch": 0.0029964515969828, "grad_norm": 88.55892077387162, "learning_rate": 2.9963942549805613e-06, "loss": 2.5427, "mean_token_accuracy": 0.4172413766384125, "step": 2975 }, { "epoch": 0.0030014876500869727, "grad_norm": 65.73514147933889, "learning_rate": 3.001430211711621e-06, "loss": 1.9901, "mean_token_accuracy": 0.49999999403953554, "step": 2980 }, { "epoch": 0.0030065237031911456, "grad_norm": 72.57805870200298, "learning_rate": 3.0064661684426806e-06, "loss": 2.4738, "mean_token_accuracy": 0.42758620381355283, "step": 2985 }, { "epoch": 0.003011559756295318, "grad_norm": 60.010863596474195, "learning_rate": 3.0115021251737407e-06, "loss": 2.371, "mean_token_accuracy": 0.4655172348022461, "step": 2990 }, { "epoch": 0.003016595809399491, "grad_norm": 95.48496956376763, "learning_rate": 3.0165380819048004e-06, "loss": 1.8541, "mean_token_accuracy": 0.5298850655555725, "step": 2995 }, { "epoch": 0.003021631862503664, "grad_norm": 65.01925032048041, "learning_rate": 3.02157403863586e-06, "loss": 2.3843, "mean_token_accuracy": 0.4381773352622986, "step": 3000 }, { "epoch": 0.0030266679156078363, "grad_norm": 94.96498384919472, "learning_rate": 3.0266099953669197e-06, "loss": 2.2954, "mean_token_accuracy": 0.43226600885391236, "step": 3005 }, { "epoch": 0.0030317039687120092, "grad_norm": 71.6103947976109, "learning_rate": 3.0316459520979794e-06, "loss": 2.2797, "mean_token_accuracy": 0.4586206912994385, "step": 3010 }, { "epoch": 0.003036740021816182, "grad_norm": 49.030310186456774, "learning_rate": 3.0366819088290395e-06, "loss": 1.947, "mean_token_accuracy": 0.5241379320621491, "step": 3015 }, { "epoch": 0.003041776074920355, "grad_norm": 59.85878115626745, "learning_rate": 3.041717865560099e-06, "loss": 2.1276, "mean_token_accuracy": 0.4793103516101837, "step": 3020 }, { "epoch": 0.0030468121280245275, "grad_norm": 69.13894546432422, "learning_rate": 3.046753822291159e-06, "loss": 2.0739, "mean_token_accuracy": 0.48965516686439514, "step": 3025 }, { "epoch": 0.0030518481811287004, "grad_norm": 84.83562826030482, "learning_rate": 3.0517897790222186e-06, "loss": 2.2518, "mean_token_accuracy": 0.4931034505367279, "step": 3030 }, { "epoch": 0.0030568842342328733, "grad_norm": 85.33763457035896, "learning_rate": 3.0568257357532782e-06, "loss": 2.119, "mean_token_accuracy": 0.4689655125141144, "step": 3035 }, { "epoch": 0.0030619202873370457, "grad_norm": 103.52015647679562, "learning_rate": 3.0618616924843383e-06, "loss": 2.4503, "mean_token_accuracy": 0.41379310488700866, "step": 3040 }, { "epoch": 0.0030669563404412186, "grad_norm": 86.49448330285979, "learning_rate": 3.066897649215398e-06, "loss": 2.234, "mean_token_accuracy": 0.45862067937850953, "step": 3045 }, { "epoch": 0.0030719923935453915, "grad_norm": 81.0860817025796, "learning_rate": 3.0719336059464577e-06, "loss": 2.0751, "mean_token_accuracy": 0.49872957468032836, "step": 3050 }, { "epoch": 0.0030770284466495644, "grad_norm": 72.29159826557503, "learning_rate": 3.0769695626775174e-06, "loss": 2.1363, "mean_token_accuracy": 0.4620689630508423, "step": 3055 }, { "epoch": 0.003082064499753737, "grad_norm": 86.2876880027152, "learning_rate": 3.082005519408577e-06, "loss": 2.4084, "mean_token_accuracy": 0.4034482777118683, "step": 3060 }, { "epoch": 0.00308710055285791, "grad_norm": 83.45307043452166, "learning_rate": 3.087041476139637e-06, "loss": 2.4245, "mean_token_accuracy": 0.4379310429096222, "step": 3065 }, { "epoch": 0.0030921366059620827, "grad_norm": 81.41168850676306, "learning_rate": 3.0920774328706972e-06, "loss": 2.1272, "mean_token_accuracy": 0.4586206912994385, "step": 3070 }, { "epoch": 0.003097172659066255, "grad_norm": 93.25355915172533, "learning_rate": 3.097113389601757e-06, "loss": 2.4373, "mean_token_accuracy": 0.41724138855934145, "step": 3075 }, { "epoch": 0.003102208712170428, "grad_norm": 104.56491335235592, "learning_rate": 3.1021493463328166e-06, "loss": 2.1426, "mean_token_accuracy": 0.4551724076271057, "step": 3080 }, { "epoch": 0.003107244765274601, "grad_norm": 106.61106264671604, "learning_rate": 3.1071853030638763e-06, "loss": 2.4611, "mean_token_accuracy": 0.4206896543502808, "step": 3085 }, { "epoch": 0.003112280818378774, "grad_norm": 87.5884365295295, "learning_rate": 3.112221259794936e-06, "loss": 2.3855, "mean_token_accuracy": 0.480762255191803, "step": 3090 }, { "epoch": 0.0031173168714829463, "grad_norm": 77.4187048838736, "learning_rate": 3.117257216525996e-06, "loss": 2.2813, "mean_token_accuracy": 0.4379310369491577, "step": 3095 }, { "epoch": 0.0031223529245871192, "grad_norm": 96.01644052275523, "learning_rate": 3.1222931732570557e-06, "loss": 2.3341, "mean_token_accuracy": 0.47078040838241575, "step": 3100 }, { "epoch": 0.003127388977691292, "grad_norm": 80.92844065704236, "learning_rate": 3.127329129988115e-06, "loss": 2.0389, "mean_token_accuracy": 0.5121597111225128, "step": 3105 }, { "epoch": 0.0031324250307954646, "grad_norm": 59.490080733519015, "learning_rate": 3.132365086719175e-06, "loss": 2.4377, "mean_token_accuracy": 0.4137930989265442, "step": 3110 }, { "epoch": 0.0031374610838996375, "grad_norm": 66.00689060356684, "learning_rate": 3.1374010434502347e-06, "loss": 2.0668, "mean_token_accuracy": 0.49999999403953554, "step": 3115 }, { "epoch": 0.0031424971370038104, "grad_norm": 176.4001965962792, "learning_rate": 3.142437000181295e-06, "loss": 2.5429, "mean_token_accuracy": 0.38620689511299133, "step": 3120 }, { "epoch": 0.003147533190107983, "grad_norm": 103.32199386607853, "learning_rate": 3.147472956912354e-06, "loss": 2.1981, "mean_token_accuracy": 0.45517241954803467, "step": 3125 }, { "epoch": 0.0031525692432121557, "grad_norm": 61.90781695146732, "learning_rate": 3.152508913643414e-06, "loss": 1.9303, "mean_token_accuracy": 0.5068965435028077, "step": 3130 }, { "epoch": 0.0031576052963163286, "grad_norm": 66.86932863486322, "learning_rate": 3.1575448703744735e-06, "loss": 2.1472, "mean_token_accuracy": 0.4655172348022461, "step": 3135 }, { "epoch": 0.0031626413494205015, "grad_norm": 56.911438747214845, "learning_rate": 3.162580827105534e-06, "loss": 2.1996, "mean_token_accuracy": 0.4896551609039307, "step": 3140 }, { "epoch": 0.003167677402524674, "grad_norm": 65.90434831968797, "learning_rate": 3.1676167838365932e-06, "loss": 2.2503, "mean_token_accuracy": 0.47931033968925474, "step": 3145 }, { "epoch": 0.003172713455628847, "grad_norm": 94.15538832461324, "learning_rate": 3.1726527405676533e-06, "loss": 2.1304, "mean_token_accuracy": 0.5172413766384125, "step": 3150 }, { "epoch": 0.00317774950873302, "grad_norm": 75.63183227983484, "learning_rate": 3.1776886972987126e-06, "loss": 2.1292, "mean_token_accuracy": 0.4896551728248596, "step": 3155 }, { "epoch": 0.0031827855618371923, "grad_norm": 67.18445741960737, "learning_rate": 3.1827246540297727e-06, "loss": 2.2006, "mean_token_accuracy": 0.4827586054801941, "step": 3160 }, { "epoch": 0.003187821614941365, "grad_norm": 72.00631080527997, "learning_rate": 3.1877606107608324e-06, "loss": 2.1479, "mean_token_accuracy": 0.4517241358757019, "step": 3165 }, { "epoch": 0.003192857668045538, "grad_norm": 69.13595140264927, "learning_rate": 3.1927965674918925e-06, "loss": 1.7245, "mean_token_accuracy": 0.5838669955730438, "step": 3170 }, { "epoch": 0.003197893721149711, "grad_norm": 71.10822797753556, "learning_rate": 3.1978325242229517e-06, "loss": 2.3047, "mean_token_accuracy": 0.47241378426551817, "step": 3175 }, { "epoch": 0.0032029297742538834, "grad_norm": 93.12915752237711, "learning_rate": 3.202868480954012e-06, "loss": 2.4167, "mean_token_accuracy": 0.3931034505367279, "step": 3180 }, { "epoch": 0.0032079658273580563, "grad_norm": 58.75058719579265, "learning_rate": 3.2079044376850715e-06, "loss": 2.2029, "mean_token_accuracy": 0.42068964838981626, "step": 3185 }, { "epoch": 0.0032130018804622292, "grad_norm": 78.23709193028031, "learning_rate": 3.2129403944161316e-06, "loss": 2.6463, "mean_token_accuracy": 0.3724137842655182, "step": 3190 }, { "epoch": 0.0032180379335664017, "grad_norm": 88.18487494198256, "learning_rate": 3.217976351147191e-06, "loss": 2.0882, "mean_token_accuracy": 0.48275862336158754, "step": 3195 }, { "epoch": 0.0032230739866705746, "grad_norm": 80.77723888561393, "learning_rate": 3.223012307878251e-06, "loss": 2.1852, "mean_token_accuracy": 0.4862068951129913, "step": 3200 }, { "epoch": 0.0032281100397747475, "grad_norm": 65.69255397408914, "learning_rate": 3.22804826460931e-06, "loss": 2.4897, "mean_token_accuracy": 0.41724138259887694, "step": 3205 }, { "epoch": 0.0032331460928789204, "grad_norm": 73.88061599403267, "learning_rate": 3.2330842213403703e-06, "loss": 2.2371, "mean_token_accuracy": 0.47586206197738645, "step": 3210 }, { "epoch": 0.003238182145983093, "grad_norm": 91.98232071006288, "learning_rate": 3.23812017807143e-06, "loss": 2.1787, "mean_token_accuracy": 0.48275862336158754, "step": 3215 }, { "epoch": 0.0032432181990872657, "grad_norm": 62.72222230252338, "learning_rate": 3.24315613480249e-06, "loss": 2.3648, "mean_token_accuracy": 0.42413793206214906, "step": 3220 }, { "epoch": 0.0032482542521914386, "grad_norm": 86.0604729106889, "learning_rate": 3.2481920915335493e-06, "loss": 2.5279, "mean_token_accuracy": 0.38965516686439516, "step": 3225 }, { "epoch": 0.003253290305295611, "grad_norm": 69.3939076142181, "learning_rate": 3.2532280482646094e-06, "loss": 2.2904, "mean_token_accuracy": 0.4849364757537842, "step": 3230 }, { "epoch": 0.003258326358399784, "grad_norm": 64.12567706361092, "learning_rate": 3.258264004995669e-06, "loss": 2.309, "mean_token_accuracy": 0.44827585816383364, "step": 3235 }, { "epoch": 0.003263362411503957, "grad_norm": 69.32636502894857, "learning_rate": 3.263299961726729e-06, "loss": 2.0678, "mean_token_accuracy": 0.4862069010734558, "step": 3240 }, { "epoch": 0.00326839846460813, "grad_norm": 79.91087498801657, "learning_rate": 3.2683359184577885e-06, "loss": 2.0761, "mean_token_accuracy": 0.5119782269001008, "step": 3245 }, { "epoch": 0.0032734345177123023, "grad_norm": 62.59537746986369, "learning_rate": 3.2733718751888486e-06, "loss": 2.4258, "mean_token_accuracy": 0.42413792610168455, "step": 3250 }, { "epoch": 0.003278470570816475, "grad_norm": 76.6221373052642, "learning_rate": 3.278407831919908e-06, "loss": 2.3818, "mean_token_accuracy": 0.42413792610168455, "step": 3255 }, { "epoch": 0.003283506623920648, "grad_norm": 102.34594839825111, "learning_rate": 3.283443788650968e-06, "loss": 1.9283, "mean_token_accuracy": 0.5068965435028077, "step": 3260 }, { "epoch": 0.0032885426770248205, "grad_norm": 74.47466952805327, "learning_rate": 3.2884797453820276e-06, "loss": 2.3048, "mean_token_accuracy": 0.43103448748588563, "step": 3265 }, { "epoch": 0.0032935787301289934, "grad_norm": 85.84855823735926, "learning_rate": 3.2935157021130877e-06, "loss": 2.008, "mean_token_accuracy": 0.47931034564971925, "step": 3270 }, { "epoch": 0.0032986147832331663, "grad_norm": 68.32688721306249, "learning_rate": 3.2985516588441478e-06, "loss": 2.4571, "mean_token_accuracy": 0.4344827592372894, "step": 3275 }, { "epoch": 0.003303650836337339, "grad_norm": 57.944642708698225, "learning_rate": 3.303587615575207e-06, "loss": 2.6626, "mean_token_accuracy": 0.3999999940395355, "step": 3280 }, { "epoch": 0.0033086868894415117, "grad_norm": 75.54629317907177, "learning_rate": 3.308623572306267e-06, "loss": 2.3377, "mean_token_accuracy": 0.4448275864124298, "step": 3285 }, { "epoch": 0.0033137229425456846, "grad_norm": 97.91341302005132, "learning_rate": 3.313659529037327e-06, "loss": 2.2696, "mean_token_accuracy": 0.42068966031074523, "step": 3290 }, { "epoch": 0.0033187589956498575, "grad_norm": 68.00506859333011, "learning_rate": 3.318695485768387e-06, "loss": 2.0894, "mean_token_accuracy": 0.46896551847457885, "step": 3295 }, { "epoch": 0.00332379504875403, "grad_norm": 81.44210177411483, "learning_rate": 3.323731442499446e-06, "loss": 2.0202, "mean_token_accuracy": 0.5103448271751404, "step": 3300 }, { "epoch": 0.003328831101858203, "grad_norm": 84.87738445515838, "learning_rate": 3.3287673992305063e-06, "loss": 2.0818, "mean_token_accuracy": 0.5019963622093201, "step": 3305 }, { "epoch": 0.0033338671549623757, "grad_norm": 67.4530211237485, "learning_rate": 3.3338033559615655e-06, "loss": 2.4397, "mean_token_accuracy": 0.4379310369491577, "step": 3310 }, { "epoch": 0.0033389032080665486, "grad_norm": 69.07746772961687, "learning_rate": 3.3388393126926256e-06, "loss": 2.104, "mean_token_accuracy": 0.46896551847457885, "step": 3315 }, { "epoch": 0.003343939261170721, "grad_norm": 101.65889637029338, "learning_rate": 3.3438752694236853e-06, "loss": 2.4903, "mean_token_accuracy": 0.4103448212146759, "step": 3320 }, { "epoch": 0.003348975314274894, "grad_norm": 122.39798118649, "learning_rate": 3.3489112261547454e-06, "loss": 2.2311, "mean_token_accuracy": 0.4931034445762634, "step": 3325 }, { "epoch": 0.003354011367379067, "grad_norm": 51.86216167236562, "learning_rate": 3.3539471828858046e-06, "loss": 1.9066, "mean_token_accuracy": 0.5310344815254211, "step": 3330 }, { "epoch": 0.0033590474204832394, "grad_norm": 69.06014636067414, "learning_rate": 3.3589831396168647e-06, "loss": 2.3247, "mean_token_accuracy": 0.4551724076271057, "step": 3335 }, { "epoch": 0.0033640834735874123, "grad_norm": 60.105566055871755, "learning_rate": 3.3640190963479244e-06, "loss": 2.5613, "mean_token_accuracy": 0.417241370677948, "step": 3340 }, { "epoch": 0.003369119526691585, "grad_norm": 67.18565868599282, "learning_rate": 3.3690550530789845e-06, "loss": 2.3156, "mean_token_accuracy": 0.3999999940395355, "step": 3345 }, { "epoch": 0.0033741555797957576, "grad_norm": 72.41177643014052, "learning_rate": 3.3740910098100438e-06, "loss": 2.1682, "mean_token_accuracy": 0.4862068951129913, "step": 3350 }, { "epoch": 0.0033791916328999305, "grad_norm": 71.12155487845098, "learning_rate": 3.379126966541104e-06, "loss": 2.3652, "mean_token_accuracy": 0.4344827473163605, "step": 3355 }, { "epoch": 0.0033842276860041034, "grad_norm": 54.85595735011981, "learning_rate": 3.384162923272163e-06, "loss": 2.306, "mean_token_accuracy": 0.42758620977401735, "step": 3360 }, { "epoch": 0.0033892637391082763, "grad_norm": 76.61154397660779, "learning_rate": 3.3891988800032236e-06, "loss": 2.1391, "mean_token_accuracy": 0.4675136208534241, "step": 3365 }, { "epoch": 0.0033942997922124488, "grad_norm": 68.67037474704775, "learning_rate": 3.394234836734283e-06, "loss": 2.2515, "mean_token_accuracy": 0.47931034564971925, "step": 3370 }, { "epoch": 0.0033993358453166217, "grad_norm": 52.73592751943477, "learning_rate": 3.399270793465343e-06, "loss": 2.2811, "mean_token_accuracy": 0.46896551847457885, "step": 3375 }, { "epoch": 0.0034043718984207946, "grad_norm": 66.85389688733349, "learning_rate": 3.4043067501964023e-06, "loss": 2.001, "mean_token_accuracy": 0.5034482777118683, "step": 3380 }, { "epoch": 0.003409407951524967, "grad_norm": 78.88779690444377, "learning_rate": 3.4093427069274624e-06, "loss": 2.4047, "mean_token_accuracy": 0.4724137902259827, "step": 3385 }, { "epoch": 0.00341444400462914, "grad_norm": 101.29439469265937, "learning_rate": 3.414378663658522e-06, "loss": 2.0217, "mean_token_accuracy": 0.5241379320621491, "step": 3390 }, { "epoch": 0.003419480057733313, "grad_norm": 106.62456854021953, "learning_rate": 3.419414620389582e-06, "loss": 1.9196, "mean_token_accuracy": 0.4862068831920624, "step": 3395 }, { "epoch": 0.0034245161108374857, "grad_norm": 73.26336701367737, "learning_rate": 3.4244505771206414e-06, "loss": 2.2365, "mean_token_accuracy": 0.4551724135875702, "step": 3400 }, { "epoch": 0.003429552163941658, "grad_norm": 74.67394340125233, "learning_rate": 3.4294865338517015e-06, "loss": 2.3358, "mean_token_accuracy": 0.42413793206214906, "step": 3405 }, { "epoch": 0.003434588217045831, "grad_norm": 68.53252070520398, "learning_rate": 3.434522490582761e-06, "loss": 2.3594, "mean_token_accuracy": 0.42413793206214906, "step": 3410 }, { "epoch": 0.003439624270150004, "grad_norm": 68.185013590259, "learning_rate": 3.4395584473138213e-06, "loss": 2.1012, "mean_token_accuracy": 0.48154870271682737, "step": 3415 }, { "epoch": 0.0034446603232541765, "grad_norm": 83.30862263246033, "learning_rate": 3.4445944040448805e-06, "loss": 2.1941, "mean_token_accuracy": 0.4620689630508423, "step": 3420 }, { "epoch": 0.0034496963763583494, "grad_norm": 104.99516517035869, "learning_rate": 3.4496303607759406e-06, "loss": 2.1823, "mean_token_accuracy": 0.48457351326942444, "step": 3425 }, { "epoch": 0.0034547324294625223, "grad_norm": 110.00124048542287, "learning_rate": 3.454666317507e-06, "loss": 2.4887, "mean_token_accuracy": 0.38965516686439516, "step": 3430 }, { "epoch": 0.003459768482566695, "grad_norm": 74.82133672955143, "learning_rate": 3.45970227423806e-06, "loss": 2.6637, "mean_token_accuracy": 0.3413793116807938, "step": 3435 }, { "epoch": 0.0034648045356708676, "grad_norm": 72.25224979625271, "learning_rate": 3.4647382309691196e-06, "loss": 2.1014, "mean_token_accuracy": 0.4896551609039307, "step": 3440 }, { "epoch": 0.0034698405887750405, "grad_norm": 108.94761765016044, "learning_rate": 3.4697741877001797e-06, "loss": 2.0139, "mean_token_accuracy": 0.482758617401123, "step": 3445 }, { "epoch": 0.0034748766418792134, "grad_norm": 79.01038491752975, "learning_rate": 3.474810144431239e-06, "loss": 1.9689, "mean_token_accuracy": 0.5068965435028077, "step": 3450 }, { "epoch": 0.003479912694983386, "grad_norm": 78.71801163458723, "learning_rate": 3.479846101162299e-06, "loss": 2.0985, "mean_token_accuracy": 0.5034482836723327, "step": 3455 }, { "epoch": 0.0034849487480875588, "grad_norm": 63.097794630809865, "learning_rate": 3.4848820578933588e-06, "loss": 2.1702, "mean_token_accuracy": 0.46551724076271056, "step": 3460 }, { "epoch": 0.0034899848011917317, "grad_norm": 76.1510617565935, "learning_rate": 3.489918014624419e-06, "loss": 2.1298, "mean_token_accuracy": 0.510344821214676, "step": 3465 }, { "epoch": 0.0034950208542959046, "grad_norm": 92.86291262024004, "learning_rate": 3.494953971355478e-06, "loss": 2.0496, "mean_token_accuracy": 0.4862068951129913, "step": 3470 }, { "epoch": 0.003500056907400077, "grad_norm": 95.99020443429825, "learning_rate": 3.4999899280865382e-06, "loss": 2.1456, "mean_token_accuracy": 0.4896551609039307, "step": 3475 }, { "epoch": 0.00350509296050425, "grad_norm": 82.6329914564027, "learning_rate": 3.5050258848175975e-06, "loss": 2.3636, "mean_token_accuracy": 0.4620689630508423, "step": 3480 }, { "epoch": 0.003510129013608423, "grad_norm": 61.11018049383657, "learning_rate": 3.5100618415486576e-06, "loss": 2.2933, "mean_token_accuracy": 0.42413792610168455, "step": 3485 }, { "epoch": 0.0035151650667125953, "grad_norm": 119.59082860991383, "learning_rate": 3.5150977982797173e-06, "loss": 1.9017, "mean_token_accuracy": 0.5206896424293518, "step": 3490 }, { "epoch": 0.003520201119816768, "grad_norm": 62.215343437407796, "learning_rate": 3.5201337550107774e-06, "loss": 2.232, "mean_token_accuracy": 0.4551724076271057, "step": 3495 }, { "epoch": 0.003525237172920941, "grad_norm": 98.74880231547037, "learning_rate": 3.5251697117418366e-06, "loss": 2.4658, "mean_token_accuracy": 0.4275861978530884, "step": 3500 }, { "epoch": 0.003530273226025114, "grad_norm": 86.30846794670663, "learning_rate": 3.5302056684728967e-06, "loss": 2.5153, "mean_token_accuracy": 0.40000000298023225, "step": 3505 }, { "epoch": 0.0035353092791292865, "grad_norm": 77.02887776073075, "learning_rate": 3.5352416252039564e-06, "loss": 2.6338, "mean_token_accuracy": 0.37586206793785093, "step": 3510 }, { "epoch": 0.0035403453322334594, "grad_norm": 91.40390753427455, "learning_rate": 3.5402775819350165e-06, "loss": 2.0753, "mean_token_accuracy": 0.5034482657909394, "step": 3515 }, { "epoch": 0.0035453813853376323, "grad_norm": 54.2541043009632, "learning_rate": 3.5453135386660757e-06, "loss": 2.1156, "mean_token_accuracy": 0.4655172348022461, "step": 3520 }, { "epoch": 0.0035504174384418047, "grad_norm": 67.84233516952993, "learning_rate": 3.550349495397136e-06, "loss": 2.1998, "mean_token_accuracy": 0.5, "step": 3525 }, { "epoch": 0.0035554534915459776, "grad_norm": 76.00835204515016, "learning_rate": 3.555385452128195e-06, "loss": 2.2102, "mean_token_accuracy": 0.47586206793785096, "step": 3530 }, { "epoch": 0.0035604895446501505, "grad_norm": 64.18721302787216, "learning_rate": 3.560421408859255e-06, "loss": 2.0036, "mean_token_accuracy": 0.4758620738983154, "step": 3535 }, { "epoch": 0.0035655255977543234, "grad_norm": 69.72603043250474, "learning_rate": 3.565457365590315e-06, "loss": 2.2701, "mean_token_accuracy": 0.4413793087005615, "step": 3540 }, { "epoch": 0.003570561650858496, "grad_norm": 49.336332319641855, "learning_rate": 3.570493322321375e-06, "loss": 2.2199, "mean_token_accuracy": 0.42413792610168455, "step": 3545 }, { "epoch": 0.0035755977039626688, "grad_norm": 102.97511347272919, "learning_rate": 3.5755292790524342e-06, "loss": 2.1684, "mean_token_accuracy": 0.48275861144065857, "step": 3550 }, { "epoch": 0.0035806337570668417, "grad_norm": 75.69131383661562, "learning_rate": 3.5805652357834943e-06, "loss": 2.1375, "mean_token_accuracy": 0.48965518474578856, "step": 3555 }, { "epoch": 0.003585669810171014, "grad_norm": 85.29764511331626, "learning_rate": 3.585601192514554e-06, "loss": 1.9744, "mean_token_accuracy": 0.5034482777118683, "step": 3560 }, { "epoch": 0.003590705863275187, "grad_norm": 63.289273927785565, "learning_rate": 3.590637149245614e-06, "loss": 1.9565, "mean_token_accuracy": 0.47586206197738645, "step": 3565 }, { "epoch": 0.00359574191637936, "grad_norm": 80.30134192622461, "learning_rate": 3.5956731059766733e-06, "loss": 2.1315, "mean_token_accuracy": 0.48275862336158754, "step": 3570 }, { "epoch": 0.0036007779694835324, "grad_norm": 50.647434265493764, "learning_rate": 3.6007090627077334e-06, "loss": 2.1857, "mean_token_accuracy": 0.4676345944404602, "step": 3575 }, { "epoch": 0.0036058140225877053, "grad_norm": 65.72498649601756, "learning_rate": 3.6057450194387927e-06, "loss": 2.4623, "mean_token_accuracy": 0.42510586977005005, "step": 3580 }, { "epoch": 0.003610850075691878, "grad_norm": 94.02834505436228, "learning_rate": 3.610780976169853e-06, "loss": 2.4245, "mean_token_accuracy": 0.43793103098869324, "step": 3585 }, { "epoch": 0.003615886128796051, "grad_norm": 74.30497161061314, "learning_rate": 3.6158169329009125e-06, "loss": 2.0252, "mean_token_accuracy": 0.49655172824859617, "step": 3590 }, { "epoch": 0.0036209221819002236, "grad_norm": 47.520490622559606, "learning_rate": 3.6208528896319726e-06, "loss": 2.4116, "mean_token_accuracy": 0.4241379380226135, "step": 3595 }, { "epoch": 0.0036259582350043965, "grad_norm": 63.18103921688582, "learning_rate": 3.625888846363032e-06, "loss": 2.6274, "mean_token_accuracy": 0.4034482777118683, "step": 3600 }, { "epoch": 0.0036309942881085694, "grad_norm": 71.20204844074186, "learning_rate": 3.630924803094092e-06, "loss": 1.8998, "mean_token_accuracy": 0.5206896543502808, "step": 3605 }, { "epoch": 0.003636030341212742, "grad_norm": 86.46180197318455, "learning_rate": 3.6359607598251516e-06, "loss": 2.3773, "mean_token_accuracy": 0.458620685338974, "step": 3610 }, { "epoch": 0.0036410663943169147, "grad_norm": 68.88902774157108, "learning_rate": 3.6409967165562117e-06, "loss": 2.54, "mean_token_accuracy": 0.4034482717514038, "step": 3615 }, { "epoch": 0.0036461024474210876, "grad_norm": 84.03757858949756, "learning_rate": 3.646032673287271e-06, "loss": 1.9773, "mean_token_accuracy": 0.5127646744251251, "step": 3620 }, { "epoch": 0.0036511385005252605, "grad_norm": 98.45081092487356, "learning_rate": 3.651068630018331e-06, "loss": 2.6486, "mean_token_accuracy": 0.3793103456497192, "step": 3625 }, { "epoch": 0.003656174553629433, "grad_norm": 65.57839039921986, "learning_rate": 3.6561045867493903e-06, "loss": 2.2012, "mean_token_accuracy": 0.46376285552978513, "step": 3630 }, { "epoch": 0.003661210606733606, "grad_norm": 48.663535680537514, "learning_rate": 3.661140543480451e-06, "loss": 1.9311, "mean_token_accuracy": 0.5689655184745789, "step": 3635 }, { "epoch": 0.0036662466598377788, "grad_norm": 71.542719255767, "learning_rate": 3.66617650021151e-06, "loss": 1.9894, "mean_token_accuracy": 0.4896551728248596, "step": 3640 }, { "epoch": 0.0036712827129419512, "grad_norm": 80.30064279528494, "learning_rate": 3.67121245694257e-06, "loss": 2.3826, "mean_token_accuracy": 0.4551724076271057, "step": 3645 }, { "epoch": 0.003676318766046124, "grad_norm": 58.98396996137691, "learning_rate": 3.6762484136736294e-06, "loss": 2.1597, "mean_token_accuracy": 0.48771928548812865, "step": 3650 }, { "epoch": 0.003681354819150297, "grad_norm": 219.89255193600903, "learning_rate": 3.6812843704046895e-06, "loss": 2.3411, "mean_token_accuracy": 0.43103448748588563, "step": 3655 }, { "epoch": 0.00368639087225447, "grad_norm": 64.68524497717114, "learning_rate": 3.6863203271357492e-06, "loss": 2.1244, "mean_token_accuracy": 0.4760435581207275, "step": 3660 }, { "epoch": 0.0036914269253586424, "grad_norm": 95.24641271260893, "learning_rate": 3.6913562838668093e-06, "loss": 2.3517, "mean_token_accuracy": 0.4551724135875702, "step": 3665 }, { "epoch": 0.0036964629784628153, "grad_norm": 62.979719804286965, "learning_rate": 3.6963922405978686e-06, "loss": 2.4041, "mean_token_accuracy": 0.40000000298023225, "step": 3670 }, { "epoch": 0.003701499031566988, "grad_norm": 69.66840206666882, "learning_rate": 3.7014281973289287e-06, "loss": 2.1918, "mean_token_accuracy": 0.43793103098869324, "step": 3675 }, { "epoch": 0.0037065350846711607, "grad_norm": 66.95460023221484, "learning_rate": 3.7064641540599883e-06, "loss": 2.5266, "mean_token_accuracy": 0.3793103516101837, "step": 3680 }, { "epoch": 0.0037115711377753336, "grad_norm": 49.14894012709388, "learning_rate": 3.7115001107910484e-06, "loss": 2.2411, "mean_token_accuracy": 0.4931034564971924, "step": 3685 }, { "epoch": 0.0037166071908795065, "grad_norm": 76.64945081703381, "learning_rate": 3.7165360675221085e-06, "loss": 2.2674, "mean_token_accuracy": 0.4620689570903778, "step": 3690 }, { "epoch": 0.0037216432439836794, "grad_norm": 72.07156586487014, "learning_rate": 3.721572024253168e-06, "loss": 2.1469, "mean_token_accuracy": 0.4517241418361664, "step": 3695 }, { "epoch": 0.003726679297087852, "grad_norm": 55.83660019268889, "learning_rate": 3.726607980984228e-06, "loss": 2.0973, "mean_token_accuracy": 0.47241379618644713, "step": 3700 }, { "epoch": 0.0037317153501920247, "grad_norm": 85.50471241152457, "learning_rate": 3.731643937715287e-06, "loss": 2.258, "mean_token_accuracy": 0.4586206912994385, "step": 3705 }, { "epoch": 0.0037367514032961976, "grad_norm": 74.0609858646388, "learning_rate": 3.7366798944463473e-06, "loss": 2.2093, "mean_token_accuracy": 0.5068965554237366, "step": 3710 }, { "epoch": 0.00374178745640037, "grad_norm": 103.45310432229822, "learning_rate": 3.741715851177407e-06, "loss": 2.6597, "mean_token_accuracy": 0.4068965554237366, "step": 3715 }, { "epoch": 0.003746823509504543, "grad_norm": 98.12325393792392, "learning_rate": 3.746751807908467e-06, "loss": 2.4466, "mean_token_accuracy": 0.4448275864124298, "step": 3720 }, { "epoch": 0.003751859562608716, "grad_norm": 57.1653678469413, "learning_rate": 3.7517877646395263e-06, "loss": 2.0738, "mean_token_accuracy": 0.4862068951129913, "step": 3725 }, { "epoch": 0.0037568956157128888, "grad_norm": 70.8981827875069, "learning_rate": 3.7568237213705864e-06, "loss": 1.9772, "mean_token_accuracy": 0.49655171632766726, "step": 3730 }, { "epoch": 0.0037619316688170612, "grad_norm": 61.30132774505522, "learning_rate": 3.761859678101646e-06, "loss": 2.6244, "mean_token_accuracy": 0.4, "step": 3735 }, { "epoch": 0.003766967721921234, "grad_norm": 67.09862478096984, "learning_rate": 3.766895634832706e-06, "loss": 2.0188, "mean_token_accuracy": 0.5172413766384125, "step": 3740 }, { "epoch": 0.003772003775025407, "grad_norm": 66.351129040528, "learning_rate": 3.7719315915637654e-06, "loss": 2.3546, "mean_token_accuracy": 0.3999999940395355, "step": 3745 }, { "epoch": 0.0037770398281295795, "grad_norm": 68.18041714322693, "learning_rate": 3.7769675482948255e-06, "loss": 2.1001, "mean_token_accuracy": 0.4931034445762634, "step": 3750 }, { "epoch": 0.0037820758812337524, "grad_norm": 72.55500172162084, "learning_rate": 3.7820035050258848e-06, "loss": 2.3197, "mean_token_accuracy": 0.40689654350280763, "step": 3755 }, { "epoch": 0.0037871119343379253, "grad_norm": 62.17451142262177, "learning_rate": 3.787039461756945e-06, "loss": 2.176, "mean_token_accuracy": 0.4689655125141144, "step": 3760 }, { "epoch": 0.003792147987442098, "grad_norm": 68.48842259879379, "learning_rate": 3.7920754184880045e-06, "loss": 2.1518, "mean_token_accuracy": 0.4724137902259827, "step": 3765 }, { "epoch": 0.0037971840405462707, "grad_norm": 89.95885898653464, "learning_rate": 3.7971113752190646e-06, "loss": 2.1153, "mean_token_accuracy": 0.4706593990325928, "step": 3770 }, { "epoch": 0.0038022200936504436, "grad_norm": 52.615998656096316, "learning_rate": 3.802147331950124e-06, "loss": 2.5564, "mean_token_accuracy": 0.4258318245410919, "step": 3775 }, { "epoch": 0.0038072561467546165, "grad_norm": 62.07442076895445, "learning_rate": 3.807183288681184e-06, "loss": 1.8344, "mean_token_accuracy": 0.5160919547080993, "step": 3780 }, { "epoch": 0.003812292199858789, "grad_norm": 61.03177882638895, "learning_rate": 3.8122192454122437e-06, "loss": 2.0359, "mean_token_accuracy": 0.5, "step": 3785 }, { "epoch": 0.003817328252962962, "grad_norm": 85.74588906043324, "learning_rate": 3.817255202143303e-06, "loss": 2.3117, "mean_token_accuracy": 0.45517241954803467, "step": 3790 }, { "epoch": 0.0038223643060671347, "grad_norm": 89.95195355878391, "learning_rate": 3.8222911588743634e-06, "loss": 2.1961, "mean_token_accuracy": 0.4448275864124298, "step": 3795 }, { "epoch": 0.003827400359171307, "grad_norm": 71.4226892060403, "learning_rate": 3.8273271156054235e-06, "loss": 2.1332, "mean_token_accuracy": 0.48275862336158754, "step": 3800 }, { "epoch": 0.00383243641227548, "grad_norm": 75.86783507205627, "learning_rate": 3.832363072336483e-06, "loss": 2.4997, "mean_token_accuracy": 0.41724138259887694, "step": 3805 }, { "epoch": 0.003837472465379653, "grad_norm": 74.3842085733325, "learning_rate": 3.837399029067543e-06, "loss": 2.2921, "mean_token_accuracy": 0.44621899724006653, "step": 3810 }, { "epoch": 0.003842508518483826, "grad_norm": 59.76921710712986, "learning_rate": 3.842434985798602e-06, "loss": 2.0702, "mean_token_accuracy": 0.5034482777118683, "step": 3815 }, { "epoch": 0.0038475445715879983, "grad_norm": 79.43928032912176, "learning_rate": 3.847470942529662e-06, "loss": 2.3318, "mean_token_accuracy": 0.4482758641242981, "step": 3820 }, { "epoch": 0.0038525806246921712, "grad_norm": 73.53894426401041, "learning_rate": 3.8525068992607215e-06, "loss": 2.0034, "mean_token_accuracy": 0.4896551728248596, "step": 3825 }, { "epoch": 0.003857616677796344, "grad_norm": 90.13419870256526, "learning_rate": 3.857542855991782e-06, "loss": 2.5503, "mean_token_accuracy": 0.3999999940395355, "step": 3830 }, { "epoch": 0.0038626527309005166, "grad_norm": 71.5623849820841, "learning_rate": 3.862578812722841e-06, "loss": 2.1522, "mean_token_accuracy": 0.47931033968925474, "step": 3835 }, { "epoch": 0.0038676887840046895, "grad_norm": 88.93229072009744, "learning_rate": 3.867614769453901e-06, "loss": 2.3312, "mean_token_accuracy": 0.4482758641242981, "step": 3840 }, { "epoch": 0.0038727248371088624, "grad_norm": 95.57253395168001, "learning_rate": 3.872650726184961e-06, "loss": 2.3891, "mean_token_accuracy": 0.4103448212146759, "step": 3845 }, { "epoch": 0.0038777608902130353, "grad_norm": 99.53244641082048, "learning_rate": 3.877686682916021e-06, "loss": 2.1269, "mean_token_accuracy": 0.4724137902259827, "step": 3850 }, { "epoch": 0.0038827969433172078, "grad_norm": 66.32690974374738, "learning_rate": 3.88272263964708e-06, "loss": 2.3929, "mean_token_accuracy": 0.4068965554237366, "step": 3855 }, { "epoch": 0.0038878329964213807, "grad_norm": 72.58499868972568, "learning_rate": 3.8877585963781405e-06, "loss": 2.2971, "mean_token_accuracy": 0.4517241358757019, "step": 3860 }, { "epoch": 0.0038928690495255536, "grad_norm": 81.1126693490413, "learning_rate": 3.8927945531092e-06, "loss": 2.3377, "mean_token_accuracy": 0.44343616962432864, "step": 3865 }, { "epoch": 0.003897905102629726, "grad_norm": 80.47860055164507, "learning_rate": 3.89783050984026e-06, "loss": 2.3569, "mean_token_accuracy": 0.4310344696044922, "step": 3870 }, { "epoch": 0.003902941155733899, "grad_norm": 80.35359476634429, "learning_rate": 3.902866466571319e-06, "loss": 2.3744, "mean_token_accuracy": 0.4344827473163605, "step": 3875 }, { "epoch": 0.003907977208838072, "grad_norm": 48.78239038260757, "learning_rate": 3.907902423302379e-06, "loss": 1.9804, "mean_token_accuracy": 0.5151477813720703, "step": 3880 }, { "epoch": 0.003913013261942245, "grad_norm": 62.476966785250376, "learning_rate": 3.9129383800334385e-06, "loss": 1.729, "mean_token_accuracy": 0.558620685338974, "step": 3885 }, { "epoch": 0.003918049315046418, "grad_norm": 65.41113658727998, "learning_rate": 3.9179743367644986e-06, "loss": 2.0854, "mean_token_accuracy": 0.48965516686439514, "step": 3890 }, { "epoch": 0.00392308536815059, "grad_norm": 74.59678459847665, "learning_rate": 3.923010293495559e-06, "loss": 2.264, "mean_token_accuracy": 0.4517241358757019, "step": 3895 }, { "epoch": 0.0039281214212547625, "grad_norm": 58.99977571026314, "learning_rate": 3.928046250226619e-06, "loss": 2.1787, "mean_token_accuracy": 0.47586206197738645, "step": 3900 }, { "epoch": 0.0039331574743589354, "grad_norm": 48.427653762972085, "learning_rate": 3.933082206957678e-06, "loss": 2.2662, "mean_token_accuracy": 0.482758617401123, "step": 3905 }, { "epoch": 0.003938193527463108, "grad_norm": 58.61050058841118, "learning_rate": 3.938118163688738e-06, "loss": 2.0359, "mean_token_accuracy": 0.46896551847457885, "step": 3910 }, { "epoch": 0.003943229580567281, "grad_norm": 56.57091735569473, "learning_rate": 3.943154120419797e-06, "loss": 2.3083, "mean_token_accuracy": 0.47586206793785096, "step": 3915 }, { "epoch": 0.003948265633671454, "grad_norm": 66.3296435475827, "learning_rate": 3.9481900771508575e-06, "loss": 2.4199, "mean_token_accuracy": 0.4068965554237366, "step": 3920 }, { "epoch": 0.003953301686775627, "grad_norm": 54.75842624216521, "learning_rate": 3.953226033881917e-06, "loss": 2.2852, "mean_token_accuracy": 0.4724137902259827, "step": 3925 }, { "epoch": 0.003958337739879799, "grad_norm": 87.54273533048352, "learning_rate": 3.958261990612977e-06, "loss": 2.2838, "mean_token_accuracy": 0.42758620381355283, "step": 3930 }, { "epoch": 0.003963373792983972, "grad_norm": 65.62266502700778, "learning_rate": 3.963297947344036e-06, "loss": 2.2713, "mean_token_accuracy": 0.4206896543502808, "step": 3935 }, { "epoch": 0.003968409846088145, "grad_norm": 53.90634618729618, "learning_rate": 3.968333904075096e-06, "loss": 2.1689, "mean_token_accuracy": 0.48451300859451296, "step": 3940 }, { "epoch": 0.003973445899192318, "grad_norm": 99.00851267101648, "learning_rate": 3.973369860806156e-06, "loss": 2.1535, "mean_token_accuracy": 0.47931034564971925, "step": 3945 }, { "epoch": 0.003978481952296491, "grad_norm": 84.25507492981015, "learning_rate": 3.978405817537216e-06, "loss": 2.2916, "mean_token_accuracy": 0.4551724135875702, "step": 3950 }, { "epoch": 0.0039835180054006636, "grad_norm": 62.56338182840301, "learning_rate": 3.983441774268276e-06, "loss": 2.3058, "mean_token_accuracy": 0.42413792610168455, "step": 3955 }, { "epoch": 0.0039885540585048364, "grad_norm": 62.169913336503825, "learning_rate": 3.988477730999336e-06, "loss": 2.0497, "mean_token_accuracy": 0.5034482777118683, "step": 3960 }, { "epoch": 0.0039935901116090085, "grad_norm": 76.66567210340112, "learning_rate": 3.993513687730395e-06, "loss": 2.5195, "mean_token_accuracy": 0.4137930989265442, "step": 3965 }, { "epoch": 0.003998626164713181, "grad_norm": 62.606570648472726, "learning_rate": 3.998549644461455e-06, "loss": 2.1871, "mean_token_accuracy": 0.4965517222881317, "step": 3970 }, { "epoch": 0.004003662217817354, "grad_norm": 86.38547380472534, "learning_rate": 4.003585601192514e-06, "loss": 2.3424, "mean_token_accuracy": 0.4413793087005615, "step": 3975 }, { "epoch": 0.004008698270921527, "grad_norm": 117.61303072590046, "learning_rate": 4.0086215579235744e-06, "loss": 2.2574, "mean_token_accuracy": 0.4379310369491577, "step": 3980 }, { "epoch": 0.0040137343240257, "grad_norm": 99.93116454487797, "learning_rate": 4.013657514654634e-06, "loss": 2.2737, "mean_token_accuracy": 0.44827585816383364, "step": 3985 }, { "epoch": 0.004018770377129873, "grad_norm": 64.85471790719244, "learning_rate": 4.018693471385694e-06, "loss": 2.0841, "mean_token_accuracy": 0.5083743929862976, "step": 3990 }, { "epoch": 0.004023806430234046, "grad_norm": 78.67211476879521, "learning_rate": 4.023729428116754e-06, "loss": 2.25, "mean_token_accuracy": 0.44827587008476255, "step": 3995 }, { "epoch": 0.004028842483338218, "grad_norm": 77.65505798090179, "learning_rate": 4.028765384847814e-06, "loss": 2.373, "mean_token_accuracy": 0.46896551847457885, "step": 4000 }, { "epoch": 0.004033878536442391, "grad_norm": 74.0153516668006, "learning_rate": 4.033801341578873e-06, "loss": 1.8359, "mean_token_accuracy": 0.5482391357421875, "step": 4005 }, { "epoch": 0.004038914589546564, "grad_norm": 82.77301271541326, "learning_rate": 4.038837298309933e-06, "loss": 2.3718, "mean_token_accuracy": 0.37586206793785093, "step": 4010 }, { "epoch": 0.004043950642650737, "grad_norm": 63.0783373081286, "learning_rate": 4.043873255040993e-06, "loss": 2.2047, "mean_token_accuracy": 0.4774954617023468, "step": 4015 }, { "epoch": 0.0040489866957549095, "grad_norm": 62.26969492820148, "learning_rate": 4.048909211772053e-06, "loss": 2.2346, "mean_token_accuracy": 0.47241379618644713, "step": 4020 }, { "epoch": 0.004054022748859082, "grad_norm": 95.70289653640307, "learning_rate": 4.053945168503112e-06, "loss": 2.2305, "mean_token_accuracy": 0.4637023627758026, "step": 4025 }, { "epoch": 0.004059058801963255, "grad_norm": 88.17929760264013, "learning_rate": 4.058981125234172e-06, "loss": 2.1404, "mean_token_accuracy": 0.46551724672317507, "step": 4030 }, { "epoch": 0.004064094855067427, "grad_norm": 72.68733610313816, "learning_rate": 4.064017081965231e-06, "loss": 2.1502, "mean_token_accuracy": 0.46551724076271056, "step": 4035 }, { "epoch": 0.0040691309081716, "grad_norm": 49.552751477214244, "learning_rate": 4.069053038696291e-06, "loss": 2.2424, "mean_token_accuracy": 0.441379314661026, "step": 4040 }, { "epoch": 0.004074166961275773, "grad_norm": 68.25887709540862, "learning_rate": 4.0740889954273515e-06, "loss": 2.311, "mean_token_accuracy": 0.42758620381355283, "step": 4045 }, { "epoch": 0.004079203014379946, "grad_norm": 56.62278606811682, "learning_rate": 4.079124952158412e-06, "loss": 2.3819, "mean_token_accuracy": 0.4551724135875702, "step": 4050 }, { "epoch": 0.004084239067484119, "grad_norm": 69.46349583210451, "learning_rate": 4.084160908889471e-06, "loss": 2.1256, "mean_token_accuracy": 0.5000000059604645, "step": 4055 }, { "epoch": 0.004089275120588292, "grad_norm": 88.98933765721326, "learning_rate": 4.089196865620531e-06, "loss": 2.3185, "mean_token_accuracy": 0.41724138259887694, "step": 4060 }, { "epoch": 0.004094311173692465, "grad_norm": 84.68266311921388, "learning_rate": 4.09423282235159e-06, "loss": 2.3945, "mean_token_accuracy": 0.4344827651977539, "step": 4065 }, { "epoch": 0.004099347226796637, "grad_norm": 89.7442286741659, "learning_rate": 4.09926877908265e-06, "loss": 2.0471, "mean_token_accuracy": 0.5310344755649566, "step": 4070 }, { "epoch": 0.00410438327990081, "grad_norm": 59.5009210643266, "learning_rate": 4.1043047358137096e-06, "loss": 2.3767, "mean_token_accuracy": 0.4, "step": 4075 }, { "epoch": 0.0041094193330049825, "grad_norm": 108.58049568685225, "learning_rate": 4.10934069254477e-06, "loss": 2.0846, "mean_token_accuracy": 0.4517241358757019, "step": 4080 }, { "epoch": 0.0041144553861091554, "grad_norm": 108.38210125411696, "learning_rate": 4.114376649275829e-06, "loss": 2.4153, "mean_token_accuracy": 0.4448275864124298, "step": 4085 }, { "epoch": 0.004119491439213328, "grad_norm": 67.85716210742117, "learning_rate": 4.119412606006889e-06, "loss": 2.326, "mean_token_accuracy": 0.4517241358757019, "step": 4090 }, { "epoch": 0.004124527492317501, "grad_norm": 79.18843431472932, "learning_rate": 4.124448562737949e-06, "loss": 2.4928, "mean_token_accuracy": 0.42413793206214906, "step": 4095 }, { "epoch": 0.004129563545421674, "grad_norm": 63.522685747569085, "learning_rate": 4.129484519469009e-06, "loss": 2.4672, "mean_token_accuracy": 0.43103447556495667, "step": 4100 }, { "epoch": 0.004134599598525846, "grad_norm": 115.60913548148612, "learning_rate": 4.134520476200069e-06, "loss": 2.2976, "mean_token_accuracy": 0.46551724076271056, "step": 4105 }, { "epoch": 0.004139635651630019, "grad_norm": 59.60227736143606, "learning_rate": 4.1395564329311286e-06, "loss": 2.0141, "mean_token_accuracy": 0.5206896543502808, "step": 4110 }, { "epoch": 0.004144671704734192, "grad_norm": 58.75462543465369, "learning_rate": 4.144592389662189e-06, "loss": 2.1149, "mean_token_accuracy": 0.46551724076271056, "step": 4115 }, { "epoch": 0.004149707757838365, "grad_norm": 86.59490868521627, "learning_rate": 4.149628346393248e-06, "loss": 2.4158, "mean_token_accuracy": 0.46551724672317507, "step": 4120 }, { "epoch": 0.004154743810942538, "grad_norm": 78.14889020209608, "learning_rate": 4.154664303124308e-06, "loss": 2.3572, "mean_token_accuracy": 0.4206896543502808, "step": 4125 }, { "epoch": 0.004159779864046711, "grad_norm": 71.21794193286092, "learning_rate": 4.159700259855367e-06, "loss": 2.23, "mean_token_accuracy": 0.482758617401123, "step": 4130 }, { "epoch": 0.0041648159171508835, "grad_norm": 76.80361300018873, "learning_rate": 4.164736216586427e-06, "loss": 2.0961, "mean_token_accuracy": 0.4931034505367279, "step": 4135 }, { "epoch": 0.004169851970255056, "grad_norm": 70.3361635992732, "learning_rate": 4.169772173317487e-06, "loss": 2.1367, "mean_token_accuracy": 0.4931034564971924, "step": 4140 }, { "epoch": 0.0041748880233592285, "grad_norm": 79.17235931033579, "learning_rate": 4.174808130048547e-06, "loss": 2.1642, "mean_token_accuracy": 0.4517241358757019, "step": 4145 }, { "epoch": 0.004179924076463401, "grad_norm": 56.08195935616169, "learning_rate": 4.179844086779607e-06, "loss": 2.3033, "mean_token_accuracy": 0.42758620381355283, "step": 4150 }, { "epoch": 0.004184960129567574, "grad_norm": 65.64879341776756, "learning_rate": 4.184880043510667e-06, "loss": 2.2199, "mean_token_accuracy": 0.482758617401123, "step": 4155 }, { "epoch": 0.004189996182671747, "grad_norm": 64.95543229406819, "learning_rate": 4.189916000241726e-06, "loss": 2.2161, "mean_token_accuracy": 0.46896551847457885, "step": 4160 }, { "epoch": 0.00419503223577592, "grad_norm": 65.43334328689735, "learning_rate": 4.194951956972786e-06, "loss": 2.4811, "mean_token_accuracy": 0.4610405325889587, "step": 4165 }, { "epoch": 0.004200068288880093, "grad_norm": 97.81663041597253, "learning_rate": 4.1999879137038455e-06, "loss": 2.3968, "mean_token_accuracy": 0.44482759237289426, "step": 4170 }, { "epoch": 0.004205104341984265, "grad_norm": 57.72628417556213, "learning_rate": 4.205023870434906e-06, "loss": 2.3253, "mean_token_accuracy": 0.4568058133125305, "step": 4175 }, { "epoch": 0.004210140395088438, "grad_norm": 49.56207392096942, "learning_rate": 4.210059827165965e-06, "loss": 1.9694, "mean_token_accuracy": 0.5034482777118683, "step": 4180 }, { "epoch": 0.004215176448192611, "grad_norm": 62.83170327084586, "learning_rate": 4.215095783897025e-06, "loss": 1.9727, "mean_token_accuracy": 0.534482765197754, "step": 4185 }, { "epoch": 0.004220212501296784, "grad_norm": 62.69438576282593, "learning_rate": 4.220131740628084e-06, "loss": 2.1881, "mean_token_accuracy": 0.4896551787853241, "step": 4190 }, { "epoch": 0.004225248554400957, "grad_norm": 91.54571706784118, "learning_rate": 4.225167697359144e-06, "loss": 2.308, "mean_token_accuracy": 0.44337567687034607, "step": 4195 }, { "epoch": 0.0042302846075051295, "grad_norm": 69.24260088954128, "learning_rate": 4.2302036540902044e-06, "loss": 2.0292, "mean_token_accuracy": 0.4620689630508423, "step": 4200 }, { "epoch": 0.004235320660609302, "grad_norm": 57.414343945959466, "learning_rate": 4.2352396108212645e-06, "loss": 2.5571, "mean_token_accuracy": 0.3793103456497192, "step": 4205 }, { "epoch": 0.004240356713713474, "grad_norm": 58.793211970168, "learning_rate": 4.240275567552324e-06, "loss": 2.2745, "mean_token_accuracy": 0.47586206793785096, "step": 4210 }, { "epoch": 0.004245392766817647, "grad_norm": 65.17152410248484, "learning_rate": 4.245311524283384e-06, "loss": 2.4349, "mean_token_accuracy": 0.4034482777118683, "step": 4215 }, { "epoch": 0.00425042881992182, "grad_norm": 72.72687356845236, "learning_rate": 4.250347481014443e-06, "loss": 2.1792, "mean_token_accuracy": 0.4724137902259827, "step": 4220 }, { "epoch": 0.004255464873025993, "grad_norm": 62.49065108487903, "learning_rate": 4.255383437745503e-06, "loss": 2.3734, "mean_token_accuracy": 0.441379314661026, "step": 4225 }, { "epoch": 0.004260500926130166, "grad_norm": 54.81935974666083, "learning_rate": 4.2604193944765625e-06, "loss": 1.9219, "mean_token_accuracy": 0.48275861144065857, "step": 4230 }, { "epoch": 0.004265536979234339, "grad_norm": 68.19630894776515, "learning_rate": 4.265455351207623e-06, "loss": 2.3024, "mean_token_accuracy": 0.4448275864124298, "step": 4235 }, { "epoch": 0.004270573032338512, "grad_norm": 59.49425627894097, "learning_rate": 4.270491307938682e-06, "loss": 2.2641, "mean_token_accuracy": 0.42758620381355283, "step": 4240 }, { "epoch": 0.004275609085442684, "grad_norm": 76.29664193763357, "learning_rate": 4.275527264669743e-06, "loss": 2.3725, "mean_token_accuracy": 0.4344827592372894, "step": 4245 }, { "epoch": 0.004280645138546857, "grad_norm": 57.16522446964406, "learning_rate": 4.280563221400802e-06, "loss": 2.2484, "mean_token_accuracy": 0.42068966031074523, "step": 4250 }, { "epoch": 0.00428568119165103, "grad_norm": 48.86394236491711, "learning_rate": 4.285599178131862e-06, "loss": 2.2847, "mean_token_accuracy": 0.44827587008476255, "step": 4255 }, { "epoch": 0.0042907172447552025, "grad_norm": 68.95637394188687, "learning_rate": 4.290635134862921e-06, "loss": 2.2714, "mean_token_accuracy": 0.43974592089653014, "step": 4260 }, { "epoch": 0.004295753297859375, "grad_norm": 75.88635881864354, "learning_rate": 4.2956710915939815e-06, "loss": 2.4482, "mean_token_accuracy": 0.44137930274009707, "step": 4265 }, { "epoch": 0.004300789350963548, "grad_norm": 66.10308244526328, "learning_rate": 4.300707048325041e-06, "loss": 2.1913, "mean_token_accuracy": 0.4448275864124298, "step": 4270 }, { "epoch": 0.004305825404067721, "grad_norm": 90.8791867607473, "learning_rate": 4.305743005056101e-06, "loss": 2.3027, "mean_token_accuracy": 0.42758620977401735, "step": 4275 }, { "epoch": 0.004310861457171893, "grad_norm": 59.049802787426515, "learning_rate": 4.31077896178716e-06, "loss": 2.5168, "mean_token_accuracy": 0.46551724076271056, "step": 4280 }, { "epoch": 0.004315897510276066, "grad_norm": 69.93615957051077, "learning_rate": 4.31581491851822e-06, "loss": 1.9633, "mean_token_accuracy": 0.5223230361938477, "step": 4285 }, { "epoch": 0.004320933563380239, "grad_norm": 67.34699780506475, "learning_rate": 4.32085087524928e-06, "loss": 2.364, "mean_token_accuracy": 0.4379310369491577, "step": 4290 }, { "epoch": 0.004325969616484412, "grad_norm": 65.20172878460436, "learning_rate": 4.32588683198034e-06, "loss": 2.1659, "mean_token_accuracy": 0.4709618926048279, "step": 4295 }, { "epoch": 0.004331005669588585, "grad_norm": 49.160352658060376, "learning_rate": 4.3309227887114e-06, "loss": 2.4155, "mean_token_accuracy": 0.4, "step": 4300 }, { "epoch": 0.004336041722692758, "grad_norm": 67.83562963406249, "learning_rate": 4.33595874544246e-06, "loss": 2.1095, "mean_token_accuracy": 0.47931033968925474, "step": 4305 }, { "epoch": 0.004341077775796931, "grad_norm": 63.95533170846425, "learning_rate": 4.340994702173519e-06, "loss": 1.9979, "mean_token_accuracy": 0.46896551847457885, "step": 4310 }, { "epoch": 0.004346113828901103, "grad_norm": 75.13482461195716, "learning_rate": 4.346030658904579e-06, "loss": 2.4099, "mean_token_accuracy": 0.39655172228813174, "step": 4315 }, { "epoch": 0.004351149882005276, "grad_norm": 55.339938507732924, "learning_rate": 4.351066615635638e-06, "loss": 2.3664, "mean_token_accuracy": 0.42758620977401735, "step": 4320 }, { "epoch": 0.0043561859351094485, "grad_norm": 55.65167695804991, "learning_rate": 4.3561025723666985e-06, "loss": 2.3625, "mean_token_accuracy": 0.42758620381355283, "step": 4325 }, { "epoch": 0.004361221988213621, "grad_norm": 98.10746909641446, "learning_rate": 4.361138529097758e-06, "loss": 2.4246, "mean_token_accuracy": 0.4638838529586792, "step": 4330 }, { "epoch": 0.004366258041317794, "grad_norm": 65.04298626807085, "learning_rate": 4.366174485828818e-06, "loss": 1.9045, "mean_token_accuracy": 0.4862068951129913, "step": 4335 }, { "epoch": 0.004371294094421967, "grad_norm": 43.4520912677257, "learning_rate": 4.371210442559878e-06, "loss": 1.9465, "mean_token_accuracy": 0.4965517222881317, "step": 4340 }, { "epoch": 0.00437633014752614, "grad_norm": 69.93909397068933, "learning_rate": 4.376246399290938e-06, "loss": 2.1646, "mean_token_accuracy": 0.44827585220336913, "step": 4345 }, { "epoch": 0.004381366200630312, "grad_norm": 77.74271973392179, "learning_rate": 4.381282356021997e-06, "loss": 2.4375, "mean_token_accuracy": 0.4172413766384125, "step": 4350 }, { "epoch": 0.004386402253734485, "grad_norm": 69.83835820188298, "learning_rate": 4.386318312753057e-06, "loss": 2.1176, "mean_token_accuracy": 0.4758620738983154, "step": 4355 }, { "epoch": 0.004391438306838658, "grad_norm": 71.74437770669098, "learning_rate": 4.391354269484117e-06, "loss": 2.1683, "mean_token_accuracy": 0.4758620738983154, "step": 4360 }, { "epoch": 0.004396474359942831, "grad_norm": 52.51628942437407, "learning_rate": 4.396390226215177e-06, "loss": 2.5393, "mean_token_accuracy": 0.42413792610168455, "step": 4365 }, { "epoch": 0.004401510413047004, "grad_norm": 70.59851980920405, "learning_rate": 4.401426182946236e-06, "loss": 2.13, "mean_token_accuracy": 0.4620689690113068, "step": 4370 }, { "epoch": 0.004406546466151177, "grad_norm": 61.16601154799414, "learning_rate": 4.406462139677296e-06, "loss": 2.1883, "mean_token_accuracy": 0.4620689630508423, "step": 4375 }, { "epoch": 0.004411582519255349, "grad_norm": 61.74240175224402, "learning_rate": 4.411498096408355e-06, "loss": 2.0564, "mean_token_accuracy": 0.46551724076271056, "step": 4380 }, { "epoch": 0.0044166185723595215, "grad_norm": 72.26463937268562, "learning_rate": 4.4165340531394154e-06, "loss": 2.2968, "mean_token_accuracy": 0.4620689630508423, "step": 4385 }, { "epoch": 0.004421654625463694, "grad_norm": 77.76039206164852, "learning_rate": 4.4215700098704755e-06, "loss": 2.2692, "mean_token_accuracy": 0.458620685338974, "step": 4390 }, { "epoch": 0.004426690678567867, "grad_norm": 56.58291496932423, "learning_rate": 4.426605966601536e-06, "loss": 2.5289, "mean_token_accuracy": 0.4344827651977539, "step": 4395 }, { "epoch": 0.00443172673167204, "grad_norm": 58.707053632135995, "learning_rate": 4.431641923332595e-06, "loss": 1.913, "mean_token_accuracy": 0.49655172824859617, "step": 4400 }, { "epoch": 0.004436762784776213, "grad_norm": 61.3813071039486, "learning_rate": 4.436677880063655e-06, "loss": 2.062, "mean_token_accuracy": 0.5034482836723327, "step": 4405 }, { "epoch": 0.004441798837880386, "grad_norm": 66.61686004967721, "learning_rate": 4.441713836794714e-06, "loss": 2.1011, "mean_token_accuracy": 0.43103447556495667, "step": 4410 }, { "epoch": 0.004446834890984558, "grad_norm": 59.98302955793069, "learning_rate": 4.446749793525774e-06, "loss": 2.3519, "mean_token_accuracy": 0.41379310488700866, "step": 4415 }, { "epoch": 0.004451870944088731, "grad_norm": 61.15411290722074, "learning_rate": 4.451785750256834e-06, "loss": 2.2279, "mean_token_accuracy": 0.42413792610168455, "step": 4420 }, { "epoch": 0.004456906997192904, "grad_norm": 51.142757781524864, "learning_rate": 4.456821706987894e-06, "loss": 2.2609, "mean_token_accuracy": 0.44137930274009707, "step": 4425 }, { "epoch": 0.004461943050297077, "grad_norm": 109.01904955739985, "learning_rate": 4.461857663718953e-06, "loss": 2.1417, "mean_token_accuracy": 0.46896551847457885, "step": 4430 }, { "epoch": 0.00446697910340125, "grad_norm": 48.9991020715633, "learning_rate": 4.466893620450013e-06, "loss": 2.3973, "mean_token_accuracy": 0.44482759237289426, "step": 4435 }, { "epoch": 0.0044720151565054225, "grad_norm": 101.26876555247364, "learning_rate": 4.471929577181073e-06, "loss": 2.3059, "mean_token_accuracy": 0.47931033968925474, "step": 4440 }, { "epoch": 0.004477051209609595, "grad_norm": 61.605800254304555, "learning_rate": 4.476965533912133e-06, "loss": 2.163, "mean_token_accuracy": 0.5000000059604645, "step": 4445 }, { "epoch": 0.0044820872627137675, "grad_norm": 73.55537222440582, "learning_rate": 4.4820014906431925e-06, "loss": 2.1569, "mean_token_accuracy": 0.48275861144065857, "step": 4450 }, { "epoch": 0.00448712331581794, "grad_norm": 102.54082419730184, "learning_rate": 4.487037447374253e-06, "loss": 2.4404, "mean_token_accuracy": 0.46896551847457885, "step": 4455 }, { "epoch": 0.004492159368922113, "grad_norm": 63.78217305810308, "learning_rate": 4.492073404105312e-06, "loss": 2.2506, "mean_token_accuracy": 0.4586206912994385, "step": 4460 }, { "epoch": 0.004497195422026286, "grad_norm": 78.11780626517847, "learning_rate": 4.497109360836372e-06, "loss": 2.348, "mean_token_accuracy": 0.4434361755847931, "step": 4465 }, { "epoch": 0.004502231475130459, "grad_norm": 52.63214825163616, "learning_rate": 4.502145317567431e-06, "loss": 2.261, "mean_token_accuracy": 0.42758620977401735, "step": 4470 }, { "epoch": 0.004507267528234632, "grad_norm": 76.67504507830839, "learning_rate": 4.507181274298491e-06, "loss": 2.3101, "mean_token_accuracy": 0.45045371651649474, "step": 4475 }, { "epoch": 0.004512303581338805, "grad_norm": 60.22942829695503, "learning_rate": 4.5122172310295506e-06, "loss": 1.9103, "mean_token_accuracy": 0.482758617401123, "step": 4480 }, { "epoch": 0.004517339634442977, "grad_norm": 82.42019024043736, "learning_rate": 4.517253187760611e-06, "loss": 2.1296, "mean_token_accuracy": 0.49655171632766726, "step": 4485 }, { "epoch": 0.00452237568754715, "grad_norm": 95.54582598762356, "learning_rate": 4.522289144491671e-06, "loss": 2.274, "mean_token_accuracy": 0.4413793206214905, "step": 4490 }, { "epoch": 0.004527411740651323, "grad_norm": 62.373890540238975, "learning_rate": 4.527325101222731e-06, "loss": 2.4116, "mean_token_accuracy": 0.42413793206214906, "step": 4495 }, { "epoch": 0.004532447793755496, "grad_norm": 62.289917758139644, "learning_rate": 4.532361057953791e-06, "loss": 1.9978, "mean_token_accuracy": 0.47931034564971925, "step": 4500 }, { "epoch": 0.0045374838468596685, "grad_norm": 50.160683932339474, "learning_rate": 4.53739701468485e-06, "loss": 2.2359, "mean_token_accuracy": 0.46896552443504336, "step": 4505 }, { "epoch": 0.004542519899963841, "grad_norm": 65.88185342026964, "learning_rate": 4.54243297141591e-06, "loss": 2.1066, "mean_token_accuracy": 0.4862068951129913, "step": 4510 }, { "epoch": 0.004547555953068014, "grad_norm": 100.95707527287281, "learning_rate": 4.5474689281469696e-06, "loss": 2.1162, "mean_token_accuracy": 0.47749546766281126, "step": 4515 }, { "epoch": 0.004552592006172186, "grad_norm": 95.95146769243699, "learning_rate": 4.55250488487803e-06, "loss": 2.3188, "mean_token_accuracy": 0.4482758641242981, "step": 4520 }, { "epoch": 0.004557628059276359, "grad_norm": 95.50098311464909, "learning_rate": 4.557540841609089e-06, "loss": 2.4757, "mean_token_accuracy": 0.4103448212146759, "step": 4525 }, { "epoch": 0.004562664112380532, "grad_norm": 66.92387869361015, "learning_rate": 4.562576798340149e-06, "loss": 2.2637, "mean_token_accuracy": 0.4448275864124298, "step": 4530 }, { "epoch": 0.004567700165484705, "grad_norm": 74.14669347554786, "learning_rate": 4.567612755071208e-06, "loss": 2.3675, "mean_token_accuracy": 0.4689655125141144, "step": 4535 }, { "epoch": 0.004572736218588878, "grad_norm": 64.59498241691422, "learning_rate": 4.572648711802268e-06, "loss": 2.6723, "mean_token_accuracy": 0.3896551728248596, "step": 4540 }, { "epoch": 0.004577772271693051, "grad_norm": 69.61065714540345, "learning_rate": 4.5776846685333285e-06, "loss": 2.4983, "mean_token_accuracy": 0.42758620381355283, "step": 4545 }, { "epoch": 0.004582808324797224, "grad_norm": 58.47058691664711, "learning_rate": 4.5827206252643886e-06, "loss": 2.0005, "mean_token_accuracy": 0.5517241418361664, "step": 4550 }, { "epoch": 0.004587844377901396, "grad_norm": 56.11088926623105, "learning_rate": 4.587756581995448e-06, "loss": 1.9313, "mean_token_accuracy": 0.46896551847457885, "step": 4555 }, { "epoch": 0.004592880431005569, "grad_norm": 86.78285751193991, "learning_rate": 4.592792538726508e-06, "loss": 2.377, "mean_token_accuracy": 0.4482758641242981, "step": 4560 }, { "epoch": 0.0045979164841097415, "grad_norm": 79.69445898404734, "learning_rate": 4.597828495457567e-06, "loss": 1.9947, "mean_token_accuracy": 0.5295220911502838, "step": 4565 }, { "epoch": 0.004602952537213914, "grad_norm": 116.84324927014865, "learning_rate": 4.602864452188627e-06, "loss": 2.4357, "mean_token_accuracy": 0.4068965494632721, "step": 4570 }, { "epoch": 0.004607988590318087, "grad_norm": 52.49389608047635, "learning_rate": 4.6079004089196865e-06, "loss": 2.4641, "mean_token_accuracy": 0.42758620381355283, "step": 4575 }, { "epoch": 0.00461302464342226, "grad_norm": 62.34033980519452, "learning_rate": 4.612936365650747e-06, "loss": 2.3382, "mean_token_accuracy": 0.4402298808097839, "step": 4580 }, { "epoch": 0.004618060696526433, "grad_norm": 65.55807484979542, "learning_rate": 4.617972322381806e-06, "loss": 2.3517, "mean_token_accuracy": 0.4448275864124298, "step": 4585 }, { "epoch": 0.004623096749630605, "grad_norm": 61.214992349349046, "learning_rate": 4.623008279112866e-06, "loss": 2.0357, "mean_token_accuracy": 0.4896551728248596, "step": 4590 }, { "epoch": 0.004628132802734778, "grad_norm": 69.57912414030602, "learning_rate": 4.628044235843926e-06, "loss": 2.2602, "mean_token_accuracy": 0.47931033968925474, "step": 4595 }, { "epoch": 0.004633168855838951, "grad_norm": 44.03075011529503, "learning_rate": 4.633080192574986e-06, "loss": 2.228, "mean_token_accuracy": 0.46551724672317507, "step": 4600 }, { "epoch": 0.004638204908943124, "grad_norm": 58.011647404611644, "learning_rate": 4.6381161493060454e-06, "loss": 1.8955, "mean_token_accuracy": 0.5068965435028077, "step": 4605 }, { "epoch": 0.004643240962047297, "grad_norm": 71.89392724736372, "learning_rate": 4.6431521060371055e-06, "loss": 2.4062, "mean_token_accuracy": 0.458620685338974, "step": 4610 }, { "epoch": 0.00464827701515147, "grad_norm": 76.05217171654296, "learning_rate": 4.648188062768165e-06, "loss": 1.9633, "mean_token_accuracy": 0.4862069010734558, "step": 4615 }, { "epoch": 0.0046533130682556425, "grad_norm": 68.25925693843631, "learning_rate": 4.653224019499225e-06, "loss": 2.0736, "mean_token_accuracy": 0.5034482717514038, "step": 4620 }, { "epoch": 0.0046583491213598146, "grad_norm": 54.400617273650944, "learning_rate": 4.658259976230284e-06, "loss": 2.3649, "mean_token_accuracy": 0.42413793206214906, "step": 4625 }, { "epoch": 0.0046633851744639875, "grad_norm": 70.30471005276473, "learning_rate": 4.663295932961344e-06, "loss": 2.3179, "mean_token_accuracy": 0.4517241358757019, "step": 4630 }, { "epoch": 0.00466842122756816, "grad_norm": 87.76760979261556, "learning_rate": 4.6683318896924035e-06, "loss": 2.3422, "mean_token_accuracy": 0.4620689690113068, "step": 4635 }, { "epoch": 0.004673457280672333, "grad_norm": 73.91291213360861, "learning_rate": 4.673367846423464e-06, "loss": 2.2407, "mean_token_accuracy": 0.4862068951129913, "step": 4640 }, { "epoch": 0.004678493333776506, "grad_norm": 58.18290091036251, "learning_rate": 4.678403803154524e-06, "loss": 2.2202, "mean_token_accuracy": 0.4620689690113068, "step": 4645 }, { "epoch": 0.004683529386880679, "grad_norm": 48.085184516340156, "learning_rate": 4.683439759885584e-06, "loss": 2.0219, "mean_token_accuracy": 0.47586206197738645, "step": 4650 }, { "epoch": 0.004688565439984852, "grad_norm": 54.084923900833566, "learning_rate": 4.688475716616643e-06, "loss": 2.2256, "mean_token_accuracy": 0.46551724672317507, "step": 4655 }, { "epoch": 0.004693601493089024, "grad_norm": 67.46580634055775, "learning_rate": 4.693511673347703e-06, "loss": 2.3484, "mean_token_accuracy": 0.4206896543502808, "step": 4660 }, { "epoch": 0.004698637546193197, "grad_norm": 57.961361849711835, "learning_rate": 4.698547630078762e-06, "loss": 2.4713, "mean_token_accuracy": 0.37586206793785093, "step": 4665 }, { "epoch": 0.00470367359929737, "grad_norm": 63.790142384919214, "learning_rate": 4.7035835868098225e-06, "loss": 2.2818, "mean_token_accuracy": 0.4620689630508423, "step": 4670 }, { "epoch": 0.004708709652401543, "grad_norm": 65.9742494655388, "learning_rate": 4.708619543540882e-06, "loss": 2.1405, "mean_token_accuracy": 0.5034482777118683, "step": 4675 }, { "epoch": 0.004713745705505716, "grad_norm": 67.30241307973168, "learning_rate": 4.713655500271942e-06, "loss": 2.2616, "mean_token_accuracy": 0.4344827592372894, "step": 4680 }, { "epoch": 0.0047187817586098885, "grad_norm": 72.50106901007615, "learning_rate": 4.718691457003001e-06, "loss": 2.3447, "mean_token_accuracy": 0.4620689690113068, "step": 4685 }, { "epoch": 0.004723817811714061, "grad_norm": 56.62281379038907, "learning_rate": 4.723727413734061e-06, "loss": 2.352, "mean_token_accuracy": 0.4206896543502808, "step": 4690 }, { "epoch": 0.004728853864818233, "grad_norm": 79.2218996566941, "learning_rate": 4.728763370465121e-06, "loss": 2.1499, "mean_token_accuracy": 0.4620689630508423, "step": 4695 }, { "epoch": 0.004733889917922406, "grad_norm": 62.15045348248068, "learning_rate": 4.733799327196181e-06, "loss": 2.2403, "mean_token_accuracy": 0.4275861978530884, "step": 4700 }, { "epoch": 0.004738925971026579, "grad_norm": 57.76997918011777, "learning_rate": 4.738835283927241e-06, "loss": 2.3428, "mean_token_accuracy": 0.4206896543502808, "step": 4705 }, { "epoch": 0.004743962024130752, "grad_norm": 64.84623090385051, "learning_rate": 4.743871240658301e-06, "loss": 2.4429, "mean_token_accuracy": 0.3896551787853241, "step": 4710 }, { "epoch": 0.004748998077234925, "grad_norm": 96.81953623021111, "learning_rate": 4.74890719738936e-06, "loss": 2.3722, "mean_token_accuracy": 0.4707198977470398, "step": 4715 }, { "epoch": 0.004754034130339098, "grad_norm": 74.37955995411201, "learning_rate": 4.75394315412042e-06, "loss": 2.4442, "mean_token_accuracy": 0.43448275327682495, "step": 4720 }, { "epoch": 0.004759070183443271, "grad_norm": 51.35350572521384, "learning_rate": 4.758979110851479e-06, "loss": 2.3695, "mean_token_accuracy": 0.42758620381355283, "step": 4725 }, { "epoch": 0.004764106236547443, "grad_norm": 62.03456840429953, "learning_rate": 4.7640150675825395e-06, "loss": 2.1875, "mean_token_accuracy": 0.4551724135875702, "step": 4730 }, { "epoch": 0.004769142289651616, "grad_norm": 75.5145494584868, "learning_rate": 4.769051024313599e-06, "loss": 2.2886, "mean_token_accuracy": 0.4620689630508423, "step": 4735 }, { "epoch": 0.004774178342755789, "grad_norm": 85.27769518322883, "learning_rate": 4.77408698104466e-06, "loss": 2.4552, "mean_token_accuracy": 0.41034482717514037, "step": 4740 }, { "epoch": 0.0047792143958599615, "grad_norm": 55.441122368856306, "learning_rate": 4.779122937775719e-06, "loss": 2.498, "mean_token_accuracy": 0.3758620649576187, "step": 4745 }, { "epoch": 0.004784250448964134, "grad_norm": 72.08228569372173, "learning_rate": 4.784158894506779e-06, "loss": 2.2111, "mean_token_accuracy": 0.4724137902259827, "step": 4750 }, { "epoch": 0.004789286502068307, "grad_norm": 43.94582784000871, "learning_rate": 4.789194851237838e-06, "loss": 2.1538, "mean_token_accuracy": 0.482758629322052, "step": 4755 }, { "epoch": 0.00479432255517248, "grad_norm": 73.53525344219976, "learning_rate": 4.794230807968898e-06, "loss": 2.3359, "mean_token_accuracy": 0.482758617401123, "step": 4760 }, { "epoch": 0.004799358608276652, "grad_norm": 71.77174418250638, "learning_rate": 4.799266764699958e-06, "loss": 2.3293, "mean_token_accuracy": 0.43793103098869324, "step": 4765 }, { "epoch": 0.004804394661380825, "grad_norm": 70.76277547318473, "learning_rate": 4.804302721431018e-06, "loss": 2.091, "mean_token_accuracy": 0.4724137902259827, "step": 4770 }, { "epoch": 0.004809430714484998, "grad_norm": 63.171319060344324, "learning_rate": 4.809338678162077e-06, "loss": 2.1785, "mean_token_accuracy": 0.4793103516101837, "step": 4775 }, { "epoch": 0.004814466767589171, "grad_norm": 66.72407050626937, "learning_rate": 4.814374634893137e-06, "loss": 2.0943, "mean_token_accuracy": 0.46896552443504336, "step": 4780 }, { "epoch": 0.004819502820693344, "grad_norm": 60.33468915114281, "learning_rate": 4.819410591624197e-06, "loss": 1.8636, "mean_token_accuracy": 0.5034482657909394, "step": 4785 }, { "epoch": 0.004824538873797517, "grad_norm": 49.81477886862322, "learning_rate": 4.824446548355257e-06, "loss": 2.3342, "mean_token_accuracy": 0.4517241358757019, "step": 4790 }, { "epoch": 0.00482957492690169, "grad_norm": 104.55613995004154, "learning_rate": 4.8294825050863165e-06, "loss": 2.7633, "mean_token_accuracy": 0.37586206793785093, "step": 4795 }, { "epoch": 0.004834610980005862, "grad_norm": 88.80635937299884, "learning_rate": 4.834518461817377e-06, "loss": 2.5066, "mean_token_accuracy": 0.41724138855934145, "step": 4800 }, { "epoch": 0.0048396470331100346, "grad_norm": 55.427107699392195, "learning_rate": 4.839554418548436e-06, "loss": 2.0271, "mean_token_accuracy": 0.4724137902259827, "step": 4805 }, { "epoch": 0.0048446830862142075, "grad_norm": 74.33928657449144, "learning_rate": 4.844590375279496e-06, "loss": 1.9994, "mean_token_accuracy": 0.5228070139884948, "step": 4810 }, { "epoch": 0.00484971913931838, "grad_norm": 52.11039364281213, "learning_rate": 4.849626332010555e-06, "loss": 1.9534, "mean_token_accuracy": 0.5259528160095215, "step": 4815 }, { "epoch": 0.004854755192422553, "grad_norm": 42.4150483663928, "learning_rate": 4.854662288741615e-06, "loss": 2.413, "mean_token_accuracy": 0.4413793087005615, "step": 4820 }, { "epoch": 0.004859791245526726, "grad_norm": 68.14359699517831, "learning_rate": 4.859698245472675e-06, "loss": 2.1299, "mean_token_accuracy": 0.4862068951129913, "step": 4825 }, { "epoch": 0.004864827298630898, "grad_norm": 55.80214183455283, "learning_rate": 4.864734202203735e-06, "loss": 2.3377, "mean_token_accuracy": 0.4310344815254211, "step": 4830 }, { "epoch": 0.004869863351735071, "grad_norm": 53.26715757398765, "learning_rate": 4.869770158934795e-06, "loss": 2.3427, "mean_token_accuracy": 0.4241379380226135, "step": 4835 }, { "epoch": 0.004874899404839244, "grad_norm": 67.82974998562977, "learning_rate": 4.874806115665855e-06, "loss": 1.937, "mean_token_accuracy": 0.4793103516101837, "step": 4840 }, { "epoch": 0.004879935457943417, "grad_norm": 58.27714151764549, "learning_rate": 4.879842072396914e-06, "loss": 2.2357, "mean_token_accuracy": 0.47684728503227236, "step": 4845 }, { "epoch": 0.00488497151104759, "grad_norm": 55.22575011739198, "learning_rate": 4.884878029127974e-06, "loss": 2.1676, "mean_token_accuracy": 0.441379314661026, "step": 4850 }, { "epoch": 0.004890007564151763, "grad_norm": 67.18177798528704, "learning_rate": 4.8899139858590335e-06, "loss": 2.0052, "mean_token_accuracy": 0.5448275804519653, "step": 4855 }, { "epoch": 0.0048950436172559356, "grad_norm": 64.27041597300736, "learning_rate": 4.894949942590094e-06, "loss": 2.5831, "mean_token_accuracy": 0.3896551728248596, "step": 4860 }, { "epoch": 0.004900079670360108, "grad_norm": 70.55649328261276, "learning_rate": 4.899985899321153e-06, "loss": 2.4933, "mean_token_accuracy": 0.39655172228813174, "step": 4865 }, { "epoch": 0.0049051157234642805, "grad_norm": 50.00552854251337, "learning_rate": 4.905021856052213e-06, "loss": 2.3004, "mean_token_accuracy": 0.46551724076271056, "step": 4870 }, { "epoch": 0.004910151776568453, "grad_norm": 47.03616536333132, "learning_rate": 4.910057812783272e-06, "loss": 2.019, "mean_token_accuracy": 0.4655172348022461, "step": 4875 }, { "epoch": 0.004915187829672626, "grad_norm": 59.92274711167819, "learning_rate": 4.915093769514332e-06, "loss": 2.1261, "mean_token_accuracy": 0.4931034445762634, "step": 4880 }, { "epoch": 0.004920223882776799, "grad_norm": 110.51028995828258, "learning_rate": 4.920129726245392e-06, "loss": 2.384, "mean_token_accuracy": 0.420689657330513, "step": 4885 }, { "epoch": 0.004925259935880972, "grad_norm": 42.23604726138878, "learning_rate": 4.9251656829764525e-06, "loss": 2.5604, "mean_token_accuracy": 0.42413792610168455, "step": 4890 }, { "epoch": 0.004930295988985145, "grad_norm": 52.02558027425352, "learning_rate": 4.930201639707512e-06, "loss": 2.1597, "mean_token_accuracy": 0.471082878112793, "step": 4895 }, { "epoch": 0.004935332042089317, "grad_norm": 62.29288889140503, "learning_rate": 4.935237596438572e-06, "loss": 2.5317, "mean_token_accuracy": 0.4172413766384125, "step": 4900 }, { "epoch": 0.00494036809519349, "grad_norm": 67.17619635608203, "learning_rate": 4.940273553169631e-06, "loss": 1.8321, "mean_token_accuracy": 0.5275862038135528, "step": 4905 }, { "epoch": 0.004945404148297663, "grad_norm": 57.84088141004681, "learning_rate": 4.945309509900691e-06, "loss": 2.0287, "mean_token_accuracy": 0.49655172824859617, "step": 4910 }, { "epoch": 0.004950440201401836, "grad_norm": 62.455860357304665, "learning_rate": 4.950345466631751e-06, "loss": 2.1387, "mean_token_accuracy": 0.4482758641242981, "step": 4915 }, { "epoch": 0.004955476254506009, "grad_norm": 52.760833668853046, "learning_rate": 4.9553814233628105e-06, "loss": 2.1012, "mean_token_accuracy": 0.4675136089324951, "step": 4920 }, { "epoch": 0.0049605123076101815, "grad_norm": 75.79250365458297, "learning_rate": 4.960417380093871e-06, "loss": 2.1276, "mean_token_accuracy": 0.4862069010734558, "step": 4925 }, { "epoch": 0.004965548360714354, "grad_norm": 66.2174262384771, "learning_rate": 4.96545333682493e-06, "loss": 2.3614, "mean_token_accuracy": 0.41724138259887694, "step": 4930 }, { "epoch": 0.0049705844138185264, "grad_norm": 76.75236718541544, "learning_rate": 4.97048929355599e-06, "loss": 2.1555, "mean_token_accuracy": 0.441379314661026, "step": 4935 }, { "epoch": 0.004975620466922699, "grad_norm": 49.16557382693346, "learning_rate": 4.97552525028705e-06, "loss": 2.5746, "mean_token_accuracy": 0.43793103098869324, "step": 4940 }, { "epoch": 0.004980656520026872, "grad_norm": 70.40210340843936, "learning_rate": 4.98056120701811e-06, "loss": 2.2663, "mean_token_accuracy": 0.45862067937850953, "step": 4945 }, { "epoch": 0.004985692573131045, "grad_norm": 56.22664948209718, "learning_rate": 4.9855971637491695e-06, "loss": 2.3825, "mean_token_accuracy": 0.4172413766384125, "step": 4950 }, { "epoch": 0.004990728626235218, "grad_norm": 58.640772496492474, "learning_rate": 4.9906331204802296e-06, "loss": 2.372, "mean_token_accuracy": 0.4275861978530884, "step": 4955 }, { "epoch": 0.004995764679339391, "grad_norm": 52.597269837763356, "learning_rate": 4.995669077211289e-06, "loss": 2.0586, "mean_token_accuracy": 0.482758617401123, "step": 4960 }, { "epoch": 0.005000800732443564, "grad_norm": 61.54477380487586, "learning_rate": 5.000705033942349e-06, "loss": 2.2386, "mean_token_accuracy": 0.5034482717514038, "step": 4965 }, { "epoch": 0.005005836785547736, "grad_norm": 57.72962510637242, "learning_rate": 5.005740990673408e-06, "loss": 2.2071, "mean_token_accuracy": 0.482758617401123, "step": 4970 }, { "epoch": 0.005010872838651909, "grad_norm": 57.799889722372136, "learning_rate": 5.010776947404468e-06, "loss": 2.489, "mean_token_accuracy": 0.41724138259887694, "step": 4975 }, { "epoch": 0.005015908891756082, "grad_norm": 67.77003986715913, "learning_rate": 5.0158129041355275e-06, "loss": 2.371, "mean_token_accuracy": 0.4448275864124298, "step": 4980 }, { "epoch": 0.0050209449448602545, "grad_norm": 76.43776800322563, "learning_rate": 5.020848860866588e-06, "loss": 2.6248, "mean_token_accuracy": 0.44482758045196535, "step": 4985 }, { "epoch": 0.0050259809979644274, "grad_norm": 44.665451568261545, "learning_rate": 5.025884817597648e-06, "loss": 2.4031, "mean_token_accuracy": 0.44827585816383364, "step": 4990 }, { "epoch": 0.0050310170510686, "grad_norm": 60.92144952752837, "learning_rate": 5.030920774328708e-06, "loss": 2.2022, "mean_token_accuracy": 0.42413793206214906, "step": 4995 }, { "epoch": 0.005036053104172773, "grad_norm": 79.81800341358644, "learning_rate": 5.035956731059767e-06, "loss": 2.0991, "mean_token_accuracy": 0.5021778583526612, "step": 5000 }, { "epoch": 0.005041089157276945, "grad_norm": 57.74455605209466, "learning_rate": 5.040992687790827e-06, "loss": 2.2064, "mean_token_accuracy": 0.4620689630508423, "step": 5005 }, { "epoch": 0.005046125210381118, "grad_norm": 44.799091360443555, "learning_rate": 5.046028644521886e-06, "loss": 2.1773, "mean_token_accuracy": 0.4413793087005615, "step": 5010 }, { "epoch": 0.005051161263485291, "grad_norm": 46.792375177451206, "learning_rate": 5.0510646012529465e-06, "loss": 1.9952, "mean_token_accuracy": 0.5137931108474731, "step": 5015 }, { "epoch": 0.005056197316589464, "grad_norm": 52.24108330229672, "learning_rate": 5.056100557984006e-06, "loss": 2.3965, "mean_token_accuracy": 0.40689654350280763, "step": 5020 }, { "epoch": 0.005061233369693637, "grad_norm": 86.39586184224119, "learning_rate": 5.061136514715066e-06, "loss": 2.3281, "mean_token_accuracy": 0.4344827651977539, "step": 5025 }, { "epoch": 0.00506626942279781, "grad_norm": 48.203306834958575, "learning_rate": 5.066172471446125e-06, "loss": 2.1886, "mean_token_accuracy": 0.458620685338974, "step": 5030 }, { "epoch": 0.005071305475901983, "grad_norm": 57.17687520705525, "learning_rate": 5.071208428177185e-06, "loss": 2.3704, "mean_token_accuracy": 0.44137930274009707, "step": 5035 }, { "epoch": 0.005076341529006155, "grad_norm": 62.69867911410207, "learning_rate": 5.076244384908245e-06, "loss": 2.1292, "mean_token_accuracy": 0.4724137902259827, "step": 5040 }, { "epoch": 0.005081377582110328, "grad_norm": 45.02012801633202, "learning_rate": 5.0812803416393054e-06, "loss": 2.0253, "mean_token_accuracy": 0.506896561384201, "step": 5045 }, { "epoch": 0.0050864136352145005, "grad_norm": 60.94816448716288, "learning_rate": 5.086316298370365e-06, "loss": 2.218, "mean_token_accuracy": 0.46061705350875853, "step": 5050 }, { "epoch": 0.005091449688318673, "grad_norm": 77.26236876616662, "learning_rate": 5.091352255101425e-06, "loss": 2.2055, "mean_token_accuracy": 0.4620689570903778, "step": 5055 }, { "epoch": 0.005096485741422846, "grad_norm": 56.9512706879724, "learning_rate": 5.096388211832484e-06, "loss": 2.2843, "mean_token_accuracy": 0.4448275864124298, "step": 5060 }, { "epoch": 0.005101521794527019, "grad_norm": 59.63238208304837, "learning_rate": 5.101424168563544e-06, "loss": 2.3069, "mean_token_accuracy": 0.4413793087005615, "step": 5065 }, { "epoch": 0.005106557847631192, "grad_norm": 52.129514672046895, "learning_rate": 5.106460125294603e-06, "loss": 2.5455, "mean_token_accuracy": 0.458620685338974, "step": 5070 }, { "epoch": 0.005111593900735364, "grad_norm": 56.40752774241312, "learning_rate": 5.1114960820256635e-06, "loss": 2.003, "mean_token_accuracy": 0.5275861978530884, "step": 5075 }, { "epoch": 0.005116629953839537, "grad_norm": 62.155488816053406, "learning_rate": 5.116532038756723e-06, "loss": 1.8662, "mean_token_accuracy": 0.5172413766384125, "step": 5080 }, { "epoch": 0.00512166600694371, "grad_norm": 60.10773183923074, "learning_rate": 5.121567995487783e-06, "loss": 2.332, "mean_token_accuracy": 0.441379314661026, "step": 5085 }, { "epoch": 0.005126702060047883, "grad_norm": 45.36708723794407, "learning_rate": 5.126603952218843e-06, "loss": 2.0886, "mean_token_accuracy": 0.4931034505367279, "step": 5090 }, { "epoch": 0.005131738113152056, "grad_norm": 54.995604496817265, "learning_rate": 5.131639908949903e-06, "loss": 1.975, "mean_token_accuracy": 0.482758617401123, "step": 5095 }, { "epoch": 0.005136774166256229, "grad_norm": 86.00466369830168, "learning_rate": 5.136675865680962e-06, "loss": 2.2981, "mean_token_accuracy": 0.4121597111225128, "step": 5100 }, { "epoch": 0.0051418102193604015, "grad_norm": 52.472386114961644, "learning_rate": 5.141711822412022e-06, "loss": 2.0993, "mean_token_accuracy": 0.4655172348022461, "step": 5105 }, { "epoch": 0.0051468462724645735, "grad_norm": 40.59334492780365, "learning_rate": 5.146747779143082e-06, "loss": 2.2135, "mean_token_accuracy": 0.43103447556495667, "step": 5110 }, { "epoch": 0.005151882325568746, "grad_norm": 57.05439660541193, "learning_rate": 5.151783735874142e-06, "loss": 2.2692, "mean_token_accuracy": 0.4762250483036041, "step": 5115 }, { "epoch": 0.005156918378672919, "grad_norm": 62.17435367166306, "learning_rate": 5.156819692605201e-06, "loss": 2.2575, "mean_token_accuracy": 0.46896551847457885, "step": 5120 }, { "epoch": 0.005161954431777092, "grad_norm": 81.96978366140067, "learning_rate": 5.161855649336261e-06, "loss": 2.1175, "mean_token_accuracy": 0.4517241299152374, "step": 5125 }, { "epoch": 0.005166990484881265, "grad_norm": 56.6224519743192, "learning_rate": 5.16689160606732e-06, "loss": 1.9847, "mean_token_accuracy": 0.5368421018123627, "step": 5130 }, { "epoch": 0.005172026537985438, "grad_norm": 54.49737081118318, "learning_rate": 5.1719275627983804e-06, "loss": 2.1467, "mean_token_accuracy": 0.48620688915252686, "step": 5135 }, { "epoch": 0.005177062591089611, "grad_norm": 113.21496006124241, "learning_rate": 5.1769635195294405e-06, "loss": 2.5677, "mean_token_accuracy": 0.4310344696044922, "step": 5140 }, { "epoch": 0.005182098644193783, "grad_norm": 65.34478537520857, "learning_rate": 5.181999476260501e-06, "loss": 2.3153, "mean_token_accuracy": 0.4551724076271057, "step": 5145 }, { "epoch": 0.005187134697297956, "grad_norm": 63.53608539077253, "learning_rate": 5.18703543299156e-06, "loss": 2.6021, "mean_token_accuracy": 0.42068966031074523, "step": 5150 }, { "epoch": 0.005192170750402129, "grad_norm": 62.76026244838133, "learning_rate": 5.19207138972262e-06, "loss": 2.4379, "mean_token_accuracy": 0.42413793206214906, "step": 5155 }, { "epoch": 0.005197206803506302, "grad_norm": 72.67055814492477, "learning_rate": 5.197107346453679e-06, "loss": 1.8438, "mean_token_accuracy": 0.5551724135875702, "step": 5160 }, { "epoch": 0.0052022428566104745, "grad_norm": 71.07748091316792, "learning_rate": 5.202143303184739e-06, "loss": 2.2333, "mean_token_accuracy": 0.43448275327682495, "step": 5165 }, { "epoch": 0.0052072789097146474, "grad_norm": 60.47556272487013, "learning_rate": 5.207179259915799e-06, "loss": 2.3338, "mean_token_accuracy": 0.4206896543502808, "step": 5170 }, { "epoch": 0.00521231496281882, "grad_norm": 77.68342587456141, "learning_rate": 5.212215216646859e-06, "loss": 2.1392, "mean_token_accuracy": 0.45517241954803467, "step": 5175 }, { "epoch": 0.005217351015922992, "grad_norm": 76.44543284207457, "learning_rate": 5.217251173377918e-06, "loss": 2.1785, "mean_token_accuracy": 0.45172414779663084, "step": 5180 }, { "epoch": 0.005222387069027165, "grad_norm": 61.890828680579716, "learning_rate": 5.222287130108978e-06, "loss": 1.9292, "mean_token_accuracy": 0.5551724135875702, "step": 5185 }, { "epoch": 0.005227423122131338, "grad_norm": 48.671387299657646, "learning_rate": 5.227323086840038e-06, "loss": 2.0347, "mean_token_accuracy": 0.4689655125141144, "step": 5190 }, { "epoch": 0.005232459175235511, "grad_norm": 61.79449331491543, "learning_rate": 5.232359043571098e-06, "loss": 2.4038, "mean_token_accuracy": 0.42758620977401735, "step": 5195 }, { "epoch": 0.005237495228339684, "grad_norm": 59.436565152531095, "learning_rate": 5.2373950003021575e-06, "loss": 2.5369, "mean_token_accuracy": 0.41724138259887694, "step": 5200 }, { "epoch": 0.005242531281443857, "grad_norm": 60.45275169688569, "learning_rate": 5.242430957033218e-06, "loss": 2.4996, "mean_token_accuracy": 0.4482758641242981, "step": 5205 }, { "epoch": 0.00524756733454803, "grad_norm": 66.91125060646078, "learning_rate": 5.247466913764277e-06, "loss": 2.3523, "mean_token_accuracy": 0.4310344815254211, "step": 5210 }, { "epoch": 0.005252603387652202, "grad_norm": 51.212774272790426, "learning_rate": 5.252502870495337e-06, "loss": 2.0909, "mean_token_accuracy": 0.4586206912994385, "step": 5215 }, { "epoch": 0.005257639440756375, "grad_norm": 94.47941921851192, "learning_rate": 5.257538827226396e-06, "loss": 2.3875, "mean_token_accuracy": 0.46551724672317507, "step": 5220 }, { "epoch": 0.005262675493860548, "grad_norm": 47.80111854043782, "learning_rate": 5.262574783957456e-06, "loss": 2.1418, "mean_token_accuracy": 0.47586206793785096, "step": 5225 }, { "epoch": 0.0052677115469647205, "grad_norm": 48.403905631961834, "learning_rate": 5.2676107406885156e-06, "loss": 2.2353, "mean_token_accuracy": 0.49999998807907103, "step": 5230 }, { "epoch": 0.005272747600068893, "grad_norm": 47.84826563045979, "learning_rate": 5.272646697419576e-06, "loss": 2.1832, "mean_token_accuracy": 0.4586206912994385, "step": 5235 }, { "epoch": 0.005277783653173066, "grad_norm": 53.5025591865723, "learning_rate": 5.277682654150636e-06, "loss": 2.226, "mean_token_accuracy": 0.4379310369491577, "step": 5240 }, { "epoch": 0.005282819706277239, "grad_norm": 68.00684103158602, "learning_rate": 5.282718610881696e-06, "loss": 2.6529, "mean_token_accuracy": 0.41379310488700866, "step": 5245 }, { "epoch": 0.005287855759381411, "grad_norm": 54.61815522016781, "learning_rate": 5.287754567612755e-06, "loss": 2.4593, "mean_token_accuracy": 0.42758620977401735, "step": 5250 }, { "epoch": 0.005292891812485584, "grad_norm": 55.75107179089296, "learning_rate": 5.292790524343815e-06, "loss": 2.1424, "mean_token_accuracy": 0.45517240166664125, "step": 5255 }, { "epoch": 0.005297927865589757, "grad_norm": 44.33943991407288, "learning_rate": 5.2978264810748745e-06, "loss": 2.2922, "mean_token_accuracy": 0.4551724076271057, "step": 5260 }, { "epoch": 0.00530296391869393, "grad_norm": 67.01123815884502, "learning_rate": 5.3028624378059346e-06, "loss": 2.6027, "mean_token_accuracy": 0.42758620977401735, "step": 5265 }, { "epoch": 0.005307999971798103, "grad_norm": 70.88445439672337, "learning_rate": 5.307898394536994e-06, "loss": 2.3783, "mean_token_accuracy": 0.41379310488700866, "step": 5270 }, { "epoch": 0.005313036024902276, "grad_norm": 59.1625159017375, "learning_rate": 5.312934351268054e-06, "loss": 1.8583, "mean_token_accuracy": 0.5068965554237366, "step": 5275 }, { "epoch": 0.005318072078006448, "grad_norm": 76.45157150282814, "learning_rate": 5.317970307999114e-06, "loss": 2.2655, "mean_token_accuracy": 0.4517241358757019, "step": 5280 }, { "epoch": 0.005323108131110621, "grad_norm": 55.597350436717164, "learning_rate": 5.323006264730174e-06, "loss": 2.1003, "mean_token_accuracy": 0.4482758641242981, "step": 5285 }, { "epoch": 0.0053281441842147935, "grad_norm": 60.360247882929805, "learning_rate": 5.328042221461233e-06, "loss": 2.2907, "mean_token_accuracy": 0.40344826579093934, "step": 5290 }, { "epoch": 0.005333180237318966, "grad_norm": 53.76963053135653, "learning_rate": 5.3330781781922935e-06, "loss": 2.2519, "mean_token_accuracy": 0.5, "step": 5295 }, { "epoch": 0.005338216290423139, "grad_norm": 43.79342476087142, "learning_rate": 5.338114134923353e-06, "loss": 2.0367, "mean_token_accuracy": 0.4793103516101837, "step": 5300 }, { "epoch": 0.005343252343527312, "grad_norm": 65.07224569261078, "learning_rate": 5.343150091654413e-06, "loss": 2.6087, "mean_token_accuracy": 0.4344827651977539, "step": 5305 }, { "epoch": 0.005348288396631485, "grad_norm": 60.182082038884175, "learning_rate": 5.348186048385472e-06, "loss": 2.5583, "mean_token_accuracy": 0.4517241418361664, "step": 5310 }, { "epoch": 0.005353324449735657, "grad_norm": 63.020502953937104, "learning_rate": 5.353222005116532e-06, "loss": 2.1694, "mean_token_accuracy": 0.4586206912994385, "step": 5315 }, { "epoch": 0.00535836050283983, "grad_norm": 41.8160444016396, "learning_rate": 5.358257961847592e-06, "loss": 2.2148, "mean_token_accuracy": 0.4551724135875702, "step": 5320 }, { "epoch": 0.005363396555944003, "grad_norm": 83.60877480562837, "learning_rate": 5.3632939185786515e-06, "loss": 2.2031, "mean_token_accuracy": 0.44827587008476255, "step": 5325 }, { "epoch": 0.005368432609048176, "grad_norm": 57.71490915260869, "learning_rate": 5.368329875309712e-06, "loss": 2.2611, "mean_token_accuracy": 0.4655172348022461, "step": 5330 }, { "epoch": 0.005373468662152349, "grad_norm": 76.19323528123365, "learning_rate": 5.373365832040772e-06, "loss": 2.387, "mean_token_accuracy": 0.4482758641242981, "step": 5335 }, { "epoch": 0.005378504715256522, "grad_norm": 64.06808216803532, "learning_rate": 5.378401788771832e-06, "loss": 2.4848, "mean_token_accuracy": 0.4344827651977539, "step": 5340 }, { "epoch": 0.0053835407683606945, "grad_norm": 55.9399912469596, "learning_rate": 5.383437745502891e-06, "loss": 2.1121, "mean_token_accuracy": 0.5006049573421478, "step": 5345 }, { "epoch": 0.005388576821464867, "grad_norm": 113.56564731295282, "learning_rate": 5.388473702233951e-06, "loss": 2.0349, "mean_token_accuracy": 0.4947044312953949, "step": 5350 }, { "epoch": 0.0053936128745690395, "grad_norm": 73.21276716942864, "learning_rate": 5.3935096589650104e-06, "loss": 1.9133, "mean_token_accuracy": 0.49709620475769045, "step": 5355 }, { "epoch": 0.005398648927673212, "grad_norm": 53.70820082358731, "learning_rate": 5.3985456156960705e-06, "loss": 2.2817, "mean_token_accuracy": 0.4083484590053558, "step": 5360 }, { "epoch": 0.005403684980777385, "grad_norm": 53.549812595066996, "learning_rate": 5.40358157242713e-06, "loss": 2.15, "mean_token_accuracy": 0.4620689570903778, "step": 5365 }, { "epoch": 0.005408721033881558, "grad_norm": 44.51704364714626, "learning_rate": 5.40861752915819e-06, "loss": 2.0451, "mean_token_accuracy": 0.47931033968925474, "step": 5370 }, { "epoch": 0.005413757086985731, "grad_norm": 92.17141296261514, "learning_rate": 5.413653485889249e-06, "loss": 2.1481, "mean_token_accuracy": 0.4517241358757019, "step": 5375 }, { "epoch": 0.005418793140089904, "grad_norm": 76.61343739248683, "learning_rate": 5.418689442620309e-06, "loss": 2.2007, "mean_token_accuracy": 0.4862069010734558, "step": 5380 }, { "epoch": 0.005423829193194076, "grad_norm": 93.55658908837326, "learning_rate": 5.423725399351369e-06, "loss": 2.5759, "mean_token_accuracy": 0.4068965554237366, "step": 5385 }, { "epoch": 0.005428865246298249, "grad_norm": 47.029094729218436, "learning_rate": 5.4287613560824295e-06, "loss": 1.9805, "mean_token_accuracy": 0.5206896483898162, "step": 5390 }, { "epoch": 0.005433901299402422, "grad_norm": 83.2217552654011, "learning_rate": 5.433797312813489e-06, "loss": 2.3926, "mean_token_accuracy": 0.4896551728248596, "step": 5395 }, { "epoch": 0.005438937352506595, "grad_norm": 70.63687969253428, "learning_rate": 5.438833269544549e-06, "loss": 2.0463, "mean_token_accuracy": 0.46206897497177124, "step": 5400 }, { "epoch": 0.005443973405610768, "grad_norm": 65.12417167724513, "learning_rate": 5.443869226275608e-06, "loss": 2.1981, "mean_token_accuracy": 0.5206896543502808, "step": 5405 }, { "epoch": 0.0054490094587149405, "grad_norm": 69.81079100440061, "learning_rate": 5.448905183006668e-06, "loss": 2.3623, "mean_token_accuracy": 0.4068965554237366, "step": 5410 }, { "epoch": 0.005454045511819113, "grad_norm": 63.16387639721739, "learning_rate": 5.453941139737727e-06, "loss": 2.3249, "mean_token_accuracy": 0.4379310429096222, "step": 5415 }, { "epoch": 0.005459081564923285, "grad_norm": 64.81181393568374, "learning_rate": 5.4589770964687875e-06, "loss": 2.6455, "mean_token_accuracy": 0.3965517282485962, "step": 5420 }, { "epoch": 0.005464117618027458, "grad_norm": 44.52213605967673, "learning_rate": 5.464013053199847e-06, "loss": 2.2336, "mean_token_accuracy": 0.4620689630508423, "step": 5425 }, { "epoch": 0.005469153671131631, "grad_norm": 58.8393114707555, "learning_rate": 5.469049009930907e-06, "loss": 2.2412, "mean_token_accuracy": 0.4431337058544159, "step": 5430 }, { "epoch": 0.005474189724235804, "grad_norm": 50.30879302436615, "learning_rate": 5.474084966661967e-06, "loss": 2.2877, "mean_token_accuracy": 0.46551724076271056, "step": 5435 }, { "epoch": 0.005479225777339977, "grad_norm": 70.76102328863462, "learning_rate": 5.479120923393027e-06, "loss": 2.1271, "mean_token_accuracy": 0.4620689690113068, "step": 5440 }, { "epoch": 0.00548426183044415, "grad_norm": 62.62060847967712, "learning_rate": 5.484156880124086e-06, "loss": 2.4177, "mean_token_accuracy": 0.44827585816383364, "step": 5445 }, { "epoch": 0.005489297883548323, "grad_norm": 55.38496834228082, "learning_rate": 5.489192836855146e-06, "loss": 2.1094, "mean_token_accuracy": 0.4620689630508423, "step": 5450 }, { "epoch": 0.005494333936652495, "grad_norm": 56.23226674735334, "learning_rate": 5.494228793586206e-06, "loss": 2.4609, "mean_token_accuracy": 0.4310344815254211, "step": 5455 }, { "epoch": 0.005499369989756668, "grad_norm": 54.99112966327625, "learning_rate": 5.499264750317266e-06, "loss": 2.1683, "mean_token_accuracy": 0.4896551787853241, "step": 5460 }, { "epoch": 0.005504406042860841, "grad_norm": 64.42595924612726, "learning_rate": 5.504300707048325e-06, "loss": 2.1856, "mean_token_accuracy": 0.47586206197738645, "step": 5465 }, { "epoch": 0.0055094420959650135, "grad_norm": 59.95598926419033, "learning_rate": 5.509336663779385e-06, "loss": 2.2492, "mean_token_accuracy": 0.4862068951129913, "step": 5470 }, { "epoch": 0.005514478149069186, "grad_norm": 44.0925278011909, "learning_rate": 5.514372620510444e-06, "loss": 2.2783, "mean_token_accuracy": 0.4517241358757019, "step": 5475 }, { "epoch": 0.005519514202173359, "grad_norm": 64.05006231377183, "learning_rate": 5.5194085772415045e-06, "loss": 2.354, "mean_token_accuracy": 0.43793103098869324, "step": 5480 }, { "epoch": 0.005524550255277532, "grad_norm": 52.52561344609644, "learning_rate": 5.5244445339725646e-06, "loss": 2.0963, "mean_token_accuracy": 0.4620689690113068, "step": 5485 }, { "epoch": 0.005529586308381704, "grad_norm": 42.114320567333806, "learning_rate": 5.529480490703625e-06, "loss": 2.1934, "mean_token_accuracy": 0.44827585816383364, "step": 5490 }, { "epoch": 0.005534622361485877, "grad_norm": 57.01095671829796, "learning_rate": 5.534516447434684e-06, "loss": 2.4048, "mean_token_accuracy": 0.4, "step": 5495 }, { "epoch": 0.00553965841459005, "grad_norm": 49.00131848788912, "learning_rate": 5.539552404165744e-06, "loss": 2.3366, "mean_token_accuracy": 0.45027223229408264, "step": 5500 }, { "epoch": 0.005544694467694223, "grad_norm": 60.36814182264058, "learning_rate": 5.544588360896803e-06, "loss": 2.1626, "mean_token_accuracy": 0.4862068951129913, "step": 5505 }, { "epoch": 0.005549730520798396, "grad_norm": 65.08074346488905, "learning_rate": 5.549624317627863e-06, "loss": 2.2481, "mean_token_accuracy": 0.42413792610168455, "step": 5510 }, { "epoch": 0.005554766573902569, "grad_norm": 54.70828816858123, "learning_rate": 5.554660274358923e-06, "loss": 2.2637, "mean_token_accuracy": 0.4448275864124298, "step": 5515 }, { "epoch": 0.005559802627006742, "grad_norm": 63.353929079694254, "learning_rate": 5.559696231089983e-06, "loss": 2.2661, "mean_token_accuracy": 0.4896551728248596, "step": 5520 }, { "epoch": 0.005564838680110914, "grad_norm": 51.32308740958065, "learning_rate": 5.564732187821042e-06, "loss": 2.227, "mean_token_accuracy": 0.4310344815254211, "step": 5525 }, { "epoch": 0.005569874733215087, "grad_norm": 46.60665809118542, "learning_rate": 5.569768144552102e-06, "loss": 2.3502, "mean_token_accuracy": 0.4517241418361664, "step": 5530 }, { "epoch": 0.0055749107863192595, "grad_norm": 67.73235174555208, "learning_rate": 5.574804101283162e-06, "loss": 2.5164, "mean_token_accuracy": 0.41034482717514037, "step": 5535 }, { "epoch": 0.005579946839423432, "grad_norm": 52.24925622094272, "learning_rate": 5.579840058014222e-06, "loss": 2.2935, "mean_token_accuracy": 0.4689655125141144, "step": 5540 }, { "epoch": 0.005584982892527605, "grad_norm": 75.32023535126956, "learning_rate": 5.5848760147452815e-06, "loss": 2.3796, "mean_token_accuracy": 0.44827585816383364, "step": 5545 }, { "epoch": 0.005590018945631778, "grad_norm": 66.80441385795447, "learning_rate": 5.589911971476342e-06, "loss": 2.1181, "mean_token_accuracy": 0.48620688915252686, "step": 5550 }, { "epoch": 0.005595054998735951, "grad_norm": 59.94957939020645, "learning_rate": 5.594947928207401e-06, "loss": 2.4706, "mean_token_accuracy": 0.41379311084747317, "step": 5555 }, { "epoch": 0.005600091051840123, "grad_norm": 50.08012602349299, "learning_rate": 5.599983884938461e-06, "loss": 2.0734, "mean_token_accuracy": 0.4896551787853241, "step": 5560 }, { "epoch": 0.005605127104944296, "grad_norm": 65.919910906976, "learning_rate": 5.60501984166952e-06, "loss": 2.5222, "mean_token_accuracy": 0.4215517222881317, "step": 5565 }, { "epoch": 0.005610163158048469, "grad_norm": 60.10771322488036, "learning_rate": 5.61005579840058e-06, "loss": 2.3789, "mean_token_accuracy": 0.4344827651977539, "step": 5570 }, { "epoch": 0.005615199211152642, "grad_norm": 54.08231618732255, "learning_rate": 5.61509175513164e-06, "loss": 2.2147, "mean_token_accuracy": 0.47586206793785096, "step": 5575 }, { "epoch": 0.005620235264256815, "grad_norm": 62.05875026888758, "learning_rate": 5.6201277118627e-06, "loss": 2.0597, "mean_token_accuracy": 0.5034482777118683, "step": 5580 }, { "epoch": 0.005625271317360988, "grad_norm": 49.492586784714035, "learning_rate": 5.62516366859376e-06, "loss": 2.1203, "mean_token_accuracy": 0.46551724076271056, "step": 5585 }, { "epoch": 0.0056303073704651605, "grad_norm": 58.72449283191179, "learning_rate": 5.63019962532482e-06, "loss": 2.291, "mean_token_accuracy": 0.4206896543502808, "step": 5590 }, { "epoch": 0.0056353434235693325, "grad_norm": 55.98856238334883, "learning_rate": 5.635235582055879e-06, "loss": 2.3986, "mean_token_accuracy": 0.42758620381355283, "step": 5595 }, { "epoch": 0.005640379476673505, "grad_norm": 74.18381080201779, "learning_rate": 5.640271538786939e-06, "loss": 2.247, "mean_token_accuracy": 0.42413793206214906, "step": 5600 }, { "epoch": 0.005645415529777678, "grad_norm": 53.5420684785195, "learning_rate": 5.6453074955179985e-06, "loss": 2.5564, "mean_token_accuracy": 0.4137931078672409, "step": 5605 }, { "epoch": 0.005650451582881851, "grad_norm": 56.02088526313848, "learning_rate": 5.650343452249059e-06, "loss": 2.4848, "mean_token_accuracy": 0.4103448331356049, "step": 5610 }, { "epoch": 0.005655487635986024, "grad_norm": 49.00342280378441, "learning_rate": 5.655379408980118e-06, "loss": 2.4967, "mean_token_accuracy": 0.42413792610168455, "step": 5615 }, { "epoch": 0.005660523689090197, "grad_norm": 44.88229759761831, "learning_rate": 5.660415365711178e-06, "loss": 2.056, "mean_token_accuracy": 0.5000000059604645, "step": 5620 }, { "epoch": 0.00566555974219437, "grad_norm": 70.69456880129235, "learning_rate": 5.665451322442237e-06, "loss": 2.4881, "mean_token_accuracy": 0.4047791838645935, "step": 5625 }, { "epoch": 0.005670595795298542, "grad_norm": 59.18438327572698, "learning_rate": 5.670487279173297e-06, "loss": 2.48, "mean_token_accuracy": 0.4689655125141144, "step": 5630 }, { "epoch": 0.005675631848402715, "grad_norm": 40.75703500824426, "learning_rate": 5.675523235904357e-06, "loss": 2.1524, "mean_token_accuracy": 0.48756158351898193, "step": 5635 }, { "epoch": 0.005680667901506888, "grad_norm": 50.08284194333192, "learning_rate": 5.6805591926354175e-06, "loss": 2.1987, "mean_token_accuracy": 0.42758620977401735, "step": 5640 }, { "epoch": 0.005685703954611061, "grad_norm": 56.36385170838698, "learning_rate": 5.685595149366477e-06, "loss": 2.2666, "mean_token_accuracy": 0.44827585816383364, "step": 5645 }, { "epoch": 0.0056907400077152335, "grad_norm": 51.950733914048236, "learning_rate": 5.690631106097537e-06, "loss": 2.1852, "mean_token_accuracy": 0.43103447556495667, "step": 5650 }, { "epoch": 0.005695776060819406, "grad_norm": 60.96397142529166, "learning_rate": 5.695667062828596e-06, "loss": 2.3226, "mean_token_accuracy": 0.4103448212146759, "step": 5655 }, { "epoch": 0.005700812113923579, "grad_norm": 79.61105429853812, "learning_rate": 5.700703019559656e-06, "loss": 2.1115, "mean_token_accuracy": 0.47586206197738645, "step": 5660 }, { "epoch": 0.005705848167027751, "grad_norm": 75.72032938501376, "learning_rate": 5.7057389762907155e-06, "loss": 2.1379, "mean_token_accuracy": 0.5103448331356049, "step": 5665 }, { "epoch": 0.005710884220131924, "grad_norm": 59.13252588476254, "learning_rate": 5.7107749330217756e-06, "loss": 2.2078, "mean_token_accuracy": 0.49655172824859617, "step": 5670 }, { "epoch": 0.005715920273236097, "grad_norm": 68.9960825391538, "learning_rate": 5.715810889752835e-06, "loss": 2.4626, "mean_token_accuracy": 0.4551724076271057, "step": 5675 }, { "epoch": 0.00572095632634027, "grad_norm": 47.17846463373798, "learning_rate": 5.720846846483895e-06, "loss": 2.1256, "mean_token_accuracy": 0.458620685338974, "step": 5680 }, { "epoch": 0.005725992379444443, "grad_norm": 54.73905473454097, "learning_rate": 5.725882803214955e-06, "loss": 2.2757, "mean_token_accuracy": 0.4517241299152374, "step": 5685 }, { "epoch": 0.005731028432548616, "grad_norm": 63.42248559250619, "learning_rate": 5.730918759946015e-06, "loss": 2.2552, "mean_token_accuracy": 0.42758620381355283, "step": 5690 }, { "epoch": 0.005736064485652789, "grad_norm": 42.5949749046304, "learning_rate": 5.735954716677074e-06, "loss": 2.1937, "mean_token_accuracy": 0.4551724135875702, "step": 5695 }, { "epoch": 0.005741100538756961, "grad_norm": 69.13309493901087, "learning_rate": 5.7409906734081345e-06, "loss": 2.3679, "mean_token_accuracy": 0.4379310429096222, "step": 5700 }, { "epoch": 0.005746136591861134, "grad_norm": 59.63654313937762, "learning_rate": 5.746026630139194e-06, "loss": 2.1821, "mean_token_accuracy": 0.44301270246505736, "step": 5705 }, { "epoch": 0.0057511726449653066, "grad_norm": 61.580943844379135, "learning_rate": 5.751062586870254e-06, "loss": 1.983, "mean_token_accuracy": 0.4965517222881317, "step": 5710 }, { "epoch": 0.0057562086980694795, "grad_norm": 63.30812303070324, "learning_rate": 5.756098543601313e-06, "loss": 2.2954, "mean_token_accuracy": 0.47586207985877993, "step": 5715 }, { "epoch": 0.005761244751173652, "grad_norm": 62.340322181989, "learning_rate": 5.761134500332373e-06, "loss": 2.14, "mean_token_accuracy": 0.5, "step": 5720 }, { "epoch": 0.005766280804277825, "grad_norm": 59.57448420965543, "learning_rate": 5.766170457063433e-06, "loss": 2.531, "mean_token_accuracy": 0.3965517282485962, "step": 5725 }, { "epoch": 0.005771316857381997, "grad_norm": 93.63703696832067, "learning_rate": 5.7712064137944925e-06, "loss": 2.4451, "mean_token_accuracy": 0.4379310369491577, "step": 5730 }, { "epoch": 0.00577635291048617, "grad_norm": 66.08940271682413, "learning_rate": 5.7762423705255535e-06, "loss": 2.3376, "mean_token_accuracy": 0.4, "step": 5735 }, { "epoch": 0.005781388963590343, "grad_norm": 45.83839722180335, "learning_rate": 5.781278327256613e-06, "loss": 2.1261, "mean_token_accuracy": 0.49655171632766726, "step": 5740 }, { "epoch": 0.005786425016694516, "grad_norm": 52.48591290460214, "learning_rate": 5.786314283987673e-06, "loss": 2.3372, "mean_token_accuracy": 0.45517241954803467, "step": 5745 }, { "epoch": 0.005791461069798689, "grad_norm": 60.91483557097881, "learning_rate": 5.791350240718732e-06, "loss": 2.3398, "mean_token_accuracy": 0.45172412395477296, "step": 5750 }, { "epoch": 0.005796497122902862, "grad_norm": 70.67164561781587, "learning_rate": 5.796386197449792e-06, "loss": 2.085, "mean_token_accuracy": 0.49655172824859617, "step": 5755 }, { "epoch": 0.005801533176007035, "grad_norm": 50.44081672382117, "learning_rate": 5.8014221541808514e-06, "loss": 2.1841, "mean_token_accuracy": 0.4517241299152374, "step": 5760 }, { "epoch": 0.005806569229111207, "grad_norm": 67.56015597622336, "learning_rate": 5.8064581109119115e-06, "loss": 2.3871, "mean_token_accuracy": 0.4448275864124298, "step": 5765 }, { "epoch": 0.00581160528221538, "grad_norm": 49.52748932236692, "learning_rate": 5.811494067642971e-06, "loss": 2.2865, "mean_token_accuracy": 0.4586206912994385, "step": 5770 }, { "epoch": 0.0058166413353195525, "grad_norm": 67.4483402277736, "learning_rate": 5.816530024374031e-06, "loss": 2.3308, "mean_token_accuracy": 0.45899015069007876, "step": 5775 }, { "epoch": 0.005821677388423725, "grad_norm": 47.80006453818044, "learning_rate": 5.821565981105091e-06, "loss": 2.1648, "mean_token_accuracy": 0.4413793087005615, "step": 5780 }, { "epoch": 0.005826713441527898, "grad_norm": 47.4724644033891, "learning_rate": 5.826601937836151e-06, "loss": 2.1088, "mean_token_accuracy": 0.47586206197738645, "step": 5785 }, { "epoch": 0.005831749494632071, "grad_norm": 49.10590670907701, "learning_rate": 5.83163789456721e-06, "loss": 2.3584, "mean_token_accuracy": 0.42758620977401735, "step": 5790 }, { "epoch": 0.005836785547736244, "grad_norm": 51.676391997937756, "learning_rate": 5.8366738512982704e-06, "loss": 2.0627, "mean_token_accuracy": 0.4379310429096222, "step": 5795 }, { "epoch": 0.005841821600840416, "grad_norm": 46.147196643559006, "learning_rate": 5.84170980802933e-06, "loss": 2.1942, "mean_token_accuracy": 0.4965517222881317, "step": 5800 }, { "epoch": 0.005846857653944589, "grad_norm": 56.03749358458488, "learning_rate": 5.84674576476039e-06, "loss": 2.3903, "mean_token_accuracy": 0.3862068891525269, "step": 5805 }, { "epoch": 0.005851893707048762, "grad_norm": 49.44581882563388, "learning_rate": 5.851781721491449e-06, "loss": 2.4331, "mean_token_accuracy": 0.4034482717514038, "step": 5810 }, { "epoch": 0.005856929760152935, "grad_norm": 71.63146635800206, "learning_rate": 5.856817678222509e-06, "loss": 2.4119, "mean_token_accuracy": 0.3793103456497192, "step": 5815 }, { "epoch": 0.005861965813257108, "grad_norm": 49.778333141589755, "learning_rate": 5.861853634953568e-06, "loss": 2.3262, "mean_token_accuracy": 0.4344827651977539, "step": 5820 }, { "epoch": 0.005867001866361281, "grad_norm": 48.245521713618245, "learning_rate": 5.8668895916846285e-06, "loss": 2.4417, "mean_token_accuracy": 0.4034482777118683, "step": 5825 }, { "epoch": 0.0058720379194654535, "grad_norm": 41.717443758605434, "learning_rate": 5.871925548415689e-06, "loss": 1.977, "mean_token_accuracy": 0.49231699109077454, "step": 5830 }, { "epoch": 0.0058770739725696255, "grad_norm": 62.59403087777202, "learning_rate": 5.876961505146749e-06, "loss": 2.2271, "mean_token_accuracy": 0.4517241358757019, "step": 5835 }, { "epoch": 0.0058821100256737984, "grad_norm": 56.003939996292274, "learning_rate": 5.881997461877808e-06, "loss": 2.6189, "mean_token_accuracy": 0.3931034505367279, "step": 5840 }, { "epoch": 0.005887146078777971, "grad_norm": 47.416387974853365, "learning_rate": 5.887033418608868e-06, "loss": 1.9012, "mean_token_accuracy": 0.4896551787853241, "step": 5845 }, { "epoch": 0.005892182131882144, "grad_norm": 49.586942409837356, "learning_rate": 5.892069375339927e-06, "loss": 2.057, "mean_token_accuracy": 0.5068965494632721, "step": 5850 }, { "epoch": 0.005897218184986317, "grad_norm": 59.682775630254525, "learning_rate": 5.897105332070987e-06, "loss": 2.1288, "mean_token_accuracy": 0.5034482777118683, "step": 5855 }, { "epoch": 0.00590225423809049, "grad_norm": 56.77091109864296, "learning_rate": 5.902141288802047e-06, "loss": 2.8807, "mean_token_accuracy": 0.37241379022598264, "step": 5860 }, { "epoch": 0.005907290291194663, "grad_norm": 46.001687468067615, "learning_rate": 5.907177245533107e-06, "loss": 2.4388, "mean_token_accuracy": 0.4172413766384125, "step": 5865 }, { "epoch": 0.005912326344298835, "grad_norm": 66.65662223029962, "learning_rate": 5.912213202264166e-06, "loss": 2.2835, "mean_token_accuracy": 0.42413792610168455, "step": 5870 }, { "epoch": 0.005917362397403008, "grad_norm": 55.869435267795275, "learning_rate": 5.917249158995226e-06, "loss": 2.2795, "mean_token_accuracy": 0.4275862157344818, "step": 5875 }, { "epoch": 0.005922398450507181, "grad_norm": 59.690548626168386, "learning_rate": 5.922285115726286e-06, "loss": 2.0925, "mean_token_accuracy": 0.475862056016922, "step": 5880 }, { "epoch": 0.005927434503611354, "grad_norm": 63.0369648609136, "learning_rate": 5.927321072457346e-06, "loss": 2.3474, "mean_token_accuracy": 0.45172412395477296, "step": 5885 }, { "epoch": 0.0059324705567155266, "grad_norm": 44.779249009806435, "learning_rate": 5.9323570291884056e-06, "loss": 2.5309, "mean_token_accuracy": 0.3793103456497192, "step": 5890 }, { "epoch": 0.0059375066098196995, "grad_norm": 50.12635118283943, "learning_rate": 5.937392985919466e-06, "loss": 2.4249, "mean_token_accuracy": 0.3758620619773865, "step": 5895 }, { "epoch": 0.005942542662923872, "grad_norm": 53.6000511347473, "learning_rate": 5.942428942650525e-06, "loss": 2.1204, "mean_token_accuracy": 0.4379310369491577, "step": 5900 }, { "epoch": 0.005947578716028044, "grad_norm": 42.2251064214332, "learning_rate": 5.947464899381585e-06, "loss": 2.4249, "mean_token_accuracy": 0.41724138259887694, "step": 5905 }, { "epoch": 0.005952614769132217, "grad_norm": 45.74037917102216, "learning_rate": 5.952500856112644e-06, "loss": 2.2228, "mean_token_accuracy": 0.4517241299152374, "step": 5910 }, { "epoch": 0.00595765082223639, "grad_norm": 47.18715188090433, "learning_rate": 5.957536812843704e-06, "loss": 2.267, "mean_token_accuracy": 0.44827587008476255, "step": 5915 }, { "epoch": 0.005962686875340563, "grad_norm": 73.07126749022403, "learning_rate": 5.962572769574764e-06, "loss": 2.2476, "mean_token_accuracy": 0.48275861144065857, "step": 5920 }, { "epoch": 0.005967722928444736, "grad_norm": 45.65174713110633, "learning_rate": 5.967608726305824e-06, "loss": 2.01, "mean_token_accuracy": 0.5017543852329254, "step": 5925 }, { "epoch": 0.005972758981548909, "grad_norm": 73.17486697360341, "learning_rate": 5.972644683036884e-06, "loss": 2.2443, "mean_token_accuracy": 0.4517241299152374, "step": 5930 }, { "epoch": 0.005977795034653082, "grad_norm": 50.62187265347177, "learning_rate": 5.977680639767944e-06, "loss": 2.3614, "mean_token_accuracy": 0.4413793087005615, "step": 5935 }, { "epoch": 0.005982831087757254, "grad_norm": 71.65213282291283, "learning_rate": 5.982716596499003e-06, "loss": 2.0024, "mean_token_accuracy": 0.5379310309886932, "step": 5940 }, { "epoch": 0.005987867140861427, "grad_norm": 45.805198992796576, "learning_rate": 5.987752553230063e-06, "loss": 2.2747, "mean_token_accuracy": 0.4379310369491577, "step": 5945 }, { "epoch": 0.0059929031939656, "grad_norm": 63.84750561509887, "learning_rate": 5.9927885099611225e-06, "loss": 2.1543, "mean_token_accuracy": 0.46787658929824827, "step": 5950 }, { "epoch": 0.0059979392470697725, "grad_norm": 52.74623210309535, "learning_rate": 5.997824466692183e-06, "loss": 2.3777, "mean_token_accuracy": 0.482758617401123, "step": 5955 }, { "epoch": 0.006002975300173945, "grad_norm": 44.82689823442204, "learning_rate": 6.002860423423242e-06, "loss": 2.3646, "mean_token_accuracy": 0.43103448748588563, "step": 5960 }, { "epoch": 0.006008011353278118, "grad_norm": 36.79439771031893, "learning_rate": 6.007896380154302e-06, "loss": 2.5272, "mean_token_accuracy": 0.4241379380226135, "step": 5965 }, { "epoch": 0.006013047406382291, "grad_norm": 46.76840821428695, "learning_rate": 6.012932336885361e-06, "loss": 2.6179, "mean_token_accuracy": 0.4103448212146759, "step": 5970 }, { "epoch": 0.006018083459486463, "grad_norm": 61.742549064534096, "learning_rate": 6.017968293616421e-06, "loss": 2.4178, "mean_token_accuracy": 0.4413793087005615, "step": 5975 }, { "epoch": 0.006023119512590636, "grad_norm": 39.96388589238497, "learning_rate": 6.0230042503474814e-06, "loss": 2.2839, "mean_token_accuracy": 0.47586206197738645, "step": 5980 }, { "epoch": 0.006028155565694809, "grad_norm": 51.1152972679412, "learning_rate": 6.0280402070785415e-06, "loss": 2.4542, "mean_token_accuracy": 0.46551724076271056, "step": 5985 }, { "epoch": 0.006033191618798982, "grad_norm": 41.03389359657289, "learning_rate": 6.033076163809601e-06, "loss": 2.2936, "mean_token_accuracy": 0.4896551787853241, "step": 5990 }, { "epoch": 0.006038227671903155, "grad_norm": 52.97017434032984, "learning_rate": 6.038112120540661e-06, "loss": 2.6077, "mean_token_accuracy": 0.4172413766384125, "step": 5995 }, { "epoch": 0.006043263725007328, "grad_norm": 54.56388228996818, "learning_rate": 6.04314807727172e-06, "loss": 2.688, "mean_token_accuracy": 0.3551724225282669, "step": 6000 }, { "epoch": 0.006048299778111501, "grad_norm": 59.272229527489564, "learning_rate": 6.04818403400278e-06, "loss": 2.4984, "mean_token_accuracy": 0.44482758045196535, "step": 6005 }, { "epoch": 0.006053335831215673, "grad_norm": 55.3052764060603, "learning_rate": 6.0532199907338395e-06, "loss": 2.3765, "mean_token_accuracy": 0.4586206912994385, "step": 6010 }, { "epoch": 0.0060583718843198455, "grad_norm": 43.43145741753114, "learning_rate": 6.0582559474649e-06, "loss": 2.4768, "mean_token_accuracy": 0.38965516686439516, "step": 6015 }, { "epoch": 0.0060634079374240184, "grad_norm": 74.04778846790151, "learning_rate": 6.063291904195959e-06, "loss": 2.4068, "mean_token_accuracy": 0.41034482717514037, "step": 6020 }, { "epoch": 0.006068443990528191, "grad_norm": 48.8311733790857, "learning_rate": 6.068327860927019e-06, "loss": 2.628, "mean_token_accuracy": 0.4120992124080658, "step": 6025 }, { "epoch": 0.006073480043632364, "grad_norm": 68.89743007492572, "learning_rate": 6.073363817658079e-06, "loss": 2.2201, "mean_token_accuracy": 0.4436781704425812, "step": 6030 }, { "epoch": 0.006078516096736537, "grad_norm": 64.98102392847618, "learning_rate": 6.078399774389139e-06, "loss": 2.3434, "mean_token_accuracy": 0.46551724672317507, "step": 6035 }, { "epoch": 0.00608355214984071, "grad_norm": 53.137492829913924, "learning_rate": 6.083435731120198e-06, "loss": 2.1615, "mean_token_accuracy": 0.4448275864124298, "step": 6040 }, { "epoch": 0.006088588202944882, "grad_norm": 51.69054411972822, "learning_rate": 6.0884716878512585e-06, "loss": 2.1671, "mean_token_accuracy": 0.49999999403953554, "step": 6045 }, { "epoch": 0.006093624256049055, "grad_norm": 40.8330215843773, "learning_rate": 6.093507644582318e-06, "loss": 2.1323, "mean_token_accuracy": 0.4517241358757019, "step": 6050 }, { "epoch": 0.006098660309153228, "grad_norm": 44.62175430710238, "learning_rate": 6.098543601313378e-06, "loss": 2.2325, "mean_token_accuracy": 0.45862069725990295, "step": 6055 }, { "epoch": 0.006103696362257401, "grad_norm": 91.88244434629704, "learning_rate": 6.103579558044437e-06, "loss": 2.1949, "mean_token_accuracy": 0.45862069725990295, "step": 6060 }, { "epoch": 0.006108732415361574, "grad_norm": 47.25705097010408, "learning_rate": 6.108615514775497e-06, "loss": 2.292, "mean_token_accuracy": 0.4758620738983154, "step": 6065 }, { "epoch": 0.0061137684684657466, "grad_norm": 48.18774763111376, "learning_rate": 6.1136514715065565e-06, "loss": 2.3922, "mean_token_accuracy": 0.42413792610168455, "step": 6070 }, { "epoch": 0.0061188045215699195, "grad_norm": 48.67803708131491, "learning_rate": 6.1186874282376166e-06, "loss": 2.2161, "mean_token_accuracy": 0.4950393199920654, "step": 6075 }, { "epoch": 0.0061238405746740915, "grad_norm": 54.9652354132669, "learning_rate": 6.123723384968677e-06, "loss": 2.3173, "mean_token_accuracy": 0.4448275864124298, "step": 6080 }, { "epoch": 0.006128876627778264, "grad_norm": 67.28103712072297, "learning_rate": 6.128759341699737e-06, "loss": 2.2039, "mean_token_accuracy": 0.4448275864124298, "step": 6085 }, { "epoch": 0.006133912680882437, "grad_norm": 46.78691878078224, "learning_rate": 6.133795298430796e-06, "loss": 2.366, "mean_token_accuracy": 0.4344827592372894, "step": 6090 }, { "epoch": 0.00613894873398661, "grad_norm": 51.59746353275638, "learning_rate": 6.138831255161856e-06, "loss": 2.1121, "mean_token_accuracy": 0.48275861144065857, "step": 6095 }, { "epoch": 0.006143984787090783, "grad_norm": 44.57197378514623, "learning_rate": 6.143867211892915e-06, "loss": 2.3257, "mean_token_accuracy": 0.4551724135875702, "step": 6100 }, { "epoch": 0.006149020840194956, "grad_norm": 41.87145916444733, "learning_rate": 6.1489031686239755e-06, "loss": 2.2679, "mean_token_accuracy": 0.47586206197738645, "step": 6105 }, { "epoch": 0.006154056893299129, "grad_norm": 50.881408989922846, "learning_rate": 6.153939125355035e-06, "loss": 2.09, "mean_token_accuracy": 0.46896552443504336, "step": 6110 }, { "epoch": 0.006159092946403301, "grad_norm": 49.065415270065614, "learning_rate": 6.158975082086095e-06, "loss": 2.4182, "mean_token_accuracy": 0.4068965494632721, "step": 6115 }, { "epoch": 0.006164128999507474, "grad_norm": 56.40941295285368, "learning_rate": 6.164011038817154e-06, "loss": 2.2607, "mean_token_accuracy": 0.44827585816383364, "step": 6120 }, { "epoch": 0.006169165052611647, "grad_norm": 59.94106134132087, "learning_rate": 6.169046995548214e-06, "loss": 2.2057, "mean_token_accuracy": 0.4620689690113068, "step": 6125 }, { "epoch": 0.00617420110571582, "grad_norm": 60.51513615811769, "learning_rate": 6.174082952279274e-06, "loss": 2.6637, "mean_token_accuracy": 0.4034482717514038, "step": 6130 }, { "epoch": 0.0061792371588199925, "grad_norm": 45.04109258907642, "learning_rate": 6.179118909010334e-06, "loss": 2.3988, "mean_token_accuracy": 0.4896551728248596, "step": 6135 }, { "epoch": 0.006184273211924165, "grad_norm": 42.07406866907482, "learning_rate": 6.1841548657413945e-06, "loss": 2.2585, "mean_token_accuracy": 0.4689655125141144, "step": 6140 }, { "epoch": 0.006189309265028338, "grad_norm": 35.23544246771574, "learning_rate": 6.189190822472454e-06, "loss": 2.0644, "mean_token_accuracy": 0.48965516686439514, "step": 6145 }, { "epoch": 0.00619434531813251, "grad_norm": 51.947830805105106, "learning_rate": 6.194226779203514e-06, "loss": 2.1011, "mean_token_accuracy": 0.5137930989265442, "step": 6150 }, { "epoch": 0.006199381371236683, "grad_norm": 40.92681724617014, "learning_rate": 6.199262735934573e-06, "loss": 1.7219, "mean_token_accuracy": 0.558620685338974, "step": 6155 }, { "epoch": 0.006204417424340856, "grad_norm": 72.52788549550016, "learning_rate": 6.204298692665633e-06, "loss": 2.3502, "mean_token_accuracy": 0.44482758045196535, "step": 6160 }, { "epoch": 0.006209453477445029, "grad_norm": 40.4514357625008, "learning_rate": 6.2093346493966924e-06, "loss": 2.0021, "mean_token_accuracy": 0.5034482657909394, "step": 6165 }, { "epoch": 0.006214489530549202, "grad_norm": 42.23448253442933, "learning_rate": 6.2143706061277525e-06, "loss": 2.3192, "mean_token_accuracy": 0.43793103098869324, "step": 6170 }, { "epoch": 0.006219525583653375, "grad_norm": 45.95498196457272, "learning_rate": 6.219406562858812e-06, "loss": 2.3219, "mean_token_accuracy": 0.4482758641242981, "step": 6175 }, { "epoch": 0.006224561636757548, "grad_norm": 71.03223809129595, "learning_rate": 6.224442519589872e-06, "loss": 2.533, "mean_token_accuracy": 0.4034482717514038, "step": 6180 }, { "epoch": 0.00622959768986172, "grad_norm": 60.59925401084252, "learning_rate": 6.229478476320932e-06, "loss": 2.6162, "mean_token_accuracy": 0.3620689630508423, "step": 6185 }, { "epoch": 0.006234633742965893, "grad_norm": 63.38753204190949, "learning_rate": 6.234514433051992e-06, "loss": 2.2126, "mean_token_accuracy": 0.44827585816383364, "step": 6190 }, { "epoch": 0.0062396697960700655, "grad_norm": 60.06166266856566, "learning_rate": 6.239550389783051e-06, "loss": 2.2572, "mean_token_accuracy": 0.44827585816383364, "step": 6195 }, { "epoch": 0.0062447058491742384, "grad_norm": 71.29171513043804, "learning_rate": 6.2445863465141114e-06, "loss": 2.5219, "mean_token_accuracy": 0.40344826579093934, "step": 6200 }, { "epoch": 0.006249741902278411, "grad_norm": 49.1617253919964, "learning_rate": 6.249622303245171e-06, "loss": 2.4708, "mean_token_accuracy": 0.3827586233615875, "step": 6205 }, { "epoch": 0.006254777955382584, "grad_norm": 52.14615912760074, "learning_rate": 6.25465825997623e-06, "loss": 2.3844, "mean_token_accuracy": 0.44827585220336913, "step": 6210 }, { "epoch": 0.006259814008486756, "grad_norm": 53.628046240845265, "learning_rate": 6.25969421670729e-06, "loss": 2.1537, "mean_token_accuracy": 0.4413792997598648, "step": 6215 }, { "epoch": 0.006264850061590929, "grad_norm": 51.347910812131715, "learning_rate": 6.26473017343835e-06, "loss": 2.2062, "mean_token_accuracy": 0.4551724076271057, "step": 6220 }, { "epoch": 0.006269886114695102, "grad_norm": 45.999235325877976, "learning_rate": 6.26976613016941e-06, "loss": 2.3164, "mean_token_accuracy": 0.44482759237289426, "step": 6225 }, { "epoch": 0.006274922167799275, "grad_norm": 70.87573238831074, "learning_rate": 6.2748020869004695e-06, "loss": 2.0772, "mean_token_accuracy": 0.42758620381355283, "step": 6230 }, { "epoch": 0.006279958220903448, "grad_norm": 50.89961608996036, "learning_rate": 6.27983804363153e-06, "loss": 2.3694, "mean_token_accuracy": 0.42068964838981626, "step": 6235 }, { "epoch": 0.006284994274007621, "grad_norm": 70.38825319121104, "learning_rate": 6.28487400036259e-06, "loss": 2.2528, "mean_token_accuracy": 0.4413793087005615, "step": 6240 }, { "epoch": 0.006290030327111794, "grad_norm": 43.413320890677525, "learning_rate": 6.28990995709365e-06, "loss": 2.3949, "mean_token_accuracy": 0.4034482717514038, "step": 6245 }, { "epoch": 0.006295066380215966, "grad_norm": 53.35471033987399, "learning_rate": 6.294945913824708e-06, "loss": 2.3449, "mean_token_accuracy": 0.4068965554237366, "step": 6250 }, { "epoch": 0.006300102433320139, "grad_norm": 49.48306681907208, "learning_rate": 6.299981870555768e-06, "loss": 2.4828, "mean_token_accuracy": 0.450030255317688, "step": 6255 }, { "epoch": 0.0063051384864243115, "grad_norm": 51.66348090698549, "learning_rate": 6.305017827286828e-06, "loss": 2.4319, "mean_token_accuracy": 0.4172413766384125, "step": 6260 }, { "epoch": 0.006310174539528484, "grad_norm": 54.983296847206454, "learning_rate": 6.3100537840178885e-06, "loss": 2.5207, "mean_token_accuracy": 0.3896551728248596, "step": 6265 }, { "epoch": 0.006315210592632657, "grad_norm": 57.2159305784855, "learning_rate": 6.315089740748947e-06, "loss": 2.1859, "mean_token_accuracy": 0.5241379261016845, "step": 6270 }, { "epoch": 0.00632024664573683, "grad_norm": 49.84401674196088, "learning_rate": 6.320125697480008e-06, "loss": 2.2443, "mean_token_accuracy": 0.45862067937850953, "step": 6275 }, { "epoch": 0.006325282698841003, "grad_norm": 50.34434897808259, "learning_rate": 6.325161654211068e-06, "loss": 2.3023, "mean_token_accuracy": 0.4843315064907074, "step": 6280 }, { "epoch": 0.006330318751945175, "grad_norm": 37.64893641660931, "learning_rate": 6.330197610942128e-06, "loss": 2.2012, "mean_token_accuracy": 0.47241379618644713, "step": 6285 }, { "epoch": 0.006335354805049348, "grad_norm": 54.30004367765904, "learning_rate": 6.3352335676731865e-06, "loss": 2.2012, "mean_token_accuracy": 0.46067755222320556, "step": 6290 }, { "epoch": 0.006340390858153521, "grad_norm": 62.30228385139683, "learning_rate": 6.3402695244042466e-06, "loss": 2.372, "mean_token_accuracy": 0.47586206793785096, "step": 6295 }, { "epoch": 0.006345426911257694, "grad_norm": 54.45164943957353, "learning_rate": 6.345305481135307e-06, "loss": 2.2323, "mean_token_accuracy": 0.46206897497177124, "step": 6300 }, { "epoch": 0.006350462964361867, "grad_norm": 51.94258537664782, "learning_rate": 6.350341437866367e-06, "loss": 2.4528, "mean_token_accuracy": 0.4517241418361664, "step": 6305 }, { "epoch": 0.00635549901746604, "grad_norm": 41.92105173730461, "learning_rate": 6.355377394597425e-06, "loss": 2.3346, "mean_token_accuracy": 0.5034482717514038, "step": 6310 }, { "epoch": 0.0063605350705702125, "grad_norm": 57.91014312614557, "learning_rate": 6.360413351328485e-06, "loss": 2.1116, "mean_token_accuracy": 0.4931034445762634, "step": 6315 }, { "epoch": 0.0063655711236743845, "grad_norm": 57.6689154729563, "learning_rate": 6.365449308059545e-06, "loss": 2.1509, "mean_token_accuracy": 0.4724137902259827, "step": 6320 }, { "epoch": 0.006370607176778557, "grad_norm": 49.18580348890661, "learning_rate": 6.3704852647906055e-06, "loss": 2.1044, "mean_token_accuracy": 0.4610405325889587, "step": 6325 }, { "epoch": 0.00637564322988273, "grad_norm": 41.5843166932207, "learning_rate": 6.375521221521665e-06, "loss": 2.7451, "mean_token_accuracy": 0.4482758641242981, "step": 6330 }, { "epoch": 0.006380679282986903, "grad_norm": 59.449037433241976, "learning_rate": 6.380557178252725e-06, "loss": 2.5404, "mean_token_accuracy": 0.42976405322551725, "step": 6335 }, { "epoch": 0.006385715336091076, "grad_norm": 47.41394815150942, "learning_rate": 6.385593134983785e-06, "loss": 2.3082, "mean_token_accuracy": 0.42758620381355283, "step": 6340 }, { "epoch": 0.006390751389195249, "grad_norm": 66.59307195900514, "learning_rate": 6.390629091714845e-06, "loss": 2.1217, "mean_token_accuracy": 0.44827585816383364, "step": 6345 }, { "epoch": 0.006395787442299422, "grad_norm": 50.20758022153302, "learning_rate": 6.395665048445903e-06, "loss": 2.1906, "mean_token_accuracy": 0.4689655125141144, "step": 6350 }, { "epoch": 0.006400823495403594, "grad_norm": 46.68766382287173, "learning_rate": 6.4007010051769635e-06, "loss": 2.4806, "mean_token_accuracy": 0.4172413766384125, "step": 6355 }, { "epoch": 0.006405859548507767, "grad_norm": 78.4136007973002, "learning_rate": 6.405736961908024e-06, "loss": 2.8195, "mean_token_accuracy": 0.38965516686439516, "step": 6360 }, { "epoch": 0.00641089560161194, "grad_norm": 47.809726494906585, "learning_rate": 6.410772918639084e-06, "loss": 2.4458, "mean_token_accuracy": 0.4428917169570923, "step": 6365 }, { "epoch": 0.006415931654716113, "grad_norm": 54.69693504704357, "learning_rate": 6.415808875370143e-06, "loss": 2.3096, "mean_token_accuracy": 0.4068965494632721, "step": 6370 }, { "epoch": 0.0064209677078202855, "grad_norm": 45.18477247322321, "learning_rate": 6.420844832101203e-06, "loss": 2.2768, "mean_token_accuracy": 0.4448275864124298, "step": 6375 }, { "epoch": 0.0064260037609244584, "grad_norm": 41.75081284043611, "learning_rate": 6.425880788832263e-06, "loss": 2.1484, "mean_token_accuracy": 0.5000000059604645, "step": 6380 }, { "epoch": 0.006431039814028631, "grad_norm": 60.48977352776294, "learning_rate": 6.430916745563323e-06, "loss": 2.0936, "mean_token_accuracy": 0.4448275864124298, "step": 6385 }, { "epoch": 0.006436075867132803, "grad_norm": 50.43668501080648, "learning_rate": 6.435952702294382e-06, "loss": 2.3059, "mean_token_accuracy": 0.4413793087005615, "step": 6390 }, { "epoch": 0.006441111920236976, "grad_norm": 70.28522233717965, "learning_rate": 6.440988659025442e-06, "loss": 2.2686, "mean_token_accuracy": 0.44652147889137267, "step": 6395 }, { "epoch": 0.006446147973341149, "grad_norm": 57.66148989977817, "learning_rate": 6.446024615756502e-06, "loss": 2.2815, "mean_token_accuracy": 0.4551724135875702, "step": 6400 }, { "epoch": 0.006451184026445322, "grad_norm": 42.62732695699391, "learning_rate": 6.451060572487562e-06, "loss": 2.3338, "mean_token_accuracy": 0.42413792610168455, "step": 6405 }, { "epoch": 0.006456220079549495, "grad_norm": 71.03479029588128, "learning_rate": 6.45609652921862e-06, "loss": 2.1171, "mean_token_accuracy": 0.46467028856277465, "step": 6410 }, { "epoch": 0.006461256132653668, "grad_norm": 54.16591231991322, "learning_rate": 6.4611324859496805e-06, "loss": 1.9617, "mean_token_accuracy": 0.5052026629447937, "step": 6415 }, { "epoch": 0.006466292185757841, "grad_norm": 43.26053484179596, "learning_rate": 6.466168442680741e-06, "loss": 2.2872, "mean_token_accuracy": 0.44827585816383364, "step": 6420 }, { "epoch": 0.006471328238862013, "grad_norm": 52.84199867986601, "learning_rate": 6.471204399411801e-06, "loss": 2.306, "mean_token_accuracy": 0.458620685338974, "step": 6425 }, { "epoch": 0.006476364291966186, "grad_norm": 46.96134223366744, "learning_rate": 6.47624035614286e-06, "loss": 2.3141, "mean_token_accuracy": 0.4931034445762634, "step": 6430 }, { "epoch": 0.006481400345070359, "grad_norm": 73.80457116500342, "learning_rate": 6.48127631287392e-06, "loss": 2.5553, "mean_token_accuracy": 0.39999999701976774, "step": 6435 }, { "epoch": 0.0064864363981745315, "grad_norm": 65.85543230417136, "learning_rate": 6.48631226960498e-06, "loss": 2.4888, "mean_token_accuracy": 0.3999999940395355, "step": 6440 }, { "epoch": 0.006491472451278704, "grad_norm": 41.42742849971942, "learning_rate": 6.49134822633604e-06, "loss": 1.9763, "mean_token_accuracy": 0.4931034445762634, "step": 6445 }, { "epoch": 0.006496508504382877, "grad_norm": 48.89036856678386, "learning_rate": 6.496384183067099e-06, "loss": 2.2551, "mean_token_accuracy": 0.4517241358757019, "step": 6450 }, { "epoch": 0.00650154455748705, "grad_norm": 52.465825307129904, "learning_rate": 6.501420139798159e-06, "loss": 2.3935, "mean_token_accuracy": 0.41034482717514037, "step": 6455 }, { "epoch": 0.006506580610591222, "grad_norm": 59.072941227879916, "learning_rate": 6.506456096529219e-06, "loss": 2.5701, "mean_token_accuracy": 0.37586207389831544, "step": 6460 }, { "epoch": 0.006511616663695395, "grad_norm": 51.59900121801609, "learning_rate": 6.511492053260279e-06, "loss": 2.2572, "mean_token_accuracy": 0.4310344815254211, "step": 6465 }, { "epoch": 0.006516652716799568, "grad_norm": 64.40473563043554, "learning_rate": 6.516528009991338e-06, "loss": 2.4966, "mean_token_accuracy": 0.441379314661026, "step": 6470 }, { "epoch": 0.006521688769903741, "grad_norm": 53.07618014705688, "learning_rate": 6.521563966722398e-06, "loss": 2.0773, "mean_token_accuracy": 0.5053236544132232, "step": 6475 }, { "epoch": 0.006526724823007914, "grad_norm": 49.69534158548225, "learning_rate": 6.526599923453458e-06, "loss": 2.2584, "mean_token_accuracy": 0.43647912740707395, "step": 6480 }, { "epoch": 0.006531760876112087, "grad_norm": 36.00189829175751, "learning_rate": 6.5316358801845185e-06, "loss": 2.2539, "mean_token_accuracy": 0.4413793087005615, "step": 6485 }, { "epoch": 0.00653679692921626, "grad_norm": 51.085414882729374, "learning_rate": 6.536671836915577e-06, "loss": 2.1821, "mean_token_accuracy": 0.4517241418361664, "step": 6490 }, { "epoch": 0.006541832982320432, "grad_norm": 41.33047324342737, "learning_rate": 6.541707793646637e-06, "loss": 2.5362, "mean_token_accuracy": 0.41379310488700866, "step": 6495 }, { "epoch": 0.0065468690354246045, "grad_norm": 39.817116760636985, "learning_rate": 6.546743750377697e-06, "loss": 2.0292, "mean_token_accuracy": 0.45517241954803467, "step": 6500 }, { "epoch": 0.006551905088528777, "grad_norm": 54.476993781896766, "learning_rate": 6.551779707108757e-06, "loss": 2.4226, "mean_token_accuracy": 0.44137930274009707, "step": 6505 }, { "epoch": 0.00655694114163295, "grad_norm": 37.075696447095005, "learning_rate": 6.556815663839816e-06, "loss": 2.1614, "mean_token_accuracy": 0.4724137902259827, "step": 6510 }, { "epoch": 0.006561977194737123, "grad_norm": 41.83870390251137, "learning_rate": 6.561851620570876e-06, "loss": 2.3574, "mean_token_accuracy": 0.44827587008476255, "step": 6515 }, { "epoch": 0.006567013247841296, "grad_norm": 49.884207237603086, "learning_rate": 6.566887577301936e-06, "loss": 1.9869, "mean_token_accuracy": 0.4689655125141144, "step": 6520 }, { "epoch": 0.006572049300945469, "grad_norm": 64.91084528084171, "learning_rate": 6.571923534032996e-06, "loss": 1.9082, "mean_token_accuracy": 0.5275862038135528, "step": 6525 }, { "epoch": 0.006577085354049641, "grad_norm": 67.48064653609676, "learning_rate": 6.576959490764055e-06, "loss": 2.3476, "mean_token_accuracy": 0.41034482717514037, "step": 6530 }, { "epoch": 0.006582121407153814, "grad_norm": 47.06341963789077, "learning_rate": 6.581995447495115e-06, "loss": 2.1041, "mean_token_accuracy": 0.4896551728248596, "step": 6535 }, { "epoch": 0.006587157460257987, "grad_norm": 40.80627504404265, "learning_rate": 6.587031404226175e-06, "loss": 2.145, "mean_token_accuracy": 0.4517241418361664, "step": 6540 }, { "epoch": 0.00659219351336216, "grad_norm": 61.507831111427976, "learning_rate": 6.5920673609572355e-06, "loss": 2.2986, "mean_token_accuracy": 0.42413793206214906, "step": 6545 }, { "epoch": 0.006597229566466333, "grad_norm": 37.707830868354016, "learning_rate": 6.5971033176882956e-06, "loss": 2.1459, "mean_token_accuracy": 0.47931034564971925, "step": 6550 }, { "epoch": 0.0066022656195705055, "grad_norm": 44.45500335259625, "learning_rate": 6.602139274419354e-06, "loss": 2.1738, "mean_token_accuracy": 0.44482759237289426, "step": 6555 }, { "epoch": 0.006607301672674678, "grad_norm": 53.06572702603528, "learning_rate": 6.607175231150414e-06, "loss": 2.4652, "mean_token_accuracy": 0.42068964838981626, "step": 6560 }, { "epoch": 0.0066123377257788505, "grad_norm": 51.13847730255945, "learning_rate": 6.612211187881474e-06, "loss": 2.3187, "mean_token_accuracy": 0.417241370677948, "step": 6565 }, { "epoch": 0.006617373778883023, "grad_norm": 61.94679831656144, "learning_rate": 6.617247144612534e-06, "loss": 1.8596, "mean_token_accuracy": 0.5539624989032745, "step": 6570 }, { "epoch": 0.006622409831987196, "grad_norm": 57.21162842111359, "learning_rate": 6.6222831013435935e-06, "loss": 2.1794, "mean_token_accuracy": 0.482758617401123, "step": 6575 }, { "epoch": 0.006627445885091369, "grad_norm": 60.123527880645874, "learning_rate": 6.627319058074654e-06, "loss": 2.2354, "mean_token_accuracy": 0.4724137902259827, "step": 6580 }, { "epoch": 0.006632481938195542, "grad_norm": 47.63020519891939, "learning_rate": 6.632355014805714e-06, "loss": 1.9821, "mean_token_accuracy": 0.47241380214691164, "step": 6585 }, { "epoch": 0.006637517991299715, "grad_norm": 48.66873674193257, "learning_rate": 6.637390971536774e-06, "loss": 2.0273, "mean_token_accuracy": 0.5000000059604645, "step": 6590 }, { "epoch": 0.006642554044403888, "grad_norm": 50.00042456961159, "learning_rate": 6.642426928267832e-06, "loss": 2.2344, "mean_token_accuracy": 0.44827585518360136, "step": 6595 }, { "epoch": 0.00664759009750806, "grad_norm": 56.230081942508846, "learning_rate": 6.647462884998892e-06, "loss": 1.9971, "mean_token_accuracy": 0.4965517222881317, "step": 6600 }, { "epoch": 0.006652626150612233, "grad_norm": 51.54557010375823, "learning_rate": 6.6524988417299524e-06, "loss": 2.5054, "mean_token_accuracy": 0.42068966031074523, "step": 6605 }, { "epoch": 0.006657662203716406, "grad_norm": 96.51899146918406, "learning_rate": 6.6575347984610125e-06, "loss": 2.3347, "mean_token_accuracy": 0.44827587008476255, "step": 6610 }, { "epoch": 0.006662698256820579, "grad_norm": 48.47226280128962, "learning_rate": 6.662570755192071e-06, "loss": 2.3117, "mean_token_accuracy": 0.4034482777118683, "step": 6615 }, { "epoch": 0.0066677343099247515, "grad_norm": 55.760672261985775, "learning_rate": 6.667606711923131e-06, "loss": 2.2402, "mean_token_accuracy": 0.4517241299152374, "step": 6620 }, { "epoch": 0.006672770363028924, "grad_norm": 54.06452456481192, "learning_rate": 6.672642668654191e-06, "loss": 2.3452, "mean_token_accuracy": 0.4689655125141144, "step": 6625 }, { "epoch": 0.006677806416133097, "grad_norm": 36.6955803829172, "learning_rate": 6.677678625385251e-06, "loss": 2.1433, "mean_token_accuracy": 0.46896551847457885, "step": 6630 }, { "epoch": 0.006682842469237269, "grad_norm": 57.01798009939822, "learning_rate": 6.6827145821163105e-06, "loss": 2.4756, "mean_token_accuracy": 0.4172413766384125, "step": 6635 }, { "epoch": 0.006687878522341442, "grad_norm": 42.74085015435525, "learning_rate": 6.687750538847371e-06, "loss": 2.4783, "mean_token_accuracy": 0.4310344815254211, "step": 6640 }, { "epoch": 0.006692914575445615, "grad_norm": 50.32812591651769, "learning_rate": 6.692786495578431e-06, "loss": 2.2453, "mean_token_accuracy": 0.47586206793785096, "step": 6645 }, { "epoch": 0.006697950628549788, "grad_norm": 47.23870937895076, "learning_rate": 6.697822452309491e-06, "loss": 2.3842, "mean_token_accuracy": 0.43793103098869324, "step": 6650 }, { "epoch": 0.006702986681653961, "grad_norm": 53.125040717996974, "learning_rate": 6.702858409040549e-06, "loss": 2.0319, "mean_token_accuracy": 0.5248768568038941, "step": 6655 }, { "epoch": 0.006708022734758134, "grad_norm": 48.4897488020064, "learning_rate": 6.707894365771609e-06, "loss": 2.2106, "mean_token_accuracy": 0.4793103516101837, "step": 6660 }, { "epoch": 0.006713058787862306, "grad_norm": 37.454403300158496, "learning_rate": 6.712930322502669e-06, "loss": 2.0216, "mean_token_accuracy": 0.46551724672317507, "step": 6665 }, { "epoch": 0.006718094840966479, "grad_norm": 50.30966431581214, "learning_rate": 6.7179662792337295e-06, "loss": 2.2227, "mean_token_accuracy": 0.47241379618644713, "step": 6670 }, { "epoch": 0.006723130894070652, "grad_norm": 63.24348477311662, "learning_rate": 6.723002235964789e-06, "loss": 2.2059, "mean_token_accuracy": 0.4724137902259827, "step": 6675 }, { "epoch": 0.0067281669471748245, "grad_norm": 39.8398484643016, "learning_rate": 6.728038192695849e-06, "loss": 2.126, "mean_token_accuracy": 0.49655171632766726, "step": 6680 }, { "epoch": 0.006733203000278997, "grad_norm": 47.64301163347034, "learning_rate": 6.733074149426909e-06, "loss": 2.2225, "mean_token_accuracy": 0.4275862008333206, "step": 6685 }, { "epoch": 0.00673823905338317, "grad_norm": 45.702792838032366, "learning_rate": 6.738110106157969e-06, "loss": 2.2141, "mean_token_accuracy": 0.42758620381355283, "step": 6690 }, { "epoch": 0.006743275106487343, "grad_norm": 57.452127969203346, "learning_rate": 6.7431460628890274e-06, "loss": 2.5248, "mean_token_accuracy": 0.42758620977401735, "step": 6695 }, { "epoch": 0.006748311159591515, "grad_norm": 53.024427870925045, "learning_rate": 6.7481820196200875e-06, "loss": 2.1566, "mean_token_accuracy": 0.493103438615799, "step": 6700 }, { "epoch": 0.006753347212695688, "grad_norm": 59.97626946191907, "learning_rate": 6.753217976351148e-06, "loss": 2.4165, "mean_token_accuracy": 0.45517241954803467, "step": 6705 }, { "epoch": 0.006758383265799861, "grad_norm": 48.20810556569317, "learning_rate": 6.758253933082208e-06, "loss": 2.1737, "mean_token_accuracy": 0.4206896543502808, "step": 6710 }, { "epoch": 0.006763419318904034, "grad_norm": 46.63656709914599, "learning_rate": 6.763289889813266e-06, "loss": 2.0034, "mean_token_accuracy": 0.5206896483898162, "step": 6715 }, { "epoch": 0.006768455372008207, "grad_norm": 57.40403145219679, "learning_rate": 6.768325846544326e-06, "loss": 2.294, "mean_token_accuracy": 0.38275861740112305, "step": 6720 }, { "epoch": 0.00677349142511238, "grad_norm": 37.16886830025921, "learning_rate": 6.773361803275386e-06, "loss": 1.9982, "mean_token_accuracy": 0.46376285552978513, "step": 6725 }, { "epoch": 0.006778527478216553, "grad_norm": 53.48813957852404, "learning_rate": 6.778397760006447e-06, "loss": 2.5181, "mean_token_accuracy": 0.4206896543502808, "step": 6730 }, { "epoch": 0.006783563531320725, "grad_norm": 65.47277696014064, "learning_rate": 6.783433716737506e-06, "loss": 2.091, "mean_token_accuracy": 0.47241378426551817, "step": 6735 }, { "epoch": 0.0067885995844248976, "grad_norm": 56.640810367366534, "learning_rate": 6.788469673468566e-06, "loss": 2.2398, "mean_token_accuracy": 0.4586206912994385, "step": 6740 }, { "epoch": 0.0067936356375290705, "grad_norm": 47.60842460570875, "learning_rate": 6.793505630199626e-06, "loss": 2.3848, "mean_token_accuracy": 0.4188747704029083, "step": 6745 }, { "epoch": 0.006798671690633243, "grad_norm": 46.917684100079526, "learning_rate": 6.798541586930686e-06, "loss": 2.3391, "mean_token_accuracy": 0.42758620381355283, "step": 6750 }, { "epoch": 0.006803707743737416, "grad_norm": 60.644771256802855, "learning_rate": 6.803577543661744e-06, "loss": 2.4641, "mean_token_accuracy": 0.43103447556495667, "step": 6755 }, { "epoch": 0.006808743796841589, "grad_norm": 62.82804240875024, "learning_rate": 6.8086135003928045e-06, "loss": 2.2765, "mean_token_accuracy": 0.46551724076271056, "step": 6760 }, { "epoch": 0.006813779849945762, "grad_norm": 40.53424666995209, "learning_rate": 6.813649457123865e-06, "loss": 2.3878, "mean_token_accuracy": 0.4344827651977539, "step": 6765 }, { "epoch": 0.006818815903049934, "grad_norm": 40.35131518017455, "learning_rate": 6.818685413854925e-06, "loss": 1.9425, "mean_token_accuracy": 0.4862069010734558, "step": 6770 }, { "epoch": 0.006823851956154107, "grad_norm": 51.78643046388614, "learning_rate": 6.823721370585984e-06, "loss": 1.9515, "mean_token_accuracy": 0.4980641186237335, "step": 6775 }, { "epoch": 0.00682888800925828, "grad_norm": 43.96695647448754, "learning_rate": 6.828757327317044e-06, "loss": 2.4561, "mean_token_accuracy": 0.41379310488700866, "step": 6780 }, { "epoch": 0.006833924062362453, "grad_norm": 63.86217062717558, "learning_rate": 6.833793284048104e-06, "loss": 2.6298, "mean_token_accuracy": 0.3620689630508423, "step": 6785 }, { "epoch": 0.006838960115466626, "grad_norm": 36.578459286712544, "learning_rate": 6.838829240779164e-06, "loss": 2.173, "mean_token_accuracy": 0.4862068951129913, "step": 6790 }, { "epoch": 0.006843996168570799, "grad_norm": 64.08570577875474, "learning_rate": 6.843865197510223e-06, "loss": 2.2524, "mean_token_accuracy": 0.47065940499305725, "step": 6795 }, { "epoch": 0.0068490322216749715, "grad_norm": 35.99027574878657, "learning_rate": 6.848901154241283e-06, "loss": 1.9677, "mean_token_accuracy": 0.5103448331356049, "step": 6800 }, { "epoch": 0.0068540682747791435, "grad_norm": 50.82366506340916, "learning_rate": 6.853937110972343e-06, "loss": 2.6616, "mean_token_accuracy": 0.4344827651977539, "step": 6805 }, { "epoch": 0.006859104327883316, "grad_norm": 56.74084504843212, "learning_rate": 6.858973067703403e-06, "loss": 2.5657, "mean_token_accuracy": 0.4310344815254211, "step": 6810 }, { "epoch": 0.006864140380987489, "grad_norm": 54.51289033212468, "learning_rate": 6.864009024434462e-06, "loss": 2.3658, "mean_token_accuracy": 0.4448275864124298, "step": 6815 }, { "epoch": 0.006869176434091662, "grad_norm": 46.783396914734425, "learning_rate": 6.869044981165522e-06, "loss": 2.5182, "mean_token_accuracy": 0.41034482717514037, "step": 6820 }, { "epoch": 0.006874212487195835, "grad_norm": 79.28297437408888, "learning_rate": 6.874080937896582e-06, "loss": 2.2713, "mean_token_accuracy": 0.4334543228149414, "step": 6825 }, { "epoch": 0.006879248540300008, "grad_norm": 45.494765448083164, "learning_rate": 6.8791168946276425e-06, "loss": 2.4228, "mean_token_accuracy": 0.3999999940395355, "step": 6830 }, { "epoch": 0.006884284593404181, "grad_norm": 55.301649930743146, "learning_rate": 6.884152851358701e-06, "loss": 2.3687, "mean_token_accuracy": 0.4103448212146759, "step": 6835 }, { "epoch": 0.006889320646508353, "grad_norm": 44.22728771278015, "learning_rate": 6.889188808089761e-06, "loss": 1.9686, "mean_token_accuracy": 0.4931034445762634, "step": 6840 }, { "epoch": 0.006894356699612526, "grad_norm": 48.12891439657761, "learning_rate": 6.894224764820821e-06, "loss": 2.4267, "mean_token_accuracy": 0.4034482777118683, "step": 6845 }, { "epoch": 0.006899392752716699, "grad_norm": 43.82322937757349, "learning_rate": 6.899260721551881e-06, "loss": 2.5481, "mean_token_accuracy": 0.42758620977401735, "step": 6850 }, { "epoch": 0.006904428805820872, "grad_norm": 41.232169674165256, "learning_rate": 6.90429667828294e-06, "loss": 2.0909, "mean_token_accuracy": 0.4517241418361664, "step": 6855 }, { "epoch": 0.0069094648589250445, "grad_norm": 39.12170262843131, "learning_rate": 6.909332635014e-06, "loss": 2.2639, "mean_token_accuracy": 0.42413793206214906, "step": 6860 }, { "epoch": 0.006914500912029217, "grad_norm": 44.177066952309005, "learning_rate": 6.91436859174506e-06, "loss": 2.1092, "mean_token_accuracy": 0.49999999403953554, "step": 6865 }, { "epoch": 0.00691953696513339, "grad_norm": 35.88057306583056, "learning_rate": 6.91940454847612e-06, "loss": 2.3898, "mean_token_accuracy": 0.41724138259887694, "step": 6870 }, { "epoch": 0.006924573018237562, "grad_norm": 46.648218080915434, "learning_rate": 6.924440505207179e-06, "loss": 2.2594, "mean_token_accuracy": 0.4413793087005615, "step": 6875 }, { "epoch": 0.006929609071341735, "grad_norm": 46.64638252103064, "learning_rate": 6.929476461938239e-06, "loss": 2.5701, "mean_token_accuracy": 0.3724137932062149, "step": 6880 }, { "epoch": 0.006934645124445908, "grad_norm": 60.967390761094904, "learning_rate": 6.934512418669299e-06, "loss": 2.4825, "mean_token_accuracy": 0.39655172228813174, "step": 6885 }, { "epoch": 0.006939681177550081, "grad_norm": 42.448747225099766, "learning_rate": 6.9395483754003595e-06, "loss": 2.289, "mean_token_accuracy": 0.41724138259887694, "step": 6890 }, { "epoch": 0.006944717230654254, "grad_norm": 49.65101480148005, "learning_rate": 6.944584332131418e-06, "loss": 2.3864, "mean_token_accuracy": 0.48275861144065857, "step": 6895 }, { "epoch": 0.006949753283758427, "grad_norm": 48.59491465201465, "learning_rate": 6.949620288862478e-06, "loss": 2.4128, "mean_token_accuracy": 0.4517241358757019, "step": 6900 }, { "epoch": 0.0069547893368626, "grad_norm": 48.388399846741024, "learning_rate": 6.954656245593538e-06, "loss": 2.4457, "mean_token_accuracy": 0.42068964838981626, "step": 6905 }, { "epoch": 0.006959825389966772, "grad_norm": 56.41617185193928, "learning_rate": 6.959692202324598e-06, "loss": 2.3499, "mean_token_accuracy": 0.45862069725990295, "step": 6910 }, { "epoch": 0.006964861443070945, "grad_norm": 50.51187410921404, "learning_rate": 6.9647281590556574e-06, "loss": 2.059, "mean_token_accuracy": 0.49655171632766726, "step": 6915 }, { "epoch": 0.0069698974961751176, "grad_norm": 85.11663788352037, "learning_rate": 6.9697641157867175e-06, "loss": 2.2402, "mean_token_accuracy": 0.42607380747795104, "step": 6920 }, { "epoch": 0.0069749335492792905, "grad_norm": 41.13440245696484, "learning_rate": 6.974800072517778e-06, "loss": 2.5678, "mean_token_accuracy": 0.3793103456497192, "step": 6925 }, { "epoch": 0.006979969602383463, "grad_norm": 61.38473172336067, "learning_rate": 6.979836029248838e-06, "loss": 2.3286, "mean_token_accuracy": 0.4034482777118683, "step": 6930 }, { "epoch": 0.006985005655487636, "grad_norm": 45.999149982488184, "learning_rate": 6.984871985979896e-06, "loss": 2.0911, "mean_token_accuracy": 0.49999999403953554, "step": 6935 }, { "epoch": 0.006990041708591809, "grad_norm": 40.14202405659626, "learning_rate": 6.989907942710956e-06, "loss": 2.3611, "mean_token_accuracy": 0.4430732011795044, "step": 6940 }, { "epoch": 0.006995077761695981, "grad_norm": 44.87245761526127, "learning_rate": 6.994943899442016e-06, "loss": 2.1066, "mean_token_accuracy": 0.5034482657909394, "step": 6945 }, { "epoch": 0.007000113814800154, "grad_norm": 42.42499090282306, "learning_rate": 6.9999798561730765e-06, "loss": 2.3274, "mean_token_accuracy": 0.4068965494632721, "step": 6950 }, { "epoch": 0.007005149867904327, "grad_norm": 40.83366280287029, "learning_rate": 7.0050158129041366e-06, "loss": 2.5474, "mean_token_accuracy": 0.4103448212146759, "step": 6955 }, { "epoch": 0.0070101859210085, "grad_norm": 57.589357362046215, "learning_rate": 7.010051769635195e-06, "loss": 2.3615, "mean_token_accuracy": 0.4517241358757019, "step": 6960 }, { "epoch": 0.007015221974112673, "grad_norm": 40.936388699541595, "learning_rate": 7.015087726366255e-06, "loss": 2.4903, "mean_token_accuracy": 0.4103448331356049, "step": 6965 }, { "epoch": 0.007020258027216846, "grad_norm": 40.902305019210914, "learning_rate": 7.020123683097315e-06, "loss": 2.3589, "mean_token_accuracy": 0.3896551698446274, "step": 6970 }, { "epoch": 0.007025294080321019, "grad_norm": 42.38641675691526, "learning_rate": 7.025159639828375e-06, "loss": 2.0391, "mean_token_accuracy": 0.5041871905326843, "step": 6975 }, { "epoch": 0.007030330133425191, "grad_norm": 41.51045590162831, "learning_rate": 7.0301955965594345e-06, "loss": 2.1452, "mean_token_accuracy": 0.4862068951129913, "step": 6980 }, { "epoch": 0.0070353661865293635, "grad_norm": 35.919549653825314, "learning_rate": 7.035231553290495e-06, "loss": 2.1058, "mean_token_accuracy": 0.44482758045196535, "step": 6985 }, { "epoch": 0.007040402239633536, "grad_norm": 42.620218788263536, "learning_rate": 7.040267510021555e-06, "loss": 2.2592, "mean_token_accuracy": 0.458620673418045, "step": 6990 }, { "epoch": 0.007045438292737709, "grad_norm": 42.64000935045413, "learning_rate": 7.045303466752615e-06, "loss": 2.1976, "mean_token_accuracy": 0.4413793087005615, "step": 6995 }, { "epoch": 0.007050474345841882, "grad_norm": 52.48581687340867, "learning_rate": 7.050339423483673e-06, "loss": 2.2135, "mean_token_accuracy": 0.4689655125141144, "step": 7000 }, { "epoch": 0.007055510398946055, "grad_norm": 41.59638511429294, "learning_rate": 7.055375380214733e-06, "loss": 2.4292, "mean_token_accuracy": 0.4, "step": 7005 }, { "epoch": 0.007060546452050228, "grad_norm": 43.87306165002983, "learning_rate": 7.060411336945793e-06, "loss": 2.3456, "mean_token_accuracy": 0.45329703092575074, "step": 7010 }, { "epoch": 0.0070655825051544, "grad_norm": 50.80426034621016, "learning_rate": 7.0654472936768535e-06, "loss": 2.154, "mean_token_accuracy": 0.48620688915252686, "step": 7015 }, { "epoch": 0.007070618558258573, "grad_norm": 61.30114396391118, "learning_rate": 7.070483250407913e-06, "loss": 2.7253, "mean_token_accuracy": 0.3827586233615875, "step": 7020 }, { "epoch": 0.007075654611362746, "grad_norm": 47.43481055790515, "learning_rate": 7.075519207138973e-06, "loss": 2.1718, "mean_token_accuracy": 0.458620685338974, "step": 7025 }, { "epoch": 0.007080690664466919, "grad_norm": 37.75871150231663, "learning_rate": 7.080555163870033e-06, "loss": 2.2518, "mean_token_accuracy": 0.4482758641242981, "step": 7030 }, { "epoch": 0.007085726717571092, "grad_norm": 41.71150169854353, "learning_rate": 7.085591120601093e-06, "loss": 2.3305, "mean_token_accuracy": 0.47241378426551817, "step": 7035 }, { "epoch": 0.0070907627706752645, "grad_norm": 54.28638817606991, "learning_rate": 7.0906270773321515e-06, "loss": 2.3878, "mean_token_accuracy": 0.4241379380226135, "step": 7040 }, { "epoch": 0.007095798823779437, "grad_norm": 54.87041248299273, "learning_rate": 7.0956630340632116e-06, "loss": 2.0312, "mean_token_accuracy": 0.4851179659366608, "step": 7045 }, { "epoch": 0.0071008348768836094, "grad_norm": 46.303498100930106, "learning_rate": 7.100698990794272e-06, "loss": 2.5216, "mean_token_accuracy": 0.37241379618644715, "step": 7050 }, { "epoch": 0.007105870929987782, "grad_norm": 63.59682745721162, "learning_rate": 7.105734947525332e-06, "loss": 2.6238, "mean_token_accuracy": 0.3793103456497192, "step": 7055 }, { "epoch": 0.007110906983091955, "grad_norm": 60.02220891170065, "learning_rate": 7.11077090425639e-06, "loss": 2.1847, "mean_token_accuracy": 0.4344827592372894, "step": 7060 }, { "epoch": 0.007115943036196128, "grad_norm": 34.54793210198037, "learning_rate": 7.11580686098745e-06, "loss": 2.1355, "mean_token_accuracy": 0.47416818141937256, "step": 7065 }, { "epoch": 0.007120979089300301, "grad_norm": 35.76723916001373, "learning_rate": 7.12084281771851e-06, "loss": 2.3813, "mean_token_accuracy": 0.45517241954803467, "step": 7070 }, { "epoch": 0.007126015142404474, "grad_norm": 34.621456064436906, "learning_rate": 7.1258787744495705e-06, "loss": 2.2343, "mean_token_accuracy": 0.4344827592372894, "step": 7075 }, { "epoch": 0.007131051195508647, "grad_norm": 55.39100176217363, "learning_rate": 7.13091473118063e-06, "loss": 2.5818, "mean_token_accuracy": 0.44827585816383364, "step": 7080 }, { "epoch": 0.007136087248612819, "grad_norm": 38.870365812464414, "learning_rate": 7.13595068791169e-06, "loss": 2.4907, "mean_token_accuracy": 0.43793103098869324, "step": 7085 }, { "epoch": 0.007141123301716992, "grad_norm": 50.11195221501107, "learning_rate": 7.14098664464275e-06, "loss": 2.0391, "mean_token_accuracy": 0.48275861144065857, "step": 7090 }, { "epoch": 0.007146159354821165, "grad_norm": 42.72960315470217, "learning_rate": 7.14602260137381e-06, "loss": 2.4855, "mean_token_accuracy": 0.42413792610168455, "step": 7095 }, { "epoch": 0.0071511954079253376, "grad_norm": 51.15688743407462, "learning_rate": 7.1510585581048684e-06, "loss": 2.8293, "mean_token_accuracy": 0.3689655065536499, "step": 7100 }, { "epoch": 0.0071562314610295105, "grad_norm": 36.360669140435384, "learning_rate": 7.1560945148359285e-06, "loss": 2.4846, "mean_token_accuracy": 0.4310344815254211, "step": 7105 }, { "epoch": 0.007161267514133683, "grad_norm": 44.930176568604665, "learning_rate": 7.161130471566989e-06, "loss": 2.2118, "mean_token_accuracy": 0.42758620977401735, "step": 7110 }, { "epoch": 0.007166303567237855, "grad_norm": 48.940881453163314, "learning_rate": 7.166166428298049e-06, "loss": 2.6227, "mean_token_accuracy": 0.3896551728248596, "step": 7115 }, { "epoch": 0.007171339620342028, "grad_norm": 36.751809186726064, "learning_rate": 7.171202385029108e-06, "loss": 2.1878, "mean_token_accuracy": 0.48620688915252686, "step": 7120 }, { "epoch": 0.007176375673446201, "grad_norm": 49.89972201755115, "learning_rate": 7.176238341760168e-06, "loss": 2.3019, "mean_token_accuracy": 0.42758620381355283, "step": 7125 }, { "epoch": 0.007181411726550374, "grad_norm": 36.71295017293243, "learning_rate": 7.181274298491228e-06, "loss": 1.9507, "mean_token_accuracy": 0.510344821214676, "step": 7130 }, { "epoch": 0.007186447779654547, "grad_norm": 69.97056559126018, "learning_rate": 7.186310255222288e-06, "loss": 2.3335, "mean_token_accuracy": 0.4448275864124298, "step": 7135 }, { "epoch": 0.00719148383275872, "grad_norm": 63.53009980885941, "learning_rate": 7.191346211953347e-06, "loss": 2.3054, "mean_token_accuracy": 0.43103447556495667, "step": 7140 }, { "epoch": 0.007196519885862893, "grad_norm": 35.44089407594087, "learning_rate": 7.196382168684407e-06, "loss": 2.3375, "mean_token_accuracy": 0.4413793087005615, "step": 7145 }, { "epoch": 0.007201555938967065, "grad_norm": 41.84611359776772, "learning_rate": 7.201418125415467e-06, "loss": 2.3276, "mean_token_accuracy": 0.4310344815254211, "step": 7150 }, { "epoch": 0.007206591992071238, "grad_norm": 44.616005258217434, "learning_rate": 7.206454082146527e-06, "loss": 2.3455, "mean_token_accuracy": 0.43103447556495667, "step": 7155 }, { "epoch": 0.007211628045175411, "grad_norm": 39.730617227637744, "learning_rate": 7.211490038877585e-06, "loss": 2.1367, "mean_token_accuracy": 0.4344827592372894, "step": 7160 }, { "epoch": 0.0072166640982795835, "grad_norm": 51.97990611504933, "learning_rate": 7.2165259956086455e-06, "loss": 2.254, "mean_token_accuracy": 0.43448275327682495, "step": 7165 }, { "epoch": 0.007221700151383756, "grad_norm": 45.32075821540736, "learning_rate": 7.221561952339706e-06, "loss": 2.6748, "mean_token_accuracy": 0.37586206793785093, "step": 7170 }, { "epoch": 0.007226736204487929, "grad_norm": 53.216386937354066, "learning_rate": 7.226597909070766e-06, "loss": 2.2826, "mean_token_accuracy": 0.4689655125141144, "step": 7175 }, { "epoch": 0.007231772257592102, "grad_norm": 45.460423208274044, "learning_rate": 7.231633865801825e-06, "loss": 2.1541, "mean_token_accuracy": 0.441379314661026, "step": 7180 }, { "epoch": 0.007236808310696274, "grad_norm": 42.53052807470696, "learning_rate": 7.236669822532885e-06, "loss": 2.208, "mean_token_accuracy": 0.4758620738983154, "step": 7185 }, { "epoch": 0.007241844363800447, "grad_norm": 36.42312803969594, "learning_rate": 7.241705779263945e-06, "loss": 2.287, "mean_token_accuracy": 0.4551724076271057, "step": 7190 }, { "epoch": 0.00724688041690462, "grad_norm": 66.7069081742678, "learning_rate": 7.246741735995005e-06, "loss": 2.0455, "mean_token_accuracy": 0.4689655125141144, "step": 7195 }, { "epoch": 0.007251916470008793, "grad_norm": 45.309369072802916, "learning_rate": 7.251777692726064e-06, "loss": 2.5352, "mean_token_accuracy": 0.3896551728248596, "step": 7200 }, { "epoch": 0.007256952523112966, "grad_norm": 60.952399529324026, "learning_rate": 7.256813649457124e-06, "loss": 2.4926, "mean_token_accuracy": 0.42238354682922363, "step": 7205 }, { "epoch": 0.007261988576217139, "grad_norm": 49.81871048491977, "learning_rate": 7.261849606188184e-06, "loss": 2.2722, "mean_token_accuracy": 0.4241379380226135, "step": 7210 }, { "epoch": 0.007267024629321312, "grad_norm": 47.851171879008675, "learning_rate": 7.266885562919244e-06, "loss": 2.2262, "mean_token_accuracy": 0.4551724076271057, "step": 7215 }, { "epoch": 0.007272060682425484, "grad_norm": 48.03082363527323, "learning_rate": 7.271921519650303e-06, "loss": 2.3253, "mean_token_accuracy": 0.42413793206214906, "step": 7220 }, { "epoch": 0.0072770967355296565, "grad_norm": 42.18251540875435, "learning_rate": 7.276957476381363e-06, "loss": 2.0987, "mean_token_accuracy": 0.47586206793785096, "step": 7225 }, { "epoch": 0.0072821327886338294, "grad_norm": 51.33179474610389, "learning_rate": 7.281993433112423e-06, "loss": 2.4539, "mean_token_accuracy": 0.4448275864124298, "step": 7230 }, { "epoch": 0.007287168841738002, "grad_norm": 49.01537882849991, "learning_rate": 7.2870293898434835e-06, "loss": 2.2662, "mean_token_accuracy": 0.42758620381355283, "step": 7235 }, { "epoch": 0.007292204894842175, "grad_norm": 46.04061801252075, "learning_rate": 7.292065346574542e-06, "loss": 2.0704, "mean_token_accuracy": 0.5539279341697693, "step": 7240 }, { "epoch": 0.007297240947946348, "grad_norm": 87.56177636085452, "learning_rate": 7.297101303305602e-06, "loss": 2.3001, "mean_token_accuracy": 0.47586206197738645, "step": 7245 }, { "epoch": 0.007302277001050521, "grad_norm": 63.621548698017534, "learning_rate": 7.302137260036662e-06, "loss": 2.1936, "mean_token_accuracy": 0.4655172348022461, "step": 7250 }, { "epoch": 0.007307313054154693, "grad_norm": 46.414432950132, "learning_rate": 7.307173216767722e-06, "loss": 2.5565, "mean_token_accuracy": 0.37543859481811526, "step": 7255 }, { "epoch": 0.007312349107258866, "grad_norm": 38.9196822916388, "learning_rate": 7.312209173498781e-06, "loss": 2.3082, "mean_token_accuracy": 0.417241370677948, "step": 7260 }, { "epoch": 0.007317385160363039, "grad_norm": 42.048293480324, "learning_rate": 7.3172451302298416e-06, "loss": 2.4473, "mean_token_accuracy": 0.4172413766384125, "step": 7265 }, { "epoch": 0.007322421213467212, "grad_norm": 44.41952114574183, "learning_rate": 7.322281086960902e-06, "loss": 2.6062, "mean_token_accuracy": 0.4103448331356049, "step": 7270 }, { "epoch": 0.007327457266571385, "grad_norm": 48.99043258135825, "learning_rate": 7.327317043691962e-06, "loss": 2.1611, "mean_token_accuracy": 0.4758620738983154, "step": 7275 }, { "epoch": 0.0073324933196755575, "grad_norm": 35.01901562228969, "learning_rate": 7.33235300042302e-06, "loss": 2.153, "mean_token_accuracy": 0.46551724076271056, "step": 7280 }, { "epoch": 0.0073375293727797304, "grad_norm": 29.19362158498308, "learning_rate": 7.33738895715408e-06, "loss": 1.9198, "mean_token_accuracy": 0.5448275744915009, "step": 7285 }, { "epoch": 0.0073425654258839025, "grad_norm": 53.34989488039861, "learning_rate": 7.34242491388514e-06, "loss": 2.5123, "mean_token_accuracy": 0.4206896543502808, "step": 7290 }, { "epoch": 0.007347601478988075, "grad_norm": 61.67489299187737, "learning_rate": 7.3474608706162005e-06, "loss": 2.3752, "mean_token_accuracy": 0.42758620381355283, "step": 7295 }, { "epoch": 0.007352637532092248, "grad_norm": 40.30318813251888, "learning_rate": 7.352496827347259e-06, "loss": 2.267, "mean_token_accuracy": 0.4310344815254211, "step": 7300 }, { "epoch": 0.007357673585196421, "grad_norm": 57.08723519385807, "learning_rate": 7.357532784078319e-06, "loss": 2.3227, "mean_token_accuracy": 0.43986691236495973, "step": 7305 }, { "epoch": 0.007362709638300594, "grad_norm": 75.57077908588624, "learning_rate": 7.362568740809379e-06, "loss": 2.0686, "mean_token_accuracy": 0.48124622106552123, "step": 7310 }, { "epoch": 0.007367745691404767, "grad_norm": 114.78906317035006, "learning_rate": 7.367604697540439e-06, "loss": 2.1749, "mean_token_accuracy": 0.46896551847457885, "step": 7315 }, { "epoch": 0.00737278174450894, "grad_norm": 86.06727225037893, "learning_rate": 7.3726406542714984e-06, "loss": 2.0564, "mean_token_accuracy": 0.5137930929660797, "step": 7320 }, { "epoch": 0.007377817797613112, "grad_norm": 55.05800807458252, "learning_rate": 7.3776766110025585e-06, "loss": 2.2081, "mean_token_accuracy": 0.44482759237289426, "step": 7325 }, { "epoch": 0.007382853850717285, "grad_norm": 41.05143809129591, "learning_rate": 7.382712567733619e-06, "loss": 2.4455, "mean_token_accuracy": 0.43448275327682495, "step": 7330 }, { "epoch": 0.007387889903821458, "grad_norm": 58.9581671994853, "learning_rate": 7.387748524464679e-06, "loss": 2.3688, "mean_token_accuracy": 0.4034482717514038, "step": 7335 }, { "epoch": 0.007392925956925631, "grad_norm": 54.62953796419114, "learning_rate": 7.392784481195737e-06, "loss": 2.2827, "mean_token_accuracy": 0.4655172348022461, "step": 7340 }, { "epoch": 0.0073979620100298035, "grad_norm": 52.97286611481698, "learning_rate": 7.397820437926797e-06, "loss": 2.1995, "mean_token_accuracy": 0.4206896543502808, "step": 7345 }, { "epoch": 0.007402998063133976, "grad_norm": 55.79868702350211, "learning_rate": 7.402856394657857e-06, "loss": 2.4488, "mean_token_accuracy": 0.4517241358757019, "step": 7350 }, { "epoch": 0.007408034116238149, "grad_norm": 45.48915047364951, "learning_rate": 7.4078923513889174e-06, "loss": 2.1298, "mean_token_accuracy": 0.47586206793785096, "step": 7355 }, { "epoch": 0.007413070169342321, "grad_norm": 48.04390928096492, "learning_rate": 7.412928308119977e-06, "loss": 2.4245, "mean_token_accuracy": 0.4206896543502808, "step": 7360 }, { "epoch": 0.007418106222446494, "grad_norm": 38.62262242716692, "learning_rate": 7.417964264851037e-06, "loss": 2.4254, "mean_token_accuracy": 0.4517241418361664, "step": 7365 }, { "epoch": 0.007423142275550667, "grad_norm": 38.74466594304998, "learning_rate": 7.423000221582097e-06, "loss": 2.3334, "mean_token_accuracy": 0.41724138259887694, "step": 7370 }, { "epoch": 0.00742817832865484, "grad_norm": 45.456584143004314, "learning_rate": 7.428036178313157e-06, "loss": 2.5147, "mean_token_accuracy": 0.3896551728248596, "step": 7375 }, { "epoch": 0.007433214381759013, "grad_norm": 44.209083090275534, "learning_rate": 7.433072135044217e-06, "loss": 2.3097, "mean_token_accuracy": 0.4448275864124298, "step": 7380 }, { "epoch": 0.007438250434863186, "grad_norm": 48.190024803535884, "learning_rate": 7.4381080917752755e-06, "loss": 2.134, "mean_token_accuracy": 0.493103438615799, "step": 7385 }, { "epoch": 0.007443286487967359, "grad_norm": 47.13081119078609, "learning_rate": 7.443144048506336e-06, "loss": 2.8193, "mean_token_accuracy": 0.4034482717514038, "step": 7390 }, { "epoch": 0.007448322541071531, "grad_norm": 32.16926280846503, "learning_rate": 7.448180005237396e-06, "loss": 2.3151, "mean_token_accuracy": 0.4551724135875702, "step": 7395 }, { "epoch": 0.007453358594175704, "grad_norm": 52.59528993764966, "learning_rate": 7.453215961968456e-06, "loss": 2.7566, "mean_token_accuracy": 0.38620689511299133, "step": 7400 }, { "epoch": 0.0074583946472798765, "grad_norm": 49.81781580297935, "learning_rate": 7.458251918699514e-06, "loss": 2.4321, "mean_token_accuracy": 0.42758620381355283, "step": 7405 }, { "epoch": 0.007463430700384049, "grad_norm": 62.28853719820733, "learning_rate": 7.463287875430574e-06, "loss": 2.3217, "mean_token_accuracy": 0.4379310369491577, "step": 7410 }, { "epoch": 0.007468466753488222, "grad_norm": 51.31986401745938, "learning_rate": 7.468323832161634e-06, "loss": 2.4849, "mean_token_accuracy": 0.3896551638841629, "step": 7415 }, { "epoch": 0.007473502806592395, "grad_norm": 48.378725380322784, "learning_rate": 7.4733597888926945e-06, "loss": 2.0426, "mean_token_accuracy": 0.47586206197738645, "step": 7420 }, { "epoch": 0.007478538859696568, "grad_norm": 53.47077832783121, "learning_rate": 7.478395745623754e-06, "loss": 2.244, "mean_token_accuracy": 0.49655172824859617, "step": 7425 }, { "epoch": 0.00748357491280074, "grad_norm": 56.998836287754415, "learning_rate": 7.483431702354814e-06, "loss": 2.4785, "mean_token_accuracy": 0.3862069010734558, "step": 7430 }, { "epoch": 0.007488610965904913, "grad_norm": 46.11160333115532, "learning_rate": 7.488467659085874e-06, "loss": 2.4127, "mean_token_accuracy": 0.441379314661026, "step": 7435 }, { "epoch": 0.007493647019009086, "grad_norm": 58.27831726409421, "learning_rate": 7.493503615816934e-06, "loss": 2.4934, "mean_token_accuracy": 0.4103448331356049, "step": 7440 }, { "epoch": 0.007498683072113259, "grad_norm": 40.930010813898996, "learning_rate": 7.4985395725479925e-06, "loss": 2.5607, "mean_token_accuracy": 0.37241379618644715, "step": 7445 }, { "epoch": 0.007503719125217432, "grad_norm": 66.3185144914046, "learning_rate": 7.5035755292790526e-06, "loss": 2.4734, "mean_token_accuracy": 0.4034482777118683, "step": 7450 }, { "epoch": 0.007508755178321605, "grad_norm": 33.60927099771563, "learning_rate": 7.508611486010113e-06, "loss": 2.2858, "mean_token_accuracy": 0.4275861978530884, "step": 7455 }, { "epoch": 0.0075137912314257775, "grad_norm": 51.45387795202174, "learning_rate": 7.513647442741173e-06, "loss": 2.2988, "mean_token_accuracy": 0.4448275864124298, "step": 7460 }, { "epoch": 0.00751882728452995, "grad_norm": 49.88928089858543, "learning_rate": 7.518683399472232e-06, "loss": 2.1042, "mean_token_accuracy": 0.5103448331356049, "step": 7465 }, { "epoch": 0.0075238633376341225, "grad_norm": 42.43795352052936, "learning_rate": 7.523719356203292e-06, "loss": 2.0689, "mean_token_accuracy": 0.4724137902259827, "step": 7470 }, { "epoch": 0.007528899390738295, "grad_norm": 42.86491293975076, "learning_rate": 7.528755312934352e-06, "loss": 2.1156, "mean_token_accuracy": 0.4862069070339203, "step": 7475 }, { "epoch": 0.007533935443842468, "grad_norm": 46.90900950426308, "learning_rate": 7.533791269665412e-06, "loss": 2.508, "mean_token_accuracy": 0.417241370677948, "step": 7480 }, { "epoch": 0.007538971496946641, "grad_norm": 49.16215530452209, "learning_rate": 7.538827226396471e-06, "loss": 2.3843, "mean_token_accuracy": 0.458620685338974, "step": 7485 }, { "epoch": 0.007544007550050814, "grad_norm": 39.59053964805687, "learning_rate": 7.543863183127531e-06, "loss": 2.1863, "mean_token_accuracy": 0.4758620738983154, "step": 7490 }, { "epoch": 0.007549043603154987, "grad_norm": 42.8136710568869, "learning_rate": 7.548899139858591e-06, "loss": 2.3064, "mean_token_accuracy": 0.441379314661026, "step": 7495 }, { "epoch": 0.007554079656259159, "grad_norm": 43.91565828053337, "learning_rate": 7.553935096589651e-06, "loss": 2.6552, "mean_token_accuracy": 0.39655172228813174, "step": 7500 }, { "epoch": 0.007559115709363332, "grad_norm": 42.645135244946374, "learning_rate": 7.5589710533207094e-06, "loss": 2.1104, "mean_token_accuracy": 0.47931033968925474, "step": 7505 }, { "epoch": 0.007564151762467505, "grad_norm": 46.96750923330262, "learning_rate": 7.5640070100517695e-06, "loss": 2.41, "mean_token_accuracy": 0.37586206793785093, "step": 7510 }, { "epoch": 0.007569187815571678, "grad_norm": 59.30592769963364, "learning_rate": 7.56904296678283e-06, "loss": 2.2797, "mean_token_accuracy": 0.45517240166664125, "step": 7515 }, { "epoch": 0.007574223868675851, "grad_norm": 43.34814182854512, "learning_rate": 7.57407892351389e-06, "loss": 2.3341, "mean_token_accuracy": 0.47241380214691164, "step": 7520 }, { "epoch": 0.0075792599217800235, "grad_norm": 39.766194807364954, "learning_rate": 7.579114880244949e-06, "loss": 1.9177, "mean_token_accuracy": 0.5137931048870087, "step": 7525 }, { "epoch": 0.007584295974884196, "grad_norm": 47.52299872328629, "learning_rate": 7.584150836976009e-06, "loss": 2.5546, "mean_token_accuracy": 0.3793103456497192, "step": 7530 }, { "epoch": 0.007589332027988368, "grad_norm": 42.412062957608335, "learning_rate": 7.589186793707069e-06, "loss": 2.3719, "mean_token_accuracy": 0.4034482777118683, "step": 7535 }, { "epoch": 0.007594368081092541, "grad_norm": 58.95088711549987, "learning_rate": 7.594222750438129e-06, "loss": 2.3328, "mean_token_accuracy": 0.4261342942714691, "step": 7540 }, { "epoch": 0.007599404134196714, "grad_norm": 50.84044022227517, "learning_rate": 7.599258707169188e-06, "loss": 2.2715, "mean_token_accuracy": 0.4620689570903778, "step": 7545 }, { "epoch": 0.007604440187300887, "grad_norm": 32.41353159043417, "learning_rate": 7.604294663900248e-06, "loss": 2.0957, "mean_token_accuracy": 0.47586206793785096, "step": 7550 }, { "epoch": 0.00760947624040506, "grad_norm": 53.91798170282348, "learning_rate": 7.609330620631308e-06, "loss": 2.2431, "mean_token_accuracy": 0.4190562665462494, "step": 7555 }, { "epoch": 0.007614512293509233, "grad_norm": 41.611142788557196, "learning_rate": 7.614366577362368e-06, "loss": 2.2547, "mean_token_accuracy": 0.4379310369491577, "step": 7560 }, { "epoch": 0.007619548346613405, "grad_norm": 51.2639911083754, "learning_rate": 7.619402534093427e-06, "loss": 2.3106, "mean_token_accuracy": 0.44658197164535524, "step": 7565 }, { "epoch": 0.007624584399717578, "grad_norm": 52.95813443200162, "learning_rate": 7.624438490824487e-06, "loss": 2.633, "mean_token_accuracy": 0.3931034505367279, "step": 7570 }, { "epoch": 0.007629620452821751, "grad_norm": 50.922433698370426, "learning_rate": 7.629474447555547e-06, "loss": 2.1826, "mean_token_accuracy": 0.4448275864124298, "step": 7575 }, { "epoch": 0.007634656505925924, "grad_norm": 43.28774317244623, "learning_rate": 7.634510404286607e-06, "loss": 2.3969, "mean_token_accuracy": 0.4431941986083984, "step": 7580 }, { "epoch": 0.0076396925590300965, "grad_norm": 70.09333539919614, "learning_rate": 7.639546361017666e-06, "loss": 2.1912, "mean_token_accuracy": 0.46551724672317507, "step": 7585 }, { "epoch": 0.007644728612134269, "grad_norm": 35.64048317441706, "learning_rate": 7.644582317748727e-06, "loss": 2.0953, "mean_token_accuracy": 0.45517241954803467, "step": 7590 }, { "epoch": 0.007649764665238442, "grad_norm": 38.46646477447764, "learning_rate": 7.649618274479786e-06, "loss": 2.1526, "mean_token_accuracy": 0.5034482717514038, "step": 7595 }, { "epoch": 0.007654800718342614, "grad_norm": 41.14308446109098, "learning_rate": 7.654654231210847e-06, "loss": 2.4023, "mean_token_accuracy": 0.4344827592372894, "step": 7600 }, { "epoch": 0.007659836771446787, "grad_norm": 50.13062794776937, "learning_rate": 7.659690187941905e-06, "loss": 2.3628, "mean_token_accuracy": 0.4310344815254211, "step": 7605 }, { "epoch": 0.00766487282455096, "grad_norm": 49.47233875223298, "learning_rate": 7.664726144672966e-06, "loss": 2.2587, "mean_token_accuracy": 0.493103438615799, "step": 7610 }, { "epoch": 0.007669908877655133, "grad_norm": 46.79330785810778, "learning_rate": 7.669762101404025e-06, "loss": 2.257, "mean_token_accuracy": 0.43103448748588563, "step": 7615 }, { "epoch": 0.007674944930759306, "grad_norm": 43.624608234233115, "learning_rate": 7.674798058135086e-06, "loss": 2.2174, "mean_token_accuracy": 0.4379310369491577, "step": 7620 }, { "epoch": 0.007679980983863479, "grad_norm": 41.3989602121546, "learning_rate": 7.679834014866143e-06, "loss": 2.3472, "mean_token_accuracy": 0.39655172228813174, "step": 7625 }, { "epoch": 0.007685017036967652, "grad_norm": 42.18964005080521, "learning_rate": 7.684869971597204e-06, "loss": 2.2834, "mean_token_accuracy": 0.4517241358757019, "step": 7630 }, { "epoch": 0.007690053090071824, "grad_norm": 61.18417420388271, "learning_rate": 7.689905928328264e-06, "loss": 2.6339, "mean_token_accuracy": 0.4148820281028748, "step": 7635 }, { "epoch": 0.007695089143175997, "grad_norm": 35.60457577088861, "learning_rate": 7.694941885059325e-06, "loss": 1.9169, "mean_token_accuracy": 0.5551724076271057, "step": 7640 }, { "epoch": 0.00770012519628017, "grad_norm": 40.23032198226782, "learning_rate": 7.699977841790384e-06, "loss": 2.6328, "mean_token_accuracy": 0.34482758641242983, "step": 7645 }, { "epoch": 0.0077051612493843425, "grad_norm": 43.579986828190805, "learning_rate": 7.705013798521443e-06, "loss": 2.4993, "mean_token_accuracy": 0.42413793206214906, "step": 7650 }, { "epoch": 0.007710197302488515, "grad_norm": 47.96196094150128, "learning_rate": 7.710049755252504e-06, "loss": 2.458, "mean_token_accuracy": 0.41724138259887694, "step": 7655 }, { "epoch": 0.007715233355592688, "grad_norm": 48.11525324326653, "learning_rate": 7.715085711983563e-06, "loss": 2.5764, "mean_token_accuracy": 0.3793103456497192, "step": 7660 }, { "epoch": 0.007720269408696861, "grad_norm": 33.9895637232027, "learning_rate": 7.720121668714622e-06, "loss": 2.3611, "mean_token_accuracy": 0.3931034505367279, "step": 7665 }, { "epoch": 0.007725305461801033, "grad_norm": 52.86292244157372, "learning_rate": 7.725157625445682e-06, "loss": 2.2499, "mean_token_accuracy": 0.42068966031074523, "step": 7670 }, { "epoch": 0.007730341514905206, "grad_norm": 42.653361613734404, "learning_rate": 7.730193582176743e-06, "loss": 2.3643, "mean_token_accuracy": 0.44482758045196535, "step": 7675 }, { "epoch": 0.007735377568009379, "grad_norm": 46.33142546120593, "learning_rate": 7.735229538907802e-06, "loss": 2.4415, "mean_token_accuracy": 0.4344827651977539, "step": 7680 }, { "epoch": 0.007740413621113552, "grad_norm": 43.52860533957293, "learning_rate": 7.740265495638861e-06, "loss": 2.4598, "mean_token_accuracy": 0.4000000059604645, "step": 7685 }, { "epoch": 0.007745449674217725, "grad_norm": 39.834301317700906, "learning_rate": 7.745301452369922e-06, "loss": 2.1268, "mean_token_accuracy": 0.44827585816383364, "step": 7690 }, { "epoch": 0.007750485727321898, "grad_norm": 36.51372472704589, "learning_rate": 7.750337409100981e-06, "loss": 2.2211, "mean_token_accuracy": 0.48620688915252686, "step": 7695 }, { "epoch": 0.007755521780426071, "grad_norm": 40.87093081805841, "learning_rate": 7.755373365832042e-06, "loss": 2.0887, "mean_token_accuracy": 0.4896551787853241, "step": 7700 }, { "epoch": 0.007760557833530243, "grad_norm": 48.91354782935568, "learning_rate": 7.7604093225631e-06, "loss": 2.3963, "mean_token_accuracy": 0.4310344815254211, "step": 7705 }, { "epoch": 0.0077655938866344155, "grad_norm": 38.817663032493066, "learning_rate": 7.76544527929416e-06, "loss": 2.1348, "mean_token_accuracy": 0.46896551847457885, "step": 7710 }, { "epoch": 0.007770629939738588, "grad_norm": 49.931966801400854, "learning_rate": 7.77048123602522e-06, "loss": 2.2143, "mean_token_accuracy": 0.4482758641242981, "step": 7715 }, { "epoch": 0.007775665992842761, "grad_norm": 40.49895562901206, "learning_rate": 7.775517192756281e-06, "loss": 2.2857, "mean_token_accuracy": 0.44827587008476255, "step": 7720 }, { "epoch": 0.007780702045946934, "grad_norm": 59.34824961207105, "learning_rate": 7.780553149487339e-06, "loss": 2.5745, "mean_token_accuracy": 0.42758620381355283, "step": 7725 }, { "epoch": 0.007785738099051107, "grad_norm": 52.490397455647674, "learning_rate": 7.7855891062184e-06, "loss": 2.1525, "mean_token_accuracy": 0.4620689630508423, "step": 7730 }, { "epoch": 0.00779077415215528, "grad_norm": 57.12734894283857, "learning_rate": 7.790625062949459e-06, "loss": 2.3675, "mean_token_accuracy": 0.4358136713504791, "step": 7735 }, { "epoch": 0.007795810205259452, "grad_norm": 35.61173530034061, "learning_rate": 7.79566101968052e-06, "loss": 2.0446, "mean_token_accuracy": 0.4931034505367279, "step": 7740 }, { "epoch": 0.007800846258363625, "grad_norm": 40.00574990266676, "learning_rate": 7.800696976411579e-06, "loss": 2.2376, "mean_token_accuracy": 0.41379310488700866, "step": 7745 }, { "epoch": 0.007805882311467798, "grad_norm": 58.59536698522381, "learning_rate": 7.805732933142638e-06, "loss": 2.4005, "mean_token_accuracy": 0.3965517282485962, "step": 7750 }, { "epoch": 0.007810918364571971, "grad_norm": 42.72620314201858, "learning_rate": 7.8107688898737e-06, "loss": 2.4971, "mean_token_accuracy": 0.4344827592372894, "step": 7755 }, { "epoch": 0.007815954417676144, "grad_norm": 54.89578534200915, "learning_rate": 7.815804846604758e-06, "loss": 2.3608, "mean_token_accuracy": 0.4206896543502808, "step": 7760 }, { "epoch": 0.007820990470780316, "grad_norm": 37.01029195744664, "learning_rate": 7.820840803335818e-06, "loss": 2.1321, "mean_token_accuracy": 0.4896551728248596, "step": 7765 }, { "epoch": 0.00782602652388449, "grad_norm": 40.008356401804875, "learning_rate": 7.825876760066877e-06, "loss": 2.268, "mean_token_accuracy": 0.46896551847457885, "step": 7770 }, { "epoch": 0.007831062576988661, "grad_norm": 64.97541840339466, "learning_rate": 7.830912716797938e-06, "loss": 2.3936, "mean_token_accuracy": 0.42068966031074523, "step": 7775 }, { "epoch": 0.007836098630092835, "grad_norm": 40.205765222638945, "learning_rate": 7.835948673528997e-06, "loss": 2.3242, "mean_token_accuracy": 0.4724137902259827, "step": 7780 }, { "epoch": 0.007841134683197007, "grad_norm": 35.61571063010522, "learning_rate": 7.840984630260058e-06, "loss": 2.2819, "mean_token_accuracy": 0.4000000059604645, "step": 7785 }, { "epoch": 0.00784617073630118, "grad_norm": 34.920276614563896, "learning_rate": 7.846020586991117e-06, "loss": 2.054, "mean_token_accuracy": 0.47586206793785096, "step": 7790 }, { "epoch": 0.007851206789405353, "grad_norm": 37.76935688270458, "learning_rate": 7.851056543722177e-06, "loss": 2.4469, "mean_token_accuracy": 0.4344827592372894, "step": 7795 }, { "epoch": 0.007856242842509525, "grad_norm": 51.292214779958435, "learning_rate": 7.856092500453238e-06, "loss": 2.5364, "mean_token_accuracy": 0.4034482777118683, "step": 7800 }, { "epoch": 0.007861278895613699, "grad_norm": 57.726874259807644, "learning_rate": 7.861128457184297e-06, "loss": 2.4918, "mean_token_accuracy": 0.4034482777118683, "step": 7805 }, { "epoch": 0.007866314948717871, "grad_norm": 42.92663352203394, "learning_rate": 7.866164413915356e-06, "loss": 2.2924, "mean_token_accuracy": 0.43103448748588563, "step": 7810 }, { "epoch": 0.007871351001822045, "grad_norm": 58.559171094667974, "learning_rate": 7.871200370646415e-06, "loss": 2.3535, "mean_token_accuracy": 0.43986691236495973, "step": 7815 }, { "epoch": 0.007876387054926217, "grad_norm": 37.632915335504485, "learning_rate": 7.876236327377476e-06, "loss": 2.2677, "mean_token_accuracy": 0.42758620381355283, "step": 7820 }, { "epoch": 0.007881423108030389, "grad_norm": 44.94353635415, "learning_rate": 7.881272284108535e-06, "loss": 2.6584, "mean_token_accuracy": 0.38965516090393065, "step": 7825 }, { "epoch": 0.007886459161134562, "grad_norm": 46.459638036345424, "learning_rate": 7.886308240839595e-06, "loss": 2.3074, "mean_token_accuracy": 0.4068965494632721, "step": 7830 }, { "epoch": 0.007891495214238734, "grad_norm": 40.9678738471561, "learning_rate": 7.891344197570654e-06, "loss": 1.9877, "mean_token_accuracy": 0.46896551847457885, "step": 7835 }, { "epoch": 0.007896531267342908, "grad_norm": 35.270568273042805, "learning_rate": 7.896380154301715e-06, "loss": 2.5115, "mean_token_accuracy": 0.43448275327682495, "step": 7840 }, { "epoch": 0.00790156732044708, "grad_norm": 41.37868217635883, "learning_rate": 7.901416111032774e-06, "loss": 2.3873, "mean_token_accuracy": 0.38620689511299133, "step": 7845 }, { "epoch": 0.007906603373551254, "grad_norm": 36.816621183394375, "learning_rate": 7.906452067763833e-06, "loss": 2.5478, "mean_token_accuracy": 0.4034482717514038, "step": 7850 }, { "epoch": 0.007911639426655426, "grad_norm": 38.40622947514277, "learning_rate": 7.911488024494894e-06, "loss": 2.4048, "mean_token_accuracy": 0.4103448212146759, "step": 7855 }, { "epoch": 0.007916675479759598, "grad_norm": 64.2355979643051, "learning_rate": 7.916523981225954e-06, "loss": 1.9694, "mean_token_accuracy": 0.4689655125141144, "step": 7860 }, { "epoch": 0.007921711532863772, "grad_norm": 34.714103206978635, "learning_rate": 7.921559937957015e-06, "loss": 2.5681, "mean_token_accuracy": 0.3793103456497192, "step": 7865 }, { "epoch": 0.007926747585967944, "grad_norm": 39.01254189774147, "learning_rate": 7.926595894688072e-06, "loss": 2.7166, "mean_token_accuracy": 0.3482758581638336, "step": 7870 }, { "epoch": 0.007931783639072118, "grad_norm": 34.12397176007829, "learning_rate": 7.931631851419133e-06, "loss": 2.2516, "mean_token_accuracy": 0.4413793087005615, "step": 7875 }, { "epoch": 0.00793681969217629, "grad_norm": 33.56962315886412, "learning_rate": 7.936667808150192e-06, "loss": 2.1755, "mean_token_accuracy": 0.4241379380226135, "step": 7880 }, { "epoch": 0.007941855745280463, "grad_norm": 35.76202580989349, "learning_rate": 7.941703764881253e-06, "loss": 2.1647, "mean_token_accuracy": 0.42413793206214906, "step": 7885 }, { "epoch": 0.007946891798384636, "grad_norm": 42.44219298453, "learning_rate": 7.946739721612313e-06, "loss": 2.0553, "mean_token_accuracy": 0.45517241954803467, "step": 7890 }, { "epoch": 0.007951927851488808, "grad_norm": 44.999233786125274, "learning_rate": 7.951775678343372e-06, "loss": 2.4038, "mean_token_accuracy": 0.44827585816383364, "step": 7895 }, { "epoch": 0.007956963904592981, "grad_norm": 32.965110297352844, "learning_rate": 7.956811635074433e-06, "loss": 2.0719, "mean_token_accuracy": 0.46896551847457885, "step": 7900 }, { "epoch": 0.007961999957697153, "grad_norm": 32.48970915329887, "learning_rate": 7.961847591805492e-06, "loss": 2.1785, "mean_token_accuracy": 0.4172413766384125, "step": 7905 }, { "epoch": 0.007967036010801327, "grad_norm": 36.76062399210729, "learning_rate": 7.966883548536551e-06, "loss": 2.1412, "mean_token_accuracy": 0.42758620977401735, "step": 7910 }, { "epoch": 0.0079720720639055, "grad_norm": 38.249870561280375, "learning_rate": 7.97191950526761e-06, "loss": 2.1193, "mean_token_accuracy": 0.482758629322052, "step": 7915 }, { "epoch": 0.007977108117009673, "grad_norm": 32.57216676657864, "learning_rate": 7.976955461998671e-06, "loss": 1.993, "mean_token_accuracy": 0.4793103337287903, "step": 7920 }, { "epoch": 0.007982144170113845, "grad_norm": 37.797191997060736, "learning_rate": 7.98199141872973e-06, "loss": 2.5122, "mean_token_accuracy": 0.41379310488700866, "step": 7925 }, { "epoch": 0.007987180223218017, "grad_norm": 41.1157806705336, "learning_rate": 7.98702737546079e-06, "loss": 2.1286, "mean_token_accuracy": 0.42758620977401735, "step": 7930 }, { "epoch": 0.00799221627632219, "grad_norm": 52.97541033800784, "learning_rate": 7.99206333219185e-06, "loss": 2.3396, "mean_token_accuracy": 0.47931033968925474, "step": 7935 }, { "epoch": 0.007997252329426363, "grad_norm": 33.541916680506986, "learning_rate": 7.99709928892291e-06, "loss": 2.3101, "mean_token_accuracy": 0.4413793206214905, "step": 7940 }, { "epoch": 0.008002288382530537, "grad_norm": 42.78588823599312, "learning_rate": 8.00213524565397e-06, "loss": 2.2737, "mean_token_accuracy": 0.4379310369491577, "step": 7945 }, { "epoch": 0.008007324435634709, "grad_norm": 40.79040540595521, "learning_rate": 8.007171202385029e-06, "loss": 2.4116, "mean_token_accuracy": 0.41034482717514037, "step": 7950 }, { "epoch": 0.008012360488738882, "grad_norm": 42.04350609933651, "learning_rate": 8.01220715911609e-06, "loss": 2.2567, "mean_token_accuracy": 0.4206896424293518, "step": 7955 }, { "epoch": 0.008017396541843054, "grad_norm": 65.63300468855537, "learning_rate": 8.017243115847149e-06, "loss": 2.5179, "mean_token_accuracy": 0.35862069129943847, "step": 7960 }, { "epoch": 0.008022432594947226, "grad_norm": 62.49629976732185, "learning_rate": 8.02227907257821e-06, "loss": 2.2414, "mean_token_accuracy": 0.42758620381355283, "step": 7965 }, { "epoch": 0.0080274686480514, "grad_norm": 50.490741559164285, "learning_rate": 8.027315029309267e-06, "loss": 2.4121, "mean_token_accuracy": 0.41724138259887694, "step": 7970 }, { "epoch": 0.008032504701155572, "grad_norm": 39.45273352591065, "learning_rate": 8.032350986040328e-06, "loss": 2.1944, "mean_token_accuracy": 0.4206896543502808, "step": 7975 }, { "epoch": 0.008037540754259746, "grad_norm": 37.438793730310145, "learning_rate": 8.037386942771388e-06, "loss": 2.0942, "mean_token_accuracy": 0.48620688915252686, "step": 7980 }, { "epoch": 0.008042576807363918, "grad_norm": 53.69375202443497, "learning_rate": 8.042422899502449e-06, "loss": 2.1803, "mean_token_accuracy": 0.44827585220336913, "step": 7985 }, { "epoch": 0.008047612860468092, "grad_norm": 55.15305514642944, "learning_rate": 8.047458856233508e-06, "loss": 2.3959, "mean_token_accuracy": 0.4379310250282288, "step": 7990 }, { "epoch": 0.008052648913572264, "grad_norm": 38.64104129707437, "learning_rate": 8.052494812964567e-06, "loss": 2.19, "mean_token_accuracy": 0.4344827592372894, "step": 7995 }, { "epoch": 0.008057684966676436, "grad_norm": 58.24588947930217, "learning_rate": 8.057530769695628e-06, "loss": 2.4183, "mean_token_accuracy": 0.4275861978530884, "step": 8000 }, { "epoch": 0.00806272101978061, "grad_norm": 40.03924470610734, "learning_rate": 8.062566726426687e-06, "loss": 2.1939, "mean_token_accuracy": 0.47931033968925474, "step": 8005 }, { "epoch": 0.008067757072884782, "grad_norm": 42.81956428420776, "learning_rate": 8.067602683157746e-06, "loss": 2.4836, "mean_token_accuracy": 0.3862068891525269, "step": 8010 }, { "epoch": 0.008072793125988955, "grad_norm": 34.87071358648223, "learning_rate": 8.072638639888806e-06, "loss": 2.2091, "mean_token_accuracy": 0.42068966031074523, "step": 8015 }, { "epoch": 0.008077829179093127, "grad_norm": 72.49380101156484, "learning_rate": 8.077674596619867e-06, "loss": 2.3102, "mean_token_accuracy": 0.43103448748588563, "step": 8020 }, { "epoch": 0.008082865232197301, "grad_norm": 50.21162649746146, "learning_rate": 8.082710553350926e-06, "loss": 2.3308, "mean_token_accuracy": 0.44137930274009707, "step": 8025 }, { "epoch": 0.008087901285301473, "grad_norm": 39.84420939519963, "learning_rate": 8.087746510081985e-06, "loss": 2.5493, "mean_token_accuracy": 0.39310343861579894, "step": 8030 }, { "epoch": 0.008092937338405645, "grad_norm": 48.5081883347131, "learning_rate": 8.092782466813046e-06, "loss": 2.1594, "mean_token_accuracy": 0.441379314661026, "step": 8035 }, { "epoch": 0.008097973391509819, "grad_norm": 53.68934938315932, "learning_rate": 8.097818423544105e-06, "loss": 2.3162, "mean_token_accuracy": 0.4517241358757019, "step": 8040 }, { "epoch": 0.008103009444613991, "grad_norm": 41.84541629098553, "learning_rate": 8.102854380275166e-06, "loss": 1.9821, "mean_token_accuracy": 0.5103448271751404, "step": 8045 }, { "epoch": 0.008108045497718165, "grad_norm": 66.47421368788807, "learning_rate": 8.107890337006224e-06, "loss": 2.4003, "mean_token_accuracy": 0.42413793206214906, "step": 8050 }, { "epoch": 0.008113081550822337, "grad_norm": 36.45912900699369, "learning_rate": 8.112926293737285e-06, "loss": 2.1878, "mean_token_accuracy": 0.46551724076271056, "step": 8055 }, { "epoch": 0.00811811760392651, "grad_norm": 39.97560119457867, "learning_rate": 8.117962250468344e-06, "loss": 2.1582, "mean_token_accuracy": 0.4517241358757019, "step": 8060 }, { "epoch": 0.008123153657030683, "grad_norm": 41.58880024560286, "learning_rate": 8.122998207199405e-06, "loss": 2.3462, "mean_token_accuracy": 0.4551724135875702, "step": 8065 }, { "epoch": 0.008128189710134855, "grad_norm": 44.26450059515507, "learning_rate": 8.128034163930463e-06, "loss": 2.0619, "mean_token_accuracy": 0.45862067937850953, "step": 8070 }, { "epoch": 0.008133225763239028, "grad_norm": 37.63761804732457, "learning_rate": 8.133070120661524e-06, "loss": 2.2692, "mean_token_accuracy": 0.47586206197738645, "step": 8075 }, { "epoch": 0.0081382618163432, "grad_norm": 40.85379689044504, "learning_rate": 8.138106077392583e-06, "loss": 2.3687, "mean_token_accuracy": 0.4241379201412201, "step": 8080 }, { "epoch": 0.008143297869447374, "grad_norm": 34.960317811560124, "learning_rate": 8.143142034123644e-06, "loss": 2.0302, "mean_token_accuracy": 0.4724137902259827, "step": 8085 }, { "epoch": 0.008148333922551546, "grad_norm": 41.97406448951179, "learning_rate": 8.148177990854703e-06, "loss": 2.2372, "mean_token_accuracy": 0.4620689630508423, "step": 8090 }, { "epoch": 0.00815336997565572, "grad_norm": 50.088763402484254, "learning_rate": 8.153213947585762e-06, "loss": 2.0148, "mean_token_accuracy": 0.5083484530448914, "step": 8095 }, { "epoch": 0.008158406028759892, "grad_norm": 41.3082877624813, "learning_rate": 8.158249904316823e-06, "loss": 2.121, "mean_token_accuracy": 0.4635813653469086, "step": 8100 }, { "epoch": 0.008163442081864064, "grad_norm": 36.25225758129978, "learning_rate": 8.163285861047882e-06, "loss": 2.2037, "mean_token_accuracy": 0.4711433708667755, "step": 8105 }, { "epoch": 0.008168478134968238, "grad_norm": 34.254175564685156, "learning_rate": 8.168321817778942e-06, "loss": 2.0332, "mean_token_accuracy": 0.4413793087005615, "step": 8110 }, { "epoch": 0.00817351418807241, "grad_norm": 36.044330006326014, "learning_rate": 8.173357774510001e-06, "loss": 2.2798, "mean_token_accuracy": 0.3965517163276672, "step": 8115 }, { "epoch": 0.008178550241176584, "grad_norm": 37.45435926828359, "learning_rate": 8.178393731241062e-06, "loss": 2.3105, "mean_token_accuracy": 0.4379310369491577, "step": 8120 }, { "epoch": 0.008183586294280756, "grad_norm": 49.65605341966188, "learning_rate": 8.183429687972121e-06, "loss": 2.1947, "mean_token_accuracy": 0.46551724076271056, "step": 8125 }, { "epoch": 0.00818862234738493, "grad_norm": 57.50848782891851, "learning_rate": 8.18846564470318e-06, "loss": 2.5328, "mean_token_accuracy": 0.4103448212146759, "step": 8130 }, { "epoch": 0.008193658400489101, "grad_norm": 40.497723755430066, "learning_rate": 8.193501601434241e-06, "loss": 2.6684, "mean_token_accuracy": 0.36206897497177126, "step": 8135 }, { "epoch": 0.008198694453593273, "grad_norm": 38.91745328429812, "learning_rate": 8.1985375581653e-06, "loss": 2.5117, "mean_token_accuracy": 0.41724138259887694, "step": 8140 }, { "epoch": 0.008203730506697447, "grad_norm": 41.95578759760776, "learning_rate": 8.203573514896362e-06, "loss": 2.6908, "mean_token_accuracy": 0.39655172228813174, "step": 8145 }, { "epoch": 0.00820876655980162, "grad_norm": 36.58132849975484, "learning_rate": 8.208609471627419e-06, "loss": 2.0842, "mean_token_accuracy": 0.47931034564971925, "step": 8150 }, { "epoch": 0.008213802612905793, "grad_norm": 45.23125145655737, "learning_rate": 8.21364542835848e-06, "loss": 2.2524, "mean_token_accuracy": 0.4689655125141144, "step": 8155 }, { "epoch": 0.008218838666009965, "grad_norm": 49.47783473807567, "learning_rate": 8.21868138508954e-06, "loss": 2.0822, "mean_token_accuracy": 0.49879008531570435, "step": 8160 }, { "epoch": 0.008223874719114139, "grad_norm": 44.80987018798864, "learning_rate": 8.2237173418206e-06, "loss": 2.3988, "mean_token_accuracy": 0.4620689630508423, "step": 8165 }, { "epoch": 0.008228910772218311, "grad_norm": 34.00098919013609, "learning_rate": 8.228753298551658e-06, "loss": 2.4857, "mean_token_accuracy": 0.41379310488700866, "step": 8170 }, { "epoch": 0.008233946825322483, "grad_norm": 46.77333255101476, "learning_rate": 8.233789255282719e-06, "loss": 2.0845, "mean_token_accuracy": 0.5172413766384125, "step": 8175 }, { "epoch": 0.008238982878426657, "grad_norm": 43.920721387950344, "learning_rate": 8.238825212013778e-06, "loss": 2.3801, "mean_token_accuracy": 0.41724138259887694, "step": 8180 }, { "epoch": 0.008244018931530829, "grad_norm": 42.612665764814295, "learning_rate": 8.243861168744839e-06, "loss": 2.3864, "mean_token_accuracy": 0.42413793206214906, "step": 8185 }, { "epoch": 0.008249054984635002, "grad_norm": 50.25398520828357, "learning_rate": 8.248897125475898e-06, "loss": 2.1631, "mean_token_accuracy": 0.46896551847457885, "step": 8190 }, { "epoch": 0.008254091037739174, "grad_norm": 60.12588568454377, "learning_rate": 8.253933082206957e-06, "loss": 2.0079, "mean_token_accuracy": 0.5068965494632721, "step": 8195 }, { "epoch": 0.008259127090843348, "grad_norm": 40.82705027320322, "learning_rate": 8.258969038938018e-06, "loss": 2.2173, "mean_token_accuracy": 0.488384747505188, "step": 8200 }, { "epoch": 0.00826416314394752, "grad_norm": 43.393192343596866, "learning_rate": 8.264004995669078e-06, "loss": 2.5521, "mean_token_accuracy": 0.40344826579093934, "step": 8205 }, { "epoch": 0.008269199197051692, "grad_norm": 58.900952500629366, "learning_rate": 8.269040952400139e-06, "loss": 2.3182, "mean_token_accuracy": 0.4310344815254211, "step": 8210 }, { "epoch": 0.008274235250155866, "grad_norm": 38.14638932995811, "learning_rate": 8.274076909131196e-06, "loss": 2.4245, "mean_token_accuracy": 0.44827585816383364, "step": 8215 }, { "epoch": 0.008279271303260038, "grad_norm": 37.38631091056911, "learning_rate": 8.279112865862257e-06, "loss": 2.4876, "mean_token_accuracy": 0.3999999940395355, "step": 8220 }, { "epoch": 0.008284307356364212, "grad_norm": 38.72564801450457, "learning_rate": 8.284148822593316e-06, "loss": 2.3549, "mean_token_accuracy": 0.4551724076271057, "step": 8225 }, { "epoch": 0.008289343409468384, "grad_norm": 42.6978603522511, "learning_rate": 8.289184779324377e-06, "loss": 2.2665, "mean_token_accuracy": 0.42413793206214906, "step": 8230 }, { "epoch": 0.008294379462572558, "grad_norm": 33.87195102632296, "learning_rate": 8.294220736055437e-06, "loss": 2.1292, "mean_token_accuracy": 0.48142769932746887, "step": 8235 }, { "epoch": 0.00829941551567673, "grad_norm": 69.36294434779421, "learning_rate": 8.299256692786496e-06, "loss": 2.2775, "mean_token_accuracy": 0.458620685338974, "step": 8240 }, { "epoch": 0.008304451568780902, "grad_norm": 40.956191026101365, "learning_rate": 8.304292649517557e-06, "loss": 2.0405, "mean_token_accuracy": 0.4724137902259827, "step": 8245 }, { "epoch": 0.008309487621885076, "grad_norm": 41.88479779991668, "learning_rate": 8.309328606248616e-06, "loss": 2.2381, "mean_token_accuracy": 0.4425892233848572, "step": 8250 }, { "epoch": 0.008314523674989248, "grad_norm": 52.61882079229251, "learning_rate": 8.314364562979675e-06, "loss": 2.1083, "mean_token_accuracy": 0.4551724135875702, "step": 8255 }, { "epoch": 0.008319559728093421, "grad_norm": 65.35804446109087, "learning_rate": 8.319400519710735e-06, "loss": 2.3923, "mean_token_accuracy": 0.48275861144065857, "step": 8260 }, { "epoch": 0.008324595781197593, "grad_norm": 57.60275869147242, "learning_rate": 8.324436476441795e-06, "loss": 2.5744, "mean_token_accuracy": 0.3620689660310745, "step": 8265 }, { "epoch": 0.008329631834301767, "grad_norm": 44.053857815518576, "learning_rate": 8.329472433172855e-06, "loss": 2.4967, "mean_token_accuracy": 0.3896551728248596, "step": 8270 }, { "epoch": 0.008334667887405939, "grad_norm": 42.719321551200395, "learning_rate": 8.334508389903914e-06, "loss": 2.3409, "mean_token_accuracy": 0.46896551847457885, "step": 8275 }, { "epoch": 0.008339703940510111, "grad_norm": 43.611556390877425, "learning_rate": 8.339544346634973e-06, "loss": 2.3195, "mean_token_accuracy": 0.4448275834321976, "step": 8280 }, { "epoch": 0.008344739993614285, "grad_norm": 50.46139890887202, "learning_rate": 8.344580303366034e-06, "loss": 2.046, "mean_token_accuracy": 0.5034482777118683, "step": 8285 }, { "epoch": 0.008349776046718457, "grad_norm": 28.860968470344737, "learning_rate": 8.349616260097093e-06, "loss": 2.1862, "mean_token_accuracy": 0.4379310369491577, "step": 8290 }, { "epoch": 0.00835481209982263, "grad_norm": 69.69906829796771, "learning_rate": 8.354652216828153e-06, "loss": 2.577, "mean_token_accuracy": 0.3827586233615875, "step": 8295 }, { "epoch": 0.008359848152926803, "grad_norm": 52.572973166661, "learning_rate": 8.359688173559214e-06, "loss": 2.8307, "mean_token_accuracy": 0.38275861740112305, "step": 8300 }, { "epoch": 0.008364884206030977, "grad_norm": 31.898356867082015, "learning_rate": 8.364724130290273e-06, "loss": 2.2226, "mean_token_accuracy": 0.4497882664203644, "step": 8305 }, { "epoch": 0.008369920259135149, "grad_norm": 39.30706262525915, "learning_rate": 8.369760087021334e-06, "loss": 1.8708, "mean_token_accuracy": 0.5275862038135528, "step": 8310 }, { "epoch": 0.00837495631223932, "grad_norm": 30.970151881832347, "learning_rate": 8.374796043752391e-06, "loss": 2.3245, "mean_token_accuracy": 0.42413793206214906, "step": 8315 }, { "epoch": 0.008379992365343494, "grad_norm": 33.16500427142407, "learning_rate": 8.379832000483452e-06, "loss": 2.3575, "mean_token_accuracy": 0.41724138259887694, "step": 8320 }, { "epoch": 0.008385028418447666, "grad_norm": 41.805727711977745, "learning_rate": 8.384867957214512e-06, "loss": 2.568, "mean_token_accuracy": 0.4689655125141144, "step": 8325 }, { "epoch": 0.00839006447155184, "grad_norm": 31.872970009545238, "learning_rate": 8.389903913945573e-06, "loss": 2.2111, "mean_token_accuracy": 0.4413793087005615, "step": 8330 }, { "epoch": 0.008395100524656012, "grad_norm": 47.08279578101426, "learning_rate": 8.394939870676632e-06, "loss": 2.5163, "mean_token_accuracy": 0.38620689511299133, "step": 8335 }, { "epoch": 0.008400136577760186, "grad_norm": 33.704780663549656, "learning_rate": 8.399975827407691e-06, "loss": 2.6529, "mean_token_accuracy": 0.4103448212146759, "step": 8340 }, { "epoch": 0.008405172630864358, "grad_norm": 30.837387375797213, "learning_rate": 8.405011784138752e-06, "loss": 2.1594, "mean_token_accuracy": 0.45335754156112673, "step": 8345 }, { "epoch": 0.00841020868396853, "grad_norm": 52.45484445909543, "learning_rate": 8.410047740869811e-06, "loss": 2.4392, "mean_token_accuracy": 0.4172413766384125, "step": 8350 }, { "epoch": 0.008415244737072704, "grad_norm": 47.76608733928675, "learning_rate": 8.41508369760087e-06, "loss": 2.354, "mean_token_accuracy": 0.4586206912994385, "step": 8355 }, { "epoch": 0.008420280790176876, "grad_norm": 34.27814291661414, "learning_rate": 8.42011965433193e-06, "loss": 2.388, "mean_token_accuracy": 0.4000000059604645, "step": 8360 }, { "epoch": 0.00842531684328105, "grad_norm": 49.11198164309136, "learning_rate": 8.42515561106299e-06, "loss": 2.322, "mean_token_accuracy": 0.44137929677963256, "step": 8365 }, { "epoch": 0.008430352896385222, "grad_norm": 42.31509318885319, "learning_rate": 8.43019156779405e-06, "loss": 2.4799, "mean_token_accuracy": 0.4310344815254211, "step": 8370 }, { "epoch": 0.008435388949489395, "grad_norm": 51.40820805360451, "learning_rate": 8.43522752452511e-06, "loss": 2.3267, "mean_token_accuracy": 0.441379314661026, "step": 8375 }, { "epoch": 0.008440425002593567, "grad_norm": 43.753664682766484, "learning_rate": 8.440263481256168e-06, "loss": 2.3846, "mean_token_accuracy": 0.46896551847457885, "step": 8380 }, { "epoch": 0.00844546105569774, "grad_norm": 48.13972778588484, "learning_rate": 8.44529943798723e-06, "loss": 2.433, "mean_token_accuracy": 0.4310344815254211, "step": 8385 }, { "epoch": 0.008450497108801913, "grad_norm": 41.326123472679015, "learning_rate": 8.450335394718289e-06, "loss": 2.4033, "mean_token_accuracy": 0.4068965554237366, "step": 8390 }, { "epoch": 0.008455533161906085, "grad_norm": 50.25378083705675, "learning_rate": 8.455371351449348e-06, "loss": 2.535, "mean_token_accuracy": 0.4172413766384125, "step": 8395 }, { "epoch": 0.008460569215010259, "grad_norm": 52.0906988744877, "learning_rate": 8.460407308180409e-06, "loss": 2.487, "mean_token_accuracy": 0.3551724135875702, "step": 8400 }, { "epoch": 0.008465605268114431, "grad_norm": 35.796510007826384, "learning_rate": 8.465443264911468e-06, "loss": 2.4722, "mean_token_accuracy": 0.41905626058578493, "step": 8405 }, { "epoch": 0.008470641321218605, "grad_norm": 36.42615482899636, "learning_rate": 8.470479221642529e-06, "loss": 2.4102, "mean_token_accuracy": 0.43793103098869324, "step": 8410 }, { "epoch": 0.008475677374322777, "grad_norm": 34.04840358539263, "learning_rate": 8.475515178373587e-06, "loss": 2.0857, "mean_token_accuracy": 0.4740471839904785, "step": 8415 }, { "epoch": 0.008480713427426949, "grad_norm": 38.36875296363297, "learning_rate": 8.480551135104648e-06, "loss": 2.3213, "mean_token_accuracy": 0.42758620381355283, "step": 8420 }, { "epoch": 0.008485749480531123, "grad_norm": 33.45723674163815, "learning_rate": 8.485587091835707e-06, "loss": 2.2982, "mean_token_accuracy": 0.42068966031074523, "step": 8425 }, { "epoch": 0.008490785533635295, "grad_norm": 40.84221107925927, "learning_rate": 8.490623048566768e-06, "loss": 2.2565, "mean_token_accuracy": 0.44482759237289426, "step": 8430 }, { "epoch": 0.008495821586739468, "grad_norm": 34.69469725065078, "learning_rate": 8.495659005297827e-06, "loss": 2.5097, "mean_token_accuracy": 0.4517241418361664, "step": 8435 }, { "epoch": 0.00850085763984364, "grad_norm": 40.34390793703597, "learning_rate": 8.500694962028886e-06, "loss": 2.2338, "mean_token_accuracy": 0.42758620977401735, "step": 8440 }, { "epoch": 0.008505893692947814, "grad_norm": 73.85690999251504, "learning_rate": 8.505730918759947e-06, "loss": 2.5933, "mean_token_accuracy": 0.38620689809322356, "step": 8445 }, { "epoch": 0.008510929746051986, "grad_norm": 44.998841024004676, "learning_rate": 8.510766875491006e-06, "loss": 2.2552, "mean_token_accuracy": 0.4379310369491577, "step": 8450 }, { "epoch": 0.008515965799156158, "grad_norm": 51.3707729358596, "learning_rate": 8.515802832222066e-06, "loss": 2.5021, "mean_token_accuracy": 0.3620689630508423, "step": 8455 }, { "epoch": 0.008521001852260332, "grad_norm": 24.101087322421513, "learning_rate": 8.520838788953125e-06, "loss": 1.909, "mean_token_accuracy": 0.5344827532768249, "step": 8460 }, { "epoch": 0.008526037905364504, "grad_norm": 47.28834755963221, "learning_rate": 8.525874745684186e-06, "loss": 2.4499, "mean_token_accuracy": 0.3965517282485962, "step": 8465 }, { "epoch": 0.008531073958468678, "grad_norm": 44.24203588040872, "learning_rate": 8.530910702415245e-06, "loss": 1.9313, "mean_token_accuracy": 0.5241379320621491, "step": 8470 }, { "epoch": 0.00853611001157285, "grad_norm": 48.91440137614651, "learning_rate": 8.535946659146304e-06, "loss": 2.441, "mean_token_accuracy": 0.4, "step": 8475 }, { "epoch": 0.008541146064677024, "grad_norm": 33.09101341416672, "learning_rate": 8.540982615877364e-06, "loss": 2.4386, "mean_token_accuracy": 0.4241379380226135, "step": 8480 }, { "epoch": 0.008546182117781196, "grad_norm": 49.80674398208219, "learning_rate": 8.546018572608425e-06, "loss": 2.4408, "mean_token_accuracy": 0.4068965494632721, "step": 8485 }, { "epoch": 0.008551218170885368, "grad_norm": 33.570541412030465, "learning_rate": 8.551054529339486e-06, "loss": 2.3873, "mean_token_accuracy": 0.41724138855934145, "step": 8490 }, { "epoch": 0.008556254223989541, "grad_norm": 41.25732377379406, "learning_rate": 8.556090486070543e-06, "loss": 2.118, "mean_token_accuracy": 0.4655172348022461, "step": 8495 }, { "epoch": 0.008561290277093713, "grad_norm": 37.305382099284905, "learning_rate": 8.561126442801604e-06, "loss": 2.536, "mean_token_accuracy": 0.35517241060733795, "step": 8500 }, { "epoch": 0.008566326330197887, "grad_norm": 36.75693946224768, "learning_rate": 8.566162399532663e-06, "loss": 2.3517, "mean_token_accuracy": 0.42758620977401735, "step": 8505 }, { "epoch": 0.00857136238330206, "grad_norm": 38.92055962350644, "learning_rate": 8.571198356263724e-06, "loss": 2.2516, "mean_token_accuracy": 0.4689655065536499, "step": 8510 }, { "epoch": 0.008576398436406233, "grad_norm": 40.83239379741632, "learning_rate": 8.576234312994782e-06, "loss": 2.0389, "mean_token_accuracy": 0.5379310309886932, "step": 8515 }, { "epoch": 0.008581434489510405, "grad_norm": 40.174235708758424, "learning_rate": 8.581270269725843e-06, "loss": 2.1814, "mean_token_accuracy": 0.4655172348022461, "step": 8520 }, { "epoch": 0.008586470542614577, "grad_norm": 39.61815409953896, "learning_rate": 8.586306226456902e-06, "loss": 2.3356, "mean_token_accuracy": 0.42068966031074523, "step": 8525 }, { "epoch": 0.00859150659571875, "grad_norm": 49.367727612798916, "learning_rate": 8.591342183187963e-06, "loss": 2.3857, "mean_token_accuracy": 0.45517241954803467, "step": 8530 }, { "epoch": 0.008596542648822923, "grad_norm": 35.705311181018494, "learning_rate": 8.596378139919022e-06, "loss": 2.2989, "mean_token_accuracy": 0.4517241418361664, "step": 8535 }, { "epoch": 0.008601578701927097, "grad_norm": 35.31542421008467, "learning_rate": 8.601414096650082e-06, "loss": 2.2813, "mean_token_accuracy": 0.4896551728248596, "step": 8540 }, { "epoch": 0.008606614755031269, "grad_norm": 46.1537174592479, "learning_rate": 8.606450053381142e-06, "loss": 2.6581, "mean_token_accuracy": 0.37241379022598264, "step": 8545 }, { "epoch": 0.008611650808135442, "grad_norm": 38.98543506655495, "learning_rate": 8.611486010112202e-06, "loss": 2.5408, "mean_token_accuracy": 0.43103447556495667, "step": 8550 }, { "epoch": 0.008616686861239614, "grad_norm": 41.395770800519884, "learning_rate": 8.616521966843261e-06, "loss": 2.5932, "mean_token_accuracy": 0.4034482777118683, "step": 8555 }, { "epoch": 0.008621722914343787, "grad_norm": 51.973730865817735, "learning_rate": 8.62155792357432e-06, "loss": 2.0703, "mean_token_accuracy": 0.4862068951129913, "step": 8560 }, { "epoch": 0.00862675896744796, "grad_norm": 39.53364695474878, "learning_rate": 8.626593880305381e-06, "loss": 2.2078, "mean_token_accuracy": 0.47586206197738645, "step": 8565 }, { "epoch": 0.008631795020552132, "grad_norm": 42.99717329018743, "learning_rate": 8.63162983703644e-06, "loss": 2.19, "mean_token_accuracy": 0.482758617401123, "step": 8570 }, { "epoch": 0.008636831073656306, "grad_norm": 36.192265195542795, "learning_rate": 8.6366657937675e-06, "loss": 2.3426, "mean_token_accuracy": 0.441379314661026, "step": 8575 }, { "epoch": 0.008641867126760478, "grad_norm": 44.509754288641254, "learning_rate": 8.64170175049856e-06, "loss": 2.5496, "mean_token_accuracy": 0.40544464290142057, "step": 8580 }, { "epoch": 0.008646903179864652, "grad_norm": 42.55656077506188, "learning_rate": 8.64673770722962e-06, "loss": 2.225, "mean_token_accuracy": 0.45517241954803467, "step": 8585 }, { "epoch": 0.008651939232968824, "grad_norm": 43.70543672001637, "learning_rate": 8.65177366396068e-06, "loss": 2.3263, "mean_token_accuracy": 0.45517241954803467, "step": 8590 }, { "epoch": 0.008656975286072996, "grad_norm": 38.05152548624296, "learning_rate": 8.65680962069174e-06, "loss": 2.3196, "mean_token_accuracy": 0.4344827592372894, "step": 8595 }, { "epoch": 0.00866201133917717, "grad_norm": 31.14734596815842, "learning_rate": 8.6618455774228e-06, "loss": 2.3625, "mean_token_accuracy": 0.44482759237289426, "step": 8600 }, { "epoch": 0.008667047392281342, "grad_norm": 37.841178598641775, "learning_rate": 8.666881534153859e-06, "loss": 2.2106, "mean_token_accuracy": 0.43793103098869324, "step": 8605 }, { "epoch": 0.008672083445385515, "grad_norm": 40.33737448405054, "learning_rate": 8.67191749088492e-06, "loss": 2.0556, "mean_token_accuracy": 0.45668481588363646, "step": 8610 }, { "epoch": 0.008677119498489688, "grad_norm": 35.0094061971413, "learning_rate": 8.676953447615979e-06, "loss": 2.312, "mean_token_accuracy": 0.47114337682724, "step": 8615 }, { "epoch": 0.008682155551593861, "grad_norm": 44.92520828183209, "learning_rate": 8.681989404347038e-06, "loss": 2.2136, "mean_token_accuracy": 0.4551724135875702, "step": 8620 }, { "epoch": 0.008687191604698033, "grad_norm": 42.8886256403922, "learning_rate": 8.687025361078097e-06, "loss": 2.5508, "mean_token_accuracy": 0.4103448212146759, "step": 8625 }, { "epoch": 0.008692227657802205, "grad_norm": 40.88069658026503, "learning_rate": 8.692061317809158e-06, "loss": 2.321, "mean_token_accuracy": 0.42758620381355283, "step": 8630 }, { "epoch": 0.008697263710906379, "grad_norm": 40.499993649477204, "learning_rate": 8.697097274540217e-06, "loss": 2.2651, "mean_token_accuracy": 0.4620689690113068, "step": 8635 }, { "epoch": 0.008702299764010551, "grad_norm": 37.44539081801667, "learning_rate": 8.702133231271277e-06, "loss": 2.4669, "mean_token_accuracy": 0.3758620709180832, "step": 8640 }, { "epoch": 0.008707335817114725, "grad_norm": 43.00615053294406, "learning_rate": 8.707169188002338e-06, "loss": 2.1534, "mean_token_accuracy": 0.4896551728248596, "step": 8645 }, { "epoch": 0.008712371870218897, "grad_norm": 36.150761656034426, "learning_rate": 8.712205144733397e-06, "loss": 2.4289, "mean_token_accuracy": 0.4379310369491577, "step": 8650 }, { "epoch": 0.00871740792332307, "grad_norm": 32.623081683858004, "learning_rate": 8.717241101464458e-06, "loss": 2.2866, "mean_token_accuracy": 0.44137930274009707, "step": 8655 }, { "epoch": 0.008722443976427243, "grad_norm": 37.20720087192561, "learning_rate": 8.722277058195515e-06, "loss": 2.5528, "mean_token_accuracy": 0.37586206793785093, "step": 8660 }, { "epoch": 0.008727480029531415, "grad_norm": 47.81739334123764, "learning_rate": 8.727313014926576e-06, "loss": 2.3752, "mean_token_accuracy": 0.47241378426551817, "step": 8665 }, { "epoch": 0.008732516082635589, "grad_norm": 29.557989687858534, "learning_rate": 8.732348971657636e-06, "loss": 2.1488, "mean_token_accuracy": 0.4793103337287903, "step": 8670 }, { "epoch": 0.00873755213573976, "grad_norm": 37.71365427684128, "learning_rate": 8.737384928388697e-06, "loss": 2.2568, "mean_token_accuracy": 0.47241379618644713, "step": 8675 }, { "epoch": 0.008742588188843934, "grad_norm": 70.82503241671895, "learning_rate": 8.742420885119756e-06, "loss": 2.4081, "mean_token_accuracy": 0.38275861740112305, "step": 8680 }, { "epoch": 0.008747624241948106, "grad_norm": 29.05161967453014, "learning_rate": 8.747456841850815e-06, "loss": 2.1983, "mean_token_accuracy": 0.4379310369491577, "step": 8685 }, { "epoch": 0.00875266029505228, "grad_norm": 43.57380949547287, "learning_rate": 8.752492798581876e-06, "loss": 2.1979, "mean_token_accuracy": 0.4517241418361664, "step": 8690 }, { "epoch": 0.008757696348156452, "grad_norm": 38.23286519420096, "learning_rate": 8.757528755312935e-06, "loss": 2.1895, "mean_token_accuracy": 0.4448275864124298, "step": 8695 }, { "epoch": 0.008762732401260624, "grad_norm": 37.86838052576704, "learning_rate": 8.762564712043995e-06, "loss": 2.0717, "mean_token_accuracy": 0.4482758641242981, "step": 8700 }, { "epoch": 0.008767768454364798, "grad_norm": 27.461225141146343, "learning_rate": 8.767600668775054e-06, "loss": 2.3125, "mean_token_accuracy": 0.4689655125141144, "step": 8705 }, { "epoch": 0.00877280450746897, "grad_norm": 49.42405799120369, "learning_rate": 8.772636625506115e-06, "loss": 2.4737, "mean_token_accuracy": 0.4068965494632721, "step": 8710 }, { "epoch": 0.008777840560573144, "grad_norm": 29.38572461818643, "learning_rate": 8.777672582237174e-06, "loss": 2.3134, "mean_token_accuracy": 0.4068965554237366, "step": 8715 }, { "epoch": 0.008782876613677316, "grad_norm": 33.767494241591706, "learning_rate": 8.782708538968233e-06, "loss": 2.572, "mean_token_accuracy": 0.4379310369491577, "step": 8720 }, { "epoch": 0.008787912666781488, "grad_norm": 48.39461483688185, "learning_rate": 8.787744495699293e-06, "loss": 2.3966, "mean_token_accuracy": 0.4275861978530884, "step": 8725 }, { "epoch": 0.008792948719885662, "grad_norm": 38.000891790029144, "learning_rate": 8.792780452430353e-06, "loss": 2.3606, "mean_token_accuracy": 0.4034482777118683, "step": 8730 }, { "epoch": 0.008797984772989834, "grad_norm": 46.65716281408244, "learning_rate": 8.797816409161413e-06, "loss": 2.5848, "mean_token_accuracy": 0.4103448212146759, "step": 8735 }, { "epoch": 0.008803020826094007, "grad_norm": 36.24999275021773, "learning_rate": 8.802852365892472e-06, "loss": 2.238, "mean_token_accuracy": 0.45027223229408264, "step": 8740 }, { "epoch": 0.00880805687919818, "grad_norm": 37.003292026075044, "learning_rate": 8.807888322623533e-06, "loss": 2.2523, "mean_token_accuracy": 0.4344827592372894, "step": 8745 }, { "epoch": 0.008813092932302353, "grad_norm": 43.76924611698608, "learning_rate": 8.812924279354592e-06, "loss": 2.4025, "mean_token_accuracy": 0.46551724076271056, "step": 8750 }, { "epoch": 0.008818128985406525, "grad_norm": 45.49062638467538, "learning_rate": 8.817960236085653e-06, "loss": 2.1171, "mean_token_accuracy": 0.4517241418361664, "step": 8755 }, { "epoch": 0.008823165038510697, "grad_norm": 37.34148266789959, "learning_rate": 8.82299619281671e-06, "loss": 2.1282, "mean_token_accuracy": 0.47586206793785096, "step": 8760 }, { "epoch": 0.008828201091614871, "grad_norm": 48.44665781185846, "learning_rate": 8.828032149547772e-06, "loss": 2.0659, "mean_token_accuracy": 0.48275862336158754, "step": 8765 }, { "epoch": 0.008833237144719043, "grad_norm": 40.81478468434724, "learning_rate": 8.833068106278831e-06, "loss": 2.3315, "mean_token_accuracy": 0.4413793087005615, "step": 8770 }, { "epoch": 0.008838273197823217, "grad_norm": 28.433689295949712, "learning_rate": 8.838104063009892e-06, "loss": 2.4215, "mean_token_accuracy": 0.417241370677948, "step": 8775 }, { "epoch": 0.008843309250927389, "grad_norm": 51.63645324935579, "learning_rate": 8.843140019740951e-06, "loss": 2.3854, "mean_token_accuracy": 0.39655172228813174, "step": 8780 }, { "epoch": 0.008848345304031563, "grad_norm": 44.843455274997986, "learning_rate": 8.84817597647201e-06, "loss": 2.002, "mean_token_accuracy": 0.5137930929660797, "step": 8785 }, { "epoch": 0.008853381357135735, "grad_norm": 43.91729593034281, "learning_rate": 8.853211933203071e-06, "loss": 2.2936, "mean_token_accuracy": 0.4517241358757019, "step": 8790 }, { "epoch": 0.008858417410239907, "grad_norm": 45.90493445949028, "learning_rate": 8.85824788993413e-06, "loss": 1.9773, "mean_token_accuracy": 0.4931034505367279, "step": 8795 }, { "epoch": 0.00886345346334408, "grad_norm": 50.237365248672106, "learning_rate": 8.86328384666519e-06, "loss": 2.1663, "mean_token_accuracy": 0.4344827592372894, "step": 8800 }, { "epoch": 0.008868489516448252, "grad_norm": 46.8542522745384, "learning_rate": 8.868319803396249e-06, "loss": 2.3889, "mean_token_accuracy": 0.44325469732284545, "step": 8805 }, { "epoch": 0.008873525569552426, "grad_norm": 50.31435705567498, "learning_rate": 8.87335576012731e-06, "loss": 2.6584, "mean_token_accuracy": 0.41379310488700866, "step": 8810 }, { "epoch": 0.008878561622656598, "grad_norm": 41.08626928352491, "learning_rate": 8.87839171685837e-06, "loss": 2.3161, "mean_token_accuracy": 0.4517241418361664, "step": 8815 }, { "epoch": 0.008883597675760772, "grad_norm": 34.973993618368965, "learning_rate": 8.883427673589428e-06, "loss": 2.2866, "mean_token_accuracy": 0.47586207985877993, "step": 8820 }, { "epoch": 0.008888633728864944, "grad_norm": 60.628902434693316, "learning_rate": 8.888463630320488e-06, "loss": 2.3726, "mean_token_accuracy": 0.45517241954803467, "step": 8825 }, { "epoch": 0.008893669781969116, "grad_norm": 33.119677580682875, "learning_rate": 8.893499587051549e-06, "loss": 2.4343, "mean_token_accuracy": 0.4172413766384125, "step": 8830 }, { "epoch": 0.00889870583507329, "grad_norm": 35.82282077874185, "learning_rate": 8.898535543782608e-06, "loss": 2.1443, "mean_token_accuracy": 0.47931034564971925, "step": 8835 }, { "epoch": 0.008903741888177462, "grad_norm": 74.12009728054932, "learning_rate": 8.903571500513667e-06, "loss": 2.2997, "mean_token_accuracy": 0.43448275327682495, "step": 8840 }, { "epoch": 0.008908777941281636, "grad_norm": 42.03595518693085, "learning_rate": 8.908607457244728e-06, "loss": 2.5405, "mean_token_accuracy": 0.41034482717514037, "step": 8845 }, { "epoch": 0.008913813994385808, "grad_norm": 37.246355210177335, "learning_rate": 8.913643413975787e-06, "loss": 2.3698, "mean_token_accuracy": 0.4310344815254211, "step": 8850 }, { "epoch": 0.008918850047489981, "grad_norm": 39.412295209463686, "learning_rate": 8.918679370706848e-06, "loss": 1.8509, "mean_token_accuracy": 0.5310344815254211, "step": 8855 }, { "epoch": 0.008923886100594153, "grad_norm": 42.34480877915383, "learning_rate": 8.923715327437906e-06, "loss": 2.3356, "mean_token_accuracy": 0.4103448331356049, "step": 8860 }, { "epoch": 0.008928922153698325, "grad_norm": 36.93129834496132, "learning_rate": 8.928751284168967e-06, "loss": 1.9451, "mean_token_accuracy": 0.46551724076271056, "step": 8865 }, { "epoch": 0.0089339582068025, "grad_norm": 44.40214287692589, "learning_rate": 8.933787240900026e-06, "loss": 2.3087, "mean_token_accuracy": 0.4689655125141144, "step": 8870 }, { "epoch": 0.008938994259906671, "grad_norm": 46.459044601030485, "learning_rate": 8.938823197631087e-06, "loss": 2.5888, "mean_token_accuracy": 0.41724138259887694, "step": 8875 }, { "epoch": 0.008944030313010845, "grad_norm": 36.17053422894732, "learning_rate": 8.943859154362146e-06, "loss": 2.3261, "mean_token_accuracy": 0.4379310369491577, "step": 8880 }, { "epoch": 0.008949066366115017, "grad_norm": 39.471827120213575, "learning_rate": 8.948895111093206e-06, "loss": 2.2563, "mean_token_accuracy": 0.43448275327682495, "step": 8885 }, { "epoch": 0.00895410241921919, "grad_norm": 35.29991544199468, "learning_rate": 8.953931067824266e-06, "loss": 2.4227, "mean_token_accuracy": 0.42758620381355283, "step": 8890 }, { "epoch": 0.008959138472323363, "grad_norm": 32.2206525931091, "learning_rate": 8.958967024555326e-06, "loss": 2.0856, "mean_token_accuracy": 0.47241378426551817, "step": 8895 }, { "epoch": 0.008964174525427535, "grad_norm": 40.16277530680962, "learning_rate": 8.964002981286385e-06, "loss": 2.126, "mean_token_accuracy": 0.5068965494632721, "step": 8900 }, { "epoch": 0.008969210578531709, "grad_norm": 36.27583049175853, "learning_rate": 8.969038938017444e-06, "loss": 2.3759, "mean_token_accuracy": 0.41379311084747317, "step": 8905 }, { "epoch": 0.00897424663163588, "grad_norm": 40.78625627856431, "learning_rate": 8.974074894748505e-06, "loss": 2.4439, "mean_token_accuracy": 0.39310344457626345, "step": 8910 }, { "epoch": 0.008979282684740054, "grad_norm": 35.61396377356604, "learning_rate": 8.979110851479564e-06, "loss": 2.657, "mean_token_accuracy": 0.37586207389831544, "step": 8915 }, { "epoch": 0.008984318737844227, "grad_norm": 41.937265004924406, "learning_rate": 8.984146808210624e-06, "loss": 2.4642, "mean_token_accuracy": 0.4034482777118683, "step": 8920 }, { "epoch": 0.0089893547909484, "grad_norm": 46.92405418834552, "learning_rate": 8.989182764941683e-06, "loss": 2.1024, "mean_token_accuracy": 0.482758617401123, "step": 8925 }, { "epoch": 0.008994390844052572, "grad_norm": 40.53266462613649, "learning_rate": 8.994218721672744e-06, "loss": 2.5797, "mean_token_accuracy": 0.4137930989265442, "step": 8930 }, { "epoch": 0.008999426897156744, "grad_norm": 39.23018318463504, "learning_rate": 8.999254678403803e-06, "loss": 2.2317, "mean_token_accuracy": 0.46551724076271056, "step": 8935 }, { "epoch": 0.009004462950260918, "grad_norm": 31.550674691626337, "learning_rate": 9.004290635134862e-06, "loss": 2.8168, "mean_token_accuracy": 0.38620689511299133, "step": 8940 }, { "epoch": 0.00900949900336509, "grad_norm": 68.64533245222765, "learning_rate": 9.009326591865923e-06, "loss": 2.2131, "mean_token_accuracy": 0.482758629322052, "step": 8945 }, { "epoch": 0.009014535056469264, "grad_norm": 42.42249912773483, "learning_rate": 9.014362548596983e-06, "loss": 2.1215, "mean_token_accuracy": 0.4862068951129913, "step": 8950 }, { "epoch": 0.009019571109573436, "grad_norm": 45.18732616195425, "learning_rate": 9.019398505328044e-06, "loss": 2.4248, "mean_token_accuracy": 0.44187192916870116, "step": 8955 }, { "epoch": 0.00902460716267761, "grad_norm": 35.63652813760277, "learning_rate": 9.024434462059101e-06, "loss": 2.6377, "mean_token_accuracy": 0.4068965554237366, "step": 8960 }, { "epoch": 0.009029643215781782, "grad_norm": 35.049794696554954, "learning_rate": 9.029470418790162e-06, "loss": 2.5643, "mean_token_accuracy": 0.4206896543502808, "step": 8965 }, { "epoch": 0.009034679268885954, "grad_norm": 41.191098651544245, "learning_rate": 9.034506375521221e-06, "loss": 2.3087, "mean_token_accuracy": 0.45862069725990295, "step": 8970 }, { "epoch": 0.009039715321990128, "grad_norm": 33.91435207143863, "learning_rate": 9.039542332252282e-06, "loss": 2.1685, "mean_token_accuracy": 0.458620685338974, "step": 8975 }, { "epoch": 0.0090447513750943, "grad_norm": 31.991502555305818, "learning_rate": 9.044578288983342e-06, "loss": 2.2538, "mean_token_accuracy": 0.44137930274009707, "step": 8980 }, { "epoch": 0.009049787428198473, "grad_norm": 31.863882731371987, "learning_rate": 9.0496142457144e-06, "loss": 2.5573, "mean_token_accuracy": 0.38275861740112305, "step": 8985 }, { "epoch": 0.009054823481302645, "grad_norm": 36.29538954972781, "learning_rate": 9.054650202445462e-06, "loss": 2.1569, "mean_token_accuracy": 0.4517241358757019, "step": 8990 }, { "epoch": 0.009059859534406819, "grad_norm": 35.64888336346998, "learning_rate": 9.059686159176521e-06, "loss": 2.1827, "mean_token_accuracy": 0.4413793087005615, "step": 8995 }, { "epoch": 0.009064895587510991, "grad_norm": 35.91379375037939, "learning_rate": 9.064722115907582e-06, "loss": 2.3477, "mean_token_accuracy": 0.4517241299152374, "step": 9000 }, { "epoch": 0.009069931640615163, "grad_norm": 36.88970119610507, "learning_rate": 9.06975807263864e-06, "loss": 2.6087, "mean_token_accuracy": 0.3965517163276672, "step": 9005 }, { "epoch": 0.009074967693719337, "grad_norm": 41.9216727845431, "learning_rate": 9.0747940293697e-06, "loss": 2.3214, "mean_token_accuracy": 0.458620685338974, "step": 9010 }, { "epoch": 0.009080003746823509, "grad_norm": 48.24826254484031, "learning_rate": 9.07982998610076e-06, "loss": 2.2357, "mean_token_accuracy": 0.46551724076271056, "step": 9015 }, { "epoch": 0.009085039799927683, "grad_norm": 89.105300244921, "learning_rate": 9.08486594283182e-06, "loss": 2.3236, "mean_token_accuracy": 0.43647912740707395, "step": 9020 }, { "epoch": 0.009090075853031855, "grad_norm": 47.32426116857205, "learning_rate": 9.08990189956288e-06, "loss": 2.6921, "mean_token_accuracy": 0.37586206793785093, "step": 9025 }, { "epoch": 0.009095111906136029, "grad_norm": 35.637115173034246, "learning_rate": 9.094937856293939e-06, "loss": 2.4778, "mean_token_accuracy": 0.3793103456497192, "step": 9030 }, { "epoch": 0.0091001479592402, "grad_norm": 46.00403941566564, "learning_rate": 9.099973813025e-06, "loss": 2.4927, "mean_token_accuracy": 0.39310344457626345, "step": 9035 }, { "epoch": 0.009105184012344373, "grad_norm": 38.43241376328721, "learning_rate": 9.10500976975606e-06, "loss": 2.2848, "mean_token_accuracy": 0.42758620381355283, "step": 9040 }, { "epoch": 0.009110220065448546, "grad_norm": 35.176183223070055, "learning_rate": 9.110045726487119e-06, "loss": 2.4917, "mean_token_accuracy": 0.38965516686439516, "step": 9045 }, { "epoch": 0.009115256118552718, "grad_norm": 38.76078797257197, "learning_rate": 9.115081683218178e-06, "loss": 2.5065, "mean_token_accuracy": 0.37586206793785093, "step": 9050 }, { "epoch": 0.009120292171656892, "grad_norm": 39.51896197275594, "learning_rate": 9.120117639949239e-06, "loss": 2.1839, "mean_token_accuracy": 0.48620688915252686, "step": 9055 }, { "epoch": 0.009125328224761064, "grad_norm": 52.32388189414563, "learning_rate": 9.125153596680298e-06, "loss": 2.3236, "mean_token_accuracy": 0.4655172348022461, "step": 9060 }, { "epoch": 0.009130364277865238, "grad_norm": 36.430747498799896, "learning_rate": 9.130189553411357e-06, "loss": 1.9821, "mean_token_accuracy": 0.4931034445762634, "step": 9065 }, { "epoch": 0.00913540033096941, "grad_norm": 40.2020694323548, "learning_rate": 9.135225510142417e-06, "loss": 2.2924, "mean_token_accuracy": 0.5, "step": 9070 }, { "epoch": 0.009140436384073582, "grad_norm": 41.64997068405822, "learning_rate": 9.140261466873477e-06, "loss": 2.2588, "mean_token_accuracy": 0.45517241954803467, "step": 9075 }, { "epoch": 0.009145472437177756, "grad_norm": 39.939107386196405, "learning_rate": 9.145297423604537e-06, "loss": 2.0679, "mean_token_accuracy": 0.46551724076271056, "step": 9080 }, { "epoch": 0.009150508490281928, "grad_norm": 37.80708946159949, "learning_rate": 9.150333380335596e-06, "loss": 2.2589, "mean_token_accuracy": 0.4689655303955078, "step": 9085 }, { "epoch": 0.009155544543386102, "grad_norm": 40.82085204326002, "learning_rate": 9.155369337066657e-06, "loss": 2.3523, "mean_token_accuracy": 0.4310344815254211, "step": 9090 }, { "epoch": 0.009160580596490274, "grad_norm": 36.91840742315867, "learning_rate": 9.160405293797716e-06, "loss": 2.2179, "mean_token_accuracy": 0.4206896543502808, "step": 9095 }, { "epoch": 0.009165616649594447, "grad_norm": 39.089459598305226, "learning_rate": 9.165441250528777e-06, "loss": 2.5603, "mean_token_accuracy": 0.4413793087005615, "step": 9100 }, { "epoch": 0.00917065270269862, "grad_norm": 41.483819350705524, "learning_rate": 9.170477207259835e-06, "loss": 2.4374, "mean_token_accuracy": 0.4068965494632721, "step": 9105 }, { "epoch": 0.009175688755802791, "grad_norm": 38.93403221599773, "learning_rate": 9.175513163990896e-06, "loss": 2.4541, "mean_token_accuracy": 0.4532970368862152, "step": 9110 }, { "epoch": 0.009180724808906965, "grad_norm": 37.15036938373122, "learning_rate": 9.180549120721955e-06, "loss": 2.2243, "mean_token_accuracy": 0.4724137902259827, "step": 9115 }, { "epoch": 0.009185760862011137, "grad_norm": 43.307143940722554, "learning_rate": 9.185585077453016e-06, "loss": 2.342, "mean_token_accuracy": 0.4517241418361664, "step": 9120 }, { "epoch": 0.009190796915115311, "grad_norm": 36.52593057082021, "learning_rate": 9.190621034184075e-06, "loss": 2.4541, "mean_token_accuracy": 0.3827586233615875, "step": 9125 }, { "epoch": 0.009195832968219483, "grad_norm": 31.69491766657416, "learning_rate": 9.195656990915134e-06, "loss": 2.2288, "mean_token_accuracy": 0.4880822718143463, "step": 9130 }, { "epoch": 0.009200869021323657, "grad_norm": 33.46994089808851, "learning_rate": 9.200692947646195e-06, "loss": 2.5069, "mean_token_accuracy": 0.4000000059604645, "step": 9135 }, { "epoch": 0.009205905074427829, "grad_norm": 47.98058660860216, "learning_rate": 9.205728904377255e-06, "loss": 2.0576, "mean_token_accuracy": 0.47931034564971925, "step": 9140 }, { "epoch": 0.009210941127532, "grad_norm": 36.29921682152968, "learning_rate": 9.210764861108314e-06, "loss": 2.1427, "mean_token_accuracy": 0.44827585816383364, "step": 9145 }, { "epoch": 0.009215977180636175, "grad_norm": 33.49505729104911, "learning_rate": 9.215800817839373e-06, "loss": 2.6732, "mean_token_accuracy": 0.4034482777118683, "step": 9150 }, { "epoch": 0.009221013233740347, "grad_norm": 44.95200390828714, "learning_rate": 9.220836774570434e-06, "loss": 2.4702, "mean_token_accuracy": 0.41379310488700866, "step": 9155 }, { "epoch": 0.00922604928684452, "grad_norm": 42.43805088101427, "learning_rate": 9.225872731301493e-06, "loss": 2.6847, "mean_token_accuracy": 0.39655172228813174, "step": 9160 }, { "epoch": 0.009231085339948692, "grad_norm": 32.32555586065422, "learning_rate": 9.230908688032552e-06, "loss": 1.752, "mean_token_accuracy": 0.5379310369491577, "step": 9165 }, { "epoch": 0.009236121393052866, "grad_norm": 34.91766987949955, "learning_rate": 9.235944644763612e-06, "loss": 2.1642, "mean_token_accuracy": 0.4661329984664917, "step": 9170 }, { "epoch": 0.009241157446157038, "grad_norm": 34.98220392415641, "learning_rate": 9.240980601494673e-06, "loss": 2.3802, "mean_token_accuracy": 0.4517241358757019, "step": 9175 }, { "epoch": 0.00924619349926121, "grad_norm": 33.05414702913924, "learning_rate": 9.246016558225732e-06, "loss": 2.7526, "mean_token_accuracy": 0.4137930989265442, "step": 9180 }, { "epoch": 0.009251229552365384, "grad_norm": 33.42497700812565, "learning_rate": 9.251052514956791e-06, "loss": 2.617, "mean_token_accuracy": 0.382758629322052, "step": 9185 }, { "epoch": 0.009256265605469556, "grad_norm": 38.947026588854825, "learning_rate": 9.256088471687852e-06, "loss": 2.3767, "mean_token_accuracy": 0.43448275327682495, "step": 9190 }, { "epoch": 0.00926130165857373, "grad_norm": 51.582724085860605, "learning_rate": 9.261124428418911e-06, "loss": 2.4202, "mean_token_accuracy": 0.3827586233615875, "step": 9195 }, { "epoch": 0.009266337711677902, "grad_norm": 48.45775234282198, "learning_rate": 9.266160385149972e-06, "loss": 2.4542, "mean_token_accuracy": 0.43793103098869324, "step": 9200 }, { "epoch": 0.009271373764782076, "grad_norm": 33.477875381335934, "learning_rate": 9.27119634188103e-06, "loss": 2.2288, "mean_token_accuracy": 0.43793103098869324, "step": 9205 }, { "epoch": 0.009276409817886248, "grad_norm": 33.39386192015122, "learning_rate": 9.276232298612091e-06, "loss": 2.2179, "mean_token_accuracy": 0.4931034445762634, "step": 9210 }, { "epoch": 0.00928144587099042, "grad_norm": 37.65668756353228, "learning_rate": 9.28126825534315e-06, "loss": 2.5813, "mean_token_accuracy": 0.3965517282485962, "step": 9215 }, { "epoch": 0.009286481924094593, "grad_norm": 45.799604301856725, "learning_rate": 9.286304212074211e-06, "loss": 2.2835, "mean_token_accuracy": 0.458620685338974, "step": 9220 }, { "epoch": 0.009291517977198765, "grad_norm": 37.43982355107599, "learning_rate": 9.29134016880527e-06, "loss": 2.1804, "mean_token_accuracy": 0.47586206793785096, "step": 9225 }, { "epoch": 0.00929655403030294, "grad_norm": 30.442295618337628, "learning_rate": 9.29637612553633e-06, "loss": 2.1534, "mean_token_accuracy": 0.44827585816383364, "step": 9230 }, { "epoch": 0.009301590083407111, "grad_norm": 28.580133085103068, "learning_rate": 9.30141208226739e-06, "loss": 2.3145, "mean_token_accuracy": 0.4482758641242981, "step": 9235 }, { "epoch": 0.009306626136511285, "grad_norm": 48.86313982794057, "learning_rate": 9.30644803899845e-06, "loss": 2.5066, "mean_token_accuracy": 0.4586206912994385, "step": 9240 }, { "epoch": 0.009311662189615457, "grad_norm": 35.62881252197929, "learning_rate": 9.311483995729509e-06, "loss": 2.3503, "mean_token_accuracy": 0.37586206793785093, "step": 9245 }, { "epoch": 0.009316698242719629, "grad_norm": 37.529854807030055, "learning_rate": 9.316519952460568e-06, "loss": 2.168, "mean_token_accuracy": 0.4310344815254211, "step": 9250 }, { "epoch": 0.009321734295823803, "grad_norm": 30.44971412684755, "learning_rate": 9.32155590919163e-06, "loss": 2.2228, "mean_token_accuracy": 0.46551724076271056, "step": 9255 }, { "epoch": 0.009326770348927975, "grad_norm": 40.207080397248326, "learning_rate": 9.326591865922688e-06, "loss": 2.4239, "mean_token_accuracy": 0.4068965554237366, "step": 9260 }, { "epoch": 0.009331806402032149, "grad_norm": 37.07009577442753, "learning_rate": 9.331627822653748e-06, "loss": 2.6045, "mean_token_accuracy": 0.38965516686439516, "step": 9265 }, { "epoch": 0.00933684245513632, "grad_norm": 37.49203138237693, "learning_rate": 9.336663779384807e-06, "loss": 2.3839, "mean_token_accuracy": 0.41034482717514037, "step": 9270 }, { "epoch": 0.009341878508240494, "grad_norm": 37.72476793928882, "learning_rate": 9.341699736115868e-06, "loss": 2.733, "mean_token_accuracy": 0.3551724135875702, "step": 9275 }, { "epoch": 0.009346914561344666, "grad_norm": 39.5307546229843, "learning_rate": 9.346735692846927e-06, "loss": 2.3011, "mean_token_accuracy": 0.4436176776885986, "step": 9280 }, { "epoch": 0.009351950614448839, "grad_norm": 35.12676499845565, "learning_rate": 9.351771649577986e-06, "loss": 2.2473, "mean_token_accuracy": 0.4551724076271057, "step": 9285 }, { "epoch": 0.009356986667553012, "grad_norm": 37.01640778622918, "learning_rate": 9.356807606309047e-06, "loss": 2.3446, "mean_token_accuracy": 0.4344827592372894, "step": 9290 }, { "epoch": 0.009362022720657184, "grad_norm": 32.90171139625545, "learning_rate": 9.361843563040107e-06, "loss": 2.5519, "mean_token_accuracy": 0.4620689690113068, "step": 9295 }, { "epoch": 0.009367058773761358, "grad_norm": 38.007285404658894, "learning_rate": 9.366879519771168e-06, "loss": 2.439, "mean_token_accuracy": 0.42758620977401735, "step": 9300 }, { "epoch": 0.00937209482686553, "grad_norm": 35.19365639538145, "learning_rate": 9.371915476502225e-06, "loss": 2.3766, "mean_token_accuracy": 0.4398064136505127, "step": 9305 }, { "epoch": 0.009377130879969704, "grad_norm": 41.130173655164306, "learning_rate": 9.376951433233286e-06, "loss": 2.1596, "mean_token_accuracy": 0.5068965554237366, "step": 9310 }, { "epoch": 0.009382166933073876, "grad_norm": 41.138525515556516, "learning_rate": 9.381987389964345e-06, "loss": 2.1327, "mean_token_accuracy": 0.501996374130249, "step": 9315 }, { "epoch": 0.009387202986178048, "grad_norm": 39.71963933737859, "learning_rate": 9.387023346695406e-06, "loss": 2.2676, "mean_token_accuracy": 0.4310344815254211, "step": 9320 }, { "epoch": 0.009392239039282222, "grad_norm": 29.04877378610156, "learning_rate": 9.392059303426466e-06, "loss": 2.3532, "mean_token_accuracy": 0.4551724135875702, "step": 9325 }, { "epoch": 0.009397275092386394, "grad_norm": 47.707668170366624, "learning_rate": 9.397095260157525e-06, "loss": 2.2283, "mean_token_accuracy": 0.46551724076271056, "step": 9330 }, { "epoch": 0.009402311145490568, "grad_norm": 39.31902167780123, "learning_rate": 9.402131216888586e-06, "loss": 2.3474, "mean_token_accuracy": 0.42068966031074523, "step": 9335 }, { "epoch": 0.00940734719859474, "grad_norm": 35.20443660332652, "learning_rate": 9.407167173619645e-06, "loss": 2.3389, "mean_token_accuracy": 0.36206896007061007, "step": 9340 }, { "epoch": 0.009412383251698913, "grad_norm": 28.877074126269388, "learning_rate": 9.412203130350704e-06, "loss": 2.0916, "mean_token_accuracy": 0.4724137902259827, "step": 9345 }, { "epoch": 0.009417419304803085, "grad_norm": 62.02005661605764, "learning_rate": 9.417239087081763e-06, "loss": 2.4439, "mean_token_accuracy": 0.4034482777118683, "step": 9350 }, { "epoch": 0.009422455357907257, "grad_norm": 54.1560187250223, "learning_rate": 9.422275043812824e-06, "loss": 2.24, "mean_token_accuracy": 0.4448275864124298, "step": 9355 }, { "epoch": 0.009427491411011431, "grad_norm": 37.19352151029625, "learning_rate": 9.427311000543884e-06, "loss": 2.2809, "mean_token_accuracy": 0.4448275864124298, "step": 9360 }, { "epoch": 0.009432527464115603, "grad_norm": 55.89592120683475, "learning_rate": 9.432346957274943e-06, "loss": 2.4476, "mean_token_accuracy": 0.41724138855934145, "step": 9365 }, { "epoch": 0.009437563517219777, "grad_norm": 38.05658085036962, "learning_rate": 9.437382914006002e-06, "loss": 2.197, "mean_token_accuracy": 0.4586206912994385, "step": 9370 }, { "epoch": 0.009442599570323949, "grad_norm": 48.367057019437205, "learning_rate": 9.442418870737063e-06, "loss": 2.3918, "mean_token_accuracy": 0.42758620381355283, "step": 9375 }, { "epoch": 0.009447635623428123, "grad_norm": 35.34738493121819, "learning_rate": 9.447454827468122e-06, "loss": 2.3408, "mean_token_accuracy": 0.4294615864753723, "step": 9380 }, { "epoch": 0.009452671676532295, "grad_norm": 35.47678578946282, "learning_rate": 9.452490784199182e-06, "loss": 2.3693, "mean_token_accuracy": 0.44827585816383364, "step": 9385 }, { "epoch": 0.009457707729636467, "grad_norm": 45.824387931172495, "learning_rate": 9.457526740930243e-06, "loss": 2.5497, "mean_token_accuracy": 0.3931034505367279, "step": 9390 }, { "epoch": 0.00946274378274064, "grad_norm": 40.298470472182366, "learning_rate": 9.462562697661302e-06, "loss": 2.3054, "mean_token_accuracy": 0.46896551847457885, "step": 9395 }, { "epoch": 0.009467779835844813, "grad_norm": 36.99580884410551, "learning_rate": 9.467598654392363e-06, "loss": 2.3268, "mean_token_accuracy": 0.42413792610168455, "step": 9400 }, { "epoch": 0.009472815888948986, "grad_norm": 36.53748203376824, "learning_rate": 9.47263461112342e-06, "loss": 2.2065, "mean_token_accuracy": 0.4551724076271057, "step": 9405 }, { "epoch": 0.009477851942053158, "grad_norm": 30.8940653222547, "learning_rate": 9.477670567854481e-06, "loss": 2.2611, "mean_token_accuracy": 0.42758620977401735, "step": 9410 }, { "epoch": 0.009482887995157332, "grad_norm": 28.734091237414834, "learning_rate": 9.48270652458554e-06, "loss": 2.1265, "mean_token_accuracy": 0.5034482777118683, "step": 9415 }, { "epoch": 0.009487924048261504, "grad_norm": 31.1004818606481, "learning_rate": 9.487742481316602e-06, "loss": 2.1517, "mean_token_accuracy": 0.4551724076271057, "step": 9420 }, { "epoch": 0.009492960101365676, "grad_norm": 34.473597381292784, "learning_rate": 9.49277843804766e-06, "loss": 2.6196, "mean_token_accuracy": 0.45862067937850953, "step": 9425 }, { "epoch": 0.00949799615446985, "grad_norm": 53.67873321079563, "learning_rate": 9.49781439477872e-06, "loss": 2.4002, "mean_token_accuracy": 0.41724138259887694, "step": 9430 }, { "epoch": 0.009503032207574022, "grad_norm": 47.26783759849226, "learning_rate": 9.502850351509781e-06, "loss": 2.4613, "mean_token_accuracy": 0.4034482777118683, "step": 9435 }, { "epoch": 0.009508068260678196, "grad_norm": 31.863222782905083, "learning_rate": 9.50788630824084e-06, "loss": 1.9586, "mean_token_accuracy": 0.4551724135875702, "step": 9440 }, { "epoch": 0.009513104313782368, "grad_norm": 39.919495468484904, "learning_rate": 9.512922264971901e-06, "loss": 2.2941, "mean_token_accuracy": 0.47241380214691164, "step": 9445 }, { "epoch": 0.009518140366886542, "grad_norm": 48.282929799846265, "learning_rate": 9.517958221702959e-06, "loss": 2.6901, "mean_token_accuracy": 0.4137930989265442, "step": 9450 }, { "epoch": 0.009523176419990714, "grad_norm": 37.14896936637999, "learning_rate": 9.52299417843402e-06, "loss": 2.3656, "mean_token_accuracy": 0.4, "step": 9455 }, { "epoch": 0.009528212473094886, "grad_norm": 47.67910216055814, "learning_rate": 9.528030135165079e-06, "loss": 2.0631, "mean_token_accuracy": 0.4689655303955078, "step": 9460 }, { "epoch": 0.00953324852619906, "grad_norm": 26.999679700384807, "learning_rate": 9.53306609189614e-06, "loss": 2.2564, "mean_token_accuracy": 0.4517241418361664, "step": 9465 }, { "epoch": 0.009538284579303231, "grad_norm": 36.86952532916416, "learning_rate": 9.538102048627197e-06, "loss": 2.1999, "mean_token_accuracy": 0.38620689511299133, "step": 9470 }, { "epoch": 0.009543320632407405, "grad_norm": 26.76187454186431, "learning_rate": 9.543138005358258e-06, "loss": 2.4029, "mean_token_accuracy": 0.4517241418361664, "step": 9475 }, { "epoch": 0.009548356685511577, "grad_norm": 37.90728106773331, "learning_rate": 9.54817396208932e-06, "loss": 2.2918, "mean_token_accuracy": 0.4517241358757019, "step": 9480 }, { "epoch": 0.009553392738615751, "grad_norm": 41.43769948358146, "learning_rate": 9.553209918820379e-06, "loss": 2.3432, "mean_token_accuracy": 0.4172413766384125, "step": 9485 }, { "epoch": 0.009558428791719923, "grad_norm": 41.565916707039364, "learning_rate": 9.558245875551438e-06, "loss": 2.117, "mean_token_accuracy": 0.49704434871673586, "step": 9490 }, { "epoch": 0.009563464844824095, "grad_norm": 53.22743699044022, "learning_rate": 9.563281832282497e-06, "loss": 2.3736, "mean_token_accuracy": 0.43103448748588563, "step": 9495 }, { "epoch": 0.009568500897928269, "grad_norm": 38.46665459041057, "learning_rate": 9.568317789013558e-06, "loss": 2.1753, "mean_token_accuracy": 0.4724137902259827, "step": 9500 }, { "epoch": 0.00957353695103244, "grad_norm": 32.935608646062974, "learning_rate": 9.573353745744617e-06, "loss": 2.5808, "mean_token_accuracy": 0.4261947989463806, "step": 9505 }, { "epoch": 0.009578573004136615, "grad_norm": 40.2120567077209, "learning_rate": 9.578389702475677e-06, "loss": 2.3295, "mean_token_accuracy": 0.4379310369491577, "step": 9510 }, { "epoch": 0.009583609057240787, "grad_norm": 34.18815683141399, "learning_rate": 9.583425659206736e-06, "loss": 2.351, "mean_token_accuracy": 0.46037508845329284, "step": 9515 }, { "epoch": 0.00958864511034496, "grad_norm": 54.585054653436785, "learning_rate": 9.588461615937797e-06, "loss": 2.3846, "mean_token_accuracy": 0.4620689690113068, "step": 9520 }, { "epoch": 0.009593681163449132, "grad_norm": 43.93374931186114, "learning_rate": 9.593497572668856e-06, "loss": 2.5726, "mean_token_accuracy": 0.4482758641242981, "step": 9525 }, { "epoch": 0.009598717216553304, "grad_norm": 44.360102782051726, "learning_rate": 9.598533529399915e-06, "loss": 2.4388, "mean_token_accuracy": 0.4758620738983154, "step": 9530 }, { "epoch": 0.009603753269657478, "grad_norm": 46.75849766622284, "learning_rate": 9.603569486130976e-06, "loss": 2.3864, "mean_token_accuracy": 0.42758620381355283, "step": 9535 }, { "epoch": 0.00960878932276165, "grad_norm": 53.288768433113255, "learning_rate": 9.608605442862035e-06, "loss": 2.4145, "mean_token_accuracy": 0.45390198826789857, "step": 9540 }, { "epoch": 0.009613825375865824, "grad_norm": 41.53323184911322, "learning_rate": 9.613641399593096e-06, "loss": 2.8338, "mean_token_accuracy": 0.3379310369491577, "step": 9545 }, { "epoch": 0.009618861428969996, "grad_norm": 57.717260151059946, "learning_rate": 9.618677356324154e-06, "loss": 2.2696, "mean_token_accuracy": 0.5172413945198059, "step": 9550 }, { "epoch": 0.00962389748207417, "grad_norm": 50.134721596711685, "learning_rate": 9.623713313055215e-06, "loss": 2.3364, "mean_token_accuracy": 0.3999999940395355, "step": 9555 }, { "epoch": 0.009628933535178342, "grad_norm": 40.60547468196455, "learning_rate": 9.628749269786274e-06, "loss": 2.3199, "mean_token_accuracy": 0.4517241299152374, "step": 9560 }, { "epoch": 0.009633969588282514, "grad_norm": 39.38986228375763, "learning_rate": 9.633785226517335e-06, "loss": 1.9784, "mean_token_accuracy": 0.4564428269863129, "step": 9565 }, { "epoch": 0.009639005641386688, "grad_norm": 35.58370937791689, "learning_rate": 9.638821183248394e-06, "loss": 2.2372, "mean_token_accuracy": 0.4413793087005615, "step": 9570 }, { "epoch": 0.00964404169449086, "grad_norm": 41.49612424376096, "learning_rate": 9.643857139979454e-06, "loss": 2.4391, "mean_token_accuracy": 0.4172413766384125, "step": 9575 }, { "epoch": 0.009649077747595033, "grad_norm": 29.33403181280721, "learning_rate": 9.648893096710515e-06, "loss": 2.1304, "mean_token_accuracy": 0.44827585816383364, "step": 9580 }, { "epoch": 0.009654113800699205, "grad_norm": 38.16799360078175, "learning_rate": 9.653929053441574e-06, "loss": 2.4373, "mean_token_accuracy": 0.4379310369491577, "step": 9585 }, { "epoch": 0.00965914985380338, "grad_norm": 37.07512875659534, "learning_rate": 9.658965010172633e-06, "loss": 2.3941, "mean_token_accuracy": 0.4206896543502808, "step": 9590 }, { "epoch": 0.009664185906907551, "grad_norm": 44.19484681262662, "learning_rate": 9.664000966903692e-06, "loss": 2.506, "mean_token_accuracy": 0.458620685338974, "step": 9595 }, { "epoch": 0.009669221960011723, "grad_norm": 32.4605651046261, "learning_rate": 9.669036923634753e-06, "loss": 2.486, "mean_token_accuracy": 0.42758620977401735, "step": 9600 }, { "epoch": 0.009674258013115897, "grad_norm": 40.13781900070692, "learning_rate": 9.674072880365812e-06, "loss": 2.4369, "mean_token_accuracy": 0.4310344815254211, "step": 9605 }, { "epoch": 0.009679294066220069, "grad_norm": 40.321007584891326, "learning_rate": 9.679108837096872e-06, "loss": 2.2452, "mean_token_accuracy": 0.4534180223941803, "step": 9610 }, { "epoch": 0.009684330119324243, "grad_norm": 46.95991123018105, "learning_rate": 9.684144793827931e-06, "loss": 2.23, "mean_token_accuracy": 0.4482758641242981, "step": 9615 }, { "epoch": 0.009689366172428415, "grad_norm": 38.62739915985313, "learning_rate": 9.689180750558992e-06, "loss": 2.489, "mean_token_accuracy": 0.39310344457626345, "step": 9620 }, { "epoch": 0.009694402225532587, "grad_norm": 62.271892841552415, "learning_rate": 9.694216707290051e-06, "loss": 2.6741, "mean_token_accuracy": 0.3999999940395355, "step": 9625 }, { "epoch": 0.00969943827863676, "grad_norm": 27.64831639695065, "learning_rate": 9.69925266402111e-06, "loss": 2.0584, "mean_token_accuracy": 0.49999999403953554, "step": 9630 }, { "epoch": 0.009704474331740933, "grad_norm": 49.90490509813038, "learning_rate": 9.704288620752171e-06, "loss": 2.5651, "mean_token_accuracy": 0.40344828367233276, "step": 9635 }, { "epoch": 0.009709510384845106, "grad_norm": 54.015041389139334, "learning_rate": 9.70932457748323e-06, "loss": 2.8962, "mean_token_accuracy": 0.3896551728248596, "step": 9640 }, { "epoch": 0.009714546437949279, "grad_norm": 44.89892736778689, "learning_rate": 9.714360534214292e-06, "loss": 2.1613, "mean_token_accuracy": 0.42758620977401735, "step": 9645 }, { "epoch": 0.009719582491053452, "grad_norm": 38.46119154950889, "learning_rate": 9.71939649094535e-06, "loss": 2.0622, "mean_token_accuracy": 0.4965517222881317, "step": 9650 }, { "epoch": 0.009724618544157624, "grad_norm": 50.21323139349961, "learning_rate": 9.72443244767641e-06, "loss": 2.2516, "mean_token_accuracy": 0.44827585816383364, "step": 9655 }, { "epoch": 0.009729654597261796, "grad_norm": 37.73573326108465, "learning_rate": 9.72946840440747e-06, "loss": 2.2475, "mean_token_accuracy": 0.4482758641242981, "step": 9660 }, { "epoch": 0.00973469065036597, "grad_norm": 40.6102850334886, "learning_rate": 9.73450436113853e-06, "loss": 2.2481, "mean_token_accuracy": 0.4448275983333588, "step": 9665 }, { "epoch": 0.009739726703470142, "grad_norm": 29.76052277546363, "learning_rate": 9.73954031786959e-06, "loss": 2.5081, "mean_token_accuracy": 0.43103447556495667, "step": 9670 }, { "epoch": 0.009744762756574316, "grad_norm": 32.381522891329375, "learning_rate": 9.744576274600649e-06, "loss": 2.2839, "mean_token_accuracy": 0.44137930274009707, "step": 9675 }, { "epoch": 0.009749798809678488, "grad_norm": 46.2840128628751, "learning_rate": 9.74961223133171e-06, "loss": 2.6206, "mean_token_accuracy": 0.42758620977401735, "step": 9680 }, { "epoch": 0.009754834862782662, "grad_norm": 43.417552676867075, "learning_rate": 9.754648188062769e-06, "loss": 2.2318, "mean_token_accuracy": 0.45862067937850953, "step": 9685 }, { "epoch": 0.009759870915886834, "grad_norm": 31.91717182901664, "learning_rate": 9.759684144793828e-06, "loss": 2.0837, "mean_token_accuracy": 0.49310343265533446, "step": 9690 }, { "epoch": 0.009764906968991006, "grad_norm": 36.06732688179533, "learning_rate": 9.764720101524888e-06, "loss": 2.4187, "mean_token_accuracy": 0.4379310369491577, "step": 9695 }, { "epoch": 0.00976994302209518, "grad_norm": 26.48922905878847, "learning_rate": 9.769756058255948e-06, "loss": 2.0905, "mean_token_accuracy": 0.4744101643562317, "step": 9700 }, { "epoch": 0.009774979075199352, "grad_norm": 28.02741967196921, "learning_rate": 9.774792014987008e-06, "loss": 2.2322, "mean_token_accuracy": 0.4862068951129913, "step": 9705 }, { "epoch": 0.009780015128303525, "grad_norm": 41.51969097502853, "learning_rate": 9.779827971718067e-06, "loss": 2.4651, "mean_token_accuracy": 0.44482758045196535, "step": 9710 }, { "epoch": 0.009785051181407697, "grad_norm": 35.66419905736955, "learning_rate": 9.784863928449126e-06, "loss": 2.2878, "mean_token_accuracy": 0.44827585816383364, "step": 9715 }, { "epoch": 0.009790087234511871, "grad_norm": 31.90114095618044, "learning_rate": 9.789899885180187e-06, "loss": 2.1398, "mean_token_accuracy": 0.4689655125141144, "step": 9720 }, { "epoch": 0.009795123287616043, "grad_norm": 39.50798321565754, "learning_rate": 9.794935841911246e-06, "loss": 2.1036, "mean_token_accuracy": 0.5, "step": 9725 }, { "epoch": 0.009800159340720215, "grad_norm": 40.770153479433105, "learning_rate": 9.799971798642306e-06, "loss": 2.2716, "mean_token_accuracy": 0.43448275327682495, "step": 9730 }, { "epoch": 0.009805195393824389, "grad_norm": 36.35831334602836, "learning_rate": 9.805007755373367e-06, "loss": 2.4703, "mean_token_accuracy": 0.4172413766384125, "step": 9735 }, { "epoch": 0.009810231446928561, "grad_norm": 47.07511466216155, "learning_rate": 9.810043712104426e-06, "loss": 2.48, "mean_token_accuracy": 0.4, "step": 9740 }, { "epoch": 0.009815267500032735, "grad_norm": 39.24649678591775, "learning_rate": 9.815079668835487e-06, "loss": 2.1266, "mean_token_accuracy": 0.4482758641242981, "step": 9745 }, { "epoch": 0.009820303553136907, "grad_norm": 50.72562269044652, "learning_rate": 9.820115625566544e-06, "loss": 2.4771, "mean_token_accuracy": 0.441379314661026, "step": 9750 }, { "epoch": 0.00982533960624108, "grad_norm": 29.216256530995295, "learning_rate": 9.825151582297605e-06, "loss": 2.2538, "mean_token_accuracy": 0.4672111332416534, "step": 9755 }, { "epoch": 0.009830375659345253, "grad_norm": 42.76259851964069, "learning_rate": 9.830187539028665e-06, "loss": 2.2418, "mean_token_accuracy": 0.458620685338974, "step": 9760 }, { "epoch": 0.009835411712449425, "grad_norm": 39.224557538858306, "learning_rate": 9.835223495759726e-06, "loss": 2.3883, "mean_token_accuracy": 0.4137930989265442, "step": 9765 }, { "epoch": 0.009840447765553598, "grad_norm": 48.384732272443365, "learning_rate": 9.840259452490785e-06, "loss": 2.2704, "mean_token_accuracy": 0.4379310369491577, "step": 9770 }, { "epoch": 0.00984548381865777, "grad_norm": 49.31576871313363, "learning_rate": 9.845295409221844e-06, "loss": 2.1596, "mean_token_accuracy": 0.4896551787853241, "step": 9775 }, { "epoch": 0.009850519871761944, "grad_norm": 51.34938786740068, "learning_rate": 9.850331365952905e-06, "loss": 2.389, "mean_token_accuracy": 0.4517241358757019, "step": 9780 }, { "epoch": 0.009855555924866116, "grad_norm": 35.20188050811193, "learning_rate": 9.855367322683964e-06, "loss": 2.3697, "mean_token_accuracy": 0.46551724672317507, "step": 9785 }, { "epoch": 0.00986059197797029, "grad_norm": 26.90127488898475, "learning_rate": 9.860403279415023e-06, "loss": 2.3977, "mean_token_accuracy": 0.41863279342651366, "step": 9790 }, { "epoch": 0.009865628031074462, "grad_norm": 48.6790640080729, "learning_rate": 9.865439236146083e-06, "loss": 2.6652, "mean_token_accuracy": 0.3965517282485962, "step": 9795 }, { "epoch": 0.009870664084178634, "grad_norm": 37.18319928261827, "learning_rate": 9.870475192877144e-06, "loss": 2.0745, "mean_token_accuracy": 0.5068965554237366, "step": 9800 }, { "epoch": 0.009875700137282808, "grad_norm": 34.914326881492144, "learning_rate": 9.875511149608203e-06, "loss": 2.2877, "mean_token_accuracy": 0.4551724076271057, "step": 9805 }, { "epoch": 0.00988073619038698, "grad_norm": 28.541790634884798, "learning_rate": 9.880547106339262e-06, "loss": 2.2875, "mean_token_accuracy": 0.4586206912994385, "step": 9810 }, { "epoch": 0.009885772243491154, "grad_norm": 32.10128822311137, "learning_rate": 9.885583063070321e-06, "loss": 2.3616, "mean_token_accuracy": 0.4379310429096222, "step": 9815 }, { "epoch": 0.009890808296595326, "grad_norm": 34.681438426135955, "learning_rate": 9.890619019801382e-06, "loss": 2.5218, "mean_token_accuracy": 0.42758620977401735, "step": 9820 }, { "epoch": 0.0098958443496995, "grad_norm": 36.78314398172527, "learning_rate": 9.895654976532442e-06, "loss": 2.4562, "mean_token_accuracy": 0.4068965494632721, "step": 9825 }, { "epoch": 0.009900880402803671, "grad_norm": 36.400868026310306, "learning_rate": 9.900690933263503e-06, "loss": 2.4623, "mean_token_accuracy": 0.4551724135875702, "step": 9830 }, { "epoch": 0.009905916455907843, "grad_norm": 63.09611463095056, "learning_rate": 9.905726889994562e-06, "loss": 2.8517, "mean_token_accuracy": 0.3862069070339203, "step": 9835 }, { "epoch": 0.009910952509012017, "grad_norm": 37.59352311555945, "learning_rate": 9.910762846725621e-06, "loss": 2.4938, "mean_token_accuracy": 0.4344827592372894, "step": 9840 }, { "epoch": 0.00991598856211619, "grad_norm": 33.1384832019594, "learning_rate": 9.915798803456682e-06, "loss": 2.4016, "mean_token_accuracy": 0.4068965554237366, "step": 9845 }, { "epoch": 0.009921024615220363, "grad_norm": 42.68695036645854, "learning_rate": 9.920834760187741e-06, "loss": 2.4635, "mean_token_accuracy": 0.42758620977401735, "step": 9850 }, { "epoch": 0.009926060668324535, "grad_norm": 43.95232104622241, "learning_rate": 9.9258707169188e-06, "loss": 2.4681, "mean_token_accuracy": 0.42413792610168455, "step": 9855 }, { "epoch": 0.009931096721428709, "grad_norm": 30.44499917573671, "learning_rate": 9.93090667364986e-06, "loss": 2.2056, "mean_token_accuracy": 0.4103448212146759, "step": 9860 }, { "epoch": 0.00993613277453288, "grad_norm": 38.329240619404715, "learning_rate": 9.93594263038092e-06, "loss": 2.294, "mean_token_accuracy": 0.44827585816383364, "step": 9865 }, { "epoch": 0.009941168827637053, "grad_norm": 34.0841614870972, "learning_rate": 9.94097858711198e-06, "loss": 1.9979, "mean_token_accuracy": 0.47241378426551817, "step": 9870 }, { "epoch": 0.009946204880741227, "grad_norm": 44.164764186961506, "learning_rate": 9.94601454384304e-06, "loss": 2.4154, "mean_token_accuracy": 0.42758620977401735, "step": 9875 }, { "epoch": 0.009951240933845399, "grad_norm": 44.39462956801971, "learning_rate": 9.9510505005741e-06, "loss": 2.474, "mean_token_accuracy": 0.3999999940395355, "step": 9880 }, { "epoch": 0.009956276986949572, "grad_norm": 55.0620247814656, "learning_rate": 9.95608645730516e-06, "loss": 2.2507, "mean_token_accuracy": 0.4551724076271057, "step": 9885 }, { "epoch": 0.009961313040053744, "grad_norm": 36.68055150617545, "learning_rate": 9.96112241403622e-06, "loss": 2.3992, "mean_token_accuracy": 0.4379310250282288, "step": 9890 }, { "epoch": 0.009966349093157918, "grad_norm": 39.585869992984804, "learning_rate": 9.966158370767278e-06, "loss": 2.7119, "mean_token_accuracy": 0.37241379022598264, "step": 9895 }, { "epoch": 0.00997138514626209, "grad_norm": 40.458863814081724, "learning_rate": 9.971194327498339e-06, "loss": 2.4842, "mean_token_accuracy": 0.41724138259887694, "step": 9900 }, { "epoch": 0.009976421199366262, "grad_norm": 53.56858318648054, "learning_rate": 9.976230284229398e-06, "loss": 2.416, "mean_token_accuracy": 0.48620688915252686, "step": 9905 }, { "epoch": 0.009981457252470436, "grad_norm": 44.82050561599625, "learning_rate": 9.981266240960459e-06, "loss": 2.071, "mean_token_accuracy": 0.5091954052448273, "step": 9910 }, { "epoch": 0.009986493305574608, "grad_norm": 38.30779622627783, "learning_rate": 9.986302197691517e-06, "loss": 2.174, "mean_token_accuracy": 0.4379310369491577, "step": 9915 }, { "epoch": 0.009991529358678782, "grad_norm": 67.06462653549127, "learning_rate": 9.991338154422578e-06, "loss": 2.2627, "mean_token_accuracy": 0.4379310369491577, "step": 9920 }, { "epoch": 0.009996565411782954, "grad_norm": 28.459262555152275, "learning_rate": 9.996374111153637e-06, "loss": 2.3344, "mean_token_accuracy": 0.44482758045196535, "step": 9925 }, { "epoch": 0.010001601464887128, "grad_norm": 46.06415459922314, "learning_rate": 1.0001410067884698e-05, "loss": 2.3853, "mean_token_accuracy": 0.47035692930221557, "step": 9930 }, { "epoch": 0.0100066375179913, "grad_norm": 51.086771355234134, "learning_rate": 1.0006446024615757e-05, "loss": 2.5218, "mean_token_accuracy": 0.36896551251411436, "step": 9935 }, { "epoch": 0.010011673571095472, "grad_norm": 32.83820277340714, "learning_rate": 1.0011481981346816e-05, "loss": 2.3408, "mean_token_accuracy": 0.4689655125141144, "step": 9940 }, { "epoch": 0.010016709624199645, "grad_norm": 48.87697795659866, "learning_rate": 1.0016517938077877e-05, "loss": 2.6272, "mean_token_accuracy": 0.39655172228813174, "step": 9945 }, { "epoch": 0.010021745677303818, "grad_norm": 34.953158531330146, "learning_rate": 1.0021553894808937e-05, "loss": 2.5131, "mean_token_accuracy": 0.42758620977401735, "step": 9950 }, { "epoch": 0.010026781730407991, "grad_norm": 28.441336427561588, "learning_rate": 1.0026589851539996e-05, "loss": 2.4865, "mean_token_accuracy": 0.3793103456497192, "step": 9955 }, { "epoch": 0.010031817783512163, "grad_norm": 42.87131918602361, "learning_rate": 1.0031625808271055e-05, "loss": 2.422, "mean_token_accuracy": 0.4344827473163605, "step": 9960 }, { "epoch": 0.010036853836616337, "grad_norm": 43.3397507835919, "learning_rate": 1.0036661765002116e-05, "loss": 2.3817, "mean_token_accuracy": 0.4551724135875702, "step": 9965 }, { "epoch": 0.010041889889720509, "grad_norm": 38.42467000016695, "learning_rate": 1.0041697721733175e-05, "loss": 2.2616, "mean_token_accuracy": 0.4413793087005615, "step": 9970 }, { "epoch": 0.010046925942824681, "grad_norm": 31.831649271520824, "learning_rate": 1.0046733678464234e-05, "loss": 2.1811, "mean_token_accuracy": 0.47931034564971925, "step": 9975 }, { "epoch": 0.010051961995928855, "grad_norm": 43.05992344527729, "learning_rate": 1.0051769635195295e-05, "loss": 2.4242, "mean_token_accuracy": 0.41379310488700866, "step": 9980 }, { "epoch": 0.010056998049033027, "grad_norm": 45.65732736647551, "learning_rate": 1.0056805591926355e-05, "loss": 2.319, "mean_token_accuracy": 0.47931033968925474, "step": 9985 }, { "epoch": 0.0100620341021372, "grad_norm": 30.152063473395394, "learning_rate": 1.0061841548657416e-05, "loss": 2.157, "mean_token_accuracy": 0.4708409011363983, "step": 9990 }, { "epoch": 0.010067070155241373, "grad_norm": 42.11063830354294, "learning_rate": 1.0066877505388473e-05, "loss": 2.4317, "mean_token_accuracy": 0.43629764914512636, "step": 9995 }, { "epoch": 0.010072106208345546, "grad_norm": 38.66812569506359, "learning_rate": 1.0071913462119534e-05, "loss": 2.1862, "mean_token_accuracy": 0.47586207985877993, "step": 10000 }, { "epoch": 0.010077142261449719, "grad_norm": 46.125095337828306, "learning_rate": 1.0076949418850593e-05, "loss": 2.0364, "mean_token_accuracy": 0.45892317295074464, "step": 10005 }, { "epoch": 0.01008217831455389, "grad_norm": 41.57642310009351, "learning_rate": 1.0081985375581654e-05, "loss": 2.4041, "mean_token_accuracy": 0.417241370677948, "step": 10010 }, { "epoch": 0.010087214367658064, "grad_norm": 38.99155556204114, "learning_rate": 1.0087021332312712e-05, "loss": 2.3741, "mean_token_accuracy": 0.4482758641242981, "step": 10015 }, { "epoch": 0.010092250420762236, "grad_norm": 33.56325731922833, "learning_rate": 1.0092057289043773e-05, "loss": 2.4864, "mean_token_accuracy": 0.42413792908191683, "step": 10020 }, { "epoch": 0.01009728647386641, "grad_norm": 33.18879509078343, "learning_rate": 1.0097093245774834e-05, "loss": 2.3649, "mean_token_accuracy": 0.4015124022960663, "step": 10025 }, { "epoch": 0.010102322526970582, "grad_norm": 34.39943228589022, "learning_rate": 1.0102129202505893e-05, "loss": 2.6679, "mean_token_accuracy": 0.40852994918823243, "step": 10030 }, { "epoch": 0.010107358580074756, "grad_norm": 37.828265395747664, "learning_rate": 1.0107165159236952e-05, "loss": 2.2851, "mean_token_accuracy": 0.41379310488700866, "step": 10035 }, { "epoch": 0.010112394633178928, "grad_norm": 36.38859441554674, "learning_rate": 1.0112201115968012e-05, "loss": 2.5865, "mean_token_accuracy": 0.4329098641872406, "step": 10040 }, { "epoch": 0.0101174306862831, "grad_norm": 31.873835647500325, "learning_rate": 1.0117237072699072e-05, "loss": 2.8942, "mean_token_accuracy": 0.3620689630508423, "step": 10045 }, { "epoch": 0.010122466739387274, "grad_norm": 40.698633581325126, "learning_rate": 1.0122273029430132e-05, "loss": 2.2452, "mean_token_accuracy": 0.417241370677948, "step": 10050 }, { "epoch": 0.010127502792491446, "grad_norm": 42.06101459085175, "learning_rate": 1.0127308986161191e-05, "loss": 2.6023, "mean_token_accuracy": 0.4137930989265442, "step": 10055 }, { "epoch": 0.01013253884559562, "grad_norm": 39.467673678439034, "learning_rate": 1.013234494289225e-05, "loss": 2.6424, "mean_token_accuracy": 0.3965517282485962, "step": 10060 }, { "epoch": 0.010137574898699792, "grad_norm": 42.071961100338484, "learning_rate": 1.0137380899623311e-05, "loss": 1.959, "mean_token_accuracy": 0.49458128213882446, "step": 10065 }, { "epoch": 0.010142610951803965, "grad_norm": 39.88961383158405, "learning_rate": 1.014241685635437e-05, "loss": 2.2529, "mean_token_accuracy": 0.49655172824859617, "step": 10070 }, { "epoch": 0.010147647004908137, "grad_norm": 34.121754964629766, "learning_rate": 1.014745281308543e-05, "loss": 2.0943, "mean_token_accuracy": 0.4620689570903778, "step": 10075 }, { "epoch": 0.01015268305801231, "grad_norm": 29.790457740494546, "learning_rate": 1.015248876981649e-05, "loss": 2.0784, "mean_token_accuracy": 0.5000000059604645, "step": 10080 }, { "epoch": 0.010157719111116483, "grad_norm": 31.451171586131146, "learning_rate": 1.015752472654755e-05, "loss": 2.2829, "mean_token_accuracy": 0.44137930274009707, "step": 10085 }, { "epoch": 0.010162755164220655, "grad_norm": 36.046115622484, "learning_rate": 1.0162560683278611e-05, "loss": 2.3188, "mean_token_accuracy": 0.4310344815254211, "step": 10090 }, { "epoch": 0.010167791217324829, "grad_norm": 46.43566816366216, "learning_rate": 1.0167596640009668e-05, "loss": 2.4158, "mean_token_accuracy": 0.39655171930789945, "step": 10095 }, { "epoch": 0.010172827270429001, "grad_norm": 31.508864275643315, "learning_rate": 1.017263259674073e-05, "loss": 2.4279, "mean_token_accuracy": 0.44827585816383364, "step": 10100 }, { "epoch": 0.010177863323533175, "grad_norm": 53.9199628523731, "learning_rate": 1.0177668553471789e-05, "loss": 2.4693, "mean_token_accuracy": 0.4326073706150055, "step": 10105 }, { "epoch": 0.010182899376637347, "grad_norm": 70.96315335275057, "learning_rate": 1.018270451020285e-05, "loss": 2.3394, "mean_token_accuracy": 0.43103448748588563, "step": 10110 }, { "epoch": 0.010187935429741519, "grad_norm": 30.03274480049322, "learning_rate": 1.0187740466933909e-05, "loss": 2.3719, "mean_token_accuracy": 0.45862069725990295, "step": 10115 }, { "epoch": 0.010192971482845693, "grad_norm": 69.3129552628717, "learning_rate": 1.0192776423664968e-05, "loss": 2.3225, "mean_token_accuracy": 0.4551724135875702, "step": 10120 }, { "epoch": 0.010198007535949865, "grad_norm": 41.14229513935154, "learning_rate": 1.0197812380396029e-05, "loss": 2.3886, "mean_token_accuracy": 0.4034482717514038, "step": 10125 }, { "epoch": 0.010203043589054038, "grad_norm": 47.73622815749257, "learning_rate": 1.0202848337127088e-05, "loss": 2.4968, "mean_token_accuracy": 0.3655172407627106, "step": 10130 }, { "epoch": 0.01020807964215821, "grad_norm": 45.56822357280052, "learning_rate": 1.0207884293858148e-05, "loss": 2.5413, "mean_token_accuracy": 0.42758620977401735, "step": 10135 }, { "epoch": 0.010213115695262384, "grad_norm": 37.646686550718584, "learning_rate": 1.0212920250589207e-05, "loss": 2.6364, "mean_token_accuracy": 0.42413793206214906, "step": 10140 }, { "epoch": 0.010218151748366556, "grad_norm": 49.39968114323596, "learning_rate": 1.0217956207320268e-05, "loss": 2.5315, "mean_token_accuracy": 0.43793103098869324, "step": 10145 }, { "epoch": 0.010223187801470728, "grad_norm": 45.94514004688942, "learning_rate": 1.0222992164051327e-05, "loss": 2.2119, "mean_token_accuracy": 0.4551724135875702, "step": 10150 }, { "epoch": 0.010228223854574902, "grad_norm": 37.12571323654598, "learning_rate": 1.0228028120782386e-05, "loss": 2.1912, "mean_token_accuracy": 0.47931033968925474, "step": 10155 }, { "epoch": 0.010233259907679074, "grad_norm": 34.323056762948674, "learning_rate": 1.0233064077513445e-05, "loss": 2.3982, "mean_token_accuracy": 0.4068965494632721, "step": 10160 }, { "epoch": 0.010238295960783248, "grad_norm": 33.61140332733371, "learning_rate": 1.0238100034244506e-05, "loss": 2.4634, "mean_token_accuracy": 0.403448274731636, "step": 10165 }, { "epoch": 0.01024333201388742, "grad_norm": 38.788665882277485, "learning_rate": 1.0243135990975566e-05, "loss": 2.5118, "mean_token_accuracy": 0.38620689511299133, "step": 10170 }, { "epoch": 0.010248368066991594, "grad_norm": 25.526287462456242, "learning_rate": 1.0248171947706625e-05, "loss": 2.1438, "mean_token_accuracy": 0.4931034505367279, "step": 10175 }, { "epoch": 0.010253404120095766, "grad_norm": 30.326332465652825, "learning_rate": 1.0253207904437686e-05, "loss": 2.3796, "mean_token_accuracy": 0.41724138259887694, "step": 10180 }, { "epoch": 0.010258440173199938, "grad_norm": 29.005554764384442, "learning_rate": 1.0258243861168745e-05, "loss": 2.545, "mean_token_accuracy": 0.3724137932062149, "step": 10185 }, { "epoch": 0.010263476226304111, "grad_norm": 31.73406266245715, "learning_rate": 1.0263279817899806e-05, "loss": 2.4644, "mean_token_accuracy": 0.39655172228813174, "step": 10190 }, { "epoch": 0.010268512279408283, "grad_norm": 34.43022747983628, "learning_rate": 1.0268315774630864e-05, "loss": 2.3896, "mean_token_accuracy": 0.40889292359352114, "step": 10195 }, { "epoch": 0.010273548332512457, "grad_norm": 27.99367376866882, "learning_rate": 1.0273351731361925e-05, "loss": 1.9994, "mean_token_accuracy": 0.507758617401123, "step": 10200 }, { "epoch": 0.01027858438561663, "grad_norm": 30.967094601001342, "learning_rate": 1.0278387688092984e-05, "loss": 2.3204, "mean_token_accuracy": 0.41379310190677643, "step": 10205 }, { "epoch": 0.010283620438720803, "grad_norm": 29.206943835308003, "learning_rate": 1.0283423644824045e-05, "loss": 2.462, "mean_token_accuracy": 0.4206896543502808, "step": 10210 }, { "epoch": 0.010288656491824975, "grad_norm": 29.54404023854891, "learning_rate": 1.0288459601555104e-05, "loss": 2.2165, "mean_token_accuracy": 0.458620685338974, "step": 10215 }, { "epoch": 0.010293692544929147, "grad_norm": 33.49345790588835, "learning_rate": 1.0293495558286163e-05, "loss": 2.152, "mean_token_accuracy": 0.4675136089324951, "step": 10220 }, { "epoch": 0.01029872859803332, "grad_norm": 33.7405598595986, "learning_rate": 1.0298531515017224e-05, "loss": 2.4107, "mean_token_accuracy": 0.4052026689052582, "step": 10225 }, { "epoch": 0.010303764651137493, "grad_norm": 36.611624491665, "learning_rate": 1.0303567471748283e-05, "loss": 2.4126, "mean_token_accuracy": 0.41724138259887694, "step": 10230 }, { "epoch": 0.010308800704241667, "grad_norm": 37.98533037557905, "learning_rate": 1.0308603428479344e-05, "loss": 2.3959, "mean_token_accuracy": 0.4344827651977539, "step": 10235 }, { "epoch": 0.010313836757345839, "grad_norm": 35.07177866820885, "learning_rate": 1.0313639385210402e-05, "loss": 2.2599, "mean_token_accuracy": 0.4517241358757019, "step": 10240 }, { "epoch": 0.010318872810450012, "grad_norm": 33.1345513630012, "learning_rate": 1.0318675341941463e-05, "loss": 2.3856, "mean_token_accuracy": 0.45862069725990295, "step": 10245 }, { "epoch": 0.010323908863554184, "grad_norm": 40.76178624731318, "learning_rate": 1.0323711298672522e-05, "loss": 2.4771, "mean_token_accuracy": 0.43793103098869324, "step": 10250 }, { "epoch": 0.010328944916658356, "grad_norm": 33.80518574326572, "learning_rate": 1.0328747255403583e-05, "loss": 2.5765, "mean_token_accuracy": 0.4482758641242981, "step": 10255 }, { "epoch": 0.01033398096976253, "grad_norm": 30.474424754649327, "learning_rate": 1.033378321213464e-05, "loss": 2.4128, "mean_token_accuracy": 0.4172413766384125, "step": 10260 }, { "epoch": 0.010339017022866702, "grad_norm": 40.975024126470394, "learning_rate": 1.0338819168865702e-05, "loss": 2.2503, "mean_token_accuracy": 0.4620689690113068, "step": 10265 }, { "epoch": 0.010344053075970876, "grad_norm": 37.17312442751016, "learning_rate": 1.0343855125596761e-05, "loss": 2.372, "mean_token_accuracy": 0.4448275864124298, "step": 10270 }, { "epoch": 0.010349089129075048, "grad_norm": 32.218541451818474, "learning_rate": 1.0348891082327822e-05, "loss": 1.894, "mean_token_accuracy": 0.510344821214676, "step": 10275 }, { "epoch": 0.010354125182179222, "grad_norm": 44.016033789572056, "learning_rate": 1.0353927039058881e-05, "loss": 2.5603, "mean_token_accuracy": 0.4241379380226135, "step": 10280 }, { "epoch": 0.010359161235283394, "grad_norm": 32.84931297504163, "learning_rate": 1.035896299578994e-05, "loss": 2.7023, "mean_token_accuracy": 0.3896551728248596, "step": 10285 }, { "epoch": 0.010364197288387566, "grad_norm": 60.93161200252047, "learning_rate": 1.0363998952521001e-05, "loss": 2.5679, "mean_token_accuracy": 0.36896551847457887, "step": 10290 }, { "epoch": 0.01036923334149174, "grad_norm": 49.15507426802025, "learning_rate": 1.036903490925206e-05, "loss": 2.0556, "mean_token_accuracy": 0.4917725265026093, "step": 10295 }, { "epoch": 0.010374269394595912, "grad_norm": 38.44915570473643, "learning_rate": 1.037407086598312e-05, "loss": 2.088, "mean_token_accuracy": 0.4862068951129913, "step": 10300 }, { "epoch": 0.010379305447700085, "grad_norm": 44.386929537455174, "learning_rate": 1.0379106822714179e-05, "loss": 2.4016, "mean_token_accuracy": 0.4137930989265442, "step": 10305 }, { "epoch": 0.010384341500804257, "grad_norm": 33.66940250326215, "learning_rate": 1.038414277944524e-05, "loss": 2.2458, "mean_token_accuracy": 0.4257108271121979, "step": 10310 }, { "epoch": 0.010389377553908431, "grad_norm": 32.54140177408694, "learning_rate": 1.03891787361763e-05, "loss": 2.6449, "mean_token_accuracy": 0.4, "step": 10315 }, { "epoch": 0.010394413607012603, "grad_norm": 32.01748553433284, "learning_rate": 1.0394214692907359e-05, "loss": 2.553, "mean_token_accuracy": 0.4000000059604645, "step": 10320 }, { "epoch": 0.010399449660116775, "grad_norm": 31.971247633558377, "learning_rate": 1.039925064963842e-05, "loss": 2.3284, "mean_token_accuracy": 0.42758620381355283, "step": 10325 }, { "epoch": 0.010404485713220949, "grad_norm": 34.620665921994465, "learning_rate": 1.0404286606369479e-05, "loss": 2.159, "mean_token_accuracy": 0.43793103098869324, "step": 10330 }, { "epoch": 0.010409521766325121, "grad_norm": 32.01140298663732, "learning_rate": 1.040932256310054e-05, "loss": 2.3091, "mean_token_accuracy": 0.41379311084747317, "step": 10335 }, { "epoch": 0.010414557819429295, "grad_norm": 30.11274705485961, "learning_rate": 1.0414358519831597e-05, "loss": 2.687, "mean_token_accuracy": 0.379310342669487, "step": 10340 }, { "epoch": 0.010419593872533467, "grad_norm": 30.338783706914825, "learning_rate": 1.0419394476562658e-05, "loss": 2.5365, "mean_token_accuracy": 0.42068966031074523, "step": 10345 }, { "epoch": 0.01042462992563764, "grad_norm": 31.367240557110694, "learning_rate": 1.0424430433293717e-05, "loss": 2.1733, "mean_token_accuracy": 0.46551724076271056, "step": 10350 }, { "epoch": 0.010429665978741813, "grad_norm": 29.963685590914608, "learning_rate": 1.0429466390024778e-05, "loss": 2.3913, "mean_token_accuracy": 0.46551724076271056, "step": 10355 }, { "epoch": 0.010434702031845985, "grad_norm": 36.380367870754824, "learning_rate": 1.0434502346755836e-05, "loss": 2.1971, "mean_token_accuracy": 0.46896551847457885, "step": 10360 }, { "epoch": 0.010439738084950159, "grad_norm": 34.096170672241456, "learning_rate": 1.0439538303486897e-05, "loss": 2.1788, "mean_token_accuracy": 0.4068965494632721, "step": 10365 }, { "epoch": 0.01044477413805433, "grad_norm": 42.67944538689265, "learning_rate": 1.0444574260217956e-05, "loss": 2.5724, "mean_token_accuracy": 0.3896551728248596, "step": 10370 }, { "epoch": 0.010449810191158504, "grad_norm": 27.381774069460064, "learning_rate": 1.0449610216949017e-05, "loss": 2.4781, "mean_token_accuracy": 0.42068964838981626, "step": 10375 }, { "epoch": 0.010454846244262676, "grad_norm": 41.50178999751459, "learning_rate": 1.0454646173680076e-05, "loss": 2.6237, "mean_token_accuracy": 0.4517241418361664, "step": 10380 }, { "epoch": 0.01045988229736685, "grad_norm": 38.11152593697709, "learning_rate": 1.0459682130411136e-05, "loss": 2.4905, "mean_token_accuracy": 0.3999999940395355, "step": 10385 }, { "epoch": 0.010464918350471022, "grad_norm": 34.56273592883333, "learning_rate": 1.0464718087142197e-05, "loss": 2.2841, "mean_token_accuracy": 0.4517241418361664, "step": 10390 }, { "epoch": 0.010469954403575194, "grad_norm": 31.5623634777684, "learning_rate": 1.0469754043873256e-05, "loss": 2.0629, "mean_token_accuracy": 0.4862069010734558, "step": 10395 }, { "epoch": 0.010474990456679368, "grad_norm": 36.89789198023529, "learning_rate": 1.0474790000604315e-05, "loss": 2.0878, "mean_token_accuracy": 0.48620688915252686, "step": 10400 }, { "epoch": 0.01048002650978354, "grad_norm": 29.092667266763794, "learning_rate": 1.0479825957335374e-05, "loss": 2.1674, "mean_token_accuracy": 0.4517241358757019, "step": 10405 }, { "epoch": 0.010485062562887714, "grad_norm": 33.796137606706274, "learning_rate": 1.0484861914066435e-05, "loss": 2.3987, "mean_token_accuracy": 0.4344827651977539, "step": 10410 }, { "epoch": 0.010490098615991886, "grad_norm": 41.349286992092715, "learning_rate": 1.0489897870797494e-05, "loss": 1.9461, "mean_token_accuracy": 0.5379310309886932, "step": 10415 }, { "epoch": 0.01049513466909606, "grad_norm": 43.83738701693651, "learning_rate": 1.0494933827528554e-05, "loss": 2.1715, "mean_token_accuracy": 0.5, "step": 10420 }, { "epoch": 0.010500170722200232, "grad_norm": 48.605883467615634, "learning_rate": 1.0499969784259615e-05, "loss": 1.924, "mean_token_accuracy": 0.4896551609039307, "step": 10425 }, { "epoch": 0.010505206775304404, "grad_norm": 43.23936858690715, "learning_rate": 1.0505005740990674e-05, "loss": 2.4113, "mean_token_accuracy": 0.38620689511299133, "step": 10430 }, { "epoch": 0.010510242828408577, "grad_norm": 36.5430019894329, "learning_rate": 1.0510041697721735e-05, "loss": 2.1961, "mean_token_accuracy": 0.4517241418361664, "step": 10435 }, { "epoch": 0.01051527888151275, "grad_norm": 34.447764293989415, "learning_rate": 1.0515077654452792e-05, "loss": 2.362, "mean_token_accuracy": 0.4448275864124298, "step": 10440 }, { "epoch": 0.010520314934616923, "grad_norm": 26.080254247863245, "learning_rate": 1.0520113611183853e-05, "loss": 1.9788, "mean_token_accuracy": 0.5241379320621491, "step": 10445 }, { "epoch": 0.010525350987721095, "grad_norm": 36.549197409715205, "learning_rate": 1.0525149567914913e-05, "loss": 2.3184, "mean_token_accuracy": 0.44827585816383364, "step": 10450 }, { "epoch": 0.010530387040825269, "grad_norm": 34.923478669428654, "learning_rate": 1.0530185524645974e-05, "loss": 2.5212, "mean_token_accuracy": 0.3896551728248596, "step": 10455 }, { "epoch": 0.010535423093929441, "grad_norm": 37.53359655933454, "learning_rate": 1.0535221481377031e-05, "loss": 2.6596, "mean_token_accuracy": 0.39812461733818055, "step": 10460 }, { "epoch": 0.010540459147033613, "grad_norm": 36.16227551252876, "learning_rate": 1.0540257438108092e-05, "loss": 2.2883, "mean_token_accuracy": 0.4551724076271057, "step": 10465 }, { "epoch": 0.010545495200137787, "grad_norm": 40.87509166642309, "learning_rate": 1.0545293394839151e-05, "loss": 2.456, "mean_token_accuracy": 0.43793103098869324, "step": 10470 }, { "epoch": 0.010550531253241959, "grad_norm": 31.590113096171606, "learning_rate": 1.0550329351570212e-05, "loss": 2.4117, "mean_token_accuracy": 0.4517241418361664, "step": 10475 }, { "epoch": 0.010555567306346133, "grad_norm": 30.305767696671513, "learning_rate": 1.0555365308301272e-05, "loss": 2.5476, "mean_token_accuracy": 0.41724138259887694, "step": 10480 }, { "epoch": 0.010560603359450305, "grad_norm": 32.76867948987744, "learning_rate": 1.056040126503233e-05, "loss": 2.2252, "mean_token_accuracy": 0.42758620977401735, "step": 10485 }, { "epoch": 0.010565639412554478, "grad_norm": 40.53187548573647, "learning_rate": 1.0565437221763392e-05, "loss": 2.4542, "mean_token_accuracy": 0.4241379380226135, "step": 10490 }, { "epoch": 0.01057067546565865, "grad_norm": 46.384836393572755, "learning_rate": 1.0570473178494451e-05, "loss": 2.3364, "mean_token_accuracy": 0.4379310429096222, "step": 10495 }, { "epoch": 0.010575711518762822, "grad_norm": 40.46301443808825, "learning_rate": 1.057550913522551e-05, "loss": 2.0937, "mean_token_accuracy": 0.47241380214691164, "step": 10500 }, { "epoch": 0.010580747571866996, "grad_norm": 33.78554610465026, "learning_rate": 1.058054509195657e-05, "loss": 2.5259, "mean_token_accuracy": 0.41034482717514037, "step": 10505 }, { "epoch": 0.010585783624971168, "grad_norm": 29.225273971815636, "learning_rate": 1.058558104868763e-05, "loss": 2.2701, "mean_token_accuracy": 0.4448275864124298, "step": 10510 }, { "epoch": 0.010590819678075342, "grad_norm": 34.01289709218677, "learning_rate": 1.059061700541869e-05, "loss": 2.3709, "mean_token_accuracy": 0.43103448748588563, "step": 10515 }, { "epoch": 0.010595855731179514, "grad_norm": 30.382913515346488, "learning_rate": 1.0595652962149749e-05, "loss": 2.1501, "mean_token_accuracy": 0.4465819835662842, "step": 10520 }, { "epoch": 0.010600891784283688, "grad_norm": 33.081721420695494, "learning_rate": 1.060068891888081e-05, "loss": 2.2121, "mean_token_accuracy": 0.458620685338974, "step": 10525 }, { "epoch": 0.01060592783738786, "grad_norm": 35.17090716258229, "learning_rate": 1.0605724875611869e-05, "loss": 2.3656, "mean_token_accuracy": 0.3793103456497192, "step": 10530 }, { "epoch": 0.010610963890492032, "grad_norm": 32.192309809976365, "learning_rate": 1.061076083234293e-05, "loss": 2.5107, "mean_token_accuracy": 0.38275861740112305, "step": 10535 }, { "epoch": 0.010615999943596206, "grad_norm": 31.04615069279317, "learning_rate": 1.0615796789073988e-05, "loss": 2.3761, "mean_token_accuracy": 0.44827587008476255, "step": 10540 }, { "epoch": 0.010621035996700378, "grad_norm": 64.55964817708052, "learning_rate": 1.0620832745805049e-05, "loss": 2.1871, "mean_token_accuracy": 0.4689655125141144, "step": 10545 }, { "epoch": 0.010626072049804551, "grad_norm": 38.85016308431354, "learning_rate": 1.0625868702536108e-05, "loss": 2.2563, "mean_token_accuracy": 0.41034482717514037, "step": 10550 }, { "epoch": 0.010631108102908723, "grad_norm": 26.576059714129702, "learning_rate": 1.0630904659267169e-05, "loss": 2.5498, "mean_token_accuracy": 0.4551724076271057, "step": 10555 }, { "epoch": 0.010636144156012895, "grad_norm": 38.01246436180795, "learning_rate": 1.0635940615998228e-05, "loss": 2.3082, "mean_token_accuracy": 0.46061705946922304, "step": 10560 }, { "epoch": 0.01064118020911707, "grad_norm": 37.18754122588303, "learning_rate": 1.0640976572729287e-05, "loss": 2.6082, "mean_token_accuracy": 0.42758620977401735, "step": 10565 }, { "epoch": 0.010646216262221241, "grad_norm": 42.63476028829352, "learning_rate": 1.0646012529460348e-05, "loss": 2.2656, "mean_token_accuracy": 0.4620689630508423, "step": 10570 }, { "epoch": 0.010651252315325415, "grad_norm": 40.975244659439724, "learning_rate": 1.0651048486191408e-05, "loss": 2.553, "mean_token_accuracy": 0.3931034505367279, "step": 10575 }, { "epoch": 0.010656288368429587, "grad_norm": 33.23995476948762, "learning_rate": 1.0656084442922467e-05, "loss": 2.2532, "mean_token_accuracy": 0.4517241418361664, "step": 10580 }, { "epoch": 0.01066132442153376, "grad_norm": 30.75365679975307, "learning_rate": 1.0661120399653526e-05, "loss": 2.2224, "mean_token_accuracy": 0.4413793087005615, "step": 10585 }, { "epoch": 0.010666360474637933, "grad_norm": 27.457811915426404, "learning_rate": 1.0666156356384587e-05, "loss": 2.4204, "mean_token_accuracy": 0.4000000059604645, "step": 10590 }, { "epoch": 0.010671396527742105, "grad_norm": 29.16548481748518, "learning_rate": 1.0671192313115646e-05, "loss": 2.4316, "mean_token_accuracy": 0.41379310488700866, "step": 10595 }, { "epoch": 0.010676432580846279, "grad_norm": 42.18592461218951, "learning_rate": 1.0676228269846705e-05, "loss": 2.5808, "mean_token_accuracy": 0.45347853302955626, "step": 10600 }, { "epoch": 0.01068146863395045, "grad_norm": 27.027727745012783, "learning_rate": 1.0681264226577765e-05, "loss": 2.1103, "mean_token_accuracy": 0.5034482717514038, "step": 10605 }, { "epoch": 0.010686504687054624, "grad_norm": 33.77769669905049, "learning_rate": 1.0686300183308826e-05, "loss": 2.1884, "mean_token_accuracy": 0.4482758641242981, "step": 10610 }, { "epoch": 0.010691540740158796, "grad_norm": 29.04056765645392, "learning_rate": 1.0691336140039885e-05, "loss": 2.5517, "mean_token_accuracy": 0.4103448212146759, "step": 10615 }, { "epoch": 0.01069657679326297, "grad_norm": 31.874408928154775, "learning_rate": 1.0696372096770944e-05, "loss": 2.5515, "mean_token_accuracy": 0.3965517282485962, "step": 10620 }, { "epoch": 0.010701612846367142, "grad_norm": 37.52674642679141, "learning_rate": 1.0701408053502005e-05, "loss": 2.1416, "mean_token_accuracy": 0.4275861978530884, "step": 10625 }, { "epoch": 0.010706648899471314, "grad_norm": 44.616324128214735, "learning_rate": 1.0706444010233064e-05, "loss": 2.4567, "mean_token_accuracy": 0.4508166968822479, "step": 10630 }, { "epoch": 0.010711684952575488, "grad_norm": 30.28047573965209, "learning_rate": 1.0711479966964125e-05, "loss": 2.2173, "mean_token_accuracy": 0.4655172348022461, "step": 10635 }, { "epoch": 0.01071672100567966, "grad_norm": 40.30605274727645, "learning_rate": 1.0716515923695185e-05, "loss": 2.3271, "mean_token_accuracy": 0.4137930989265442, "step": 10640 }, { "epoch": 0.010721757058783834, "grad_norm": 38.06545042897882, "learning_rate": 1.0721551880426244e-05, "loss": 2.5044, "mean_token_accuracy": 0.38620689511299133, "step": 10645 }, { "epoch": 0.010726793111888006, "grad_norm": 33.37743765259418, "learning_rate": 1.0726587837157303e-05, "loss": 2.2496, "mean_token_accuracy": 0.43793103098869324, "step": 10650 }, { "epoch": 0.01073182916499218, "grad_norm": 39.03853637308467, "learning_rate": 1.0731623793888364e-05, "loss": 2.2717, "mean_token_accuracy": 0.4206896543502808, "step": 10655 }, { "epoch": 0.010736865218096352, "grad_norm": 30.311936754021644, "learning_rate": 1.0736659750619423e-05, "loss": 2.2295, "mean_token_accuracy": 0.42758620381355283, "step": 10660 }, { "epoch": 0.010741901271200524, "grad_norm": 30.667808738026263, "learning_rate": 1.0741695707350483e-05, "loss": 2.2949, "mean_token_accuracy": 0.443254691362381, "step": 10665 }, { "epoch": 0.010746937324304697, "grad_norm": 37.447558920945205, "learning_rate": 1.0746731664081543e-05, "loss": 2.1711, "mean_token_accuracy": 0.49999999403953554, "step": 10670 }, { "epoch": 0.01075197337740887, "grad_norm": 26.372475278157463, "learning_rate": 1.0751767620812603e-05, "loss": 2.1953, "mean_token_accuracy": 0.5068965554237366, "step": 10675 }, { "epoch": 0.010757009430513043, "grad_norm": 31.452994819804683, "learning_rate": 1.0756803577543664e-05, "loss": 2.2469, "mean_token_accuracy": 0.4638838529586792, "step": 10680 }, { "epoch": 0.010762045483617215, "grad_norm": 45.01702147043336, "learning_rate": 1.0761839534274721e-05, "loss": 2.4548, "mean_token_accuracy": 0.3896551728248596, "step": 10685 }, { "epoch": 0.010767081536721389, "grad_norm": 30.509440563135705, "learning_rate": 1.0766875491005782e-05, "loss": 2.4362, "mean_token_accuracy": 0.4068965494632721, "step": 10690 }, { "epoch": 0.010772117589825561, "grad_norm": 37.82580881728543, "learning_rate": 1.0771911447736841e-05, "loss": 2.4261, "mean_token_accuracy": 0.4034482717514038, "step": 10695 }, { "epoch": 0.010777153642929733, "grad_norm": 30.617201300860028, "learning_rate": 1.0776947404467902e-05, "loss": 2.2913, "mean_token_accuracy": 0.4413793087005615, "step": 10700 }, { "epoch": 0.010782189696033907, "grad_norm": 31.574496057490556, "learning_rate": 1.078198336119896e-05, "loss": 2.4517, "mean_token_accuracy": 0.42758620381355283, "step": 10705 }, { "epoch": 0.010787225749138079, "grad_norm": 30.274046926342876, "learning_rate": 1.0787019317930021e-05, "loss": 2.0461, "mean_token_accuracy": 0.44827585816383364, "step": 10710 }, { "epoch": 0.010792261802242253, "grad_norm": 42.660090113818136, "learning_rate": 1.079205527466108e-05, "loss": 2.2115, "mean_token_accuracy": 0.4630541861057281, "step": 10715 }, { "epoch": 0.010797297855346425, "grad_norm": 29.498819295321166, "learning_rate": 1.0797091231392141e-05, "loss": 2.2346, "mean_token_accuracy": 0.4344827592372894, "step": 10720 }, { "epoch": 0.010802333908450599, "grad_norm": 33.12173831816271, "learning_rate": 1.08021271881232e-05, "loss": 2.533, "mean_token_accuracy": 0.4172413766384125, "step": 10725 }, { "epoch": 0.01080736996155477, "grad_norm": 30.280856042169987, "learning_rate": 1.080716314485426e-05, "loss": 2.2034, "mean_token_accuracy": 0.48275862336158754, "step": 10730 }, { "epoch": 0.010812406014658943, "grad_norm": 41.147301396772875, "learning_rate": 1.081219910158532e-05, "loss": 2.3396, "mean_token_accuracy": 0.4482758641242981, "step": 10735 }, { "epoch": 0.010817442067763116, "grad_norm": 43.41143222499023, "learning_rate": 1.081723505831638e-05, "loss": 2.3149, "mean_token_accuracy": 0.37586206793785093, "step": 10740 }, { "epoch": 0.010822478120867288, "grad_norm": 33.3717507098183, "learning_rate": 1.0822271015047439e-05, "loss": 2.4314, "mean_token_accuracy": 0.4310344815254211, "step": 10745 }, { "epoch": 0.010827514173971462, "grad_norm": 31.4728786030362, "learning_rate": 1.0827306971778498e-05, "loss": 2.4191, "mean_token_accuracy": 0.4517241358757019, "step": 10750 }, { "epoch": 0.010832550227075634, "grad_norm": 46.142685102008066, "learning_rate": 1.083234292850956e-05, "loss": 2.6398, "mean_token_accuracy": 0.3827586233615875, "step": 10755 }, { "epoch": 0.010837586280179808, "grad_norm": 31.440824186910028, "learning_rate": 1.0837378885240619e-05, "loss": 2.3034, "mean_token_accuracy": 0.46896551847457885, "step": 10760 }, { "epoch": 0.01084262233328398, "grad_norm": 31.191653278009152, "learning_rate": 1.0842414841971678e-05, "loss": 2.2683, "mean_token_accuracy": 0.4482758641242981, "step": 10765 }, { "epoch": 0.010847658386388152, "grad_norm": 34.481908805591246, "learning_rate": 1.0847450798702739e-05, "loss": 2.3137, "mean_token_accuracy": 0.41034482717514037, "step": 10770 }, { "epoch": 0.010852694439492326, "grad_norm": 35.177721248481724, "learning_rate": 1.0852486755433798e-05, "loss": 2.4809, "mean_token_accuracy": 0.42413793206214906, "step": 10775 }, { "epoch": 0.010857730492596498, "grad_norm": 24.668699965673156, "learning_rate": 1.0857522712164859e-05, "loss": 2.2867, "mean_token_accuracy": 0.4843920111656189, "step": 10780 }, { "epoch": 0.010862766545700672, "grad_norm": 38.65989026859781, "learning_rate": 1.0862558668895916e-05, "loss": 2.7365, "mean_token_accuracy": 0.3862068891525269, "step": 10785 }, { "epoch": 0.010867802598804844, "grad_norm": 32.6797676511802, "learning_rate": 1.0867594625626977e-05, "loss": 2.0282, "mean_token_accuracy": 0.493103438615799, "step": 10790 }, { "epoch": 0.010872838651909017, "grad_norm": 38.27756675451432, "learning_rate": 1.0872630582358037e-05, "loss": 2.2889, "mean_token_accuracy": 0.42068966627120974, "step": 10795 }, { "epoch": 0.01087787470501319, "grad_norm": 30.721209632790146, "learning_rate": 1.0877666539089098e-05, "loss": 2.3026, "mean_token_accuracy": 0.4034482777118683, "step": 10800 }, { "epoch": 0.010882910758117361, "grad_norm": 28.100437125715874, "learning_rate": 1.0882702495820155e-05, "loss": 2.1089, "mean_token_accuracy": 0.47931034564971925, "step": 10805 }, { "epoch": 0.010887946811221535, "grad_norm": 40.72504770972406, "learning_rate": 1.0887738452551216e-05, "loss": 2.0119, "mean_token_accuracy": 0.45960590839385984, "step": 10810 }, { "epoch": 0.010892982864325707, "grad_norm": 35.685912974091096, "learning_rate": 1.0892774409282275e-05, "loss": 2.4779, "mean_token_accuracy": 0.4379310369491577, "step": 10815 }, { "epoch": 0.010898018917429881, "grad_norm": 29.391599354248623, "learning_rate": 1.0897810366013336e-05, "loss": 2.1848, "mean_token_accuracy": 0.47586206793785096, "step": 10820 }, { "epoch": 0.010903054970534053, "grad_norm": 34.55382534828304, "learning_rate": 1.0902846322744396e-05, "loss": 2.3804, "mean_token_accuracy": 0.44676345586776733, "step": 10825 }, { "epoch": 0.010908091023638227, "grad_norm": 27.222208406079336, "learning_rate": 1.0907882279475455e-05, "loss": 2.114, "mean_token_accuracy": 0.47931034564971925, "step": 10830 }, { "epoch": 0.010913127076742399, "grad_norm": 38.90441483749146, "learning_rate": 1.0912918236206516e-05, "loss": 2.1331, "mean_token_accuracy": 0.4517241358757019, "step": 10835 }, { "epoch": 0.01091816312984657, "grad_norm": 27.594837667700784, "learning_rate": 1.0917954192937575e-05, "loss": 2.4736, "mean_token_accuracy": 0.40689656138420105, "step": 10840 }, { "epoch": 0.010923199182950745, "grad_norm": 31.031302054703925, "learning_rate": 1.0922990149668634e-05, "loss": 2.3521, "mean_token_accuracy": 0.441379314661026, "step": 10845 }, { "epoch": 0.010928235236054917, "grad_norm": 35.17986897043136, "learning_rate": 1.0928026106399694e-05, "loss": 2.041, "mean_token_accuracy": 0.4815486967563629, "step": 10850 }, { "epoch": 0.01093327128915909, "grad_norm": 31.86622657961703, "learning_rate": 1.0933062063130754e-05, "loss": 1.9663, "mean_token_accuracy": 0.4862068951129913, "step": 10855 }, { "epoch": 0.010938307342263262, "grad_norm": 30.268773930429212, "learning_rate": 1.0938098019861814e-05, "loss": 2.5621, "mean_token_accuracy": 0.4586206912994385, "step": 10860 }, { "epoch": 0.010943343395367436, "grad_norm": 45.43403418647319, "learning_rate": 1.0943133976592873e-05, "loss": 2.2358, "mean_token_accuracy": 0.4344827473163605, "step": 10865 }, { "epoch": 0.010948379448471608, "grad_norm": 33.582158780226145, "learning_rate": 1.0948169933323934e-05, "loss": 2.6082, "mean_token_accuracy": 0.3876588046550751, "step": 10870 }, { "epoch": 0.01095341550157578, "grad_norm": 31.410646610305644, "learning_rate": 1.0953205890054993e-05, "loss": 2.0884, "mean_token_accuracy": 0.48965518474578856, "step": 10875 }, { "epoch": 0.010958451554679954, "grad_norm": 32.015748320983846, "learning_rate": 1.0958241846786054e-05, "loss": 2.4072, "mean_token_accuracy": 0.4448275864124298, "step": 10880 }, { "epoch": 0.010963487607784126, "grad_norm": 35.290072474386754, "learning_rate": 1.0963277803517112e-05, "loss": 2.5729, "mean_token_accuracy": 0.36551723480224607, "step": 10885 }, { "epoch": 0.0109685236608883, "grad_norm": 37.62099214540939, "learning_rate": 1.0968313760248173e-05, "loss": 2.1619, "mean_token_accuracy": 0.48965516686439514, "step": 10890 }, { "epoch": 0.010973559713992472, "grad_norm": 39.17360022731531, "learning_rate": 1.0973349716979232e-05, "loss": 2.1668, "mean_token_accuracy": 0.4931034445762634, "step": 10895 }, { "epoch": 0.010978595767096646, "grad_norm": 39.20607909256005, "learning_rate": 1.0978385673710293e-05, "loss": 2.362, "mean_token_accuracy": 0.4137930989265442, "step": 10900 }, { "epoch": 0.010983631820200818, "grad_norm": 28.8195460410601, "learning_rate": 1.098342163044135e-05, "loss": 2.0535, "mean_token_accuracy": 0.47931034564971925, "step": 10905 }, { "epoch": 0.01098866787330499, "grad_norm": 26.96027186118255, "learning_rate": 1.0988457587172411e-05, "loss": 2.4447, "mean_token_accuracy": 0.4310344815254211, "step": 10910 }, { "epoch": 0.010993703926409163, "grad_norm": 32.46548692038139, "learning_rate": 1.099349354390347e-05, "loss": 2.479, "mean_token_accuracy": 0.3931034505367279, "step": 10915 }, { "epoch": 0.010998739979513335, "grad_norm": 36.583635851619455, "learning_rate": 1.0998529500634532e-05, "loss": 2.0519, "mean_token_accuracy": 0.5073891639709472, "step": 10920 }, { "epoch": 0.01100377603261751, "grad_norm": 41.64154573575927, "learning_rate": 1.100356545736559e-05, "loss": 2.286, "mean_token_accuracy": 0.42758620977401735, "step": 10925 }, { "epoch": 0.011008812085721681, "grad_norm": 40.41842045859674, "learning_rate": 1.100860141409665e-05, "loss": 2.4726, "mean_token_accuracy": 0.4517241418361664, "step": 10930 }, { "epoch": 0.011013848138825855, "grad_norm": 62.12691163358502, "learning_rate": 1.1013637370827711e-05, "loss": 2.3722, "mean_token_accuracy": 0.46551724076271056, "step": 10935 }, { "epoch": 0.011018884191930027, "grad_norm": 37.149261798603646, "learning_rate": 1.101867332755877e-05, "loss": 2.6742, "mean_token_accuracy": 0.3862069010734558, "step": 10940 }, { "epoch": 0.011023920245034199, "grad_norm": 34.67222367300885, "learning_rate": 1.102370928428983e-05, "loss": 2.4205, "mean_token_accuracy": 0.40689654350280763, "step": 10945 }, { "epoch": 0.011028956298138373, "grad_norm": 37.71382518710506, "learning_rate": 1.1028745241020889e-05, "loss": 2.623, "mean_token_accuracy": 0.37586206793785093, "step": 10950 }, { "epoch": 0.011033992351242545, "grad_norm": 32.413317647004746, "learning_rate": 1.103378119775195e-05, "loss": 2.4341, "mean_token_accuracy": 0.38620689511299133, "step": 10955 }, { "epoch": 0.011039028404346719, "grad_norm": 35.67497569497302, "learning_rate": 1.1038817154483009e-05, "loss": 2.2033, "mean_token_accuracy": 0.4517241358757019, "step": 10960 }, { "epoch": 0.01104406445745089, "grad_norm": 30.863900522813342, "learning_rate": 1.1043853111214068e-05, "loss": 2.6417, "mean_token_accuracy": 0.35862069129943847, "step": 10965 }, { "epoch": 0.011049100510555064, "grad_norm": 35.02413144173615, "learning_rate": 1.1048889067945129e-05, "loss": 2.7192, "mean_token_accuracy": 0.3482758641242981, "step": 10970 }, { "epoch": 0.011054136563659236, "grad_norm": 36.63426614943195, "learning_rate": 1.1053925024676188e-05, "loss": 2.4429, "mean_token_accuracy": 0.43278887271881106, "step": 10975 }, { "epoch": 0.011059172616763409, "grad_norm": 30.269112291284028, "learning_rate": 1.105896098140725e-05, "loss": 2.0593, "mean_token_accuracy": 0.510344821214676, "step": 10980 }, { "epoch": 0.011064208669867582, "grad_norm": 42.663104483967274, "learning_rate": 1.1063996938138307e-05, "loss": 2.4622, "mean_token_accuracy": 0.41724138259887694, "step": 10985 }, { "epoch": 0.011069244722971754, "grad_norm": 27.953517198650847, "learning_rate": 1.1069032894869368e-05, "loss": 3.0749, "mean_token_accuracy": 0.3655172407627106, "step": 10990 }, { "epoch": 0.011074280776075928, "grad_norm": 30.661192056723667, "learning_rate": 1.1074068851600427e-05, "loss": 2.1688, "mean_token_accuracy": 0.46733213067054746, "step": 10995 }, { "epoch": 0.0110793168291801, "grad_norm": 29.223269218994012, "learning_rate": 1.1079104808331488e-05, "loss": 2.4347, "mean_token_accuracy": 0.40689654350280763, "step": 11000 }, { "epoch": 0.011084352882284274, "grad_norm": 43.18024493372328, "learning_rate": 1.1084140765062546e-05, "loss": 2.1825, "mean_token_accuracy": 0.4620689690113068, "step": 11005 }, { "epoch": 0.011089388935388446, "grad_norm": 38.806657948380135, "learning_rate": 1.1089176721793607e-05, "loss": 2.3115, "mean_token_accuracy": 0.441379314661026, "step": 11010 }, { "epoch": 0.011094424988492618, "grad_norm": 29.00955146269285, "learning_rate": 1.1094212678524668e-05, "loss": 2.5242, "mean_token_accuracy": 0.3827586114406586, "step": 11015 }, { "epoch": 0.011099461041596792, "grad_norm": 30.07485564419664, "learning_rate": 1.1099248635255727e-05, "loss": 2.8016, "mean_token_accuracy": 0.4000000059604645, "step": 11020 }, { "epoch": 0.011104497094700964, "grad_norm": 28.114295267933155, "learning_rate": 1.1104284591986786e-05, "loss": 2.3921, "mean_token_accuracy": 0.4411373257637024, "step": 11025 }, { "epoch": 0.011109533147805137, "grad_norm": 26.67568879535564, "learning_rate": 1.1109320548717845e-05, "loss": 2.1925, "mean_token_accuracy": 0.4628554105758667, "step": 11030 }, { "epoch": 0.01111456920090931, "grad_norm": 34.98690467561199, "learning_rate": 1.1114356505448906e-05, "loss": 2.4062, "mean_token_accuracy": 0.4229280173778534, "step": 11035 }, { "epoch": 0.011119605254013483, "grad_norm": 29.837911136343347, "learning_rate": 1.1119392462179965e-05, "loss": 2.2937, "mean_token_accuracy": 0.42758620381355283, "step": 11040 }, { "epoch": 0.011124641307117655, "grad_norm": 31.885912913079434, "learning_rate": 1.1124428418911026e-05, "loss": 2.1902, "mean_token_accuracy": 0.4365396201610565, "step": 11045 }, { "epoch": 0.011129677360221827, "grad_norm": 33.496562829065844, "learning_rate": 1.1129464375642084e-05, "loss": 2.2347, "mean_token_accuracy": 0.46551724076271056, "step": 11050 }, { "epoch": 0.011134713413326001, "grad_norm": 33.07013525649395, "learning_rate": 1.1134500332373145e-05, "loss": 2.4862, "mean_token_accuracy": 0.37931033968925476, "step": 11055 }, { "epoch": 0.011139749466430173, "grad_norm": 37.02871091344708, "learning_rate": 1.1139536289104204e-05, "loss": 2.1365, "mean_token_accuracy": 0.48850575685501096, "step": 11060 }, { "epoch": 0.011144785519534347, "grad_norm": 38.73879710788627, "learning_rate": 1.1144572245835265e-05, "loss": 2.4776, "mean_token_accuracy": 0.4137930989265442, "step": 11065 }, { "epoch": 0.011149821572638519, "grad_norm": 28.330673424128214, "learning_rate": 1.1149608202566324e-05, "loss": 2.2198, "mean_token_accuracy": 0.4724137902259827, "step": 11070 }, { "epoch": 0.011154857625742693, "grad_norm": 37.837426934207514, "learning_rate": 1.1154644159297384e-05, "loss": 2.2808, "mean_token_accuracy": 0.46896551847457885, "step": 11075 }, { "epoch": 0.011159893678846865, "grad_norm": 28.955344729503338, "learning_rate": 1.1159680116028445e-05, "loss": 2.2519, "mean_token_accuracy": 0.4655172348022461, "step": 11080 }, { "epoch": 0.011164929731951037, "grad_norm": 32.96526797175497, "learning_rate": 1.1164716072759504e-05, "loss": 2.1388, "mean_token_accuracy": 0.46551724672317507, "step": 11085 }, { "epoch": 0.01116996578505521, "grad_norm": 27.156821986326424, "learning_rate": 1.1169752029490563e-05, "loss": 2.2883, "mean_token_accuracy": 0.458620685338974, "step": 11090 }, { "epoch": 0.011175001838159383, "grad_norm": 32.685599485769295, "learning_rate": 1.1174787986221622e-05, "loss": 2.346, "mean_token_accuracy": 0.44646098017692565, "step": 11095 }, { "epoch": 0.011180037891263556, "grad_norm": 24.513999428761405, "learning_rate": 1.1179823942952683e-05, "loss": 2.1141, "mean_token_accuracy": 0.4655172348022461, "step": 11100 }, { "epoch": 0.011185073944367728, "grad_norm": 21.7578777902776, "learning_rate": 1.1184859899683743e-05, "loss": 2.3718, "mean_token_accuracy": 0.4379310429096222, "step": 11105 }, { "epoch": 0.011190109997471902, "grad_norm": 32.63089585725225, "learning_rate": 1.1189895856414802e-05, "loss": 2.4199, "mean_token_accuracy": 0.41379310488700866, "step": 11110 }, { "epoch": 0.011195146050576074, "grad_norm": 46.00322654408078, "learning_rate": 1.1194931813145863e-05, "loss": 2.6226, "mean_token_accuracy": 0.4206896543502808, "step": 11115 }, { "epoch": 0.011200182103680246, "grad_norm": 35.00582142668977, "learning_rate": 1.1199967769876922e-05, "loss": 2.3573, "mean_token_accuracy": 0.4551724135875702, "step": 11120 }, { "epoch": 0.01120521815678442, "grad_norm": 28.746498102901292, "learning_rate": 1.1205003726607983e-05, "loss": 2.6983, "mean_token_accuracy": 0.4068965494632721, "step": 11125 }, { "epoch": 0.011210254209888592, "grad_norm": 34.09145114869498, "learning_rate": 1.121003968333904e-05, "loss": 2.4323, "mean_token_accuracy": 0.441379314661026, "step": 11130 }, { "epoch": 0.011215290262992766, "grad_norm": 25.872618432889894, "learning_rate": 1.1215075640070101e-05, "loss": 2.2974, "mean_token_accuracy": 0.41379311084747317, "step": 11135 }, { "epoch": 0.011220326316096938, "grad_norm": 28.214100238942155, "learning_rate": 1.122011159680116e-05, "loss": 2.4921, "mean_token_accuracy": 0.39655172526836396, "step": 11140 }, { "epoch": 0.011225362369201112, "grad_norm": 35.07036940965686, "learning_rate": 1.1225147553532222e-05, "loss": 2.4969, "mean_token_accuracy": 0.41724138259887694, "step": 11145 }, { "epoch": 0.011230398422305284, "grad_norm": 47.16272793714296, "learning_rate": 1.123018351026328e-05, "loss": 2.4695, "mean_token_accuracy": 0.4034482717514038, "step": 11150 }, { "epoch": 0.011235434475409456, "grad_norm": 33.1641828555765, "learning_rate": 1.123521946699434e-05, "loss": 2.1111, "mean_token_accuracy": 0.44827585518360136, "step": 11155 }, { "epoch": 0.01124047052851363, "grad_norm": 45.461005017927846, "learning_rate": 1.12402554237254e-05, "loss": 2.2241, "mean_token_accuracy": 0.417241370677948, "step": 11160 }, { "epoch": 0.011245506581617801, "grad_norm": 32.834775475881635, "learning_rate": 1.124529138045646e-05, "loss": 2.4683, "mean_token_accuracy": 0.42413792610168455, "step": 11165 }, { "epoch": 0.011250542634721975, "grad_norm": 30.542638051166918, "learning_rate": 1.125032733718752e-05, "loss": 2.2336, "mean_token_accuracy": 0.4517241418361664, "step": 11170 }, { "epoch": 0.011255578687826147, "grad_norm": 37.97004941684028, "learning_rate": 1.1255363293918579e-05, "loss": 2.4737, "mean_token_accuracy": 0.42068964838981626, "step": 11175 }, { "epoch": 0.011260614740930321, "grad_norm": 25.21621941964731, "learning_rate": 1.126039925064964e-05, "loss": 2.3255, "mean_token_accuracy": 0.42413792610168455, "step": 11180 }, { "epoch": 0.011265650794034493, "grad_norm": 28.696916748890857, "learning_rate": 1.1265435207380699e-05, "loss": 2.4528, "mean_token_accuracy": 0.41724138259887694, "step": 11185 }, { "epoch": 0.011270686847138665, "grad_norm": 22.949255114064812, "learning_rate": 1.1270471164111758e-05, "loss": 2.2295, "mean_token_accuracy": 0.48275862336158754, "step": 11190 }, { "epoch": 0.011275722900242839, "grad_norm": 34.225415844031936, "learning_rate": 1.1275507120842818e-05, "loss": 2.1593, "mean_token_accuracy": 0.4793103516101837, "step": 11195 }, { "epoch": 0.01128075895334701, "grad_norm": 35.40808704076608, "learning_rate": 1.1280543077573878e-05, "loss": 2.326, "mean_token_accuracy": 0.42068964838981626, "step": 11200 }, { "epoch": 0.011285795006451185, "grad_norm": 40.77515201218449, "learning_rate": 1.1285579034304938e-05, "loss": 2.2034, "mean_token_accuracy": 0.46896552443504336, "step": 11205 }, { "epoch": 0.011290831059555357, "grad_norm": 34.7462626263935, "learning_rate": 1.1290614991035997e-05, "loss": 2.6503, "mean_token_accuracy": 0.3517241418361664, "step": 11210 }, { "epoch": 0.01129586711265953, "grad_norm": 32.09408511157453, "learning_rate": 1.1295650947767058e-05, "loss": 2.2219, "mean_token_accuracy": 0.43103448748588563, "step": 11215 }, { "epoch": 0.011300903165763702, "grad_norm": 28.95363557734509, "learning_rate": 1.1300686904498117e-05, "loss": 2.3018, "mean_token_accuracy": 0.3896551728248596, "step": 11220 }, { "epoch": 0.011305939218867874, "grad_norm": 30.60510430538887, "learning_rate": 1.1305722861229178e-05, "loss": 2.3663, "mean_token_accuracy": 0.4034482777118683, "step": 11225 }, { "epoch": 0.011310975271972048, "grad_norm": 31.873080131196023, "learning_rate": 1.1310758817960236e-05, "loss": 2.323, "mean_token_accuracy": 0.43448275327682495, "step": 11230 }, { "epoch": 0.01131601132507622, "grad_norm": 28.18307099489073, "learning_rate": 1.1315794774691297e-05, "loss": 2.6619, "mean_token_accuracy": 0.4206896543502808, "step": 11235 }, { "epoch": 0.011321047378180394, "grad_norm": 32.97547241319515, "learning_rate": 1.1320830731422356e-05, "loss": 2.2373, "mean_token_accuracy": 0.4310344815254211, "step": 11240 }, { "epoch": 0.011326083431284566, "grad_norm": 50.067571260052695, "learning_rate": 1.1325866688153417e-05, "loss": 2.6148, "mean_token_accuracy": 0.35862069129943847, "step": 11245 }, { "epoch": 0.01133111948438874, "grad_norm": 32.835463579930064, "learning_rate": 1.1330902644884474e-05, "loss": 2.0574, "mean_token_accuracy": 0.4931034505367279, "step": 11250 }, { "epoch": 0.011336155537492912, "grad_norm": 29.09041071054721, "learning_rate": 1.1335938601615535e-05, "loss": 2.2842, "mean_token_accuracy": 0.5102238357067108, "step": 11255 }, { "epoch": 0.011341191590597084, "grad_norm": 36.44403016123223, "learning_rate": 1.1340974558346595e-05, "loss": 2.338, "mean_token_accuracy": 0.4310344815254211, "step": 11260 }, { "epoch": 0.011346227643701258, "grad_norm": 30.47387174292945, "learning_rate": 1.1346010515077656e-05, "loss": 2.3608, "mean_token_accuracy": 0.43103448748588563, "step": 11265 }, { "epoch": 0.01135126369680543, "grad_norm": 43.57486939529165, "learning_rate": 1.1351046471808715e-05, "loss": 2.3635, "mean_token_accuracy": 0.4620689630508423, "step": 11270 }, { "epoch": 0.011356299749909603, "grad_norm": 30.80385732606569, "learning_rate": 1.1356082428539774e-05, "loss": 2.3265, "mean_token_accuracy": 0.4517241418361664, "step": 11275 }, { "epoch": 0.011361335803013775, "grad_norm": 50.56760163880261, "learning_rate": 1.1361118385270835e-05, "loss": 2.2775, "mean_token_accuracy": 0.4310344815254211, "step": 11280 }, { "epoch": 0.01136637185611795, "grad_norm": 31.833970951701428, "learning_rate": 1.1366154342001894e-05, "loss": 2.1973, "mean_token_accuracy": 0.5034482777118683, "step": 11285 }, { "epoch": 0.011371407909222121, "grad_norm": 31.02810684886034, "learning_rate": 1.1371190298732954e-05, "loss": 2.1515, "mean_token_accuracy": 0.458620685338974, "step": 11290 }, { "epoch": 0.011376443962326293, "grad_norm": 35.70808072877237, "learning_rate": 1.1376226255464013e-05, "loss": 2.2424, "mean_token_accuracy": 0.4551724135875702, "step": 11295 }, { "epoch": 0.011381480015430467, "grad_norm": 38.79489333374522, "learning_rate": 1.1381262212195074e-05, "loss": 2.258, "mean_token_accuracy": 0.47241378426551817, "step": 11300 }, { "epoch": 0.011386516068534639, "grad_norm": 31.24392550176355, "learning_rate": 1.1386298168926133e-05, "loss": 2.5387, "mean_token_accuracy": 0.3793103456497192, "step": 11305 }, { "epoch": 0.011391552121638813, "grad_norm": 31.509881235195156, "learning_rate": 1.1391334125657192e-05, "loss": 2.1173, "mean_token_accuracy": 0.4938423693180084, "step": 11310 }, { "epoch": 0.011396588174742985, "grad_norm": 31.499717094399063, "learning_rate": 1.1396370082388253e-05, "loss": 2.0832, "mean_token_accuracy": 0.4862068951129913, "step": 11315 }, { "epoch": 0.011401624227847159, "grad_norm": 24.264805572099565, "learning_rate": 1.1401406039119312e-05, "loss": 2.2539, "mean_token_accuracy": 0.46551724076271056, "step": 11320 }, { "epoch": 0.01140666028095133, "grad_norm": 27.005003970828003, "learning_rate": 1.1406441995850373e-05, "loss": 2.7109, "mean_token_accuracy": 0.37586207389831544, "step": 11325 }, { "epoch": 0.011411696334055503, "grad_norm": 28.823876961086924, "learning_rate": 1.1411477952581431e-05, "loss": 2.4805, "mean_token_accuracy": 0.4379310429096222, "step": 11330 }, { "epoch": 0.011416732387159676, "grad_norm": 30.45345939695766, "learning_rate": 1.1416513909312492e-05, "loss": 2.0434, "mean_token_accuracy": 0.47586206793785096, "step": 11335 }, { "epoch": 0.011421768440263848, "grad_norm": 33.73712091994839, "learning_rate": 1.1421549866043551e-05, "loss": 2.3956, "mean_token_accuracy": 0.4620689630508423, "step": 11340 }, { "epoch": 0.011426804493368022, "grad_norm": 36.010374141068525, "learning_rate": 1.1426585822774612e-05, "loss": 2.4198, "mean_token_accuracy": 0.44482758045196535, "step": 11345 }, { "epoch": 0.011431840546472194, "grad_norm": 29.630348630082686, "learning_rate": 1.143162177950567e-05, "loss": 2.4567, "mean_token_accuracy": 0.4689655125141144, "step": 11350 }, { "epoch": 0.011436876599576368, "grad_norm": 29.800977235316534, "learning_rate": 1.143665773623673e-05, "loss": 2.2673, "mean_token_accuracy": 0.4655172348022461, "step": 11355 }, { "epoch": 0.01144191265268054, "grad_norm": 27.859238389939275, "learning_rate": 1.144169369296779e-05, "loss": 2.1193, "mean_token_accuracy": 0.48965516686439514, "step": 11360 }, { "epoch": 0.011446948705784712, "grad_norm": 35.39450583241857, "learning_rate": 1.144672964969885e-05, "loss": 2.4734, "mean_token_accuracy": 0.41379310488700866, "step": 11365 }, { "epoch": 0.011451984758888886, "grad_norm": 33.915949128640754, "learning_rate": 1.145176560642991e-05, "loss": 2.217, "mean_token_accuracy": 0.4275861978530884, "step": 11370 }, { "epoch": 0.011457020811993058, "grad_norm": 48.20686091215249, "learning_rate": 1.145680156316097e-05, "loss": 2.1881, "mean_token_accuracy": 0.41724138259887694, "step": 11375 }, { "epoch": 0.011462056865097232, "grad_norm": 38.3299278932728, "learning_rate": 1.146183751989203e-05, "loss": 2.5015, "mean_token_accuracy": 0.43103447556495667, "step": 11380 }, { "epoch": 0.011467092918201404, "grad_norm": 32.7158403231524, "learning_rate": 1.146687347662309e-05, "loss": 2.3428, "mean_token_accuracy": 0.4379310369491577, "step": 11385 }, { "epoch": 0.011472128971305577, "grad_norm": 33.26722294101563, "learning_rate": 1.1471909433354149e-05, "loss": 2.4984, "mean_token_accuracy": 0.4068965494632721, "step": 11390 }, { "epoch": 0.01147716502440975, "grad_norm": 25.846843769642586, "learning_rate": 1.1476945390085208e-05, "loss": 2.3896, "mean_token_accuracy": 0.42758620977401735, "step": 11395 }, { "epoch": 0.011482201077513922, "grad_norm": 35.05438406558011, "learning_rate": 1.1481981346816269e-05, "loss": 2.6029, "mean_token_accuracy": 0.36896551847457887, "step": 11400 }, { "epoch": 0.011487237130618095, "grad_norm": 29.293321031606666, "learning_rate": 1.1487017303547328e-05, "loss": 2.5893, "mean_token_accuracy": 0.37586206793785093, "step": 11405 }, { "epoch": 0.011492273183722267, "grad_norm": 28.752858193786885, "learning_rate": 1.1492053260278387e-05, "loss": 2.3525, "mean_token_accuracy": 0.44482758045196535, "step": 11410 }, { "epoch": 0.011497309236826441, "grad_norm": 29.837383742888036, "learning_rate": 1.1497089217009448e-05, "loss": 2.3698, "mean_token_accuracy": 0.41034482717514037, "step": 11415 }, { "epoch": 0.011502345289930613, "grad_norm": 36.076392292081245, "learning_rate": 1.1502125173740508e-05, "loss": 2.2143, "mean_token_accuracy": 0.4310344815254211, "step": 11420 }, { "epoch": 0.011507381343034787, "grad_norm": 30.270360452784974, "learning_rate": 1.1507161130471569e-05, "loss": 2.2552, "mean_token_accuracy": 0.46551724076271056, "step": 11425 }, { "epoch": 0.011512417396138959, "grad_norm": 28.41563831409088, "learning_rate": 1.1512197087202626e-05, "loss": 2.3775, "mean_token_accuracy": 0.42413792610168455, "step": 11430 }, { "epoch": 0.011517453449243131, "grad_norm": 34.58551614517551, "learning_rate": 1.1517233043933687e-05, "loss": 2.4192, "mean_token_accuracy": 0.4068965494632721, "step": 11435 }, { "epoch": 0.011522489502347305, "grad_norm": 29.814524399114482, "learning_rate": 1.1522269000664746e-05, "loss": 2.6102, "mean_token_accuracy": 0.3896551728248596, "step": 11440 }, { "epoch": 0.011527525555451477, "grad_norm": 33.218503540841375, "learning_rate": 1.1527304957395807e-05, "loss": 2.3605, "mean_token_accuracy": 0.4310344815254211, "step": 11445 }, { "epoch": 0.01153256160855565, "grad_norm": 26.991593312418317, "learning_rate": 1.1532340914126867e-05, "loss": 2.1858, "mean_token_accuracy": 0.4068965554237366, "step": 11450 }, { "epoch": 0.011537597661659823, "grad_norm": 31.969388025806072, "learning_rate": 1.1537376870857926e-05, "loss": 2.5483, "mean_token_accuracy": 0.41379310488700866, "step": 11455 }, { "epoch": 0.011542633714763995, "grad_norm": 26.521594588707206, "learning_rate": 1.1542412827588985e-05, "loss": 2.1821, "mean_token_accuracy": 0.4562807857990265, "step": 11460 }, { "epoch": 0.011547669767868168, "grad_norm": 38.75884281078359, "learning_rate": 1.1547448784320046e-05, "loss": 2.2722, "mean_token_accuracy": 0.4413793087005615, "step": 11465 }, { "epoch": 0.01155270582097234, "grad_norm": 26.2340559642593, "learning_rate": 1.1552484741051107e-05, "loss": 2.3726, "mean_token_accuracy": 0.44827585816383364, "step": 11470 }, { "epoch": 0.011557741874076514, "grad_norm": 34.17758089850744, "learning_rate": 1.1557520697782165e-05, "loss": 2.2684, "mean_token_accuracy": 0.44827585816383364, "step": 11475 }, { "epoch": 0.011562777927180686, "grad_norm": 38.78050550743558, "learning_rate": 1.1562556654513225e-05, "loss": 2.2958, "mean_token_accuracy": 0.4620689570903778, "step": 11480 }, { "epoch": 0.01156781398028486, "grad_norm": 39.9482929360104, "learning_rate": 1.1567592611244285e-05, "loss": 2.2652, "mean_token_accuracy": 0.441379314661026, "step": 11485 }, { "epoch": 0.011572850033389032, "grad_norm": 34.383272971167514, "learning_rate": 1.1572628567975346e-05, "loss": 2.4905, "mean_token_accuracy": 0.39655172228813174, "step": 11490 }, { "epoch": 0.011577886086493204, "grad_norm": 35.92989023232943, "learning_rate": 1.1577664524706403e-05, "loss": 2.1399, "mean_token_accuracy": 0.4862068951129913, "step": 11495 }, { "epoch": 0.011582922139597378, "grad_norm": 35.44359713743167, "learning_rate": 1.1582700481437464e-05, "loss": 2.5389, "mean_token_accuracy": 0.4344827473163605, "step": 11500 }, { "epoch": 0.01158795819270155, "grad_norm": 30.112689003701576, "learning_rate": 1.1587736438168523e-05, "loss": 2.2239, "mean_token_accuracy": 0.4620689690113068, "step": 11505 }, { "epoch": 0.011592994245805724, "grad_norm": 39.33231840187201, "learning_rate": 1.1592772394899584e-05, "loss": 2.3532, "mean_token_accuracy": 0.43448275327682495, "step": 11510 }, { "epoch": 0.011598030298909896, "grad_norm": 33.836971216147816, "learning_rate": 1.1597808351630644e-05, "loss": 2.2874, "mean_token_accuracy": 0.4172413766384125, "step": 11515 }, { "epoch": 0.01160306635201407, "grad_norm": 31.48522080302191, "learning_rate": 1.1602844308361703e-05, "loss": 2.0137, "mean_token_accuracy": 0.47586206793785096, "step": 11520 }, { "epoch": 0.011608102405118241, "grad_norm": 37.354047830631124, "learning_rate": 1.1607880265092764e-05, "loss": 2.3898, "mean_token_accuracy": 0.480762255191803, "step": 11525 }, { "epoch": 0.011613138458222413, "grad_norm": 27.73356906619096, "learning_rate": 1.1612916221823823e-05, "loss": 2.2007, "mean_token_accuracy": 0.4517241358757019, "step": 11530 }, { "epoch": 0.011618174511326587, "grad_norm": 35.47297794558834, "learning_rate": 1.1617952178554882e-05, "loss": 2.1781, "mean_token_accuracy": 0.47586207985877993, "step": 11535 }, { "epoch": 0.01162321056443076, "grad_norm": 42.539437963521095, "learning_rate": 1.1622988135285942e-05, "loss": 2.3222, "mean_token_accuracy": 0.39310343861579894, "step": 11540 }, { "epoch": 0.011628246617534933, "grad_norm": 36.40070439042412, "learning_rate": 1.1628024092017003e-05, "loss": 2.5816, "mean_token_accuracy": 0.4206896543502808, "step": 11545 }, { "epoch": 0.011633282670639105, "grad_norm": 33.16120407285037, "learning_rate": 1.1633060048748062e-05, "loss": 2.3945, "mean_token_accuracy": 0.4068965494632721, "step": 11550 }, { "epoch": 0.011638318723743279, "grad_norm": 29.484771041421798, "learning_rate": 1.1638096005479121e-05, "loss": 2.1433, "mean_token_accuracy": 0.4931034505367279, "step": 11555 }, { "epoch": 0.01164335477684745, "grad_norm": 36.40939325677855, "learning_rate": 1.1643131962210182e-05, "loss": 2.4058, "mean_token_accuracy": 0.41379310488700866, "step": 11560 }, { "epoch": 0.011648390829951623, "grad_norm": 27.946950946732393, "learning_rate": 1.1648167918941241e-05, "loss": 2.2502, "mean_token_accuracy": 0.47586206793785096, "step": 11565 }, { "epoch": 0.011653426883055797, "grad_norm": 38.44211173044791, "learning_rate": 1.1653203875672302e-05, "loss": 2.2147, "mean_token_accuracy": 0.45862067937850953, "step": 11570 }, { "epoch": 0.011658462936159969, "grad_norm": 41.09401139542813, "learning_rate": 1.165823983240336e-05, "loss": 2.1679, "mean_token_accuracy": 0.4517241358757019, "step": 11575 }, { "epoch": 0.011663498989264142, "grad_norm": 36.09791853494381, "learning_rate": 1.166327578913442e-05, "loss": 2.3712, "mean_token_accuracy": 0.3793103456497192, "step": 11580 }, { "epoch": 0.011668535042368314, "grad_norm": 28.79494635873865, "learning_rate": 1.166831174586548e-05, "loss": 2.2485, "mean_token_accuracy": 0.4517241358757019, "step": 11585 }, { "epoch": 0.011673571095472488, "grad_norm": 28.460158171464016, "learning_rate": 1.1673347702596541e-05, "loss": 2.2697, "mean_token_accuracy": 0.47931034564971925, "step": 11590 }, { "epoch": 0.01167860714857666, "grad_norm": 31.95727198297756, "learning_rate": 1.1678383659327598e-05, "loss": 2.2268, "mean_token_accuracy": 0.47241379618644713, "step": 11595 }, { "epoch": 0.011683643201680832, "grad_norm": 39.39864172166567, "learning_rate": 1.168341961605866e-05, "loss": 2.6456, "mean_token_accuracy": 0.3599515974521637, "step": 11600 }, { "epoch": 0.011688679254785006, "grad_norm": 33.92420170376145, "learning_rate": 1.1688455572789719e-05, "loss": 2.5457, "mean_token_accuracy": 0.3931034475564957, "step": 11605 }, { "epoch": 0.011693715307889178, "grad_norm": 35.31032262005259, "learning_rate": 1.169349152952078e-05, "loss": 2.7874, "mean_token_accuracy": 0.3827586233615875, "step": 11610 }, { "epoch": 0.011698751360993352, "grad_norm": 36.408309424903294, "learning_rate": 1.1698527486251839e-05, "loss": 2.1933, "mean_token_accuracy": 0.43103447556495667, "step": 11615 }, { "epoch": 0.011703787414097524, "grad_norm": 26.883386887357823, "learning_rate": 1.1703563442982898e-05, "loss": 2.2353, "mean_token_accuracy": 0.505807626247406, "step": 11620 }, { "epoch": 0.011708823467201698, "grad_norm": 30.3866929815451, "learning_rate": 1.1708599399713959e-05, "loss": 2.3994, "mean_token_accuracy": 0.4137930989265442, "step": 11625 }, { "epoch": 0.01171385952030587, "grad_norm": 27.47648489661462, "learning_rate": 1.1713635356445018e-05, "loss": 2.1753, "mean_token_accuracy": 0.4758620738983154, "step": 11630 }, { "epoch": 0.011718895573410042, "grad_norm": 28.30304310887975, "learning_rate": 1.1718671313176078e-05, "loss": 2.1896, "mean_token_accuracy": 0.46551724076271056, "step": 11635 }, { "epoch": 0.011723931626514215, "grad_norm": 31.20339825793529, "learning_rate": 1.1723707269907137e-05, "loss": 2.1507, "mean_token_accuracy": 0.4586206912994385, "step": 11640 }, { "epoch": 0.011728967679618387, "grad_norm": 26.769937723201615, "learning_rate": 1.1728743226638198e-05, "loss": 2.1793, "mean_token_accuracy": 0.4905172407627106, "step": 11645 }, { "epoch": 0.011734003732722561, "grad_norm": 47.6107599957727, "learning_rate": 1.1733779183369257e-05, "loss": 2.7639, "mean_token_accuracy": 0.3655172407627106, "step": 11650 }, { "epoch": 0.011739039785826733, "grad_norm": 31.107877045445967, "learning_rate": 1.1738815140100316e-05, "loss": 2.231, "mean_token_accuracy": 0.43974592089653014, "step": 11655 }, { "epoch": 0.011744075838930907, "grad_norm": 28.79640354428086, "learning_rate": 1.1743851096831377e-05, "loss": 2.2575, "mean_token_accuracy": 0.4379310369491577, "step": 11660 }, { "epoch": 0.011749111892035079, "grad_norm": 23.592700094394363, "learning_rate": 1.1748887053562436e-05, "loss": 1.9979, "mean_token_accuracy": 0.5034482657909394, "step": 11665 }, { "epoch": 0.011754147945139251, "grad_norm": 30.827337374231064, "learning_rate": 1.1753923010293497e-05, "loss": 2.2609, "mean_token_accuracy": 0.4156079888343811, "step": 11670 }, { "epoch": 0.011759183998243425, "grad_norm": 57.831809437527056, "learning_rate": 1.1758958967024555e-05, "loss": 2.4051, "mean_token_accuracy": 0.4275861978530884, "step": 11675 }, { "epoch": 0.011764220051347597, "grad_norm": 38.3851403276558, "learning_rate": 1.1763994923755616e-05, "loss": 2.5404, "mean_token_accuracy": 0.4068965494632721, "step": 11680 }, { "epoch": 0.01176925610445177, "grad_norm": 33.92180640453206, "learning_rate": 1.1769030880486675e-05, "loss": 2.0088, "mean_token_accuracy": 0.5310344815254211, "step": 11685 }, { "epoch": 0.011774292157555943, "grad_norm": 34.4335348158334, "learning_rate": 1.1774066837217736e-05, "loss": 2.3242, "mean_token_accuracy": 0.4448275864124298, "step": 11690 }, { "epoch": 0.011779328210660116, "grad_norm": 37.86839475338455, "learning_rate": 1.1779102793948794e-05, "loss": 2.3915, "mean_token_accuracy": 0.45747126936912536, "step": 11695 }, { "epoch": 0.011784364263764288, "grad_norm": 29.533663641715155, "learning_rate": 1.1784138750679855e-05, "loss": 2.2349, "mean_token_accuracy": 0.47096188068389894, "step": 11700 }, { "epoch": 0.01178940031686846, "grad_norm": 29.147766761954117, "learning_rate": 1.1789174707410914e-05, "loss": 2.0756, "mean_token_accuracy": 0.47586206197738645, "step": 11705 }, { "epoch": 0.011794436369972634, "grad_norm": 30.423986876778347, "learning_rate": 1.1794210664141975e-05, "loss": 2.2408, "mean_token_accuracy": 0.4103448212146759, "step": 11710 }, { "epoch": 0.011799472423076806, "grad_norm": 48.76108093174837, "learning_rate": 1.1799246620873034e-05, "loss": 2.6077, "mean_token_accuracy": 0.4034482777118683, "step": 11715 }, { "epoch": 0.01180450847618098, "grad_norm": 33.45300175282271, "learning_rate": 1.1804282577604093e-05, "loss": 2.3763, "mean_token_accuracy": 0.4379310429096222, "step": 11720 }, { "epoch": 0.011809544529285152, "grad_norm": 35.70077979385335, "learning_rate": 1.1809318534335154e-05, "loss": 2.4995, "mean_token_accuracy": 0.4620689690113068, "step": 11725 }, { "epoch": 0.011814580582389326, "grad_norm": 31.22088762399235, "learning_rate": 1.1814354491066214e-05, "loss": 2.4538, "mean_token_accuracy": 0.42758620977401735, "step": 11730 }, { "epoch": 0.011819616635493498, "grad_norm": 29.091625305492162, "learning_rate": 1.1819390447797273e-05, "loss": 2.2808, "mean_token_accuracy": 0.4620689630508423, "step": 11735 }, { "epoch": 0.01182465268859767, "grad_norm": 31.713195667671137, "learning_rate": 1.1824426404528332e-05, "loss": 2.1855, "mean_token_accuracy": 0.4620689690113068, "step": 11740 }, { "epoch": 0.011829688741701844, "grad_norm": 41.178585232472834, "learning_rate": 1.1829462361259393e-05, "loss": 2.5098, "mean_token_accuracy": 0.4344827592372894, "step": 11745 }, { "epoch": 0.011834724794806016, "grad_norm": 26.55604875629458, "learning_rate": 1.1834498317990452e-05, "loss": 2.2414, "mean_token_accuracy": 0.43793103098869324, "step": 11750 }, { "epoch": 0.01183976084791019, "grad_norm": 33.219480125829634, "learning_rate": 1.1839534274721511e-05, "loss": 2.341, "mean_token_accuracy": 0.460556560754776, "step": 11755 }, { "epoch": 0.011844796901014362, "grad_norm": 35.91387042737535, "learning_rate": 1.1844570231452572e-05, "loss": 2.4802, "mean_token_accuracy": 0.41185723543167113, "step": 11760 }, { "epoch": 0.011849832954118535, "grad_norm": 30.829630617005744, "learning_rate": 1.1849606188183632e-05, "loss": 2.2007, "mean_token_accuracy": 0.4689655065536499, "step": 11765 }, { "epoch": 0.011854869007222707, "grad_norm": 26.424355055171844, "learning_rate": 1.1854642144914693e-05, "loss": 2.3634, "mean_token_accuracy": 0.4551724135875702, "step": 11770 }, { "epoch": 0.01185990506032688, "grad_norm": 39.74142420053445, "learning_rate": 1.185967810164575e-05, "loss": 2.6674, "mean_token_accuracy": 0.3517241358757019, "step": 11775 }, { "epoch": 0.011864941113431053, "grad_norm": 29.204152397600183, "learning_rate": 1.1864714058376811e-05, "loss": 2.4455, "mean_token_accuracy": 0.41724138259887694, "step": 11780 }, { "epoch": 0.011869977166535225, "grad_norm": 30.820441868810942, "learning_rate": 1.186975001510787e-05, "loss": 2.4034, "mean_token_accuracy": 0.4379310429096222, "step": 11785 }, { "epoch": 0.011875013219639399, "grad_norm": 34.93596359481468, "learning_rate": 1.1874785971838931e-05, "loss": 2.2903, "mean_token_accuracy": 0.41379310488700866, "step": 11790 }, { "epoch": 0.011880049272743571, "grad_norm": 35.18496665378755, "learning_rate": 1.1879821928569989e-05, "loss": 2.3573, "mean_token_accuracy": 0.44827585816383364, "step": 11795 }, { "epoch": 0.011885085325847745, "grad_norm": 26.243347308215515, "learning_rate": 1.188485788530105e-05, "loss": 2.3211, "mean_token_accuracy": 0.47586206793785096, "step": 11800 }, { "epoch": 0.011890121378951917, "grad_norm": 42.77812947857821, "learning_rate": 1.1889893842032109e-05, "loss": 2.3755, "mean_token_accuracy": 0.4620689630508423, "step": 11805 }, { "epoch": 0.011895157432056089, "grad_norm": 29.44263931797846, "learning_rate": 1.189492979876317e-05, "loss": 2.4405, "mean_token_accuracy": 0.42413792610168455, "step": 11810 }, { "epoch": 0.011900193485160263, "grad_norm": 29.622466696353953, "learning_rate": 1.189996575549423e-05, "loss": 2.222, "mean_token_accuracy": 0.42758620381355283, "step": 11815 }, { "epoch": 0.011905229538264435, "grad_norm": 59.869047997826236, "learning_rate": 1.1905001712225289e-05, "loss": 2.4852, "mean_token_accuracy": 0.42068964838981626, "step": 11820 }, { "epoch": 0.011910265591368608, "grad_norm": 40.603104169243366, "learning_rate": 1.191003766895635e-05, "loss": 2.6181, "mean_token_accuracy": 0.4034482717514038, "step": 11825 }, { "epoch": 0.01191530164447278, "grad_norm": 30.373590109935176, "learning_rate": 1.1915073625687409e-05, "loss": 2.3887, "mean_token_accuracy": 0.4241379380226135, "step": 11830 }, { "epoch": 0.011920337697576954, "grad_norm": 27.992116159530656, "learning_rate": 1.1920109582418468e-05, "loss": 2.6311, "mean_token_accuracy": 0.39655173420906065, "step": 11835 }, { "epoch": 0.011925373750681126, "grad_norm": 21.99121016474274, "learning_rate": 1.1925145539149527e-05, "loss": 2.3176, "mean_token_accuracy": 0.4586206912994385, "step": 11840 }, { "epoch": 0.011930409803785298, "grad_norm": 24.090678675968853, "learning_rate": 1.1930181495880588e-05, "loss": 2.096, "mean_token_accuracy": 0.463520872592926, "step": 11845 }, { "epoch": 0.011935445856889472, "grad_norm": 22.389842023634845, "learning_rate": 1.1935217452611647e-05, "loss": 2.1761, "mean_token_accuracy": 0.4344827651977539, "step": 11850 }, { "epoch": 0.011940481909993644, "grad_norm": 23.848870125804265, "learning_rate": 1.1940253409342707e-05, "loss": 2.626, "mean_token_accuracy": 0.3670901358127594, "step": 11855 }, { "epoch": 0.011945517963097818, "grad_norm": 30.34740952210801, "learning_rate": 1.1945289366073768e-05, "loss": 2.3298, "mean_token_accuracy": 0.42758620381355283, "step": 11860 }, { "epoch": 0.01195055401620199, "grad_norm": 29.72381944433842, "learning_rate": 1.1950325322804827e-05, "loss": 2.1655, "mean_token_accuracy": 0.4586206912994385, "step": 11865 }, { "epoch": 0.011955590069306164, "grad_norm": 37.395219009976124, "learning_rate": 1.1955361279535888e-05, "loss": 2.3425, "mean_token_accuracy": 0.43448275327682495, "step": 11870 }, { "epoch": 0.011960626122410336, "grad_norm": 33.79469696269186, "learning_rate": 1.1960397236266947e-05, "loss": 2.2985, "mean_token_accuracy": 0.4689655065536499, "step": 11875 }, { "epoch": 0.011965662175514508, "grad_norm": 57.48276174097468, "learning_rate": 1.1965433192998006e-05, "loss": 2.508, "mean_token_accuracy": 0.42758620381355283, "step": 11880 }, { "epoch": 0.011970698228618681, "grad_norm": 23.57946935792069, "learning_rate": 1.1970469149729066e-05, "loss": 2.4551, "mean_token_accuracy": 0.4379310250282288, "step": 11885 }, { "epoch": 0.011975734281722853, "grad_norm": 30.65853885506041, "learning_rate": 1.1975505106460127e-05, "loss": 2.2331, "mean_token_accuracy": 0.4517241299152374, "step": 11890 }, { "epoch": 0.011980770334827027, "grad_norm": 32.8691052090834, "learning_rate": 1.1980541063191186e-05, "loss": 2.0642, "mean_token_accuracy": 0.4845130145549774, "step": 11895 }, { "epoch": 0.0119858063879312, "grad_norm": 30.200057255884282, "learning_rate": 1.1985577019922245e-05, "loss": 2.3444, "mean_token_accuracy": 0.4034482717514038, "step": 11900 }, { "epoch": 0.011990842441035373, "grad_norm": 33.21811713029088, "learning_rate": 1.1990612976653304e-05, "loss": 2.1185, "mean_token_accuracy": 0.41379310488700866, "step": 11905 }, { "epoch": 0.011995878494139545, "grad_norm": 26.002346921505122, "learning_rate": 1.1995648933384365e-05, "loss": 2.3714, "mean_token_accuracy": 0.4431337058544159, "step": 11910 }, { "epoch": 0.012000914547243717, "grad_norm": 28.41105857341897, "learning_rate": 1.2000684890115425e-05, "loss": 2.2993, "mean_token_accuracy": 0.4120387136936188, "step": 11915 }, { "epoch": 0.01200595060034789, "grad_norm": 35.13040381814801, "learning_rate": 1.2005720846846484e-05, "loss": 2.0992, "mean_token_accuracy": 0.4344827651977539, "step": 11920 }, { "epoch": 0.012010986653452063, "grad_norm": 33.902052562487064, "learning_rate": 1.2010756803577545e-05, "loss": 2.3548, "mean_token_accuracy": 0.42758620977401735, "step": 11925 }, { "epoch": 0.012016022706556237, "grad_norm": 30.351260673050795, "learning_rate": 1.2015792760308604e-05, "loss": 2.3414, "mean_token_accuracy": 0.4448275864124298, "step": 11930 }, { "epoch": 0.012021058759660409, "grad_norm": 29.727351781076695, "learning_rate": 1.2020828717039665e-05, "loss": 2.1047, "mean_token_accuracy": 0.517241370677948, "step": 11935 }, { "epoch": 0.012026094812764582, "grad_norm": 42.319718975242544, "learning_rate": 1.2025864673770722e-05, "loss": 2.4983, "mean_token_accuracy": 0.44827587008476255, "step": 11940 }, { "epoch": 0.012031130865868754, "grad_norm": 34.90828718420889, "learning_rate": 1.2030900630501783e-05, "loss": 2.2032, "mean_token_accuracy": 0.4517241358757019, "step": 11945 }, { "epoch": 0.012036166918972926, "grad_norm": 36.6431165004841, "learning_rate": 1.2035936587232843e-05, "loss": 2.4441, "mean_token_accuracy": 0.4379310369491577, "step": 11950 }, { "epoch": 0.0120412029720771, "grad_norm": 48.10309109629409, "learning_rate": 1.2040972543963904e-05, "loss": 2.2942, "mean_token_accuracy": 0.44137930274009707, "step": 11955 }, { "epoch": 0.012046239025181272, "grad_norm": 40.61548526380205, "learning_rate": 1.2046008500694963e-05, "loss": 2.7224, "mean_token_accuracy": 0.3862068891525269, "step": 11960 }, { "epoch": 0.012051275078285446, "grad_norm": 35.51999667987734, "learning_rate": 1.2051044457426022e-05, "loss": 2.4255, "mean_token_accuracy": 0.4172413766384125, "step": 11965 }, { "epoch": 0.012056311131389618, "grad_norm": 36.76471067656789, "learning_rate": 1.2056080414157083e-05, "loss": 2.2225, "mean_token_accuracy": 0.46896551847457885, "step": 11970 }, { "epoch": 0.012061347184493792, "grad_norm": 31.02849146260985, "learning_rate": 1.2061116370888142e-05, "loss": 2.2786, "mean_token_accuracy": 0.4275862157344818, "step": 11975 }, { "epoch": 0.012066383237597964, "grad_norm": 25.74508082509686, "learning_rate": 1.2066152327619202e-05, "loss": 2.2063, "mean_token_accuracy": 0.42758620381355283, "step": 11980 }, { "epoch": 0.012071419290702136, "grad_norm": 24.714618897499193, "learning_rate": 1.207118828435026e-05, "loss": 2.4041, "mean_token_accuracy": 0.4, "step": 11985 }, { "epoch": 0.01207645534380631, "grad_norm": 36.555366923221555, "learning_rate": 1.2076224241081322e-05, "loss": 2.2502, "mean_token_accuracy": 0.44593595564365385, "step": 11990 }, { "epoch": 0.012081491396910482, "grad_norm": 34.32218517791795, "learning_rate": 1.2081260197812381e-05, "loss": 2.5374, "mean_token_accuracy": 0.42413792610168455, "step": 11995 }, { "epoch": 0.012086527450014655, "grad_norm": 29.016327400206517, "learning_rate": 1.208629615454344e-05, "loss": 2.3157, "mean_token_accuracy": 0.42068966031074523, "step": 12000 }, { "epoch": 0.012091563503118827, "grad_norm": 32.278258368869174, "learning_rate": 1.2091332111274501e-05, "loss": 2.2051, "mean_token_accuracy": 0.4344827592372894, "step": 12005 }, { "epoch": 0.012096599556223001, "grad_norm": 34.500678726034955, "learning_rate": 1.209636806800556e-05, "loss": 2.2132, "mean_token_accuracy": 0.4746521472930908, "step": 12010 }, { "epoch": 0.012101635609327173, "grad_norm": 23.72196587971329, "learning_rate": 1.2101404024736621e-05, "loss": 2.254, "mean_token_accuracy": 0.49310343265533446, "step": 12015 }, { "epoch": 0.012106671662431345, "grad_norm": 25.039264292882553, "learning_rate": 1.2106439981467679e-05, "loss": 2.551, "mean_token_accuracy": 0.36896551847457887, "step": 12020 }, { "epoch": 0.012111707715535519, "grad_norm": 27.67678840246632, "learning_rate": 1.211147593819874e-05, "loss": 2.1059, "mean_token_accuracy": 0.46896551847457885, "step": 12025 }, { "epoch": 0.012116743768639691, "grad_norm": 32.01213154648573, "learning_rate": 1.21165118949298e-05, "loss": 2.1221, "mean_token_accuracy": 0.47241379618644713, "step": 12030 }, { "epoch": 0.012121779821743865, "grad_norm": 32.43952805086492, "learning_rate": 1.212154785166086e-05, "loss": 2.4842, "mean_token_accuracy": 0.4310344815254211, "step": 12035 }, { "epoch": 0.012126815874848037, "grad_norm": 32.91024604085388, "learning_rate": 1.2126583808391918e-05, "loss": 2.5699, "mean_token_accuracy": 0.3482758581638336, "step": 12040 }, { "epoch": 0.01213185192795221, "grad_norm": 40.23602559617418, "learning_rate": 1.2131619765122979e-05, "loss": 2.2727, "mean_token_accuracy": 0.47352216243743894, "step": 12045 }, { "epoch": 0.012136887981056383, "grad_norm": 31.90534793888992, "learning_rate": 1.2136655721854038e-05, "loss": 2.409, "mean_token_accuracy": 0.42413792610168455, "step": 12050 }, { "epoch": 0.012141924034160555, "grad_norm": 29.161363129473802, "learning_rate": 1.2141691678585099e-05, "loss": 2.4105, "mean_token_accuracy": 0.41724138259887694, "step": 12055 }, { "epoch": 0.012146960087264728, "grad_norm": 30.206054380356, "learning_rate": 1.2146727635316158e-05, "loss": 2.2371, "mean_token_accuracy": 0.47434966564178466, "step": 12060 }, { "epoch": 0.0121519961403689, "grad_norm": 30.366279329096894, "learning_rate": 1.2151763592047217e-05, "loss": 2.1487, "mean_token_accuracy": 0.47404718995094297, "step": 12065 }, { "epoch": 0.012157032193473074, "grad_norm": 32.6056171639377, "learning_rate": 1.2156799548778278e-05, "loss": 2.3538, "mean_token_accuracy": 0.48862674832344055, "step": 12070 }, { "epoch": 0.012162068246577246, "grad_norm": 35.79740680820382, "learning_rate": 1.2161835505509338e-05, "loss": 2.3017, "mean_token_accuracy": 0.41724138259887694, "step": 12075 }, { "epoch": 0.01216710429968142, "grad_norm": 32.8361135077361, "learning_rate": 1.2166871462240397e-05, "loss": 2.052, "mean_token_accuracy": 0.5068965435028077, "step": 12080 }, { "epoch": 0.012172140352785592, "grad_norm": 27.195149448502644, "learning_rate": 1.2171907418971456e-05, "loss": 2.0828, "mean_token_accuracy": 0.44482758045196535, "step": 12085 }, { "epoch": 0.012177176405889764, "grad_norm": 35.51346782622862, "learning_rate": 1.2176943375702517e-05, "loss": 2.2336, "mean_token_accuracy": 0.4884452521800995, "step": 12090 }, { "epoch": 0.012182212458993938, "grad_norm": 47.91198506078173, "learning_rate": 1.2181979332433576e-05, "loss": 2.7316, "mean_token_accuracy": 0.37586207389831544, "step": 12095 }, { "epoch": 0.01218724851209811, "grad_norm": 25.24952257274832, "learning_rate": 1.2187015289164636e-05, "loss": 2.2287, "mean_token_accuracy": 0.44827585816383364, "step": 12100 }, { "epoch": 0.012192284565202284, "grad_norm": 41.366680473854885, "learning_rate": 1.2192051245895696e-05, "loss": 2.1682, "mean_token_accuracy": 0.49655171632766726, "step": 12105 }, { "epoch": 0.012197320618306456, "grad_norm": 29.580693126967436, "learning_rate": 1.2197087202626756e-05, "loss": 2.4111, "mean_token_accuracy": 0.4103448212146759, "step": 12110 }, { "epoch": 0.01220235667141063, "grad_norm": 27.795948916880032, "learning_rate": 1.2202123159357817e-05, "loss": 2.1578, "mean_token_accuracy": 0.47434966564178466, "step": 12115 }, { "epoch": 0.012207392724514802, "grad_norm": 24.31278972152385, "learning_rate": 1.2207159116088874e-05, "loss": 2.2462, "mean_token_accuracy": 0.46206897497177124, "step": 12120 }, { "epoch": 0.012212428777618974, "grad_norm": 44.73619705892749, "learning_rate": 1.2212195072819935e-05, "loss": 2.4258, "mean_token_accuracy": 0.3907440960407257, "step": 12125 }, { "epoch": 0.012217464830723147, "grad_norm": 44.09987761342455, "learning_rate": 1.2217231029550994e-05, "loss": 2.4476, "mean_token_accuracy": 0.41379311084747317, "step": 12130 }, { "epoch": 0.01222250088382732, "grad_norm": 31.159134806619846, "learning_rate": 1.2222266986282055e-05, "loss": 2.5624, "mean_token_accuracy": 0.38275861740112305, "step": 12135 }, { "epoch": 0.012227536936931493, "grad_norm": 35.7648789972433, "learning_rate": 1.2227302943013113e-05, "loss": 2.5196, "mean_token_accuracy": 0.39655172228813174, "step": 12140 }, { "epoch": 0.012232572990035665, "grad_norm": 38.242615424674746, "learning_rate": 1.2232338899744174e-05, "loss": 2.6402, "mean_token_accuracy": 0.39655172228813174, "step": 12145 }, { "epoch": 0.012237609043139839, "grad_norm": 40.887422675709274, "learning_rate": 1.2237374856475233e-05, "loss": 2.3138, "mean_token_accuracy": 0.415426504611969, "step": 12150 }, { "epoch": 0.012242645096244011, "grad_norm": 31.931748143686473, "learning_rate": 1.2242410813206294e-05, "loss": 2.4109, "mean_token_accuracy": 0.4379310250282288, "step": 12155 }, { "epoch": 0.012247681149348183, "grad_norm": 24.02194620414054, "learning_rate": 1.2247446769937353e-05, "loss": 2.1376, "mean_token_accuracy": 0.43103448748588563, "step": 12160 }, { "epoch": 0.012252717202452357, "grad_norm": 32.9457221709957, "learning_rate": 1.2252482726668413e-05, "loss": 2.3615, "mean_token_accuracy": 0.39310344457626345, "step": 12165 }, { "epoch": 0.012257753255556529, "grad_norm": 33.00860703281463, "learning_rate": 1.2257518683399474e-05, "loss": 2.4587, "mean_token_accuracy": 0.4034482777118683, "step": 12170 }, { "epoch": 0.012262789308660703, "grad_norm": 44.89004291993204, "learning_rate": 1.2262554640130533e-05, "loss": 2.4888, "mean_token_accuracy": 0.35172412991523744, "step": 12175 }, { "epoch": 0.012267825361764875, "grad_norm": 44.11400169077773, "learning_rate": 1.2267590596861592e-05, "loss": 2.5631, "mean_token_accuracy": 0.41034482717514037, "step": 12180 }, { "epoch": 0.012272861414869048, "grad_norm": 30.01498010857881, "learning_rate": 1.2272626553592651e-05, "loss": 2.3986, "mean_token_accuracy": 0.4344827473163605, "step": 12185 }, { "epoch": 0.01227789746797322, "grad_norm": 21.309668670364317, "learning_rate": 1.2277662510323712e-05, "loss": 2.114, "mean_token_accuracy": 0.48965516686439514, "step": 12190 }, { "epoch": 0.012282933521077392, "grad_norm": 26.11629365341752, "learning_rate": 1.2282698467054771e-05, "loss": 2.3484, "mean_token_accuracy": 0.41034482717514037, "step": 12195 }, { "epoch": 0.012287969574181566, "grad_norm": 27.875106173708257, "learning_rate": 1.228773442378583e-05, "loss": 2.3855, "mean_token_accuracy": 0.44827585816383364, "step": 12200 }, { "epoch": 0.012293005627285738, "grad_norm": 30.164966670858732, "learning_rate": 1.2292770380516892e-05, "loss": 2.3645, "mean_token_accuracy": 0.4482758641242981, "step": 12205 }, { "epoch": 0.012298041680389912, "grad_norm": 26.723320574896206, "learning_rate": 1.2297806337247951e-05, "loss": 2.2234, "mean_token_accuracy": 0.458620685338974, "step": 12210 }, { "epoch": 0.012303077733494084, "grad_norm": 27.655417095411156, "learning_rate": 1.2302842293979012e-05, "loss": 2.1854, "mean_token_accuracy": 0.42413793206214906, "step": 12215 }, { "epoch": 0.012308113786598258, "grad_norm": 25.755773423332897, "learning_rate": 1.230787825071007e-05, "loss": 2.3357, "mean_token_accuracy": 0.4655172348022461, "step": 12220 }, { "epoch": 0.01231314983970243, "grad_norm": 31.448424780292072, "learning_rate": 1.231291420744113e-05, "loss": 2.2223, "mean_token_accuracy": 0.4620689630508423, "step": 12225 }, { "epoch": 0.012318185892806602, "grad_norm": 24.98279423637835, "learning_rate": 1.231795016417219e-05, "loss": 2.3607, "mean_token_accuracy": 0.4448275864124298, "step": 12230 }, { "epoch": 0.012323221945910776, "grad_norm": 35.59861885622897, "learning_rate": 1.232298612090325e-05, "loss": 2.7285, "mean_token_accuracy": 0.3896551728248596, "step": 12235 }, { "epoch": 0.012328257999014948, "grad_norm": 20.429017503250947, "learning_rate": 1.2328022077634308e-05, "loss": 1.92, "mean_token_accuracy": 0.5127041757106781, "step": 12240 }, { "epoch": 0.012333294052119121, "grad_norm": 30.131159760352496, "learning_rate": 1.2333058034365369e-05, "loss": 2.5884, "mean_token_accuracy": 0.39310343861579894, "step": 12245 }, { "epoch": 0.012338330105223293, "grad_norm": 48.5446176386253, "learning_rate": 1.2338093991096428e-05, "loss": 2.3249, "mean_token_accuracy": 0.42068964838981626, "step": 12250 }, { "epoch": 0.012343366158327467, "grad_norm": 27.146414488717674, "learning_rate": 1.234312994782749e-05, "loss": 2.2559, "mean_token_accuracy": 0.4620689570903778, "step": 12255 }, { "epoch": 0.01234840221143164, "grad_norm": 43.04661433142306, "learning_rate": 1.2348165904558549e-05, "loss": 2.6008, "mean_token_accuracy": 0.3793103456497192, "step": 12260 }, { "epoch": 0.012353438264535811, "grad_norm": 25.40896500313097, "learning_rate": 1.2353201861289608e-05, "loss": 2.2481, "mean_token_accuracy": 0.44137930274009707, "step": 12265 }, { "epoch": 0.012358474317639985, "grad_norm": 23.37790638152415, "learning_rate": 1.2358237818020669e-05, "loss": 2.5951, "mean_token_accuracy": 0.3655172407627106, "step": 12270 }, { "epoch": 0.012363510370744157, "grad_norm": 27.49061747667942, "learning_rate": 1.2363273774751728e-05, "loss": 2.3631, "mean_token_accuracy": 0.4433151841163635, "step": 12275 }, { "epoch": 0.01236854642384833, "grad_norm": 35.385313867573075, "learning_rate": 1.2368309731482789e-05, "loss": 2.3532, "mean_token_accuracy": 0.40689654350280763, "step": 12280 }, { "epoch": 0.012373582476952503, "grad_norm": 25.341615106386364, "learning_rate": 1.2373345688213846e-05, "loss": 2.204, "mean_token_accuracy": 0.5241379320621491, "step": 12285 }, { "epoch": 0.012378618530056677, "grad_norm": 29.179479229905585, "learning_rate": 1.2378381644944907e-05, "loss": 2.3344, "mean_token_accuracy": 0.4310344815254211, "step": 12290 }, { "epoch": 0.012383654583160849, "grad_norm": 25.541063248156956, "learning_rate": 1.2383417601675967e-05, "loss": 2.7222, "mean_token_accuracy": 0.36896551251411436, "step": 12295 }, { "epoch": 0.01238869063626502, "grad_norm": 35.224607397398145, "learning_rate": 1.2388453558407028e-05, "loss": 2.322, "mean_token_accuracy": 0.43793103098869324, "step": 12300 }, { "epoch": 0.012393726689369194, "grad_norm": 22.708548898606317, "learning_rate": 1.2393489515138087e-05, "loss": 2.0088, "mean_token_accuracy": 0.5137931108474731, "step": 12305 }, { "epoch": 0.012398762742473366, "grad_norm": 32.69161134529763, "learning_rate": 1.2398525471869146e-05, "loss": 2.4177, "mean_token_accuracy": 0.3862069010734558, "step": 12310 }, { "epoch": 0.01240379879557754, "grad_norm": 47.44484033035358, "learning_rate": 1.2403561428600207e-05, "loss": 2.5551, "mean_token_accuracy": 0.4137930989265442, "step": 12315 }, { "epoch": 0.012408834848681712, "grad_norm": 38.70807064210927, "learning_rate": 1.2408597385331266e-05, "loss": 2.3365, "mean_token_accuracy": 0.4482758641242981, "step": 12320 }, { "epoch": 0.012413870901785886, "grad_norm": 27.9617671840753, "learning_rate": 1.2413633342062326e-05, "loss": 2.0856, "mean_token_accuracy": 0.501875388622284, "step": 12325 }, { "epoch": 0.012418906954890058, "grad_norm": 24.76509584209334, "learning_rate": 1.2418669298793385e-05, "loss": 2.1031, "mean_token_accuracy": 0.4950393199920654, "step": 12330 }, { "epoch": 0.01242394300799423, "grad_norm": 27.286537860002337, "learning_rate": 1.2423705255524446e-05, "loss": 2.344, "mean_token_accuracy": 0.4344827592372894, "step": 12335 }, { "epoch": 0.012428979061098404, "grad_norm": 26.310202965921295, "learning_rate": 1.2428741212255505e-05, "loss": 2.6101, "mean_token_accuracy": 0.3655172407627106, "step": 12340 }, { "epoch": 0.012434015114202576, "grad_norm": 29.288826593649755, "learning_rate": 1.2433777168986564e-05, "loss": 2.4956, "mean_token_accuracy": 0.41034482717514037, "step": 12345 }, { "epoch": 0.01243905116730675, "grad_norm": 31.112832553224887, "learning_rate": 1.2438813125717624e-05, "loss": 2.4809, "mean_token_accuracy": 0.42068964838981626, "step": 12350 }, { "epoch": 0.012444087220410922, "grad_norm": 35.57880917039802, "learning_rate": 1.2443849082448685e-05, "loss": 2.421, "mean_token_accuracy": 0.45517241954803467, "step": 12355 }, { "epoch": 0.012449123273515095, "grad_norm": 32.95203755786022, "learning_rate": 1.2448885039179744e-05, "loss": 2.2853, "mean_token_accuracy": 0.4, "step": 12360 }, { "epoch": 0.012454159326619267, "grad_norm": 27.700703152268495, "learning_rate": 1.2453920995910803e-05, "loss": 2.7464, "mean_token_accuracy": 0.36896551251411436, "step": 12365 }, { "epoch": 0.01245919537972344, "grad_norm": 35.365804558101395, "learning_rate": 1.2458956952641864e-05, "loss": 2.6811, "mean_token_accuracy": 0.324137932062149, "step": 12370 }, { "epoch": 0.012464231432827613, "grad_norm": 32.97976107029191, "learning_rate": 1.2463992909372923e-05, "loss": 2.1364, "mean_token_accuracy": 0.47241379618644713, "step": 12375 }, { "epoch": 0.012469267485931785, "grad_norm": 32.98809461892356, "learning_rate": 1.2469028866103984e-05, "loss": 2.3336, "mean_token_accuracy": 0.4620689630508423, "step": 12380 }, { "epoch": 0.012474303539035959, "grad_norm": 27.020086462010717, "learning_rate": 1.2474064822835042e-05, "loss": 2.3314, "mean_token_accuracy": 0.44827585816383364, "step": 12385 }, { "epoch": 0.012479339592140131, "grad_norm": 41.278708666932026, "learning_rate": 1.2479100779566103e-05, "loss": 2.394, "mean_token_accuracy": 0.4172413766384125, "step": 12390 }, { "epoch": 0.012484375645244303, "grad_norm": 32.959432220913286, "learning_rate": 1.2484136736297162e-05, "loss": 2.0557, "mean_token_accuracy": 0.5, "step": 12395 }, { "epoch": 0.012489411698348477, "grad_norm": 25.54759803744777, "learning_rate": 1.2489172693028223e-05, "loss": 2.3786, "mean_token_accuracy": 0.4137930989265442, "step": 12400 }, { "epoch": 0.012494447751452649, "grad_norm": 36.15524711163905, "learning_rate": 1.2494208649759282e-05, "loss": 2.7923, "mean_token_accuracy": 0.37931033968925476, "step": 12405 }, { "epoch": 0.012499483804556823, "grad_norm": 24.931286486004183, "learning_rate": 1.2499244606490341e-05, "loss": 2.1781, "mean_token_accuracy": 0.4517241358757019, "step": 12410 }, { "epoch": 0.012504519857660995, "grad_norm": 24.71513888532816, "learning_rate": 1.2504280563221402e-05, "loss": 2.4558, "mean_token_accuracy": 0.4379310369491577, "step": 12415 }, { "epoch": 0.012509555910765168, "grad_norm": 24.781446474498864, "learning_rate": 1.250931651995246e-05, "loss": 2.1278, "mean_token_accuracy": 0.48620688915252686, "step": 12420 }, { "epoch": 0.01251459196386934, "grad_norm": 35.22602686370479, "learning_rate": 1.2514352476683523e-05, "loss": 2.5821, "mean_token_accuracy": 0.40689654350280763, "step": 12425 }, { "epoch": 0.012519628016973513, "grad_norm": 35.25326745978308, "learning_rate": 1.251938843341458e-05, "loss": 2.0507, "mean_token_accuracy": 0.4983061134815216, "step": 12430 }, { "epoch": 0.012524664070077686, "grad_norm": 41.8082689159271, "learning_rate": 1.252442439014564e-05, "loss": 2.8233, "mean_token_accuracy": 0.3793103456497192, "step": 12435 }, { "epoch": 0.012529700123181858, "grad_norm": 32.11253142547804, "learning_rate": 1.25294603468767e-05, "loss": 2.6194, "mean_token_accuracy": 0.42413793206214906, "step": 12440 }, { "epoch": 0.012534736176286032, "grad_norm": 29.054580901967682, "learning_rate": 1.253449630360776e-05, "loss": 2.2797, "mean_token_accuracy": 0.4379310250282288, "step": 12445 }, { "epoch": 0.012539772229390204, "grad_norm": 25.22875251004564, "learning_rate": 1.253953226033882e-05, "loss": 2.4613, "mean_token_accuracy": 0.4, "step": 12450 }, { "epoch": 0.012544808282494378, "grad_norm": 51.59575048924858, "learning_rate": 1.254456821706988e-05, "loss": 2.3611, "mean_token_accuracy": 0.4551724135875702, "step": 12455 }, { "epoch": 0.01254984433559855, "grad_norm": 27.2790310159277, "learning_rate": 1.2549604173800939e-05, "loss": 2.1536, "mean_token_accuracy": 0.43793103098869324, "step": 12460 }, { "epoch": 0.012554880388702722, "grad_norm": 31.30030855839559, "learning_rate": 1.2554640130532e-05, "loss": 2.4671, "mean_token_accuracy": 0.43103447556495667, "step": 12465 }, { "epoch": 0.012559916441806896, "grad_norm": 25.26799897715249, "learning_rate": 1.255967608726306e-05, "loss": 2.0066, "mean_token_accuracy": 0.4433151841163635, "step": 12470 }, { "epoch": 0.012564952494911068, "grad_norm": 24.987728592457096, "learning_rate": 1.2564712043994117e-05, "loss": 2.1716, "mean_token_accuracy": 0.4724137902259827, "step": 12475 }, { "epoch": 0.012569988548015242, "grad_norm": 35.16232997894578, "learning_rate": 1.256974800072518e-05, "loss": 1.9862, "mean_token_accuracy": 0.5120992124080658, "step": 12480 }, { "epoch": 0.012575024601119414, "grad_norm": 24.259143302035962, "learning_rate": 1.2574783957456237e-05, "loss": 1.9787, "mean_token_accuracy": 0.458620685338974, "step": 12485 }, { "epoch": 0.012580060654223587, "grad_norm": 31.140449407356805, "learning_rate": 1.25798199141873e-05, "loss": 2.7262, "mean_token_accuracy": 0.35517241060733795, "step": 12490 }, { "epoch": 0.01258509670732776, "grad_norm": 29.54485719851589, "learning_rate": 1.2584855870918357e-05, "loss": 2.3806, "mean_token_accuracy": 0.4482758641242981, "step": 12495 }, { "epoch": 0.012590132760431931, "grad_norm": 28.347802255076466, "learning_rate": 1.2589891827649416e-05, "loss": 2.4771, "mean_token_accuracy": 0.44827585816383364, "step": 12500 }, { "epoch": 0.012595168813536105, "grad_norm": 27.13069768874934, "learning_rate": 1.2594927784380477e-05, "loss": 2.2474, "mean_token_accuracy": 0.42413793206214906, "step": 12505 }, { "epoch": 0.012600204866640277, "grad_norm": 26.013557359862748, "learning_rate": 1.2599963741111537e-05, "loss": 2.0707, "mean_token_accuracy": 0.4931034445762634, "step": 12510 }, { "epoch": 0.012605240919744451, "grad_norm": 23.211625934928932, "learning_rate": 1.2604999697842596e-05, "loss": 2.3397, "mean_token_accuracy": 0.42413792610168455, "step": 12515 }, { "epoch": 0.012610276972848623, "grad_norm": 31.799509596239663, "learning_rate": 1.2610035654573657e-05, "loss": 2.3452, "mean_token_accuracy": 0.45656382441520693, "step": 12520 }, { "epoch": 0.012615313025952797, "grad_norm": 28.174141859846408, "learning_rate": 1.2615071611304716e-05, "loss": 2.5538, "mean_token_accuracy": 0.38620689511299133, "step": 12525 }, { "epoch": 0.012620349079056969, "grad_norm": 31.155542413297635, "learning_rate": 1.2620107568035777e-05, "loss": 2.5918, "mean_token_accuracy": 0.412099215388298, "step": 12530 }, { "epoch": 0.01262538513216114, "grad_norm": 36.72536837237263, "learning_rate": 1.2625143524766836e-05, "loss": 2.6464, "mean_token_accuracy": 0.3689655065536499, "step": 12535 }, { "epoch": 0.012630421185265315, "grad_norm": 32.56709727600641, "learning_rate": 1.2630179481497894e-05, "loss": 2.3201, "mean_token_accuracy": 0.43448275327682495, "step": 12540 }, { "epoch": 0.012635457238369487, "grad_norm": 32.31279525695266, "learning_rate": 1.2635215438228956e-05, "loss": 2.2739, "mean_token_accuracy": 0.4034482777118683, "step": 12545 }, { "epoch": 0.01264049329147366, "grad_norm": 47.25737661831153, "learning_rate": 1.2640251394960016e-05, "loss": 2.5598, "mean_token_accuracy": 0.37241379022598264, "step": 12550 }, { "epoch": 0.012645529344577832, "grad_norm": 31.864885779048016, "learning_rate": 1.2645287351691073e-05, "loss": 2.2602, "mean_token_accuracy": 0.4655172288417816, "step": 12555 }, { "epoch": 0.012650565397682006, "grad_norm": 28.524852632538792, "learning_rate": 1.2650323308422136e-05, "loss": 2.3715, "mean_token_accuracy": 0.41724138259887694, "step": 12560 }, { "epoch": 0.012655601450786178, "grad_norm": 31.215855647245988, "learning_rate": 1.2655359265153193e-05, "loss": 2.6743, "mean_token_accuracy": 0.3517241358757019, "step": 12565 }, { "epoch": 0.01266063750389035, "grad_norm": 49.518531177360714, "learning_rate": 1.2660395221884256e-05, "loss": 2.5935, "mean_token_accuracy": 0.39999999701976774, "step": 12570 }, { "epoch": 0.012665673556994524, "grad_norm": 39.34856259495588, "learning_rate": 1.2665431178615314e-05, "loss": 2.4846, "mean_token_accuracy": 0.441379314661026, "step": 12575 }, { "epoch": 0.012670709610098696, "grad_norm": 32.406584460502856, "learning_rate": 1.2670467135346373e-05, "loss": 2.4491, "mean_token_accuracy": 0.37931033968925476, "step": 12580 }, { "epoch": 0.01267574566320287, "grad_norm": 24.96458692458947, "learning_rate": 1.2675503092077434e-05, "loss": 2.6967, "mean_token_accuracy": 0.3620689630508423, "step": 12585 }, { "epoch": 0.012680781716307042, "grad_norm": 28.173207219771577, "learning_rate": 1.2680539048808493e-05, "loss": 2.0474, "mean_token_accuracy": 0.4896551787853241, "step": 12590 }, { "epoch": 0.012685817769411216, "grad_norm": 37.32953389360052, "learning_rate": 1.2685575005539552e-05, "loss": 2.3558, "mean_token_accuracy": 0.38620689511299133, "step": 12595 }, { "epoch": 0.012690853822515388, "grad_norm": 30.089706154473134, "learning_rate": 1.2690610962270613e-05, "loss": 2.2442, "mean_token_accuracy": 0.4862068951129913, "step": 12600 }, { "epoch": 0.01269588987561956, "grad_norm": 27.694821905567846, "learning_rate": 1.2695646919001673e-05, "loss": 2.2743, "mean_token_accuracy": 0.46551724076271056, "step": 12605 }, { "epoch": 0.012700925928723733, "grad_norm": 31.646543090492017, "learning_rate": 1.2700682875732734e-05, "loss": 2.1807, "mean_token_accuracy": 0.42413793206214906, "step": 12610 }, { "epoch": 0.012705961981827905, "grad_norm": 36.95474372000229, "learning_rate": 1.2705718832463793e-05, "loss": 2.6449, "mean_token_accuracy": 0.41034482717514037, "step": 12615 }, { "epoch": 0.01271099803493208, "grad_norm": 29.810877420075435, "learning_rate": 1.271075478919485e-05, "loss": 2.363, "mean_token_accuracy": 0.4068965494632721, "step": 12620 }, { "epoch": 0.012716034088036251, "grad_norm": 31.577223317974738, "learning_rate": 1.2715790745925913e-05, "loss": 2.3964, "mean_token_accuracy": 0.40798547863960266, "step": 12625 }, { "epoch": 0.012721070141140425, "grad_norm": 30.18369857092084, "learning_rate": 1.272082670265697e-05, "loss": 2.4358, "mean_token_accuracy": 0.3965517163276672, "step": 12630 }, { "epoch": 0.012726106194244597, "grad_norm": 31.931394544713104, "learning_rate": 1.272586265938803e-05, "loss": 2.3843, "mean_token_accuracy": 0.417241370677948, "step": 12635 }, { "epoch": 0.012731142247348769, "grad_norm": 31.667023714276244, "learning_rate": 1.273089861611909e-05, "loss": 2.3703, "mean_token_accuracy": 0.3827586233615875, "step": 12640 }, { "epoch": 0.012736178300452943, "grad_norm": 33.01904718223163, "learning_rate": 1.273593457285015e-05, "loss": 2.5054, "mean_token_accuracy": 0.43623714447021483, "step": 12645 }, { "epoch": 0.012741214353557115, "grad_norm": 28.75349632298836, "learning_rate": 1.2740970529581211e-05, "loss": 2.2973, "mean_token_accuracy": 0.4206896543502808, "step": 12650 }, { "epoch": 0.012746250406661289, "grad_norm": 43.89957086085222, "learning_rate": 1.274600648631227e-05, "loss": 2.5134, "mean_token_accuracy": 0.38620689511299133, "step": 12655 }, { "epoch": 0.01275128645976546, "grad_norm": 35.192670025155444, "learning_rate": 1.275104244304333e-05, "loss": 2.4297, "mean_token_accuracy": 0.43103447556495667, "step": 12660 }, { "epoch": 0.012756322512869634, "grad_norm": 30.650579666006582, "learning_rate": 1.275607839977439e-05, "loss": 2.3771, "mean_token_accuracy": 0.42068966031074523, "step": 12665 }, { "epoch": 0.012761358565973806, "grad_norm": 31.011786103933645, "learning_rate": 1.276111435650545e-05, "loss": 2.4126, "mean_token_accuracy": 0.4086509466171265, "step": 12670 }, { "epoch": 0.012766394619077978, "grad_norm": 33.38479119722718, "learning_rate": 1.2766150313236507e-05, "loss": 2.3375, "mean_token_accuracy": 0.4448275864124298, "step": 12675 }, { "epoch": 0.012771430672182152, "grad_norm": 30.475820515807357, "learning_rate": 1.277118626996757e-05, "loss": 2.5579, "mean_token_accuracy": 0.3793103456497192, "step": 12680 }, { "epoch": 0.012776466725286324, "grad_norm": 25.73170396499636, "learning_rate": 1.2776222226698627e-05, "loss": 2.232, "mean_token_accuracy": 0.42068966031074523, "step": 12685 }, { "epoch": 0.012781502778390498, "grad_norm": 25.757602331231993, "learning_rate": 1.278125818342969e-05, "loss": 2.4408, "mean_token_accuracy": 0.4396249294281006, "step": 12690 }, { "epoch": 0.01278653883149467, "grad_norm": 32.45572523514291, "learning_rate": 1.2786294140160748e-05, "loss": 2.5287, "mean_token_accuracy": 0.44827585816383364, "step": 12695 }, { "epoch": 0.012791574884598844, "grad_norm": 28.769040816368303, "learning_rate": 1.2791330096891807e-05, "loss": 2.4698, "mean_token_accuracy": 0.42413793206214906, "step": 12700 }, { "epoch": 0.012796610937703016, "grad_norm": 37.55962852563274, "learning_rate": 1.2796366053622868e-05, "loss": 2.0511, "mean_token_accuracy": 0.4870689630508423, "step": 12705 }, { "epoch": 0.012801646990807188, "grad_norm": 27.589639816641036, "learning_rate": 1.2801402010353927e-05, "loss": 2.2718, "mean_token_accuracy": 0.4551724135875702, "step": 12710 }, { "epoch": 0.012806683043911362, "grad_norm": 39.01467674178639, "learning_rate": 1.2806437967084988e-05, "loss": 2.6747, "mean_token_accuracy": 0.38275861740112305, "step": 12715 }, { "epoch": 0.012811719097015534, "grad_norm": 31.54599311241794, "learning_rate": 1.2811473923816047e-05, "loss": 2.1155, "mean_token_accuracy": 0.4881427764892578, "step": 12720 }, { "epoch": 0.012816755150119707, "grad_norm": 38.41931270092057, "learning_rate": 1.2816509880547106e-05, "loss": 2.1786, "mean_token_accuracy": 0.42758620381355283, "step": 12725 }, { "epoch": 0.01282179120322388, "grad_norm": 32.52700224985398, "learning_rate": 1.2821545837278167e-05, "loss": 1.7954, "mean_token_accuracy": 0.5517241477966308, "step": 12730 }, { "epoch": 0.012826827256328053, "grad_norm": 30.782415251204366, "learning_rate": 1.2826581794009227e-05, "loss": 2.2334, "mean_token_accuracy": 0.42758620381355283, "step": 12735 }, { "epoch": 0.012831863309432225, "grad_norm": 44.07255498344601, "learning_rate": 1.2831617750740286e-05, "loss": 2.7697, "mean_token_accuracy": 0.4344827592372894, "step": 12740 }, { "epoch": 0.012836899362536397, "grad_norm": 30.566807063967055, "learning_rate": 1.2836653707471347e-05, "loss": 2.0987, "mean_token_accuracy": 0.458620685338974, "step": 12745 }, { "epoch": 0.012841935415640571, "grad_norm": 32.43777592863901, "learning_rate": 1.2841689664202406e-05, "loss": 2.3014, "mean_token_accuracy": 0.40689654350280763, "step": 12750 }, { "epoch": 0.012846971468744743, "grad_norm": 29.424119561405323, "learning_rate": 1.2846725620933467e-05, "loss": 2.5864, "mean_token_accuracy": 0.42068964838981626, "step": 12755 }, { "epoch": 0.012852007521848917, "grad_norm": 31.273473895404905, "learning_rate": 1.2851761577664526e-05, "loss": 2.319, "mean_token_accuracy": 0.4517241418361664, "step": 12760 }, { "epoch": 0.012857043574953089, "grad_norm": 37.668560355649866, "learning_rate": 1.2856797534395584e-05, "loss": 2.8225, "mean_token_accuracy": 0.417241370677948, "step": 12765 }, { "epoch": 0.012862079628057263, "grad_norm": 32.0196168050631, "learning_rate": 1.2861833491126647e-05, "loss": 2.7624, "mean_token_accuracy": 0.35862069129943847, "step": 12770 }, { "epoch": 0.012867115681161435, "grad_norm": 28.41207023695925, "learning_rate": 1.2866869447857704e-05, "loss": 2.4554, "mean_token_accuracy": 0.4379310369491577, "step": 12775 }, { "epoch": 0.012872151734265607, "grad_norm": 36.93349054722976, "learning_rate": 1.2871905404588763e-05, "loss": 2.4285, "mean_token_accuracy": 0.43793103098869324, "step": 12780 }, { "epoch": 0.01287718778736978, "grad_norm": 28.567774362389525, "learning_rate": 1.2876941361319824e-05, "loss": 2.3522, "mean_token_accuracy": 0.4448275864124298, "step": 12785 }, { "epoch": 0.012882223840473953, "grad_norm": 34.28002705071036, "learning_rate": 1.2881977318050884e-05, "loss": 2.6179, "mean_token_accuracy": 0.36896551549434664, "step": 12790 }, { "epoch": 0.012887259893578126, "grad_norm": 25.95669245715233, "learning_rate": 1.2887013274781945e-05, "loss": 2.2537, "mean_token_accuracy": 0.4172413766384125, "step": 12795 }, { "epoch": 0.012892295946682298, "grad_norm": 32.76418069538328, "learning_rate": 1.2892049231513004e-05, "loss": 2.439, "mean_token_accuracy": 0.4068965494632721, "step": 12800 }, { "epoch": 0.012897331999786472, "grad_norm": 30.483698834661528, "learning_rate": 1.2897085188244063e-05, "loss": 2.3958, "mean_token_accuracy": 0.4068965524435043, "step": 12805 }, { "epoch": 0.012902368052890644, "grad_norm": 23.111492257569815, "learning_rate": 1.2902121144975124e-05, "loss": 2.2623, "mean_token_accuracy": 0.42413792610168455, "step": 12810 }, { "epoch": 0.012907404105994816, "grad_norm": 28.560933429334074, "learning_rate": 1.2907157101706183e-05, "loss": 2.5017, "mean_token_accuracy": 0.4137930989265442, "step": 12815 }, { "epoch": 0.01291244015909899, "grad_norm": 25.093406735469227, "learning_rate": 1.291219305843724e-05, "loss": 2.2457, "mean_token_accuracy": 0.47586206197738645, "step": 12820 }, { "epoch": 0.012917476212203162, "grad_norm": 25.620166034413394, "learning_rate": 1.2917229015168303e-05, "loss": 2.0981, "mean_token_accuracy": 0.4896551609039307, "step": 12825 }, { "epoch": 0.012922512265307336, "grad_norm": 23.74110744119613, "learning_rate": 1.2922264971899361e-05, "loss": 2.2391, "mean_token_accuracy": 0.49522080421447756, "step": 12830 }, { "epoch": 0.012927548318411508, "grad_norm": 30.55861636771355, "learning_rate": 1.2927300928630424e-05, "loss": 2.2856, "mean_token_accuracy": 0.441379314661026, "step": 12835 }, { "epoch": 0.012932584371515682, "grad_norm": 26.0317251444116, "learning_rate": 1.2932336885361481e-05, "loss": 2.3061, "mean_token_accuracy": 0.4551724076271057, "step": 12840 }, { "epoch": 0.012937620424619854, "grad_norm": 26.985724445535347, "learning_rate": 1.293737284209254e-05, "loss": 2.5272, "mean_token_accuracy": 0.4344827592372894, "step": 12845 }, { "epoch": 0.012942656477724026, "grad_norm": 27.969385054594625, "learning_rate": 1.2942408798823601e-05, "loss": 2.5063, "mean_token_accuracy": 0.40344826579093934, "step": 12850 }, { "epoch": 0.0129476925308282, "grad_norm": 30.513456889761024, "learning_rate": 1.294744475555466e-05, "loss": 1.9497, "mean_token_accuracy": 0.4770935893058777, "step": 12855 }, { "epoch": 0.012952728583932371, "grad_norm": 46.75875363559398, "learning_rate": 1.295248071228572e-05, "loss": 2.7059, "mean_token_accuracy": 0.42758620381355283, "step": 12860 }, { "epoch": 0.012957764637036545, "grad_norm": 26.230855533020616, "learning_rate": 1.295751666901678e-05, "loss": 2.2372, "mean_token_accuracy": 0.47241378426551817, "step": 12865 }, { "epoch": 0.012962800690140717, "grad_norm": 28.802748603821968, "learning_rate": 1.296255262574784e-05, "loss": 2.1341, "mean_token_accuracy": 0.4798029541969299, "step": 12870 }, { "epoch": 0.012967836743244891, "grad_norm": 33.73548848255155, "learning_rate": 1.2967588582478901e-05, "loss": 2.4402, "mean_token_accuracy": 0.4068965554237366, "step": 12875 }, { "epoch": 0.012972872796349063, "grad_norm": 42.73450696683177, "learning_rate": 1.297262453920996e-05, "loss": 2.2558, "mean_token_accuracy": 0.43793103992939, "step": 12880 }, { "epoch": 0.012977908849453235, "grad_norm": 35.47754526219574, "learning_rate": 1.2977660495941018e-05, "loss": 2.426, "mean_token_accuracy": 0.42413792610168455, "step": 12885 }, { "epoch": 0.012982944902557409, "grad_norm": 38.609434301944674, "learning_rate": 1.298269645267208e-05, "loss": 2.3408, "mean_token_accuracy": 0.43854679465293883, "step": 12890 }, { "epoch": 0.01298798095566158, "grad_norm": 35.45261316088407, "learning_rate": 1.2987732409403138e-05, "loss": 2.402, "mean_token_accuracy": 0.4534180223941803, "step": 12895 }, { "epoch": 0.012993017008765755, "grad_norm": 33.4224987380504, "learning_rate": 1.2992768366134197e-05, "loss": 2.4698, "mean_token_accuracy": 0.4034482717514038, "step": 12900 }, { "epoch": 0.012998053061869927, "grad_norm": 35.609743099720625, "learning_rate": 1.2997804322865258e-05, "loss": 2.4653, "mean_token_accuracy": 0.41379310488700866, "step": 12905 }, { "epoch": 0.0130030891149741, "grad_norm": 24.919853466004504, "learning_rate": 1.3002840279596317e-05, "loss": 2.3445, "mean_token_accuracy": 0.43793103098869324, "step": 12910 }, { "epoch": 0.013008125168078272, "grad_norm": 25.4773996910296, "learning_rate": 1.300787623632738e-05, "loss": 2.3354, "mean_token_accuracy": 0.46551724672317507, "step": 12915 }, { "epoch": 0.013013161221182444, "grad_norm": 29.623283373632454, "learning_rate": 1.3012912193058438e-05, "loss": 2.3086, "mean_token_accuracy": 0.44137930274009707, "step": 12920 }, { "epoch": 0.013018197274286618, "grad_norm": 32.02303539201582, "learning_rate": 1.3017948149789497e-05, "loss": 2.1336, "mean_token_accuracy": 0.45517241954803467, "step": 12925 }, { "epoch": 0.01302323332739079, "grad_norm": 26.590419690457505, "learning_rate": 1.3022984106520558e-05, "loss": 2.286, "mean_token_accuracy": 0.43103447556495667, "step": 12930 }, { "epoch": 0.013028269380494964, "grad_norm": 24.97942720598132, "learning_rate": 1.3028020063251617e-05, "loss": 2.3042, "mean_token_accuracy": 0.44827585220336913, "step": 12935 }, { "epoch": 0.013033305433599136, "grad_norm": 30.217694894270398, "learning_rate": 1.3033056019982676e-05, "loss": 2.323, "mean_token_accuracy": 0.43793103098869324, "step": 12940 }, { "epoch": 0.01303834148670331, "grad_norm": 38.34318197705085, "learning_rate": 1.3038091976713737e-05, "loss": 2.3795, "mean_token_accuracy": 0.42631577849388125, "step": 12945 }, { "epoch": 0.013043377539807482, "grad_norm": 28.4063648755663, "learning_rate": 1.3043127933444797e-05, "loss": 2.1181, "mean_token_accuracy": 0.482758629322052, "step": 12950 }, { "epoch": 0.013048413592911654, "grad_norm": 30.205296249243396, "learning_rate": 1.3048163890175858e-05, "loss": 2.6092, "mean_token_accuracy": 0.4206896543502808, "step": 12955 }, { "epoch": 0.013053449646015828, "grad_norm": 28.583186441150584, "learning_rate": 1.3053199846906917e-05, "loss": 2.4494, "mean_token_accuracy": 0.44482758045196535, "step": 12960 }, { "epoch": 0.01305848569912, "grad_norm": 30.346269622031645, "learning_rate": 1.3058235803637974e-05, "loss": 2.3381, "mean_token_accuracy": 0.42758620381355283, "step": 12965 }, { "epoch": 0.013063521752224173, "grad_norm": 28.22640394302225, "learning_rate": 1.3063271760369037e-05, "loss": 2.3978, "mean_token_accuracy": 0.41379310190677643, "step": 12970 }, { "epoch": 0.013068557805328345, "grad_norm": 26.82382405332868, "learning_rate": 1.3068307717100095e-05, "loss": 2.5297, "mean_token_accuracy": 0.4517241418361664, "step": 12975 }, { "epoch": 0.01307359385843252, "grad_norm": 40.97766546169816, "learning_rate": 1.3073343673831154e-05, "loss": 2.4479, "mean_token_accuracy": 0.36896551847457887, "step": 12980 }, { "epoch": 0.013078629911536691, "grad_norm": 28.42873371062202, "learning_rate": 1.3078379630562215e-05, "loss": 2.2434, "mean_token_accuracy": 0.4758620738983154, "step": 12985 }, { "epoch": 0.013083665964640863, "grad_norm": 23.98547544977723, "learning_rate": 1.3083415587293274e-05, "loss": 2.1944, "mean_token_accuracy": 0.45517241954803467, "step": 12990 }, { "epoch": 0.013088702017745037, "grad_norm": 29.32057895859403, "learning_rate": 1.3088451544024335e-05, "loss": 2.4815, "mean_token_accuracy": 0.3793103456497192, "step": 12995 }, { "epoch": 0.013093738070849209, "grad_norm": 35.90334327283973, "learning_rate": 1.3093487500755394e-05, "loss": 2.4015, "mean_token_accuracy": 0.47931033968925474, "step": 13000 }, { "epoch": 0.013098774123953383, "grad_norm": 28.007809165784924, "learning_rate": 1.3098523457486453e-05, "loss": 2.8044, "mean_token_accuracy": 0.3758620619773865, "step": 13005 }, { "epoch": 0.013103810177057555, "grad_norm": 28.882632142862747, "learning_rate": 1.3103559414217514e-05, "loss": 2.2319, "mean_token_accuracy": 0.47586206197738645, "step": 13010 }, { "epoch": 0.013108846230161729, "grad_norm": 39.222945278278296, "learning_rate": 1.3108595370948574e-05, "loss": 2.6117, "mean_token_accuracy": 0.4159104645252228, "step": 13015 }, { "epoch": 0.0131138822832659, "grad_norm": 28.066340136477823, "learning_rate": 1.3113631327679631e-05, "loss": 2.5679, "mean_token_accuracy": 0.4344827651977539, "step": 13020 }, { "epoch": 0.013118918336370073, "grad_norm": 31.665754869067026, "learning_rate": 1.3118667284410694e-05, "loss": 2.0537, "mean_token_accuracy": 0.4620689690113068, "step": 13025 }, { "epoch": 0.013123954389474246, "grad_norm": 37.78623901562369, "learning_rate": 1.3123703241141751e-05, "loss": 2.4536, "mean_token_accuracy": 0.38620689511299133, "step": 13030 }, { "epoch": 0.013128990442578418, "grad_norm": 30.237792331384547, "learning_rate": 1.3128739197872814e-05, "loss": 2.6079, "mean_token_accuracy": 0.403448274731636, "step": 13035 }, { "epoch": 0.013134026495682592, "grad_norm": 27.212776761823513, "learning_rate": 1.3133775154603872e-05, "loss": 2.4359, "mean_token_accuracy": 0.41034482717514037, "step": 13040 }, { "epoch": 0.013139062548786764, "grad_norm": 28.091725473534964, "learning_rate": 1.3138811111334931e-05, "loss": 2.4651, "mean_token_accuracy": 0.40344826579093934, "step": 13045 }, { "epoch": 0.013144098601890938, "grad_norm": 25.78388333567489, "learning_rate": 1.3143847068065992e-05, "loss": 2.622, "mean_token_accuracy": 0.38965516686439516, "step": 13050 }, { "epoch": 0.01314913465499511, "grad_norm": 27.013001814895404, "learning_rate": 1.3148883024797051e-05, "loss": 2.5008, "mean_token_accuracy": 0.417241370677948, "step": 13055 }, { "epoch": 0.013154170708099282, "grad_norm": 24.611167133186306, "learning_rate": 1.315391898152811e-05, "loss": 2.2566, "mean_token_accuracy": 0.458620685338974, "step": 13060 }, { "epoch": 0.013159206761203456, "grad_norm": 21.589827906902283, "learning_rate": 1.3158954938259171e-05, "loss": 2.2282, "mean_token_accuracy": 0.4206896543502808, "step": 13065 }, { "epoch": 0.013164242814307628, "grad_norm": 27.028339947340495, "learning_rate": 1.316399089499023e-05, "loss": 2.491, "mean_token_accuracy": 0.4482758641242981, "step": 13070 }, { "epoch": 0.013169278867411802, "grad_norm": 30.11790167101707, "learning_rate": 1.3169026851721291e-05, "loss": 2.3151, "mean_token_accuracy": 0.4517241418361664, "step": 13075 }, { "epoch": 0.013174314920515974, "grad_norm": 36.839886245442564, "learning_rate": 1.317406280845235e-05, "loss": 2.5183, "mean_token_accuracy": 0.46551724076271056, "step": 13080 }, { "epoch": 0.013179350973620147, "grad_norm": 27.402049215157966, "learning_rate": 1.317909876518341e-05, "loss": 2.0824, "mean_token_accuracy": 0.4588626742362976, "step": 13085 }, { "epoch": 0.01318438702672432, "grad_norm": 32.58936476564454, "learning_rate": 1.3184134721914471e-05, "loss": 2.3953, "mean_token_accuracy": 0.4068965554237366, "step": 13090 }, { "epoch": 0.013189423079828492, "grad_norm": 37.75702389372153, "learning_rate": 1.318917067864553e-05, "loss": 2.3736, "mean_token_accuracy": 0.42413793206214906, "step": 13095 }, { "epoch": 0.013194459132932665, "grad_norm": 29.830997361558296, "learning_rate": 1.3194206635376591e-05, "loss": 2.0218, "mean_token_accuracy": 0.5251231491565704, "step": 13100 }, { "epoch": 0.013199495186036837, "grad_norm": 26.926482307655053, "learning_rate": 1.319924259210765e-05, "loss": 2.1943, "mean_token_accuracy": 0.42758620977401735, "step": 13105 }, { "epoch": 0.013204531239141011, "grad_norm": 24.51714622167604, "learning_rate": 1.3204278548838708e-05, "loss": 2.5788, "mean_token_accuracy": 0.3793103456497192, "step": 13110 }, { "epoch": 0.013209567292245183, "grad_norm": 24.711564779492285, "learning_rate": 1.320931450556977e-05, "loss": 2.3359, "mean_token_accuracy": 0.4448275864124298, "step": 13115 }, { "epoch": 0.013214603345349357, "grad_norm": 31.842543773740047, "learning_rate": 1.3214350462300828e-05, "loss": 2.4287, "mean_token_accuracy": 0.3493647873401642, "step": 13120 }, { "epoch": 0.013219639398453529, "grad_norm": 29.806671252443607, "learning_rate": 1.3219386419031887e-05, "loss": 2.2382, "mean_token_accuracy": 0.4620689690113068, "step": 13125 }, { "epoch": 0.013224675451557701, "grad_norm": 26.74280108147092, "learning_rate": 1.3224422375762948e-05, "loss": 2.2388, "mean_token_accuracy": 0.4517241358757019, "step": 13130 }, { "epoch": 0.013229711504661875, "grad_norm": 26.70603868985356, "learning_rate": 1.3229458332494008e-05, "loss": 2.4793, "mean_token_accuracy": 0.3931034505367279, "step": 13135 }, { "epoch": 0.013234747557766047, "grad_norm": 28.695697798680055, "learning_rate": 1.3234494289225069e-05, "loss": 2.0935, "mean_token_accuracy": 0.4551724076271057, "step": 13140 }, { "epoch": 0.01323978361087022, "grad_norm": 29.635497332223242, "learning_rate": 1.3239530245956128e-05, "loss": 2.7615, "mean_token_accuracy": 0.3983061194419861, "step": 13145 }, { "epoch": 0.013244819663974393, "grad_norm": 28.279503536811465, "learning_rate": 1.3244566202687187e-05, "loss": 2.3152, "mean_token_accuracy": 0.4640653431415558, "step": 13150 }, { "epoch": 0.013249855717078566, "grad_norm": 30.117343825351398, "learning_rate": 1.3249602159418248e-05, "loss": 2.4695, "mean_token_accuracy": 0.4068965554237366, "step": 13155 }, { "epoch": 0.013254891770182738, "grad_norm": 25.44118815330071, "learning_rate": 1.3254638116149307e-05, "loss": 2.3541, "mean_token_accuracy": 0.4172413766384125, "step": 13160 }, { "epoch": 0.01325992782328691, "grad_norm": 26.712037637891942, "learning_rate": 1.3259674072880365e-05, "loss": 2.3713, "mean_token_accuracy": 0.4275861978530884, "step": 13165 }, { "epoch": 0.013264963876391084, "grad_norm": 37.18679979972518, "learning_rate": 1.3264710029611427e-05, "loss": 2.3854, "mean_token_accuracy": 0.4517241418361664, "step": 13170 }, { "epoch": 0.013269999929495256, "grad_norm": 36.61800566398733, "learning_rate": 1.3269745986342485e-05, "loss": 2.5815, "mean_token_accuracy": 0.42413792610168455, "step": 13175 }, { "epoch": 0.01327503598259943, "grad_norm": 30.344365867524466, "learning_rate": 1.3274781943073548e-05, "loss": 2.2539, "mean_token_accuracy": 0.42413792610168455, "step": 13180 }, { "epoch": 0.013280072035703602, "grad_norm": 35.59478771723536, "learning_rate": 1.3279817899804605e-05, "loss": 2.9194, "mean_token_accuracy": 0.3586206823587418, "step": 13185 }, { "epoch": 0.013285108088807776, "grad_norm": 26.7564241636928, "learning_rate": 1.3284853856535664e-05, "loss": 2.6201, "mean_token_accuracy": 0.3931034505367279, "step": 13190 }, { "epoch": 0.013290144141911948, "grad_norm": 27.467643933222462, "learning_rate": 1.3289889813266725e-05, "loss": 2.5296, "mean_token_accuracy": 0.3999999940395355, "step": 13195 }, { "epoch": 0.01329518019501612, "grad_norm": 29.147119182875322, "learning_rate": 1.3294925769997785e-05, "loss": 2.0661, "mean_token_accuracy": 0.5000000059604645, "step": 13200 }, { "epoch": 0.013300216248120294, "grad_norm": 25.532278933057587, "learning_rate": 1.3299961726728844e-05, "loss": 2.3221, "mean_token_accuracy": 0.42758620381355283, "step": 13205 }, { "epoch": 0.013305252301224466, "grad_norm": 32.99655712921853, "learning_rate": 1.3304997683459905e-05, "loss": 2.3824, "mean_token_accuracy": 0.41548699140548706, "step": 13210 }, { "epoch": 0.01331028835432864, "grad_norm": 25.898519198369804, "learning_rate": 1.3310033640190964e-05, "loss": 2.491, "mean_token_accuracy": 0.4103448212146759, "step": 13215 }, { "epoch": 0.013315324407432811, "grad_norm": 29.607799723185142, "learning_rate": 1.3315069596922025e-05, "loss": 2.5533, "mean_token_accuracy": 0.42068966031074523, "step": 13220 }, { "epoch": 0.013320360460536985, "grad_norm": 26.91335893225431, "learning_rate": 1.3320105553653084e-05, "loss": 2.4274, "mean_token_accuracy": 0.42758620381355283, "step": 13225 }, { "epoch": 0.013325396513641157, "grad_norm": 37.029474253182556, "learning_rate": 1.3325141510384142e-05, "loss": 2.4071, "mean_token_accuracy": 0.43103448748588563, "step": 13230 }, { "epoch": 0.01333043256674533, "grad_norm": 38.49354046059756, "learning_rate": 1.3330177467115205e-05, "loss": 2.1807, "mean_token_accuracy": 0.4517241299152374, "step": 13235 }, { "epoch": 0.013335468619849503, "grad_norm": 21.079691866672174, "learning_rate": 1.3335213423846262e-05, "loss": 2.0981, "mean_token_accuracy": 0.5052026569843292, "step": 13240 }, { "epoch": 0.013340504672953675, "grad_norm": 31.24127807906519, "learning_rate": 1.3340249380577321e-05, "loss": 2.3168, "mean_token_accuracy": 0.42413792610168455, "step": 13245 }, { "epoch": 0.013345540726057849, "grad_norm": 31.35469483163652, "learning_rate": 1.3345285337308382e-05, "loss": 2.4527, "mean_token_accuracy": 0.4206896543502808, "step": 13250 }, { "epoch": 0.01335057677916202, "grad_norm": 29.570816697010443, "learning_rate": 1.3350321294039442e-05, "loss": 2.445, "mean_token_accuracy": 0.42068964838981626, "step": 13255 }, { "epoch": 0.013355612832266195, "grad_norm": 23.95500614024331, "learning_rate": 1.3355357250770502e-05, "loss": 2.3022, "mean_token_accuracy": 0.458620685338974, "step": 13260 }, { "epoch": 0.013360648885370367, "grad_norm": 21.812508768874007, "learning_rate": 1.3360393207501562e-05, "loss": 2.1632, "mean_token_accuracy": 0.4620689690113068, "step": 13265 }, { "epoch": 0.013365684938474539, "grad_norm": 31.20300093111897, "learning_rate": 1.3365429164232621e-05, "loss": 2.4955, "mean_token_accuracy": 0.42758620381355283, "step": 13270 }, { "epoch": 0.013370720991578712, "grad_norm": 33.92234758081913, "learning_rate": 1.3370465120963682e-05, "loss": 2.5696, "mean_token_accuracy": 0.3862069010734558, "step": 13275 }, { "epoch": 0.013375757044682884, "grad_norm": 37.19318737941734, "learning_rate": 1.3375501077694741e-05, "loss": 2.6097, "mean_token_accuracy": 0.3931034475564957, "step": 13280 }, { "epoch": 0.013380793097787058, "grad_norm": 26.389424915461618, "learning_rate": 1.33805370344258e-05, "loss": 2.0169, "mean_token_accuracy": 0.49100985527038576, "step": 13285 }, { "epoch": 0.01338582915089123, "grad_norm": 25.892190160599146, "learning_rate": 1.3385572991156861e-05, "loss": 2.1506, "mean_token_accuracy": 0.45862069725990295, "step": 13290 }, { "epoch": 0.013390865203995402, "grad_norm": 26.778048777594854, "learning_rate": 1.339060894788792e-05, "loss": 2.0451, "mean_token_accuracy": 0.4413793087005615, "step": 13295 }, { "epoch": 0.013395901257099576, "grad_norm": 25.86233147889207, "learning_rate": 1.3395644904618982e-05, "loss": 2.3476, "mean_token_accuracy": 0.4310344815254211, "step": 13300 }, { "epoch": 0.013400937310203748, "grad_norm": 31.839048903275426, "learning_rate": 1.340068086135004e-05, "loss": 1.9876, "mean_token_accuracy": 0.4863279044628143, "step": 13305 }, { "epoch": 0.013405973363307922, "grad_norm": 36.77715308947395, "learning_rate": 1.3405716818081098e-05, "loss": 2.0154, "mean_token_accuracy": 0.5021173536777497, "step": 13310 }, { "epoch": 0.013411009416412094, "grad_norm": 29.051248858598928, "learning_rate": 1.3410752774812161e-05, "loss": 2.2748, "mean_token_accuracy": 0.4931034445762634, "step": 13315 }, { "epoch": 0.013416045469516268, "grad_norm": 23.82582602335328, "learning_rate": 1.3415788731543219e-05, "loss": 2.2898, "mean_token_accuracy": 0.4310344815254211, "step": 13320 }, { "epoch": 0.01342108152262044, "grad_norm": 31.30941445438363, "learning_rate": 1.3420824688274278e-05, "loss": 2.6116, "mean_token_accuracy": 0.37241379022598264, "step": 13325 }, { "epoch": 0.013426117575724612, "grad_norm": 33.24636965502477, "learning_rate": 1.3425860645005339e-05, "loss": 2.4368, "mean_token_accuracy": 0.41209921836853025, "step": 13330 }, { "epoch": 0.013431153628828785, "grad_norm": 56.4758936308936, "learning_rate": 1.3430896601736398e-05, "loss": 2.4223, "mean_token_accuracy": 0.44482758045196535, "step": 13335 }, { "epoch": 0.013436189681932957, "grad_norm": 25.72549086319441, "learning_rate": 1.3435932558467459e-05, "loss": 2.61, "mean_token_accuracy": 0.37586206793785093, "step": 13340 }, { "epoch": 0.013441225735037131, "grad_norm": 42.08671318015093, "learning_rate": 1.3440968515198518e-05, "loss": 2.3626, "mean_token_accuracy": 0.4068965494632721, "step": 13345 }, { "epoch": 0.013446261788141303, "grad_norm": 43.64949811480978, "learning_rate": 1.3446004471929577e-05, "loss": 2.6187, "mean_token_accuracy": 0.3896551787853241, "step": 13350 }, { "epoch": 0.013451297841245477, "grad_norm": 21.56184740972458, "learning_rate": 1.3451040428660638e-05, "loss": 2.3611, "mean_token_accuracy": 0.4344827651977539, "step": 13355 }, { "epoch": 0.013456333894349649, "grad_norm": 28.561819188138596, "learning_rate": 1.3456076385391698e-05, "loss": 2.1824, "mean_token_accuracy": 0.49655171036720275, "step": 13360 }, { "epoch": 0.013461369947453821, "grad_norm": 24.101584654395488, "learning_rate": 1.3461112342122755e-05, "loss": 2.2919, "mean_token_accuracy": 0.47586206793785096, "step": 13365 }, { "epoch": 0.013466406000557995, "grad_norm": 35.34163390202499, "learning_rate": 1.3466148298853818e-05, "loss": 2.7604, "mean_token_accuracy": 0.40344828367233276, "step": 13370 }, { "epoch": 0.013471442053662167, "grad_norm": 26.35615471100323, "learning_rate": 1.3471184255584875e-05, "loss": 2.777, "mean_token_accuracy": 0.37241379618644715, "step": 13375 }, { "epoch": 0.01347647810676634, "grad_norm": 23.30053669527517, "learning_rate": 1.3476220212315938e-05, "loss": 2.6872, "mean_token_accuracy": 0.3827586203813553, "step": 13380 }, { "epoch": 0.013481514159870513, "grad_norm": 32.3574464927237, "learning_rate": 1.3481256169046996e-05, "loss": 2.487, "mean_token_accuracy": 0.42413792610168455, "step": 13385 }, { "epoch": 0.013486550212974686, "grad_norm": 29.957176784804318, "learning_rate": 1.3486292125778055e-05, "loss": 2.2204, "mean_token_accuracy": 0.45486992597579956, "step": 13390 }, { "epoch": 0.013491586266078858, "grad_norm": 32.34722239570284, "learning_rate": 1.3491328082509116e-05, "loss": 2.5156, "mean_token_accuracy": 0.4068965554237366, "step": 13395 }, { "epoch": 0.01349662231918303, "grad_norm": 28.08430323260744, "learning_rate": 1.3496364039240175e-05, "loss": 2.3849, "mean_token_accuracy": 0.42758620381355283, "step": 13400 }, { "epoch": 0.013501658372287204, "grad_norm": 32.73009089942878, "learning_rate": 1.3501399995971234e-05, "loss": 2.4437, "mean_token_accuracy": 0.4, "step": 13405 }, { "epoch": 0.013506694425391376, "grad_norm": 46.64258261808528, "learning_rate": 1.3506435952702295e-05, "loss": 2.3428, "mean_token_accuracy": 0.4620689630508423, "step": 13410 }, { "epoch": 0.01351173047849555, "grad_norm": 28.61483244079587, "learning_rate": 1.3511471909433355e-05, "loss": 2.2551, "mean_token_accuracy": 0.447005432844162, "step": 13415 }, { "epoch": 0.013516766531599722, "grad_norm": 40.331135864675595, "learning_rate": 1.3516507866164415e-05, "loss": 2.4398, "mean_token_accuracy": 0.3931034505367279, "step": 13420 }, { "epoch": 0.013521802584703896, "grad_norm": 26.52834992994393, "learning_rate": 1.3521543822895475e-05, "loss": 2.2539, "mean_token_accuracy": 0.4931034445762634, "step": 13425 }, { "epoch": 0.013526838637808068, "grad_norm": 25.29292943626565, "learning_rate": 1.3526579779626532e-05, "loss": 1.9428, "mean_token_accuracy": 0.501875388622284, "step": 13430 }, { "epoch": 0.01353187469091224, "grad_norm": 21.22945332741738, "learning_rate": 1.3531615736357595e-05, "loss": 2.1188, "mean_token_accuracy": 0.46896552443504336, "step": 13435 }, { "epoch": 0.013536910744016414, "grad_norm": 27.491291043643972, "learning_rate": 1.3536651693088653e-05, "loss": 2.1109, "mean_token_accuracy": 0.4758620738983154, "step": 13440 }, { "epoch": 0.013541946797120586, "grad_norm": 34.482057498307704, "learning_rate": 1.3541687649819712e-05, "loss": 2.1006, "mean_token_accuracy": 0.45517240166664125, "step": 13445 }, { "epoch": 0.01354698285022476, "grad_norm": 31.40567347381732, "learning_rate": 1.3546723606550773e-05, "loss": 2.6787, "mean_token_accuracy": 0.4137930989265442, "step": 13450 }, { "epoch": 0.013552018903328931, "grad_norm": 24.84582439916837, "learning_rate": 1.3551759563281832e-05, "loss": 2.3336, "mean_token_accuracy": 0.4413793087005615, "step": 13455 }, { "epoch": 0.013557054956433105, "grad_norm": 24.216317611258546, "learning_rate": 1.3556795520012895e-05, "loss": 2.3509, "mean_token_accuracy": 0.4172413766384125, "step": 13460 }, { "epoch": 0.013562091009537277, "grad_norm": 30.41956767424792, "learning_rate": 1.3561831476743952e-05, "loss": 2.3585, "mean_token_accuracy": 0.4159104585647583, "step": 13465 }, { "epoch": 0.01356712706264145, "grad_norm": 23.993589383061074, "learning_rate": 1.3566867433475011e-05, "loss": 2.1802, "mean_token_accuracy": 0.46896551847457885, "step": 13470 }, { "epoch": 0.013572163115745623, "grad_norm": 27.152218485375624, "learning_rate": 1.3571903390206072e-05, "loss": 2.1909, "mean_token_accuracy": 0.4448275864124298, "step": 13475 }, { "epoch": 0.013577199168849795, "grad_norm": 27.541940208110763, "learning_rate": 1.3576939346937132e-05, "loss": 2.1498, "mean_token_accuracy": 0.4586206912994385, "step": 13480 }, { "epoch": 0.013582235221953969, "grad_norm": 31.59059505205469, "learning_rate": 1.3581975303668191e-05, "loss": 2.5284, "mean_token_accuracy": 0.4620689630508423, "step": 13485 }, { "epoch": 0.013587271275058141, "grad_norm": 33.25011383514295, "learning_rate": 1.3587011260399252e-05, "loss": 2.462, "mean_token_accuracy": 0.44827585816383364, "step": 13490 }, { "epoch": 0.013592307328162315, "grad_norm": 30.642490137801477, "learning_rate": 1.3592047217130311e-05, "loss": 2.4603, "mean_token_accuracy": 0.441379314661026, "step": 13495 }, { "epoch": 0.013597343381266487, "grad_norm": 26.23554612018405, "learning_rate": 1.3597083173861372e-05, "loss": 2.1111, "mean_token_accuracy": 0.4517241358757019, "step": 13500 }, { "epoch": 0.013602379434370659, "grad_norm": 30.53288435029085, "learning_rate": 1.3602119130592431e-05, "loss": 2.49, "mean_token_accuracy": 0.41034482717514037, "step": 13505 }, { "epoch": 0.013607415487474833, "grad_norm": 27.50941733450289, "learning_rate": 1.3607155087323489e-05, "loss": 2.3081, "mean_token_accuracy": 0.4620689570903778, "step": 13510 }, { "epoch": 0.013612451540579005, "grad_norm": 27.830259864410383, "learning_rate": 1.3612191044054551e-05, "loss": 2.2092, "mean_token_accuracy": 0.46551724076271056, "step": 13515 }, { "epoch": 0.013617487593683178, "grad_norm": 23.233848429616888, "learning_rate": 1.3617227000785609e-05, "loss": 2.59, "mean_token_accuracy": 0.43793103098869324, "step": 13520 }, { "epoch": 0.01362252364678735, "grad_norm": 34.706070952462554, "learning_rate": 1.3622262957516672e-05, "loss": 2.6078, "mean_token_accuracy": 0.38620689511299133, "step": 13525 }, { "epoch": 0.013627559699891524, "grad_norm": 32.16073625179282, "learning_rate": 1.362729891424773e-05, "loss": 2.4252, "mean_token_accuracy": 0.43103448748588563, "step": 13530 }, { "epoch": 0.013632595752995696, "grad_norm": 28.321563610821467, "learning_rate": 1.3632334870978788e-05, "loss": 2.5101, "mean_token_accuracy": 0.4379310250282288, "step": 13535 }, { "epoch": 0.013637631806099868, "grad_norm": 38.784157737560314, "learning_rate": 1.363737082770985e-05, "loss": 2.4589, "mean_token_accuracy": 0.4379310250282288, "step": 13540 }, { "epoch": 0.013642667859204042, "grad_norm": 27.71392350651912, "learning_rate": 1.3642406784440909e-05, "loss": 2.4885, "mean_token_accuracy": 0.4034482717514038, "step": 13545 }, { "epoch": 0.013647703912308214, "grad_norm": 23.197070909643145, "learning_rate": 1.3647442741171968e-05, "loss": 2.4825, "mean_token_accuracy": 0.4137930989265442, "step": 13550 }, { "epoch": 0.013652739965412388, "grad_norm": 38.68560041527579, "learning_rate": 1.3652478697903029e-05, "loss": 2.178, "mean_token_accuracy": 0.4689655125141144, "step": 13555 }, { "epoch": 0.01365777601851656, "grad_norm": 30.43931275360943, "learning_rate": 1.3657514654634088e-05, "loss": 2.1141, "mean_token_accuracy": 0.441379314661026, "step": 13560 }, { "epoch": 0.013662812071620734, "grad_norm": 30.67907474851602, "learning_rate": 1.3662550611365149e-05, "loss": 2.4654, "mean_token_accuracy": 0.3965517163276672, "step": 13565 }, { "epoch": 0.013667848124724906, "grad_norm": 27.280492970470206, "learning_rate": 1.3667586568096208e-05, "loss": 2.1042, "mean_token_accuracy": 0.4620689690113068, "step": 13570 }, { "epoch": 0.013672884177829078, "grad_norm": 30.050951280900833, "learning_rate": 1.3672622524827266e-05, "loss": 2.2671, "mean_token_accuracy": 0.4016333997249603, "step": 13575 }, { "epoch": 0.013677920230933251, "grad_norm": 37.713115178803086, "learning_rate": 1.3677658481558329e-05, "loss": 2.5251, "mean_token_accuracy": 0.4, "step": 13580 }, { "epoch": 0.013682956284037423, "grad_norm": 33.27826098970429, "learning_rate": 1.3682694438289386e-05, "loss": 2.2968, "mean_token_accuracy": 0.4758620738983154, "step": 13585 }, { "epoch": 0.013687992337141597, "grad_norm": 27.662748828303325, "learning_rate": 1.3687730395020445e-05, "loss": 2.3947, "mean_token_accuracy": 0.4517241418361664, "step": 13590 }, { "epoch": 0.01369302839024577, "grad_norm": 36.71722543414404, "learning_rate": 1.3692766351751506e-05, "loss": 2.2264, "mean_token_accuracy": 0.4774349570274353, "step": 13595 }, { "epoch": 0.013698064443349943, "grad_norm": 34.30343247257018, "learning_rate": 1.3697802308482566e-05, "loss": 2.4774, "mean_token_accuracy": 0.41034482717514037, "step": 13600 }, { "epoch": 0.013703100496454115, "grad_norm": 25.973692331354307, "learning_rate": 1.3702838265213626e-05, "loss": 2.3587, "mean_token_accuracy": 0.4517241358757019, "step": 13605 }, { "epoch": 0.013708136549558287, "grad_norm": 27.022399535616298, "learning_rate": 1.3707874221944686e-05, "loss": 2.5817, "mean_token_accuracy": 0.4206896543502808, "step": 13610 }, { "epoch": 0.01371317260266246, "grad_norm": 26.085525309504455, "learning_rate": 1.3712910178675745e-05, "loss": 2.3602, "mean_token_accuracy": 0.4068965554237366, "step": 13615 }, { "epoch": 0.013718208655766633, "grad_norm": 35.99124576343834, "learning_rate": 1.3717946135406806e-05, "loss": 2.6556, "mean_token_accuracy": 0.3793103516101837, "step": 13620 }, { "epoch": 0.013723244708870807, "grad_norm": 30.092340639666475, "learning_rate": 1.3722982092137865e-05, "loss": 2.5238, "mean_token_accuracy": 0.37586206793785093, "step": 13625 }, { "epoch": 0.013728280761974979, "grad_norm": 29.52150213665843, "learning_rate": 1.3728018048868924e-05, "loss": 2.3419, "mean_token_accuracy": 0.4896551787853241, "step": 13630 }, { "epoch": 0.013733316815079152, "grad_norm": 29.549291110669905, "learning_rate": 1.3733054005599985e-05, "loss": 2.637, "mean_token_accuracy": 0.40689654350280763, "step": 13635 }, { "epoch": 0.013738352868183324, "grad_norm": 25.207613493411678, "learning_rate": 1.3738089962331045e-05, "loss": 2.3176, "mean_token_accuracy": 0.4724137902259827, "step": 13640 }, { "epoch": 0.013743388921287496, "grad_norm": 28.393334540972603, "learning_rate": 1.3743125919062106e-05, "loss": 2.0455, "mean_token_accuracy": 0.47241379618644713, "step": 13645 }, { "epoch": 0.01374842497439167, "grad_norm": 27.632355446797295, "learning_rate": 1.3748161875793165e-05, "loss": 2.0234, "mean_token_accuracy": 0.458620685338974, "step": 13650 }, { "epoch": 0.013753461027495842, "grad_norm": 26.20244205085071, "learning_rate": 1.3753197832524222e-05, "loss": 2.3586, "mean_token_accuracy": 0.45172414779663084, "step": 13655 }, { "epoch": 0.013758497080600016, "grad_norm": 63.60662309744961, "learning_rate": 1.3758233789255285e-05, "loss": 2.3394, "mean_token_accuracy": 0.4810042381286621, "step": 13660 }, { "epoch": 0.013763533133704188, "grad_norm": 33.41388476499537, "learning_rate": 1.3763269745986343e-05, "loss": 2.1838, "mean_token_accuracy": 0.4931034505367279, "step": 13665 }, { "epoch": 0.013768569186808362, "grad_norm": 33.727315823247686, "learning_rate": 1.3768305702717402e-05, "loss": 2.6535, "mean_token_accuracy": 0.37586206793785093, "step": 13670 }, { "epoch": 0.013773605239912534, "grad_norm": 24.28062834079656, "learning_rate": 1.3773341659448463e-05, "loss": 2.5132, "mean_token_accuracy": 0.39310344457626345, "step": 13675 }, { "epoch": 0.013778641293016706, "grad_norm": 30.0255864202641, "learning_rate": 1.3778377616179522e-05, "loss": 2.4625, "mean_token_accuracy": 0.4172413766384125, "step": 13680 }, { "epoch": 0.01378367734612088, "grad_norm": 28.381195466955294, "learning_rate": 1.3783413572910583e-05, "loss": 2.2771, "mean_token_accuracy": 0.4431336998939514, "step": 13685 }, { "epoch": 0.013788713399225052, "grad_norm": 24.081556394380694, "learning_rate": 1.3788449529641642e-05, "loss": 2.4156, "mean_token_accuracy": 0.4275862157344818, "step": 13690 }, { "epoch": 0.013793749452329225, "grad_norm": 27.61645154489071, "learning_rate": 1.3793485486372702e-05, "loss": 2.6592, "mean_token_accuracy": 0.36702964305877683, "step": 13695 }, { "epoch": 0.013798785505433397, "grad_norm": 21.93220612182478, "learning_rate": 1.3798521443103762e-05, "loss": 2.2208, "mean_token_accuracy": 0.441379314661026, "step": 13700 }, { "epoch": 0.013803821558537571, "grad_norm": 23.421907472730563, "learning_rate": 1.3803557399834822e-05, "loss": 2.2828, "mean_token_accuracy": 0.4379310429096222, "step": 13705 }, { "epoch": 0.013808857611641743, "grad_norm": 25.106479254007787, "learning_rate": 1.380859335656588e-05, "loss": 2.4229, "mean_token_accuracy": 0.4517241358757019, "step": 13710 }, { "epoch": 0.013813893664745915, "grad_norm": 27.098208304017174, "learning_rate": 1.3813629313296942e-05, "loss": 2.0094, "mean_token_accuracy": 0.4979431450366974, "step": 13715 }, { "epoch": 0.013818929717850089, "grad_norm": 22.250983551870384, "learning_rate": 1.3818665270028e-05, "loss": 1.8665, "mean_token_accuracy": 0.49866908192634585, "step": 13720 }, { "epoch": 0.013823965770954261, "grad_norm": 23.73972392007322, "learning_rate": 1.3823701226759062e-05, "loss": 2.4838, "mean_token_accuracy": 0.3862068891525269, "step": 13725 }, { "epoch": 0.013829001824058435, "grad_norm": 38.9274355090047, "learning_rate": 1.382873718349012e-05, "loss": 2.6813, "mean_token_accuracy": 0.3896551728248596, "step": 13730 }, { "epoch": 0.013834037877162607, "grad_norm": 23.65292742875833, "learning_rate": 1.3833773140221179e-05, "loss": 2.3639, "mean_token_accuracy": 0.4517241358757019, "step": 13735 }, { "epoch": 0.01383907393026678, "grad_norm": 22.436691589144164, "learning_rate": 1.383880909695224e-05, "loss": 2.5212, "mean_token_accuracy": 0.44482759237289426, "step": 13740 }, { "epoch": 0.013844109983370953, "grad_norm": 31.56529295102431, "learning_rate": 1.3843845053683299e-05, "loss": 2.2573, "mean_token_accuracy": 0.4620689690113068, "step": 13745 }, { "epoch": 0.013849146036475125, "grad_norm": 35.93501081823169, "learning_rate": 1.3848881010414358e-05, "loss": 2.265, "mean_token_accuracy": 0.41947974264621735, "step": 13750 }, { "epoch": 0.013854182089579298, "grad_norm": 27.34234921325077, "learning_rate": 1.385391696714542e-05, "loss": 2.5391, "mean_token_accuracy": 0.3896551728248596, "step": 13755 }, { "epoch": 0.01385921814268347, "grad_norm": 32.05644762769521, "learning_rate": 1.3858952923876479e-05, "loss": 2.3079, "mean_token_accuracy": 0.39310344457626345, "step": 13760 }, { "epoch": 0.013864254195787644, "grad_norm": 25.697503249893554, "learning_rate": 1.386398888060754e-05, "loss": 2.6425, "mean_token_accuracy": 0.41034482717514037, "step": 13765 }, { "epoch": 0.013869290248891816, "grad_norm": 31.76197281271624, "learning_rate": 1.3869024837338599e-05, "loss": 2.456, "mean_token_accuracy": 0.42413793206214906, "step": 13770 }, { "epoch": 0.01387432630199599, "grad_norm": 35.308250611662054, "learning_rate": 1.3874060794069656e-05, "loss": 2.586, "mean_token_accuracy": 0.3793103456497192, "step": 13775 }, { "epoch": 0.013879362355100162, "grad_norm": 50.70823971099878, "learning_rate": 1.3879096750800719e-05, "loss": 2.4025, "mean_token_accuracy": 0.4379310369491577, "step": 13780 }, { "epoch": 0.013884398408204334, "grad_norm": 25.038569938292934, "learning_rate": 1.3884132707531777e-05, "loss": 2.2659, "mean_token_accuracy": 0.4816092014312744, "step": 13785 }, { "epoch": 0.013889434461308508, "grad_norm": 36.3842874449946, "learning_rate": 1.3889168664262836e-05, "loss": 2.2339, "mean_token_accuracy": 0.4862069010734558, "step": 13790 }, { "epoch": 0.01389447051441268, "grad_norm": 25.89752160790185, "learning_rate": 1.3894204620993897e-05, "loss": 2.4768, "mean_token_accuracy": 0.44482758045196535, "step": 13795 }, { "epoch": 0.013899506567516854, "grad_norm": 26.974745755744706, "learning_rate": 1.3899240577724956e-05, "loss": 2.276, "mean_token_accuracy": 0.47931033968925474, "step": 13800 }, { "epoch": 0.013904542620621026, "grad_norm": 48.42921170210844, "learning_rate": 1.3904276534456017e-05, "loss": 2.2699, "mean_token_accuracy": 0.4880822777748108, "step": 13805 }, { "epoch": 0.0139095786737252, "grad_norm": 33.952280450743174, "learning_rate": 1.3909312491187076e-05, "loss": 2.1916, "mean_token_accuracy": 0.4640048384666443, "step": 13810 }, { "epoch": 0.013914614726829371, "grad_norm": 41.27832516759408, "learning_rate": 1.3914348447918135e-05, "loss": 2.3554, "mean_token_accuracy": 0.45517241954803467, "step": 13815 }, { "epoch": 0.013919650779933544, "grad_norm": 25.632922284676905, "learning_rate": 1.3919384404649196e-05, "loss": 2.5967, "mean_token_accuracy": 0.39310343861579894, "step": 13820 }, { "epoch": 0.013924686833037717, "grad_norm": 24.720269311162866, "learning_rate": 1.3924420361380256e-05, "loss": 2.3295, "mean_token_accuracy": 0.4448275864124298, "step": 13825 }, { "epoch": 0.01392972288614189, "grad_norm": 50.640605524944306, "learning_rate": 1.3929456318111315e-05, "loss": 2.4225, "mean_token_accuracy": 0.3965517282485962, "step": 13830 }, { "epoch": 0.013934758939246063, "grad_norm": 27.139377775582044, "learning_rate": 1.3934492274842376e-05, "loss": 2.3684, "mean_token_accuracy": 0.4034482717514038, "step": 13835 }, { "epoch": 0.013939794992350235, "grad_norm": 35.27235715048766, "learning_rate": 1.3939528231573435e-05, "loss": 2.0935, "mean_token_accuracy": 0.47586206197738645, "step": 13840 }, { "epoch": 0.013944831045454409, "grad_norm": 26.039475966100603, "learning_rate": 1.3944564188304496e-05, "loss": 2.5792, "mean_token_accuracy": 0.37241379618644715, "step": 13845 }, { "epoch": 0.013949867098558581, "grad_norm": 26.66515189012988, "learning_rate": 1.3949600145035555e-05, "loss": 2.2224, "mean_token_accuracy": 0.46896551847457885, "step": 13850 }, { "epoch": 0.013954903151662753, "grad_norm": 24.771804674537318, "learning_rate": 1.3954636101766613e-05, "loss": 2.1707, "mean_token_accuracy": 0.47586206793785096, "step": 13855 }, { "epoch": 0.013959939204766927, "grad_norm": 36.59570498183246, "learning_rate": 1.3959672058497675e-05, "loss": 2.0786, "mean_token_accuracy": 0.5068965554237366, "step": 13860 }, { "epoch": 0.013964975257871099, "grad_norm": 35.36675202574528, "learning_rate": 1.3964708015228733e-05, "loss": 2.7755, "mean_token_accuracy": 0.37586206793785093, "step": 13865 }, { "epoch": 0.013970011310975273, "grad_norm": 31.663274052818668, "learning_rate": 1.3969743971959792e-05, "loss": 2.4857, "mean_token_accuracy": 0.4413793087005615, "step": 13870 }, { "epoch": 0.013975047364079445, "grad_norm": 26.270126446720514, "learning_rate": 1.3974779928690853e-05, "loss": 2.1984, "mean_token_accuracy": 0.49655172824859617, "step": 13875 }, { "epoch": 0.013980083417183618, "grad_norm": 28.711550933625286, "learning_rate": 1.3979815885421913e-05, "loss": 2.5893, "mean_token_accuracy": 0.41034482717514037, "step": 13880 }, { "epoch": 0.01398511947028779, "grad_norm": 30.314419676094158, "learning_rate": 1.3984851842152973e-05, "loss": 2.335, "mean_token_accuracy": 0.4517241358757019, "step": 13885 }, { "epoch": 0.013990155523391962, "grad_norm": 23.226241161342884, "learning_rate": 1.3989887798884033e-05, "loss": 2.3859, "mean_token_accuracy": 0.46551724672317507, "step": 13890 }, { "epoch": 0.013995191576496136, "grad_norm": 23.09558356944471, "learning_rate": 1.3994923755615092e-05, "loss": 2.3441, "mean_token_accuracy": 0.4517241418361664, "step": 13895 }, { "epoch": 0.014000227629600308, "grad_norm": 101.74202625802418, "learning_rate": 1.3999959712346153e-05, "loss": 2.2123, "mean_token_accuracy": 0.4517241358757019, "step": 13900 }, { "epoch": 0.014005263682704482, "grad_norm": 30.492760460386002, "learning_rate": 1.4004995669077212e-05, "loss": 2.7457, "mean_token_accuracy": 0.3793103456497192, "step": 13905 }, { "epoch": 0.014010299735808654, "grad_norm": 34.712605638850874, "learning_rate": 1.4010031625808273e-05, "loss": 2.4381, "mean_token_accuracy": 0.43448275327682495, "step": 13910 }, { "epoch": 0.014015335788912828, "grad_norm": 27.227924619512613, "learning_rate": 1.4015067582539332e-05, "loss": 2.3635, "mean_token_accuracy": 0.42758620381355283, "step": 13915 }, { "epoch": 0.014020371842017, "grad_norm": 29.891136199318833, "learning_rate": 1.402010353927039e-05, "loss": 2.4869, "mean_token_accuracy": 0.4206896543502808, "step": 13920 }, { "epoch": 0.014025407895121172, "grad_norm": 31.899386988634994, "learning_rate": 1.4025139496001453e-05, "loss": 2.7688, "mean_token_accuracy": 0.3620689630508423, "step": 13925 }, { "epoch": 0.014030443948225346, "grad_norm": 26.29877775224825, "learning_rate": 1.403017545273251e-05, "loss": 2.3842, "mean_token_accuracy": 0.42413792610168455, "step": 13930 }, { "epoch": 0.014035480001329518, "grad_norm": 39.76180304652059, "learning_rate": 1.403521140946357e-05, "loss": 2.5543, "mean_token_accuracy": 0.4172413766384125, "step": 13935 }, { "epoch": 0.014040516054433691, "grad_norm": 22.592645527562233, "learning_rate": 1.404024736619463e-05, "loss": 2.2495, "mean_token_accuracy": 0.4379310250282288, "step": 13940 }, { "epoch": 0.014045552107537863, "grad_norm": 28.894902637027247, "learning_rate": 1.404528332292569e-05, "loss": 2.5621, "mean_token_accuracy": 0.38275861740112305, "step": 13945 }, { "epoch": 0.014050588160642037, "grad_norm": 25.78605113653275, "learning_rate": 1.405031927965675e-05, "loss": 2.48, "mean_token_accuracy": 0.44137930274009707, "step": 13950 }, { "epoch": 0.01405562421374621, "grad_norm": 30.14873450824353, "learning_rate": 1.405535523638781e-05, "loss": 2.4725, "mean_token_accuracy": 0.4344827592372894, "step": 13955 }, { "epoch": 0.014060660266850381, "grad_norm": 34.26628193605773, "learning_rate": 1.4060391193118869e-05, "loss": 2.2456, "mean_token_accuracy": 0.44482758045196535, "step": 13960 }, { "epoch": 0.014065696319954555, "grad_norm": 39.72013121838575, "learning_rate": 1.406542714984993e-05, "loss": 2.5008, "mean_token_accuracy": 0.41034482717514037, "step": 13965 }, { "epoch": 0.014070732373058727, "grad_norm": 38.271817513991074, "learning_rate": 1.407046310658099e-05, "loss": 2.1637, "mean_token_accuracy": 0.4776769518852234, "step": 13970 }, { "epoch": 0.0140757684261629, "grad_norm": 32.873694259727834, "learning_rate": 1.4075499063312047e-05, "loss": 2.4976, "mean_token_accuracy": 0.42238354682922363, "step": 13975 }, { "epoch": 0.014080804479267073, "grad_norm": 29.826439758053755, "learning_rate": 1.408053502004311e-05, "loss": 2.642, "mean_token_accuracy": 0.39310344457626345, "step": 13980 }, { "epoch": 0.014085840532371247, "grad_norm": 40.26036309612394, "learning_rate": 1.4085570976774167e-05, "loss": 2.4766, "mean_token_accuracy": 0.40344826579093934, "step": 13985 }, { "epoch": 0.014090876585475419, "grad_norm": 27.751350139391278, "learning_rate": 1.409060693350523e-05, "loss": 2.7553, "mean_token_accuracy": 0.35862069129943847, "step": 13990 }, { "epoch": 0.01409591263857959, "grad_norm": 23.466008050323317, "learning_rate": 1.4095642890236289e-05, "loss": 2.4024, "mean_token_accuracy": 0.4310344815254211, "step": 13995 }, { "epoch": 0.014100948691683764, "grad_norm": 35.17414276987657, "learning_rate": 1.4100678846967346e-05, "loss": 2.0949, "mean_token_accuracy": 0.4517241299152374, "step": 14000 }, { "epoch": 0.014105984744787936, "grad_norm": 32.25859286777294, "learning_rate": 1.4105714803698409e-05, "loss": 2.835, "mean_token_accuracy": 0.41379310488700866, "step": 14005 }, { "epoch": 0.01411102079789211, "grad_norm": 25.007506879940863, "learning_rate": 1.4110750760429467e-05, "loss": 2.2616, "mean_token_accuracy": 0.4517241418361664, "step": 14010 }, { "epoch": 0.014116056850996282, "grad_norm": 68.50716236505473, "learning_rate": 1.4115786717160526e-05, "loss": 2.2437, "mean_token_accuracy": 0.4551724135875702, "step": 14015 }, { "epoch": 0.014121092904100456, "grad_norm": 36.66102476353269, "learning_rate": 1.4120822673891587e-05, "loss": 2.6624, "mean_token_accuracy": 0.4000000059604645, "step": 14020 }, { "epoch": 0.014126128957204628, "grad_norm": 24.2480174756622, "learning_rate": 1.4125858630622646e-05, "loss": 2.0828, "mean_token_accuracy": 0.4814277052879333, "step": 14025 }, { "epoch": 0.0141311650103088, "grad_norm": 30.75320405712023, "learning_rate": 1.4130894587353707e-05, "loss": 2.1088, "mean_token_accuracy": 0.471203875541687, "step": 14030 }, { "epoch": 0.014136201063412974, "grad_norm": 23.89200080830402, "learning_rate": 1.4135930544084766e-05, "loss": 2.2859, "mean_token_accuracy": 0.4586206912994385, "step": 14035 }, { "epoch": 0.014141237116517146, "grad_norm": 29.945100949469442, "learning_rate": 1.4140966500815826e-05, "loss": 2.4027, "mean_token_accuracy": 0.42758620977401735, "step": 14040 }, { "epoch": 0.01414627316962132, "grad_norm": 26.65745870542326, "learning_rate": 1.4146002457546886e-05, "loss": 2.6715, "mean_token_accuracy": 0.38620689511299133, "step": 14045 }, { "epoch": 0.014151309222725492, "grad_norm": 25.343701912182958, "learning_rate": 1.4151038414277946e-05, "loss": 2.0758, "mean_token_accuracy": 0.4843920111656189, "step": 14050 }, { "epoch": 0.014156345275829665, "grad_norm": 28.916872677606936, "learning_rate": 1.4156074371009003e-05, "loss": 2.225, "mean_token_accuracy": 0.45172414779663084, "step": 14055 }, { "epoch": 0.014161381328933837, "grad_norm": 22.556662395783103, "learning_rate": 1.4161110327740066e-05, "loss": 2.6137, "mean_token_accuracy": 0.36896551847457887, "step": 14060 }, { "epoch": 0.01416641738203801, "grad_norm": 27.375479137320863, "learning_rate": 1.4166146284471123e-05, "loss": 2.3445, "mean_token_accuracy": 0.441379314661026, "step": 14065 }, { "epoch": 0.014171453435142183, "grad_norm": 54.781488321326066, "learning_rate": 1.4171182241202186e-05, "loss": 2.7439, "mean_token_accuracy": 0.37241379618644715, "step": 14070 }, { "epoch": 0.014176489488246355, "grad_norm": 246.07215370401684, "learning_rate": 1.4176218197933244e-05, "loss": 2.4367, "mean_token_accuracy": 0.44137930274009707, "step": 14075 }, { "epoch": 0.014181525541350529, "grad_norm": 27.776921535867178, "learning_rate": 1.4181254154664303e-05, "loss": 2.5066, "mean_token_accuracy": 0.4275861978530884, "step": 14080 }, { "epoch": 0.014186561594454701, "grad_norm": 58.933875145998734, "learning_rate": 1.4186290111395364e-05, "loss": 2.3849, "mean_token_accuracy": 0.4137930929660797, "step": 14085 }, { "epoch": 0.014191597647558875, "grad_norm": 35.05198756979633, "learning_rate": 1.4191326068126423e-05, "loss": 2.4113, "mean_token_accuracy": 0.3965517282485962, "step": 14090 }, { "epoch": 0.014196633700663047, "grad_norm": 26.46254026181598, "learning_rate": 1.4196362024857482e-05, "loss": 2.346, "mean_token_accuracy": 0.42413792610168455, "step": 14095 }, { "epoch": 0.014201669753767219, "grad_norm": 31.1211841860338, "learning_rate": 1.4201397981588543e-05, "loss": 2.6274, "mean_token_accuracy": 0.37586206793785093, "step": 14100 }, { "epoch": 0.014206705806871393, "grad_norm": 29.253925889450294, "learning_rate": 1.4206433938319603e-05, "loss": 2.2547, "mean_token_accuracy": 0.4896551787853241, "step": 14105 }, { "epoch": 0.014211741859975565, "grad_norm": 33.50981125265915, "learning_rate": 1.4211469895050664e-05, "loss": 2.2725, "mean_token_accuracy": 0.4448275864124298, "step": 14110 }, { "epoch": 0.014216777913079738, "grad_norm": 27.774604248723605, "learning_rate": 1.4216505851781723e-05, "loss": 2.6449, "mean_token_accuracy": 0.36551723778247835, "step": 14115 }, { "epoch": 0.01422181396618391, "grad_norm": 33.25420172122438, "learning_rate": 1.422154180851278e-05, "loss": 2.4876, "mean_token_accuracy": 0.4, "step": 14120 }, { "epoch": 0.014226850019288084, "grad_norm": 31.532518030740587, "learning_rate": 1.4226577765243843e-05, "loss": 2.6674, "mean_token_accuracy": 0.3862069010734558, "step": 14125 }, { "epoch": 0.014231886072392256, "grad_norm": 33.5155146342872, "learning_rate": 1.42316137219749e-05, "loss": 2.1178, "mean_token_accuracy": 0.4747126460075378, "step": 14130 }, { "epoch": 0.014236922125496428, "grad_norm": 28.715544392339666, "learning_rate": 1.423664967870596e-05, "loss": 2.233, "mean_token_accuracy": 0.47931034564971925, "step": 14135 }, { "epoch": 0.014241958178600602, "grad_norm": 30.460340240910913, "learning_rate": 1.424168563543702e-05, "loss": 2.6734, "mean_token_accuracy": 0.4000000059604645, "step": 14140 }, { "epoch": 0.014246994231704774, "grad_norm": 27.101519865271197, "learning_rate": 1.424672159216808e-05, "loss": 2.3561, "mean_token_accuracy": 0.4068965494632721, "step": 14145 }, { "epoch": 0.014252030284808948, "grad_norm": 27.227915795552246, "learning_rate": 1.4251757548899141e-05, "loss": 2.4843, "mean_token_accuracy": 0.358620685338974, "step": 14150 }, { "epoch": 0.01425706633791312, "grad_norm": 31.3813151807137, "learning_rate": 1.42567935056302e-05, "loss": 2.4392, "mean_token_accuracy": 0.38620689511299133, "step": 14155 }, { "epoch": 0.014262102391017294, "grad_norm": 25.214489497455098, "learning_rate": 1.426182946236126e-05, "loss": 2.4543, "mean_token_accuracy": 0.4137930989265442, "step": 14160 }, { "epoch": 0.014267138444121466, "grad_norm": 24.12913450499344, "learning_rate": 1.426686541909232e-05, "loss": 2.227, "mean_token_accuracy": 0.4620689630508423, "step": 14165 }, { "epoch": 0.014272174497225638, "grad_norm": 31.44419808272467, "learning_rate": 1.427190137582338e-05, "loss": 2.5371, "mean_token_accuracy": 0.43448275327682495, "step": 14170 }, { "epoch": 0.014277210550329811, "grad_norm": 28.22215443729513, "learning_rate": 1.4276937332554439e-05, "loss": 2.3255, "mean_token_accuracy": 0.4517241299152374, "step": 14175 }, { "epoch": 0.014282246603433984, "grad_norm": 26.528236688115253, "learning_rate": 1.42819732892855e-05, "loss": 2.2097, "mean_token_accuracy": 0.38620689511299133, "step": 14180 }, { "epoch": 0.014287282656538157, "grad_norm": 35.1016338446267, "learning_rate": 1.4287009246016559e-05, "loss": 2.5796, "mean_token_accuracy": 0.42758620381355283, "step": 14185 }, { "epoch": 0.01429231870964233, "grad_norm": 27.484766831262114, "learning_rate": 1.429204520274762e-05, "loss": 2.7045, "mean_token_accuracy": 0.39655172228813174, "step": 14190 }, { "epoch": 0.014297354762746503, "grad_norm": 32.40012238366505, "learning_rate": 1.429708115947868e-05, "loss": 2.4418, "mean_token_accuracy": 0.42068966031074523, "step": 14195 }, { "epoch": 0.014302390815850675, "grad_norm": 35.6998407352267, "learning_rate": 1.4302117116209737e-05, "loss": 2.6028, "mean_token_accuracy": 0.4117362380027771, "step": 14200 }, { "epoch": 0.014307426868954847, "grad_norm": 22.914058568561767, "learning_rate": 1.43071530729408e-05, "loss": 2.4783, "mean_token_accuracy": 0.38620689511299133, "step": 14205 }, { "epoch": 0.014312462922059021, "grad_norm": 32.16962207546506, "learning_rate": 1.4312189029671857e-05, "loss": 2.0697, "mean_token_accuracy": 0.5021778583526612, "step": 14210 }, { "epoch": 0.014317498975163193, "grad_norm": 24.869260572392864, "learning_rate": 1.4317224986402916e-05, "loss": 2.4156, "mean_token_accuracy": 0.4758620738983154, "step": 14215 }, { "epoch": 0.014322535028267367, "grad_norm": 22.23016165598873, "learning_rate": 1.4322260943133977e-05, "loss": 2.5263, "mean_token_accuracy": 0.4206896543502808, "step": 14220 }, { "epoch": 0.014327571081371539, "grad_norm": 26.071865742208946, "learning_rate": 1.4327296899865037e-05, "loss": 1.9892, "mean_token_accuracy": 0.47586206793785096, "step": 14225 }, { "epoch": 0.01433260713447571, "grad_norm": 33.19550897813042, "learning_rate": 1.4332332856596097e-05, "loss": 2.3962, "mean_token_accuracy": 0.4172413766384125, "step": 14230 }, { "epoch": 0.014337643187579885, "grad_norm": 21.19220314119713, "learning_rate": 1.4337368813327157e-05, "loss": 2.1142, "mean_token_accuracy": 0.4448275864124298, "step": 14235 }, { "epoch": 0.014342679240684057, "grad_norm": 23.877300425917774, "learning_rate": 1.4342404770058216e-05, "loss": 1.8484, "mean_token_accuracy": 0.509513008594513, "step": 14240 }, { "epoch": 0.01434771529378823, "grad_norm": 22.32873953528322, "learning_rate": 1.4347440726789277e-05, "loss": 2.9369, "mean_token_accuracy": 0.3620689660310745, "step": 14245 }, { "epoch": 0.014352751346892402, "grad_norm": 27.65259544978894, "learning_rate": 1.4352476683520336e-05, "loss": 2.5462, "mean_token_accuracy": 0.382758629322052, "step": 14250 }, { "epoch": 0.014357787399996576, "grad_norm": 22.313122434855202, "learning_rate": 1.4357512640251394e-05, "loss": 2.3461, "mean_token_accuracy": 0.3965517163276672, "step": 14255 }, { "epoch": 0.014362823453100748, "grad_norm": 29.83000271412299, "learning_rate": 1.4362548596982456e-05, "loss": 2.3808, "mean_token_accuracy": 0.4068965494632721, "step": 14260 }, { "epoch": 0.01436785950620492, "grad_norm": 35.86189348186604, "learning_rate": 1.4367584553713514e-05, "loss": 2.1064, "mean_token_accuracy": 0.4862069010734558, "step": 14265 }, { "epoch": 0.014372895559309094, "grad_norm": 41.13423038184262, "learning_rate": 1.4372620510444577e-05, "loss": 2.4613, "mean_token_accuracy": 0.39655172228813174, "step": 14270 }, { "epoch": 0.014377931612413266, "grad_norm": 33.188887788989035, "learning_rate": 1.4377656467175634e-05, "loss": 2.4464, "mean_token_accuracy": 0.39655172228813174, "step": 14275 }, { "epoch": 0.01438296766551744, "grad_norm": 25.35912386879697, "learning_rate": 1.4382692423906693e-05, "loss": 2.146, "mean_token_accuracy": 0.4586206912994385, "step": 14280 }, { "epoch": 0.014388003718621612, "grad_norm": 22.906730867776258, "learning_rate": 1.4387728380637754e-05, "loss": 2.6516, "mean_token_accuracy": 0.39310344457626345, "step": 14285 }, { "epoch": 0.014393039771725786, "grad_norm": 26.760427296554425, "learning_rate": 1.4392764337368814e-05, "loss": 2.6042, "mean_token_accuracy": 0.41379310488700866, "step": 14290 }, { "epoch": 0.014398075824829958, "grad_norm": 36.08217689005033, "learning_rate": 1.4397800294099873e-05, "loss": 2.5417, "mean_token_accuracy": 0.4153055131435394, "step": 14295 }, { "epoch": 0.01440311187793413, "grad_norm": 27.360802480662073, "learning_rate": 1.4402836250830934e-05, "loss": 2.4354, "mean_token_accuracy": 0.3793103456497192, "step": 14300 }, { "epoch": 0.014408147931038303, "grad_norm": 20.472226319165895, "learning_rate": 1.4407872207561993e-05, "loss": 2.636, "mean_token_accuracy": 0.42758620381355283, "step": 14305 }, { "epoch": 0.014413183984142475, "grad_norm": 29.23899429352142, "learning_rate": 1.4412908164293054e-05, "loss": 2.4251, "mean_token_accuracy": 0.4413793087005615, "step": 14310 }, { "epoch": 0.01441822003724665, "grad_norm": 24.67105306384765, "learning_rate": 1.4417944121024113e-05, "loss": 2.3114, "mean_token_accuracy": 0.44301270246505736, "step": 14315 }, { "epoch": 0.014423256090350821, "grad_norm": 34.37186566837699, "learning_rate": 1.442298007775517e-05, "loss": 2.6264, "mean_token_accuracy": 0.3620689630508423, "step": 14320 }, { "epoch": 0.014428292143454995, "grad_norm": 25.17078717340894, "learning_rate": 1.4428016034486233e-05, "loss": 2.1531, "mean_token_accuracy": 0.4931034505367279, "step": 14325 }, { "epoch": 0.014433328196559167, "grad_norm": 24.18417796685301, "learning_rate": 1.4433051991217291e-05, "loss": 2.4494, "mean_token_accuracy": 0.4068965494632721, "step": 14330 }, { "epoch": 0.014438364249663339, "grad_norm": 24.29239509295025, "learning_rate": 1.4438087947948354e-05, "loss": 2.4686, "mean_token_accuracy": 0.4103448331356049, "step": 14335 }, { "epoch": 0.014443400302767513, "grad_norm": 35.50455526908005, "learning_rate": 1.4443123904679411e-05, "loss": 2.6254, "mean_token_accuracy": 0.3827586233615875, "step": 14340 }, { "epoch": 0.014448436355871685, "grad_norm": 28.65470367774043, "learning_rate": 1.444815986141047e-05, "loss": 2.2081, "mean_token_accuracy": 0.4620689630508423, "step": 14345 }, { "epoch": 0.014453472408975859, "grad_norm": 30.071500613430047, "learning_rate": 1.4453195818141531e-05, "loss": 2.5225, "mean_token_accuracy": 0.4137930989265442, "step": 14350 }, { "epoch": 0.01445850846208003, "grad_norm": 28.25659719911171, "learning_rate": 1.445823177487259e-05, "loss": 2.2636, "mean_token_accuracy": 0.41379310488700866, "step": 14355 }, { "epoch": 0.014463544515184204, "grad_norm": 27.62098847917744, "learning_rate": 1.446326773160365e-05, "loss": 2.3036, "mean_token_accuracy": 0.41379311084747317, "step": 14360 }, { "epoch": 0.014468580568288376, "grad_norm": 29.7177333227106, "learning_rate": 1.4468303688334711e-05, "loss": 2.497, "mean_token_accuracy": 0.39655172228813174, "step": 14365 }, { "epoch": 0.014473616621392548, "grad_norm": 23.06481319136907, "learning_rate": 1.447333964506577e-05, "loss": 2.5153, "mean_token_accuracy": 0.41034482717514037, "step": 14370 }, { "epoch": 0.014478652674496722, "grad_norm": 40.948306461440474, "learning_rate": 1.4478375601796831e-05, "loss": 2.3496, "mean_token_accuracy": 0.42758620381355283, "step": 14375 }, { "epoch": 0.014483688727600894, "grad_norm": 26.10986851990998, "learning_rate": 1.448341155852789e-05, "loss": 2.129, "mean_token_accuracy": 0.47241379618644713, "step": 14380 }, { "epoch": 0.014488724780705068, "grad_norm": 26.34141056928323, "learning_rate": 1.448844751525895e-05, "loss": 2.2718, "mean_token_accuracy": 0.441379314661026, "step": 14385 }, { "epoch": 0.01449376083380924, "grad_norm": 34.06788425758033, "learning_rate": 1.449348347199001e-05, "loss": 2.6892, "mean_token_accuracy": 0.4068965494632721, "step": 14390 }, { "epoch": 0.014498796886913414, "grad_norm": 25.74411314356086, "learning_rate": 1.449851942872107e-05, "loss": 2.1606, "mean_token_accuracy": 0.482758617401123, "step": 14395 }, { "epoch": 0.014503832940017586, "grad_norm": 32.04970155871049, "learning_rate": 1.4503555385452127e-05, "loss": 2.526, "mean_token_accuracy": 0.4034482777118683, "step": 14400 }, { "epoch": 0.014508868993121758, "grad_norm": 40.42955901776003, "learning_rate": 1.450859134218319e-05, "loss": 2.3825, "mean_token_accuracy": 0.44137930274009707, "step": 14405 }, { "epoch": 0.014513905046225932, "grad_norm": 26.482696231713515, "learning_rate": 1.4513627298914248e-05, "loss": 2.2495, "mean_token_accuracy": 0.4448275864124298, "step": 14410 }, { "epoch": 0.014518941099330104, "grad_norm": 27.877491566202455, "learning_rate": 1.451866325564531e-05, "loss": 2.4241, "mean_token_accuracy": 0.43103448748588563, "step": 14415 }, { "epoch": 0.014523977152434277, "grad_norm": 32.811506623502666, "learning_rate": 1.4523699212376368e-05, "loss": 2.5715, "mean_token_accuracy": 0.4344827651977539, "step": 14420 }, { "epoch": 0.01452901320553845, "grad_norm": 27.172225710497397, "learning_rate": 1.4528735169107427e-05, "loss": 2.5742, "mean_token_accuracy": 0.4, "step": 14425 }, { "epoch": 0.014534049258642623, "grad_norm": 23.964519185889756, "learning_rate": 1.4533771125838488e-05, "loss": 2.4602, "mean_token_accuracy": 0.40689654350280763, "step": 14430 }, { "epoch": 0.014539085311746795, "grad_norm": 21.620465590987802, "learning_rate": 1.4538807082569547e-05, "loss": 2.2618, "mean_token_accuracy": 0.4667487680912018, "step": 14435 }, { "epoch": 0.014544121364850967, "grad_norm": 34.407022748929776, "learning_rate": 1.4543843039300606e-05, "loss": 2.741, "mean_token_accuracy": 0.358620685338974, "step": 14440 }, { "epoch": 0.014549157417955141, "grad_norm": 26.901566633859108, "learning_rate": 1.4548878996031667e-05, "loss": 2.5655, "mean_token_accuracy": 0.3551724165678024, "step": 14445 }, { "epoch": 0.014554193471059313, "grad_norm": 30.33823793701239, "learning_rate": 1.4553914952762727e-05, "loss": 2.2824, "mean_token_accuracy": 0.43103447556495667, "step": 14450 }, { "epoch": 0.014559229524163487, "grad_norm": 28.041826678771642, "learning_rate": 1.4558950909493788e-05, "loss": 2.3527, "mean_token_accuracy": 0.44506956934928893, "step": 14455 }, { "epoch": 0.014564265577267659, "grad_norm": 23.836348893852037, "learning_rate": 1.4563986866224847e-05, "loss": 2.5028, "mean_token_accuracy": 0.39310344457626345, "step": 14460 }, { "epoch": 0.014569301630371833, "grad_norm": 23.688286291447078, "learning_rate": 1.4569022822955904e-05, "loss": 2.186, "mean_token_accuracy": 0.4379310369491577, "step": 14465 }, { "epoch": 0.014574337683476005, "grad_norm": 25.848618829871278, "learning_rate": 1.4574058779686967e-05, "loss": 2.5572, "mean_token_accuracy": 0.4, "step": 14470 }, { "epoch": 0.014579373736580177, "grad_norm": 33.876501866356286, "learning_rate": 1.4579094736418025e-05, "loss": 2.3571, "mean_token_accuracy": 0.4517846405506134, "step": 14475 }, { "epoch": 0.01458440978968435, "grad_norm": 24.124623586229266, "learning_rate": 1.4584130693149084e-05, "loss": 2.1319, "mean_token_accuracy": 0.4774349629878998, "step": 14480 }, { "epoch": 0.014589445842788522, "grad_norm": 31.687703082018878, "learning_rate": 1.4589166649880145e-05, "loss": 2.1639, "mean_token_accuracy": 0.4620689570903778, "step": 14485 }, { "epoch": 0.014594481895892696, "grad_norm": 23.696390897799596, "learning_rate": 1.4594202606611204e-05, "loss": 2.2667, "mean_token_accuracy": 0.4572897732257843, "step": 14490 }, { "epoch": 0.014599517948996868, "grad_norm": 34.633908347115025, "learning_rate": 1.4599238563342265e-05, "loss": 2.5372, "mean_token_accuracy": 0.4310344934463501, "step": 14495 }, { "epoch": 0.014604554002101042, "grad_norm": 22.485813727402057, "learning_rate": 1.4604274520073324e-05, "loss": 2.3241, "mean_token_accuracy": 0.37931033968925476, "step": 14500 }, { "epoch": 0.014609590055205214, "grad_norm": 24.35050591572347, "learning_rate": 1.4609310476804383e-05, "loss": 2.5262, "mean_token_accuracy": 0.3896551787853241, "step": 14505 }, { "epoch": 0.014614626108309386, "grad_norm": 25.770059544374785, "learning_rate": 1.4614346433535444e-05, "loss": 2.106, "mean_token_accuracy": 0.47586206197738645, "step": 14510 }, { "epoch": 0.01461966216141356, "grad_norm": 26.824760960715643, "learning_rate": 1.4619382390266504e-05, "loss": 2.1678, "mean_token_accuracy": 0.47779794931411745, "step": 14515 }, { "epoch": 0.014624698214517732, "grad_norm": 23.679776358811104, "learning_rate": 1.4624418346997561e-05, "loss": 2.3399, "mean_token_accuracy": 0.4620689630508423, "step": 14520 }, { "epoch": 0.014629734267621906, "grad_norm": 24.441598109204982, "learning_rate": 1.4629454303728624e-05, "loss": 2.1747, "mean_token_accuracy": 0.4862069010734558, "step": 14525 }, { "epoch": 0.014634770320726078, "grad_norm": 25.110411719257563, "learning_rate": 1.4634490260459683e-05, "loss": 2.3748, "mean_token_accuracy": 0.44137930274009707, "step": 14530 }, { "epoch": 0.014639806373830251, "grad_norm": 34.40920774889646, "learning_rate": 1.4639526217190744e-05, "loss": 2.5975, "mean_token_accuracy": 0.41379310488700866, "step": 14535 }, { "epoch": 0.014644842426934424, "grad_norm": 27.70907043333397, "learning_rate": 1.4644562173921803e-05, "loss": 2.1599, "mean_token_accuracy": 0.458620673418045, "step": 14540 }, { "epoch": 0.014649878480038596, "grad_norm": 29.130907292101533, "learning_rate": 1.4649598130652861e-05, "loss": 2.0242, "mean_token_accuracy": 0.4776769518852234, "step": 14545 }, { "epoch": 0.01465491453314277, "grad_norm": 30.266866667366006, "learning_rate": 1.4654634087383924e-05, "loss": 2.3795, "mean_token_accuracy": 0.39655172228813174, "step": 14550 }, { "epoch": 0.014659950586246941, "grad_norm": 44.05107062953808, "learning_rate": 1.4659670044114981e-05, "loss": 2.5076, "mean_token_accuracy": 0.3896551728248596, "step": 14555 }, { "epoch": 0.014664986639351115, "grad_norm": 32.18245345330848, "learning_rate": 1.466470600084604e-05, "loss": 2.4041, "mean_token_accuracy": 0.4361766457557678, "step": 14560 }, { "epoch": 0.014670022692455287, "grad_norm": 24.60002828072494, "learning_rate": 1.4669741957577101e-05, "loss": 2.1617, "mean_token_accuracy": 0.4882637619972229, "step": 14565 }, { "epoch": 0.014675058745559461, "grad_norm": 21.926331825965136, "learning_rate": 1.467477791430816e-05, "loss": 2.2334, "mean_token_accuracy": 0.4137930989265442, "step": 14570 }, { "epoch": 0.014680094798663633, "grad_norm": 24.495448086931294, "learning_rate": 1.4679813871039222e-05, "loss": 2.3695, "mean_token_accuracy": 0.4310344815254211, "step": 14575 }, { "epoch": 0.014685130851767805, "grad_norm": 33.86289949768534, "learning_rate": 1.468484982777028e-05, "loss": 2.4642, "mean_token_accuracy": 0.4482758641242981, "step": 14580 }, { "epoch": 0.014690166904871979, "grad_norm": 23.648091612720897, "learning_rate": 1.468988578450134e-05, "loss": 2.3015, "mean_token_accuracy": 0.4689655125141144, "step": 14585 }, { "epoch": 0.01469520295797615, "grad_norm": 34.577433059607316, "learning_rate": 1.4694921741232401e-05, "loss": 2.478, "mean_token_accuracy": 0.48965516686439514, "step": 14590 }, { "epoch": 0.014700239011080325, "grad_norm": 29.463683004904315, "learning_rate": 1.469995769796346e-05, "loss": 1.9954, "mean_token_accuracy": 0.5448275923728942, "step": 14595 }, { "epoch": 0.014705275064184497, "grad_norm": 22.86993303535317, "learning_rate": 1.4704993654694518e-05, "loss": 2.4433, "mean_token_accuracy": 0.4620689630508423, "step": 14600 }, { "epoch": 0.01471031111728867, "grad_norm": 43.657261488453614, "learning_rate": 1.471002961142558e-05, "loss": 2.5465, "mean_token_accuracy": 0.41379310488700866, "step": 14605 }, { "epoch": 0.014715347170392842, "grad_norm": 26.46599634655422, "learning_rate": 1.4715065568156638e-05, "loss": 2.2547, "mean_token_accuracy": 0.4206896543502808, "step": 14610 }, { "epoch": 0.014720383223497014, "grad_norm": 27.900220529339226, "learning_rate": 1.47201015248877e-05, "loss": 2.6512, "mean_token_accuracy": 0.35862068831920624, "step": 14615 }, { "epoch": 0.014725419276601188, "grad_norm": 27.9573882940398, "learning_rate": 1.4725137481618758e-05, "loss": 2.3622, "mean_token_accuracy": 0.4000000059604645, "step": 14620 }, { "epoch": 0.01473045532970536, "grad_norm": 26.587399558194228, "learning_rate": 1.4730173438349817e-05, "loss": 2.2841, "mean_token_accuracy": 0.42413792610168455, "step": 14625 }, { "epoch": 0.014735491382809534, "grad_norm": 23.155787701986483, "learning_rate": 1.4735209395080878e-05, "loss": 2.352, "mean_token_accuracy": 0.42934058904647826, "step": 14630 }, { "epoch": 0.014740527435913706, "grad_norm": 22.477119018359964, "learning_rate": 1.4740245351811938e-05, "loss": 2.2796, "mean_token_accuracy": 0.46551724672317507, "step": 14635 }, { "epoch": 0.01474556348901788, "grad_norm": 27.54722202451458, "learning_rate": 1.4745281308542997e-05, "loss": 2.4387, "mean_token_accuracy": 0.482758617401123, "step": 14640 }, { "epoch": 0.014750599542122052, "grad_norm": 25.80632190041504, "learning_rate": 1.4750317265274058e-05, "loss": 2.6151, "mean_token_accuracy": 0.43793103098869324, "step": 14645 }, { "epoch": 0.014755635595226224, "grad_norm": 23.673409912815078, "learning_rate": 1.4755353222005117e-05, "loss": 2.6233, "mean_token_accuracy": 0.4137930929660797, "step": 14650 }, { "epoch": 0.014760671648330398, "grad_norm": 27.939991644356798, "learning_rate": 1.4760389178736178e-05, "loss": 2.4924, "mean_token_accuracy": 0.3793103456497192, "step": 14655 }, { "epoch": 0.01476570770143457, "grad_norm": 23.975817338639356, "learning_rate": 1.4765425135467237e-05, "loss": 2.5087, "mean_token_accuracy": 0.4137930989265442, "step": 14660 }, { "epoch": 0.014770743754538743, "grad_norm": 28.661051438121323, "learning_rate": 1.4770461092198295e-05, "loss": 2.1658, "mean_token_accuracy": 0.42068966031074523, "step": 14665 }, { "epoch": 0.014775779807642915, "grad_norm": 44.83158259002183, "learning_rate": 1.4775497048929357e-05, "loss": 2.1556, "mean_token_accuracy": 0.5137931048870087, "step": 14670 }, { "epoch": 0.01478081586074709, "grad_norm": 27.26092397373069, "learning_rate": 1.4780533005660415e-05, "loss": 2.7949, "mean_token_accuracy": 0.35862069129943847, "step": 14675 }, { "epoch": 0.014785851913851261, "grad_norm": 26.64347587068617, "learning_rate": 1.4785568962391474e-05, "loss": 2.2384, "mean_token_accuracy": 0.41034482717514037, "step": 14680 }, { "epoch": 0.014790887966955433, "grad_norm": 33.48874693375749, "learning_rate": 1.4790604919122535e-05, "loss": 2.4473, "mean_token_accuracy": 0.4620689630508423, "step": 14685 }, { "epoch": 0.014795924020059607, "grad_norm": 33.36040429142167, "learning_rate": 1.4795640875853594e-05, "loss": 2.3568, "mean_token_accuracy": 0.40000000298023225, "step": 14690 }, { "epoch": 0.014800960073163779, "grad_norm": 47.28622128639422, "learning_rate": 1.4800676832584655e-05, "loss": 2.6198, "mean_token_accuracy": 0.37586206793785093, "step": 14695 }, { "epoch": 0.014805996126267953, "grad_norm": 28.21091643531407, "learning_rate": 1.4805712789315715e-05, "loss": 2.3908, "mean_token_accuracy": 0.44137929677963256, "step": 14700 }, { "epoch": 0.014811032179372125, "grad_norm": 29.966442030392802, "learning_rate": 1.4810748746046774e-05, "loss": 2.3101, "mean_token_accuracy": 0.42758620381355283, "step": 14705 }, { "epoch": 0.014816068232476299, "grad_norm": 30.56615164110763, "learning_rate": 1.4815784702777835e-05, "loss": 2.6077, "mean_token_accuracy": 0.38275861740112305, "step": 14710 }, { "epoch": 0.01482110428558047, "grad_norm": 25.104057223093797, "learning_rate": 1.4820820659508894e-05, "loss": 2.194, "mean_token_accuracy": 0.44482758045196535, "step": 14715 }, { "epoch": 0.014826140338684643, "grad_norm": 29.51730075187755, "learning_rate": 1.4825856616239953e-05, "loss": 2.1224, "mean_token_accuracy": 0.458620685338974, "step": 14720 }, { "epoch": 0.014831176391788816, "grad_norm": 22.664552677527578, "learning_rate": 1.4830892572971014e-05, "loss": 1.8991, "mean_token_accuracy": 0.5137931048870087, "step": 14725 }, { "epoch": 0.014836212444892988, "grad_norm": 24.448382876717243, "learning_rate": 1.4835928529702074e-05, "loss": 2.3314, "mean_token_accuracy": 0.47241379618644713, "step": 14730 }, { "epoch": 0.014841248497997162, "grad_norm": 24.64328918290692, "learning_rate": 1.4840964486433135e-05, "loss": 2.6031, "mean_token_accuracy": 0.3965517282485962, "step": 14735 }, { "epoch": 0.014846284551101334, "grad_norm": 29.602939490499384, "learning_rate": 1.4846000443164194e-05, "loss": 2.4319, "mean_token_accuracy": 0.39655172228813174, "step": 14740 }, { "epoch": 0.014851320604205508, "grad_norm": 21.834560427448505, "learning_rate": 1.4851036399895251e-05, "loss": 2.3018, "mean_token_accuracy": 0.42758620381355283, "step": 14745 }, { "epoch": 0.01485635665730968, "grad_norm": 27.09831909643038, "learning_rate": 1.4856072356626314e-05, "loss": 2.4512, "mean_token_accuracy": 0.4344827592372894, "step": 14750 }, { "epoch": 0.014861392710413852, "grad_norm": 27.246073559468776, "learning_rate": 1.4861108313357372e-05, "loss": 2.8919, "mean_token_accuracy": 0.341379314661026, "step": 14755 }, { "epoch": 0.014866428763518026, "grad_norm": 32.53746534574877, "learning_rate": 1.4866144270088434e-05, "loss": 2.5639, "mean_token_accuracy": 0.41724138259887694, "step": 14760 }, { "epoch": 0.014871464816622198, "grad_norm": 32.11908456690238, "learning_rate": 1.4871180226819492e-05, "loss": 2.4317, "mean_token_accuracy": 0.4535390198230743, "step": 14765 }, { "epoch": 0.014876500869726372, "grad_norm": 21.0885052894486, "learning_rate": 1.4876216183550551e-05, "loss": 2.2866, "mean_token_accuracy": 0.4482758641242981, "step": 14770 }, { "epoch": 0.014881536922830544, "grad_norm": 31.71635540005267, "learning_rate": 1.4881252140281612e-05, "loss": 2.4607, "mean_token_accuracy": 0.43103448748588563, "step": 14775 }, { "epoch": 0.014886572975934717, "grad_norm": 26.551065702831263, "learning_rate": 1.4886288097012671e-05, "loss": 2.2315, "mean_token_accuracy": 0.4931034445762634, "step": 14780 }, { "epoch": 0.01489160902903889, "grad_norm": 26.31800167604234, "learning_rate": 1.489132405374373e-05, "loss": 2.1847, "mean_token_accuracy": 0.46551724076271056, "step": 14785 }, { "epoch": 0.014896645082143061, "grad_norm": 28.85586010118484, "learning_rate": 1.4896360010474791e-05, "loss": 2.3582, "mean_token_accuracy": 0.43103448748588563, "step": 14790 }, { "epoch": 0.014901681135247235, "grad_norm": 24.224798034892938, "learning_rate": 1.490139596720585e-05, "loss": 2.3948, "mean_token_accuracy": 0.44137930274009707, "step": 14795 }, { "epoch": 0.014906717188351407, "grad_norm": 25.305991708360004, "learning_rate": 1.4906431923936912e-05, "loss": 2.5472, "mean_token_accuracy": 0.37241379022598264, "step": 14800 }, { "epoch": 0.014911753241455581, "grad_norm": 22.11855014596968, "learning_rate": 1.4911467880667971e-05, "loss": 2.2594, "mean_token_accuracy": 0.42068964838981626, "step": 14805 }, { "epoch": 0.014916789294559753, "grad_norm": 31.92017127712201, "learning_rate": 1.4916503837399028e-05, "loss": 2.5865, "mean_token_accuracy": 0.37241379022598264, "step": 14810 }, { "epoch": 0.014921825347663927, "grad_norm": 22.662170426299145, "learning_rate": 1.4921539794130091e-05, "loss": 2.3608, "mean_token_accuracy": 0.42758620381355283, "step": 14815 }, { "epoch": 0.014926861400768099, "grad_norm": 25.223517984678654, "learning_rate": 1.4926575750861149e-05, "loss": 1.98, "mean_token_accuracy": 0.47931033968925474, "step": 14820 }, { "epoch": 0.014931897453872271, "grad_norm": 23.5384031661648, "learning_rate": 1.4931611707592208e-05, "loss": 2.1661, "mean_token_accuracy": 0.46551724076271056, "step": 14825 }, { "epoch": 0.014936933506976445, "grad_norm": 27.81598026744748, "learning_rate": 1.4936647664323269e-05, "loss": 2.4364, "mean_token_accuracy": 0.41724138259887694, "step": 14830 }, { "epoch": 0.014941969560080617, "grad_norm": 23.43711974743467, "learning_rate": 1.4941683621054328e-05, "loss": 2.2014, "mean_token_accuracy": 0.4482758641242981, "step": 14835 }, { "epoch": 0.01494700561318479, "grad_norm": 23.275180241035205, "learning_rate": 1.4946719577785389e-05, "loss": 2.636, "mean_token_accuracy": 0.39655172228813174, "step": 14840 }, { "epoch": 0.014952041666288962, "grad_norm": 21.630565260730904, "learning_rate": 1.4951755534516448e-05, "loss": 2.3176, "mean_token_accuracy": 0.488324248790741, "step": 14845 }, { "epoch": 0.014957077719393136, "grad_norm": 22.65031123196753, "learning_rate": 1.4956791491247508e-05, "loss": 2.2864, "mean_token_accuracy": 0.5344827592372894, "step": 14850 }, { "epoch": 0.014962113772497308, "grad_norm": 28.671975200109998, "learning_rate": 1.4961827447978568e-05, "loss": 2.8464, "mean_token_accuracy": 0.3862068921327591, "step": 14855 }, { "epoch": 0.01496714982560148, "grad_norm": 26.657981432517147, "learning_rate": 1.4966863404709628e-05, "loss": 2.4944, "mean_token_accuracy": 0.4103448331356049, "step": 14860 }, { "epoch": 0.014972185878705654, "grad_norm": 26.683902641254384, "learning_rate": 1.4971899361440685e-05, "loss": 2.3469, "mean_token_accuracy": 0.4344827592372894, "step": 14865 }, { "epoch": 0.014977221931809826, "grad_norm": 24.6457348562685, "learning_rate": 1.4976935318171748e-05, "loss": 2.5826, "mean_token_accuracy": 0.38275861740112305, "step": 14870 }, { "epoch": 0.014982257984914, "grad_norm": 19.37687188663943, "learning_rate": 1.4981971274902805e-05, "loss": 2.2248, "mean_token_accuracy": 0.43793103098869324, "step": 14875 }, { "epoch": 0.014987294038018172, "grad_norm": 29.402498225411847, "learning_rate": 1.4987007231633868e-05, "loss": 2.0806, "mean_token_accuracy": 0.4724137902259827, "step": 14880 }, { "epoch": 0.014992330091122346, "grad_norm": 31.0861367592149, "learning_rate": 1.4992043188364926e-05, "loss": 2.441, "mean_token_accuracy": 0.4068965375423431, "step": 14885 }, { "epoch": 0.014997366144226518, "grad_norm": 29.60742108065984, "learning_rate": 1.4997079145095985e-05, "loss": 2.2121, "mean_token_accuracy": 0.47931034564971925, "step": 14890 }, { "epoch": 0.01500240219733069, "grad_norm": 24.110506300490766, "learning_rate": 1.5002115101827046e-05, "loss": 2.4483, "mean_token_accuracy": 0.42758620977401735, "step": 14895 }, { "epoch": 0.015007438250434863, "grad_norm": 36.5036726395976, "learning_rate": 1.5007151058558105e-05, "loss": 2.2583, "mean_token_accuracy": 0.484359610080719, "step": 14900 }, { "epoch": 0.015012474303539036, "grad_norm": 22.055851039073005, "learning_rate": 1.5012187015289164e-05, "loss": 2.4453, "mean_token_accuracy": 0.44827585816383364, "step": 14905 }, { "epoch": 0.01501751035664321, "grad_norm": 34.09602018492731, "learning_rate": 1.5017222972020225e-05, "loss": 2.1159, "mean_token_accuracy": 0.5070780336856842, "step": 14910 }, { "epoch": 0.015022546409747381, "grad_norm": 24.787918165458834, "learning_rate": 1.5022258928751285e-05, "loss": 2.2127, "mean_token_accuracy": 0.4413793087005615, "step": 14915 }, { "epoch": 0.015027582462851555, "grad_norm": 26.47279743468467, "learning_rate": 1.5027294885482346e-05, "loss": 2.3159, "mean_token_accuracy": 0.45172412395477296, "step": 14920 }, { "epoch": 0.015032618515955727, "grad_norm": 31.033818205756386, "learning_rate": 1.5032330842213405e-05, "loss": 2.4719, "mean_token_accuracy": 0.4275862157344818, "step": 14925 }, { "epoch": 0.0150376545690599, "grad_norm": 26.94708550355271, "learning_rate": 1.5037366798944464e-05, "loss": 2.3457, "mean_token_accuracy": 0.4413793087005615, "step": 14930 }, { "epoch": 0.015042690622164073, "grad_norm": 25.169244647715573, "learning_rate": 1.5042402755675525e-05, "loss": 2.1845, "mean_token_accuracy": 0.4517241299152374, "step": 14935 }, { "epoch": 0.015047726675268245, "grad_norm": 40.10087759421944, "learning_rate": 1.5047438712406584e-05, "loss": 2.4953, "mean_token_accuracy": 0.4034482717514038, "step": 14940 }, { "epoch": 0.015052762728372419, "grad_norm": 29.313313283164337, "learning_rate": 1.5052474669137642e-05, "loss": 2.4427, "mean_token_accuracy": 0.4413793087005615, "step": 14945 }, { "epoch": 0.01505779878147659, "grad_norm": 22.470059260982964, "learning_rate": 1.5057510625868704e-05, "loss": 2.577, "mean_token_accuracy": 0.3862069010734558, "step": 14950 }, { "epoch": 0.015062834834580765, "grad_norm": 28.6797743395478, "learning_rate": 1.5062546582599762e-05, "loss": 2.6405, "mean_token_accuracy": 0.3965517163276672, "step": 14955 }, { "epoch": 0.015067870887684937, "grad_norm": 33.0424606347182, "learning_rate": 1.5067582539330825e-05, "loss": 1.9827, "mean_token_accuracy": 0.48523896336555483, "step": 14960 }, { "epoch": 0.015072906940789109, "grad_norm": 28.14559724508174, "learning_rate": 1.5072618496061882e-05, "loss": 2.6368, "mean_token_accuracy": 0.38275861740112305, "step": 14965 }, { "epoch": 0.015077942993893282, "grad_norm": 24.374999285284307, "learning_rate": 1.5077654452792941e-05, "loss": 2.3932, "mean_token_accuracy": 0.41724138259887694, "step": 14970 }, { "epoch": 0.015082979046997454, "grad_norm": 25.616286346354993, "learning_rate": 1.5082690409524002e-05, "loss": 2.3574, "mean_token_accuracy": 0.41379310488700866, "step": 14975 }, { "epoch": 0.015088015100101628, "grad_norm": 27.237489024038084, "learning_rate": 1.5087726366255062e-05, "loss": 2.2576, "mean_token_accuracy": 0.4551724135875702, "step": 14980 }, { "epoch": 0.0150930511532058, "grad_norm": 25.741042246992208, "learning_rate": 1.5092762322986121e-05, "loss": 2.4418, "mean_token_accuracy": 0.42758620977401735, "step": 14985 }, { "epoch": 0.015098087206309974, "grad_norm": 29.331359785369592, "learning_rate": 1.5097798279717182e-05, "loss": 2.4019, "mean_token_accuracy": 0.41379310488700866, "step": 14990 }, { "epoch": 0.015103123259414146, "grad_norm": 27.882007846329156, "learning_rate": 1.5102834236448241e-05, "loss": 2.4753, "mean_token_accuracy": 0.4275861978530884, "step": 14995 }, { "epoch": 0.015108159312518318, "grad_norm": 34.95171478184749, "learning_rate": 1.5107870193179302e-05, "loss": 2.5929, "mean_token_accuracy": 0.42413793206214906, "step": 15000 }, { "epoch": 0.015113195365622492, "grad_norm": 32.956074239610174, "learning_rate": 1.5112906149910361e-05, "loss": 2.3283, "mean_token_accuracy": 0.42068964838981626, "step": 15005 }, { "epoch": 0.015118231418726664, "grad_norm": 22.777803284158868, "learning_rate": 1.5117942106641419e-05, "loss": 2.1476, "mean_token_accuracy": 0.43793103098869324, "step": 15010 }, { "epoch": 0.015123267471830838, "grad_norm": 27.160889631129503, "learning_rate": 1.5122978063372481e-05, "loss": 2.3891, "mean_token_accuracy": 0.4482758641242981, "step": 15015 }, { "epoch": 0.01512830352493501, "grad_norm": 30.92468799538444, "learning_rate": 1.5128014020103539e-05, "loss": 2.3801, "mean_token_accuracy": 0.42068966031074523, "step": 15020 }, { "epoch": 0.015133339578039183, "grad_norm": 22.7474869503625, "learning_rate": 1.5133049976834598e-05, "loss": 2.2962, "mean_token_accuracy": 0.4137931078672409, "step": 15025 }, { "epoch": 0.015138375631143355, "grad_norm": 29.11117019798295, "learning_rate": 1.513808593356566e-05, "loss": 2.2306, "mean_token_accuracy": 0.493103438615799, "step": 15030 }, { "epoch": 0.015143411684247527, "grad_norm": 21.12295579576171, "learning_rate": 1.5143121890296719e-05, "loss": 2.1813, "mean_token_accuracy": 0.4689655125141144, "step": 15035 }, { "epoch": 0.015148447737351701, "grad_norm": 28.935505185650776, "learning_rate": 1.514815784702778e-05, "loss": 2.267, "mean_token_accuracy": 0.4931034564971924, "step": 15040 }, { "epoch": 0.015153483790455873, "grad_norm": 31.40162474672853, "learning_rate": 1.5153193803758839e-05, "loss": 2.3161, "mean_token_accuracy": 0.42413793206214906, "step": 15045 }, { "epoch": 0.015158519843560047, "grad_norm": 26.131745620735387, "learning_rate": 1.5158229760489898e-05, "loss": 2.3862, "mean_token_accuracy": 0.42413792610168455, "step": 15050 }, { "epoch": 0.015163555896664219, "grad_norm": 28.287149193031876, "learning_rate": 1.5163265717220959e-05, "loss": 2.5364, "mean_token_accuracy": 0.44482759237289426, "step": 15055 }, { "epoch": 0.015168591949768393, "grad_norm": 19.64781895533074, "learning_rate": 1.5168301673952018e-05, "loss": 2.4416, "mean_token_accuracy": 0.39655171930789945, "step": 15060 }, { "epoch": 0.015173628002872565, "grad_norm": 27.655730291447014, "learning_rate": 1.5173337630683076e-05, "loss": 2.504, "mean_token_accuracy": 0.417241370677948, "step": 15065 }, { "epoch": 0.015178664055976737, "grad_norm": 37.69673256513576, "learning_rate": 1.5178373587414138e-05, "loss": 2.4305, "mean_token_accuracy": 0.44827585816383364, "step": 15070 }, { "epoch": 0.01518370010908091, "grad_norm": 28.71471968505637, "learning_rate": 1.5183409544145198e-05, "loss": 2.2258, "mean_token_accuracy": 0.4637930989265442, "step": 15075 }, { "epoch": 0.015188736162185083, "grad_norm": 27.040804556333185, "learning_rate": 1.5188445500876259e-05, "loss": 2.4311, "mean_token_accuracy": 0.36896551847457887, "step": 15080 }, { "epoch": 0.015193772215289256, "grad_norm": 21.01676275131767, "learning_rate": 1.5193481457607318e-05, "loss": 2.291, "mean_token_accuracy": 0.3999999940395355, "step": 15085 }, { "epoch": 0.015198808268393428, "grad_norm": 26.14926082172177, "learning_rate": 1.5198517414338375e-05, "loss": 2.7127, "mean_token_accuracy": 0.3620689570903778, "step": 15090 }, { "epoch": 0.015203844321497602, "grad_norm": 30.068215386643843, "learning_rate": 1.5203553371069438e-05, "loss": 2.5639, "mean_token_accuracy": 0.44482759237289426, "step": 15095 }, { "epoch": 0.015208880374601774, "grad_norm": 30.376764799380407, "learning_rate": 1.5208589327800496e-05, "loss": 2.57, "mean_token_accuracy": 0.4620689690113068, "step": 15100 }, { "epoch": 0.015213916427705946, "grad_norm": 27.737970240050405, "learning_rate": 1.5213625284531555e-05, "loss": 2.508, "mean_token_accuracy": 0.44827585220336913, "step": 15105 }, { "epoch": 0.01521895248081012, "grad_norm": 31.12720254062167, "learning_rate": 1.5218661241262616e-05, "loss": 2.4977, "mean_token_accuracy": 0.42982456386089324, "step": 15110 }, { "epoch": 0.015223988533914292, "grad_norm": 26.92285197046569, "learning_rate": 1.5223697197993675e-05, "loss": 2.4711, "mean_token_accuracy": 0.4034482717514038, "step": 15115 }, { "epoch": 0.015229024587018466, "grad_norm": 32.305778791098305, "learning_rate": 1.5228733154724736e-05, "loss": 2.3784, "mean_token_accuracy": 0.44827585816383364, "step": 15120 }, { "epoch": 0.015234060640122638, "grad_norm": 21.90958700099828, "learning_rate": 1.5233769111455795e-05, "loss": 2.1262, "mean_token_accuracy": 0.4620689690113068, "step": 15125 }, { "epoch": 0.01523909669322681, "grad_norm": 28.276987843375537, "learning_rate": 1.5238805068186854e-05, "loss": 2.3825, "mean_token_accuracy": 0.4620689690113068, "step": 15130 }, { "epoch": 0.015244132746330984, "grad_norm": 33.500641313669036, "learning_rate": 1.5243841024917915e-05, "loss": 2.446, "mean_token_accuracy": 0.441379314661026, "step": 15135 }, { "epoch": 0.015249168799435156, "grad_norm": 26.76904883676346, "learning_rate": 1.5248876981648975e-05, "loss": 2.3168, "mean_token_accuracy": 0.441379314661026, "step": 15140 }, { "epoch": 0.01525420485253933, "grad_norm": 23.2680974431548, "learning_rate": 1.5253912938380036e-05, "loss": 2.4012, "mean_token_accuracy": 0.46206897497177124, "step": 15145 }, { "epoch": 0.015259240905643501, "grad_norm": 32.07531607208907, "learning_rate": 1.5258948895111095e-05, "loss": 2.6204, "mean_token_accuracy": 0.38965516686439516, "step": 15150 }, { "epoch": 0.015264276958747675, "grad_norm": 29.75079183643218, "learning_rate": 1.5263984851842154e-05, "loss": 2.6213, "mean_token_accuracy": 0.42413793206214906, "step": 15155 }, { "epoch": 0.015269313011851847, "grad_norm": 28.667975208877643, "learning_rate": 1.5269020808573213e-05, "loss": 2.3524, "mean_token_accuracy": 0.4517241418361664, "step": 15160 }, { "epoch": 0.01527434906495602, "grad_norm": 29.495643499924547, "learning_rate": 1.5274056765304273e-05, "loss": 2.4358, "mean_token_accuracy": 0.4413793087005615, "step": 15165 }, { "epoch": 0.015279385118060193, "grad_norm": 23.539930596724293, "learning_rate": 1.5279092722035332e-05, "loss": 1.9282, "mean_token_accuracy": 0.5275861978530884, "step": 15170 }, { "epoch": 0.015284421171164365, "grad_norm": 27.136937283318872, "learning_rate": 1.5284128678766395e-05, "loss": 2.4718, "mean_token_accuracy": 0.42758620977401735, "step": 15175 }, { "epoch": 0.015289457224268539, "grad_norm": 26.40953267002137, "learning_rate": 1.5289164635497454e-05, "loss": 2.3817, "mean_token_accuracy": 0.4597290694713593, "step": 15180 }, { "epoch": 0.015294493277372711, "grad_norm": 21.97792386095895, "learning_rate": 1.5294200592228513e-05, "loss": 2.4647, "mean_token_accuracy": 0.3862068891525269, "step": 15185 }, { "epoch": 0.015299529330476885, "grad_norm": 28.92035318736106, "learning_rate": 1.5299236548959572e-05, "loss": 2.5609, "mean_token_accuracy": 0.3862068891525269, "step": 15190 }, { "epoch": 0.015304565383581057, "grad_norm": 27.038182465843654, "learning_rate": 1.530427250569063e-05, "loss": 2.3196, "mean_token_accuracy": 0.4206896543502808, "step": 15195 }, { "epoch": 0.015309601436685229, "grad_norm": 23.57627638728257, "learning_rate": 1.5309308462421694e-05, "loss": 2.5406, "mean_token_accuracy": 0.38620689511299133, "step": 15200 }, { "epoch": 0.015314637489789402, "grad_norm": 32.39661561322554, "learning_rate": 1.531434441915275e-05, "loss": 2.2925, "mean_token_accuracy": 0.43793103098869324, "step": 15205 }, { "epoch": 0.015319673542893575, "grad_norm": 21.61241132847649, "learning_rate": 1.531938037588381e-05, "loss": 2.3827, "mean_token_accuracy": 0.41724138259887694, "step": 15210 }, { "epoch": 0.015324709595997748, "grad_norm": 62.22562315211066, "learning_rate": 1.5324416332614872e-05, "loss": 2.1067, "mean_token_accuracy": 0.4758620738983154, "step": 15215 }, { "epoch": 0.01532974564910192, "grad_norm": 33.307248233027074, "learning_rate": 1.532945228934593e-05, "loss": 2.9393, "mean_token_accuracy": 0.31379309892654417, "step": 15220 }, { "epoch": 0.015334781702206094, "grad_norm": 22.824698164028327, "learning_rate": 1.533448824607699e-05, "loss": 2.1646, "mean_token_accuracy": 0.42758620977401735, "step": 15225 }, { "epoch": 0.015339817755310266, "grad_norm": 39.60586148444596, "learning_rate": 1.533952420280805e-05, "loss": 2.4388, "mean_token_accuracy": 0.4034482717514038, "step": 15230 }, { "epoch": 0.015344853808414438, "grad_norm": 21.514431137827334, "learning_rate": 1.534456015953911e-05, "loss": 2.54, "mean_token_accuracy": 0.42068966031074523, "step": 15235 }, { "epoch": 0.015349889861518612, "grad_norm": 26.501008741817458, "learning_rate": 1.534959611627017e-05, "loss": 2.2307, "mean_token_accuracy": 0.4655172348022461, "step": 15240 }, { "epoch": 0.015354925914622784, "grad_norm": 30.964055113230447, "learning_rate": 1.535463207300123e-05, "loss": 2.4714, "mean_token_accuracy": 0.40689654350280763, "step": 15245 }, { "epoch": 0.015359961967726958, "grad_norm": 23.609834060334986, "learning_rate": 1.5359668029732287e-05, "loss": 2.4079, "mean_token_accuracy": 0.42068966031074523, "step": 15250 }, { "epoch": 0.01536499802083113, "grad_norm": 31.221222286855603, "learning_rate": 1.536470398646335e-05, "loss": 2.4158, "mean_token_accuracy": 0.4344827651977539, "step": 15255 }, { "epoch": 0.015370034073935303, "grad_norm": 26.52871738394261, "learning_rate": 1.536973994319441e-05, "loss": 2.5322, "mean_token_accuracy": 0.4620689690113068, "step": 15260 }, { "epoch": 0.015375070127039476, "grad_norm": 27.420888859961313, "learning_rate": 1.537477589992547e-05, "loss": 2.5503, "mean_token_accuracy": 0.41034482717514037, "step": 15265 }, { "epoch": 0.015380106180143648, "grad_norm": 27.623311514977516, "learning_rate": 1.5379811856656527e-05, "loss": 2.6551, "mean_token_accuracy": 0.37241379022598264, "step": 15270 }, { "epoch": 0.015385142233247821, "grad_norm": 28.179215674514353, "learning_rate": 1.5384847813387586e-05, "loss": 2.2328, "mean_token_accuracy": 0.43932244181632996, "step": 15275 }, { "epoch": 0.015390178286351993, "grad_norm": 23.4748892846769, "learning_rate": 1.538988377011865e-05, "loss": 2.2666, "mean_token_accuracy": 0.4517241358757019, "step": 15280 }, { "epoch": 0.015395214339456167, "grad_norm": 60.65185448627418, "learning_rate": 1.5394919726849708e-05, "loss": 2.6417, "mean_token_accuracy": 0.3827586233615875, "step": 15285 }, { "epoch": 0.01540025039256034, "grad_norm": 25.402593201443093, "learning_rate": 1.5399955683580768e-05, "loss": 2.5087, "mean_token_accuracy": 0.3931034505367279, "step": 15290 }, { "epoch": 0.015405286445664513, "grad_norm": 27.022524882760127, "learning_rate": 1.5404991640311827e-05, "loss": 2.4856, "mean_token_accuracy": 0.3931034505367279, "step": 15295 }, { "epoch": 0.015410322498768685, "grad_norm": 26.340769506923586, "learning_rate": 1.5410027597042886e-05, "loss": 2.478, "mean_token_accuracy": 0.4068965494632721, "step": 15300 }, { "epoch": 0.015415358551872857, "grad_norm": 31.93989389902504, "learning_rate": 1.541506355377395e-05, "loss": 2.2626, "mean_token_accuracy": 0.4931034505367279, "step": 15305 }, { "epoch": 0.01542039460497703, "grad_norm": 25.94858099628174, "learning_rate": 1.5420099510505008e-05, "loss": 2.4934, "mean_token_accuracy": 0.4051421582698822, "step": 15310 }, { "epoch": 0.015425430658081203, "grad_norm": 38.07859351500177, "learning_rate": 1.5425135467236064e-05, "loss": 2.3503, "mean_token_accuracy": 0.4620689630508423, "step": 15315 }, { "epoch": 0.015430466711185377, "grad_norm": 33.9922370450758, "learning_rate": 1.5430171423967126e-05, "loss": 2.5306, "mean_token_accuracy": 0.3965517282485962, "step": 15320 }, { "epoch": 0.015435502764289549, "grad_norm": 27.94677952657774, "learning_rate": 1.5435207380698186e-05, "loss": 2.7147, "mean_token_accuracy": 0.320689657330513, "step": 15325 }, { "epoch": 0.015440538817393722, "grad_norm": 26.254540596488283, "learning_rate": 1.5440243337429245e-05, "loss": 2.3457, "mean_token_accuracy": 0.4344827592372894, "step": 15330 }, { "epoch": 0.015445574870497894, "grad_norm": 26.858089823420904, "learning_rate": 1.5445279294160304e-05, "loss": 2.5259, "mean_token_accuracy": 0.3896551728248596, "step": 15335 }, { "epoch": 0.015450610923602066, "grad_norm": 25.678576939756653, "learning_rate": 1.5450315250891363e-05, "loss": 2.3543, "mean_token_accuracy": 0.4344827651977539, "step": 15340 }, { "epoch": 0.01545564697670624, "grad_norm": 26.635519574655135, "learning_rate": 1.5455351207622426e-05, "loss": 2.2627, "mean_token_accuracy": 0.4172413766384125, "step": 15345 }, { "epoch": 0.015460683029810412, "grad_norm": 22.938107403922594, "learning_rate": 1.5460387164353485e-05, "loss": 2.3581, "mean_token_accuracy": 0.4137930989265442, "step": 15350 }, { "epoch": 0.015465719082914586, "grad_norm": 30.32331116088612, "learning_rate": 1.5465423121084545e-05, "loss": 2.4135, "mean_token_accuracy": 0.39655172228813174, "step": 15355 }, { "epoch": 0.015470755136018758, "grad_norm": 29.976195194096814, "learning_rate": 1.5470459077815604e-05, "loss": 2.313, "mean_token_accuracy": 0.39655172228813174, "step": 15360 }, { "epoch": 0.015475791189122932, "grad_norm": 23.365203161684644, "learning_rate": 1.5475495034546663e-05, "loss": 2.4102, "mean_token_accuracy": 0.4586206912994385, "step": 15365 }, { "epoch": 0.015480827242227104, "grad_norm": 29.183489873934562, "learning_rate": 1.5480530991277722e-05, "loss": 2.1398, "mean_token_accuracy": 0.4310344934463501, "step": 15370 }, { "epoch": 0.015485863295331276, "grad_norm": 27.940375902532523, "learning_rate": 1.5485566948008785e-05, "loss": 2.1217, "mean_token_accuracy": 0.4620689630508423, "step": 15375 }, { "epoch": 0.01549089934843545, "grad_norm": 32.49007707356087, "learning_rate": 1.5490602904739844e-05, "loss": 2.2558, "mean_token_accuracy": 0.4620689630508423, "step": 15380 }, { "epoch": 0.015495935401539622, "grad_norm": 25.252953640610716, "learning_rate": 1.5495638861470903e-05, "loss": 2.2966, "mean_token_accuracy": 0.40689654350280763, "step": 15385 }, { "epoch": 0.015500971454643795, "grad_norm": 27.663082739947956, "learning_rate": 1.5500674818201963e-05, "loss": 2.4797, "mean_token_accuracy": 0.4034482777118683, "step": 15390 }, { "epoch": 0.015506007507747967, "grad_norm": 43.82563328270075, "learning_rate": 1.5505710774933022e-05, "loss": 2.2844, "mean_token_accuracy": 0.4137930989265442, "step": 15395 }, { "epoch": 0.015511043560852141, "grad_norm": 29.145020488591964, "learning_rate": 1.5510746731664085e-05, "loss": 2.0937, "mean_token_accuracy": 0.47241379618644713, "step": 15400 }, { "epoch": 0.015516079613956313, "grad_norm": 22.83227662007667, "learning_rate": 1.551578268839514e-05, "loss": 2.2476, "mean_token_accuracy": 0.4517241358757019, "step": 15405 }, { "epoch": 0.015521115667060485, "grad_norm": 26.132016761693365, "learning_rate": 1.55208186451262e-05, "loss": 2.3593, "mean_token_accuracy": 0.441379314661026, "step": 15410 }, { "epoch": 0.015526151720164659, "grad_norm": 19.341227683769997, "learning_rate": 1.5525854601857262e-05, "loss": 2.1409, "mean_token_accuracy": 0.4931034445762634, "step": 15415 }, { "epoch": 0.015531187773268831, "grad_norm": 29.721415781920566, "learning_rate": 1.553089055858832e-05, "loss": 2.4372, "mean_token_accuracy": 0.417241370677948, "step": 15420 }, { "epoch": 0.015536223826373005, "grad_norm": 22.50283103005277, "learning_rate": 1.553592651531938e-05, "loss": 2.2973, "mean_token_accuracy": 0.4448275864124298, "step": 15425 }, { "epoch": 0.015541259879477177, "grad_norm": 23.985986542567865, "learning_rate": 1.554096247205044e-05, "loss": 2.2876, "mean_token_accuracy": 0.46551724076271056, "step": 15430 }, { "epoch": 0.01554629593258135, "grad_norm": 25.27321331074455, "learning_rate": 1.55459984287815e-05, "loss": 2.2492, "mean_token_accuracy": 0.47586206793785096, "step": 15435 }, { "epoch": 0.015551331985685523, "grad_norm": 29.489918811966035, "learning_rate": 1.5551034385512562e-05, "loss": 2.0868, "mean_token_accuracy": 0.48337438702583313, "step": 15440 }, { "epoch": 0.015556368038789695, "grad_norm": 24.33428711106757, "learning_rate": 1.555607034224362e-05, "loss": 2.4336, "mean_token_accuracy": 0.39310343861579894, "step": 15445 }, { "epoch": 0.015561404091893868, "grad_norm": 28.539339088876677, "learning_rate": 1.5561106298974677e-05, "loss": 2.4891, "mean_token_accuracy": 0.38620689511299133, "step": 15450 }, { "epoch": 0.01556644014499804, "grad_norm": 24.705792318123912, "learning_rate": 1.556614225570574e-05, "loss": 2.2383, "mean_token_accuracy": 0.43103447556495667, "step": 15455 }, { "epoch": 0.015571476198102214, "grad_norm": 21.822106373407035, "learning_rate": 1.55711782124368e-05, "loss": 2.0719, "mean_token_accuracy": 0.48965516686439514, "step": 15460 }, { "epoch": 0.015576512251206386, "grad_norm": 31.90504845301236, "learning_rate": 1.5576214169167862e-05, "loss": 2.2195, "mean_token_accuracy": 0.4862068951129913, "step": 15465 }, { "epoch": 0.01558154830431056, "grad_norm": 27.086377007185842, "learning_rate": 1.5581250125898918e-05, "loss": 2.2882, "mean_token_accuracy": 0.45862067937850953, "step": 15470 }, { "epoch": 0.015586584357414732, "grad_norm": 22.299810823910082, "learning_rate": 1.5586286082629977e-05, "loss": 2.3129, "mean_token_accuracy": 0.4758620738983154, "step": 15475 }, { "epoch": 0.015591620410518904, "grad_norm": 25.69934904554766, "learning_rate": 1.559132203936104e-05, "loss": 2.4365, "mean_token_accuracy": 0.39310345649719236, "step": 15480 }, { "epoch": 0.015596656463623078, "grad_norm": 33.522117494510724, "learning_rate": 1.55963579960921e-05, "loss": 2.3073, "mean_token_accuracy": 0.46896551847457885, "step": 15485 }, { "epoch": 0.01560169251672725, "grad_norm": 31.277892949129864, "learning_rate": 1.5601393952823158e-05, "loss": 2.3262, "mean_token_accuracy": 0.4275861978530884, "step": 15490 }, { "epoch": 0.015606728569831424, "grad_norm": 28.30338192441034, "learning_rate": 1.5606429909554217e-05, "loss": 2.4136, "mean_token_accuracy": 0.39999998807907106, "step": 15495 }, { "epoch": 0.015611764622935596, "grad_norm": 26.097227251949477, "learning_rate": 1.5611465866285276e-05, "loss": 2.1959, "mean_token_accuracy": 0.46896552443504336, "step": 15500 }, { "epoch": 0.01561680067603977, "grad_norm": 26.142762877336704, "learning_rate": 1.561650182301634e-05, "loss": 2.481, "mean_token_accuracy": 0.4344827592372894, "step": 15505 }, { "epoch": 0.015621836729143941, "grad_norm": 34.112400762998185, "learning_rate": 1.56215377797474e-05, "loss": 2.5313, "mean_token_accuracy": 0.38620689511299133, "step": 15510 }, { "epoch": 0.015626872782248113, "grad_norm": 35.81910547070459, "learning_rate": 1.5626573736478454e-05, "loss": 2.509, "mean_token_accuracy": 0.4206896543502808, "step": 15515 }, { "epoch": 0.015631908835352287, "grad_norm": 26.876770466531983, "learning_rate": 1.5631609693209517e-05, "loss": 1.9901, "mean_token_accuracy": 0.4689655125141144, "step": 15520 }, { "epoch": 0.01563694488845646, "grad_norm": 30.964368393603834, "learning_rate": 1.5636645649940576e-05, "loss": 2.2874, "mean_token_accuracy": 0.45517241954803467, "step": 15525 }, { "epoch": 0.01564198094156063, "grad_norm": 27.4545831418006, "learning_rate": 1.5641681606671635e-05, "loss": 2.2759, "mean_token_accuracy": 0.4379310250282288, "step": 15530 }, { "epoch": 0.015647016994664805, "grad_norm": 24.67646371041598, "learning_rate": 1.5646717563402698e-05, "loss": 2.2276, "mean_token_accuracy": 0.482758617401123, "step": 15535 }, { "epoch": 0.01565205304776898, "grad_norm": 33.70781072486016, "learning_rate": 1.5651753520133754e-05, "loss": 2.5344, "mean_token_accuracy": 0.4586206912994385, "step": 15540 }, { "epoch": 0.015657089100873153, "grad_norm": 25.208622671873613, "learning_rate": 1.5656789476864817e-05, "loss": 2.6548, "mean_token_accuracy": 0.3896551728248596, "step": 15545 }, { "epoch": 0.015662125153977323, "grad_norm": 30.92790838738424, "learning_rate": 1.5661825433595876e-05, "loss": 3.016, "mean_token_accuracy": 0.37241379618644715, "step": 15550 }, { "epoch": 0.015667161207081497, "grad_norm": 24.351642725436907, "learning_rate": 1.5666861390326935e-05, "loss": 1.849, "mean_token_accuracy": 0.5224440395832062, "step": 15555 }, { "epoch": 0.01567219726018567, "grad_norm": 20.30646079720107, "learning_rate": 1.5671897347057994e-05, "loss": 2.2394, "mean_token_accuracy": 0.42758620381355283, "step": 15560 }, { "epoch": 0.01567723331328984, "grad_norm": 28.533673083955637, "learning_rate": 1.5676933303789054e-05, "loss": 2.569, "mean_token_accuracy": 0.41379310488700866, "step": 15565 }, { "epoch": 0.015682269366394015, "grad_norm": 24.948961473221317, "learning_rate": 1.5681969260520116e-05, "loss": 2.4286, "mean_token_accuracy": 0.4206896543502808, "step": 15570 }, { "epoch": 0.01568730541949819, "grad_norm": 24.21568089839342, "learning_rate": 1.5687005217251175e-05, "loss": 1.9818, "mean_token_accuracy": 0.4448275864124298, "step": 15575 }, { "epoch": 0.01569234147260236, "grad_norm": 31.979436338504815, "learning_rate": 1.5692041173982235e-05, "loss": 2.5227, "mean_token_accuracy": 0.40344826579093934, "step": 15580 }, { "epoch": 0.015697377525706532, "grad_norm": 24.92234295687272, "learning_rate": 1.5697077130713294e-05, "loss": 2.5614, "mean_token_accuracy": 0.39310344457626345, "step": 15585 }, { "epoch": 0.015702413578810706, "grad_norm": 20.689721415656155, "learning_rate": 1.5702113087444353e-05, "loss": 2.1045, "mean_token_accuracy": 0.5068965494632721, "step": 15590 }, { "epoch": 0.01570744963191488, "grad_norm": 28.801084846283672, "learning_rate": 1.5707149044175412e-05, "loss": 2.5776, "mean_token_accuracy": 0.37931033968925476, "step": 15595 }, { "epoch": 0.01571248568501905, "grad_norm": 26.269572739616184, "learning_rate": 1.5712185000906475e-05, "loss": 2.2005, "mean_token_accuracy": 0.45517241954803467, "step": 15600 }, { "epoch": 0.015717521738123224, "grad_norm": 29.64383672673831, "learning_rate": 1.571722095763753e-05, "loss": 2.3336, "mean_token_accuracy": 0.43103447556495667, "step": 15605 }, { "epoch": 0.015722557791227398, "grad_norm": 21.780053690574633, "learning_rate": 1.5722256914368594e-05, "loss": 2.4922, "mean_token_accuracy": 0.38275861740112305, "step": 15610 }, { "epoch": 0.015727593844331568, "grad_norm": 27.077007276609294, "learning_rate": 1.5727292871099653e-05, "loss": 2.5873, "mean_token_accuracy": 0.3724137932062149, "step": 15615 }, { "epoch": 0.015732629897435742, "grad_norm": 23.68917863219754, "learning_rate": 1.5732328827830712e-05, "loss": 2.4491, "mean_token_accuracy": 0.4354679822921753, "step": 15620 }, { "epoch": 0.015737665950539916, "grad_norm": 18.837769585846583, "learning_rate": 1.573736478456177e-05, "loss": 2.5415, "mean_token_accuracy": 0.43793103098869324, "step": 15625 }, { "epoch": 0.01574270200364409, "grad_norm": 29.55895038432199, "learning_rate": 1.574240074129283e-05, "loss": 2.3969, "mean_token_accuracy": 0.41379310488700866, "step": 15630 }, { "epoch": 0.01574773805674826, "grad_norm": 23.305301382521773, "learning_rate": 1.574743669802389e-05, "loss": 2.2443, "mean_token_accuracy": 0.5137930929660797, "step": 15635 }, { "epoch": 0.015752774109852433, "grad_norm": 22.872085066464045, "learning_rate": 1.5752472654754952e-05, "loss": 2.4616, "mean_token_accuracy": 0.44482759237289426, "step": 15640 }, { "epoch": 0.015757810162956607, "grad_norm": 31.678905506034983, "learning_rate": 1.5757508611486012e-05, "loss": 2.8223, "mean_token_accuracy": 0.35983060896396635, "step": 15645 }, { "epoch": 0.015762846216060777, "grad_norm": 26.29372465340689, "learning_rate": 1.576254456821707e-05, "loss": 2.111, "mean_token_accuracy": 0.47931034564971925, "step": 15650 }, { "epoch": 0.01576788226916495, "grad_norm": 24.979648008473138, "learning_rate": 1.576758052494813e-05, "loss": 2.3929, "mean_token_accuracy": 0.4360556542873383, "step": 15655 }, { "epoch": 0.015772918322269125, "grad_norm": 20.083755440355905, "learning_rate": 1.577261648167919e-05, "loss": 2.1284, "mean_token_accuracy": 0.4931034505367279, "step": 15660 }, { "epoch": 0.0157779543753733, "grad_norm": 25.446001775037097, "learning_rate": 1.5777652438410252e-05, "loss": 2.8251, "mean_token_accuracy": 0.3655172407627106, "step": 15665 }, { "epoch": 0.01578299042847747, "grad_norm": 24.14010394513108, "learning_rate": 1.5782688395141308e-05, "loss": 2.1684, "mean_token_accuracy": 0.4918935298919678, "step": 15670 }, { "epoch": 0.015788026481581643, "grad_norm": 40.70765718398589, "learning_rate": 1.5787724351872367e-05, "loss": 2.4998, "mean_token_accuracy": 0.4379310369491577, "step": 15675 }, { "epoch": 0.015793062534685817, "grad_norm": 46.79036510826806, "learning_rate": 1.579276030860343e-05, "loss": 2.6341, "mean_token_accuracy": 0.4379310369491577, "step": 15680 }, { "epoch": 0.015798098587789987, "grad_norm": 25.23462316227281, "learning_rate": 1.579779626533449e-05, "loss": 2.1441, "mean_token_accuracy": 0.46067755222320556, "step": 15685 }, { "epoch": 0.01580313464089416, "grad_norm": 27.96837390920119, "learning_rate": 1.580283222206555e-05, "loss": 2.5682, "mean_token_accuracy": 0.4068965494632721, "step": 15690 }, { "epoch": 0.015808170693998334, "grad_norm": 28.13620690300876, "learning_rate": 1.5807868178796608e-05, "loss": 2.6107, "mean_token_accuracy": 0.3931034505367279, "step": 15695 }, { "epoch": 0.015813206747102508, "grad_norm": 26.96325482019206, "learning_rate": 1.5812904135527667e-05, "loss": 2.4025, "mean_token_accuracy": 0.4379310220479965, "step": 15700 }, { "epoch": 0.01581824280020668, "grad_norm": 30.18764897032944, "learning_rate": 1.581794009225873e-05, "loss": 2.728, "mean_token_accuracy": 0.37931033968925476, "step": 15705 }, { "epoch": 0.015823278853310852, "grad_norm": 20.77718634717397, "learning_rate": 1.582297604898979e-05, "loss": 2.3238, "mean_token_accuracy": 0.458620685338974, "step": 15710 }, { "epoch": 0.015828314906415026, "grad_norm": 23.93752695363553, "learning_rate": 1.5828012005720848e-05, "loss": 2.3833, "mean_token_accuracy": 0.4758620738983154, "step": 15715 }, { "epoch": 0.015833350959519196, "grad_norm": 25.19312957998834, "learning_rate": 1.5833047962451907e-05, "loss": 2.05, "mean_token_accuracy": 0.47410768866539, "step": 15720 }, { "epoch": 0.01583838701262337, "grad_norm": 28.501528142587986, "learning_rate": 1.5838083919182967e-05, "loss": 2.3783, "mean_token_accuracy": 0.43103448748588563, "step": 15725 }, { "epoch": 0.015843423065727544, "grad_norm": 23.766406540567694, "learning_rate": 1.584311987591403e-05, "loss": 2.4806, "mean_token_accuracy": 0.41899576783180237, "step": 15730 }, { "epoch": 0.015848459118831718, "grad_norm": 27.67939898804874, "learning_rate": 1.584815583264509e-05, "loss": 2.2029, "mean_token_accuracy": 0.458620685338974, "step": 15735 }, { "epoch": 0.015853495171935888, "grad_norm": 24.19068887361558, "learning_rate": 1.5853191789376144e-05, "loss": 2.2707, "mean_token_accuracy": 0.4172413766384125, "step": 15740 }, { "epoch": 0.01585853122504006, "grad_norm": 26.408644458985073, "learning_rate": 1.5858227746107207e-05, "loss": 2.4602, "mean_token_accuracy": 0.35172413289546967, "step": 15745 }, { "epoch": 0.015863567278144235, "grad_norm": 18.221737377816677, "learning_rate": 1.5863263702838266e-05, "loss": 2.1366, "mean_token_accuracy": 0.49655171632766726, "step": 15750 }, { "epoch": 0.015868603331248406, "grad_norm": 25.412962717365165, "learning_rate": 1.5868299659569325e-05, "loss": 2.1603, "mean_token_accuracy": 0.4724137902259827, "step": 15755 }, { "epoch": 0.01587363938435258, "grad_norm": 29.229194550437484, "learning_rate": 1.5873335616300385e-05, "loss": 2.4608, "mean_token_accuracy": 0.4068965494632721, "step": 15760 }, { "epoch": 0.015878675437456753, "grad_norm": 24.21488829858154, "learning_rate": 1.5878371573031444e-05, "loss": 2.1328, "mean_token_accuracy": 0.48965516686439514, "step": 15765 }, { "epoch": 0.015883711490560927, "grad_norm": 29.291146162527546, "learning_rate": 1.5883407529762507e-05, "loss": 2.6086, "mean_token_accuracy": 0.4068965554237366, "step": 15770 }, { "epoch": 0.015888747543665097, "grad_norm": 35.60298936591672, "learning_rate": 1.5888443486493566e-05, "loss": 2.7648, "mean_token_accuracy": 0.4, "step": 15775 }, { "epoch": 0.01589378359676927, "grad_norm": 23.381797296031145, "learning_rate": 1.5893479443224625e-05, "loss": 2.3552, "mean_token_accuracy": 0.4655172348022461, "step": 15780 }, { "epoch": 0.015898819649873445, "grad_norm": 25.299462976071695, "learning_rate": 1.5898515399955684e-05, "loss": 2.4132, "mean_token_accuracy": 0.4517241358757019, "step": 15785 }, { "epoch": 0.015903855702977615, "grad_norm": 25.921225227680083, "learning_rate": 1.5903551356686744e-05, "loss": 2.2967, "mean_token_accuracy": 0.45246304869651793, "step": 15790 }, { "epoch": 0.01590889175608179, "grad_norm": 29.139199157012364, "learning_rate": 1.5908587313417803e-05, "loss": 2.3918, "mean_token_accuracy": 0.4083484590053558, "step": 15795 }, { "epoch": 0.015913927809185963, "grad_norm": 22.595354253050626, "learning_rate": 1.5913623270148866e-05, "loss": 2.4594, "mean_token_accuracy": 0.4310344934463501, "step": 15800 }, { "epoch": 0.015918963862290136, "grad_norm": 26.59467412170808, "learning_rate": 1.591865922687992e-05, "loss": 2.273, "mean_token_accuracy": 0.4221415579319, "step": 15805 }, { "epoch": 0.015923999915394307, "grad_norm": 31.676023878385156, "learning_rate": 1.5923695183610984e-05, "loss": 2.6697, "mean_token_accuracy": 0.36896551251411436, "step": 15810 }, { "epoch": 0.01592903596849848, "grad_norm": 22.191416333855827, "learning_rate": 1.5928731140342043e-05, "loss": 2.0939, "mean_token_accuracy": 0.46982758641242983, "step": 15815 }, { "epoch": 0.015934072021602654, "grad_norm": 23.957835292298817, "learning_rate": 1.5933767097073103e-05, "loss": 2.3493, "mean_token_accuracy": 0.4310344815254211, "step": 15820 }, { "epoch": 0.015939108074706825, "grad_norm": 22.71593701763225, "learning_rate": 1.5938803053804162e-05, "loss": 2.382, "mean_token_accuracy": 0.4528325080871582, "step": 15825 }, { "epoch": 0.015944144127811, "grad_norm": 24.683184400372657, "learning_rate": 1.594383901053522e-05, "loss": 2.4161, "mean_token_accuracy": 0.3931034475564957, "step": 15830 }, { "epoch": 0.015949180180915172, "grad_norm": 32.61330315995067, "learning_rate": 1.594887496726628e-05, "loss": 2.2772, "mean_token_accuracy": 0.4620689630508423, "step": 15835 }, { "epoch": 0.015954216234019346, "grad_norm": 20.358648699825046, "learning_rate": 1.5953910923997343e-05, "loss": 2.2344, "mean_token_accuracy": 0.4068965494632721, "step": 15840 }, { "epoch": 0.015959252287123516, "grad_norm": 25.81643722807954, "learning_rate": 1.5958946880728402e-05, "loss": 2.4167, "mean_token_accuracy": 0.4344827592372894, "step": 15845 }, { "epoch": 0.01596428834022769, "grad_norm": 24.8563934157008, "learning_rate": 1.596398283745946e-05, "loss": 2.3589, "mean_token_accuracy": 0.4344827592372894, "step": 15850 }, { "epoch": 0.015969324393331864, "grad_norm": 35.92665779226816, "learning_rate": 1.596901879419052e-05, "loss": 2.564, "mean_token_accuracy": 0.4034482717514038, "step": 15855 }, { "epoch": 0.015974360446436034, "grad_norm": 27.996171339057042, "learning_rate": 1.597405475092158e-05, "loss": 2.6749, "mean_token_accuracy": 0.40859044194221494, "step": 15860 }, { "epoch": 0.015979396499540208, "grad_norm": 40.1817657857375, "learning_rate": 1.5979090707652643e-05, "loss": 2.3577, "mean_token_accuracy": 0.4379310369491577, "step": 15865 }, { "epoch": 0.01598443255264438, "grad_norm": 23.95405503722282, "learning_rate": 1.59841266643837e-05, "loss": 2.2048, "mean_token_accuracy": 0.4551724076271057, "step": 15870 }, { "epoch": 0.015989468605748555, "grad_norm": 28.213042977269204, "learning_rate": 1.5989162621114758e-05, "loss": 2.3296, "mean_token_accuracy": 0.45408348441123964, "step": 15875 }, { "epoch": 0.015994504658852726, "grad_norm": 26.55424489285476, "learning_rate": 1.599419857784582e-05, "loss": 2.6034, "mean_token_accuracy": 0.42068966031074523, "step": 15880 }, { "epoch": 0.0159995407119569, "grad_norm": 22.234568172741824, "learning_rate": 1.599923453457688e-05, "loss": 2.5275, "mean_token_accuracy": 0.41034482717514037, "step": 15885 }, { "epoch": 0.016004576765061073, "grad_norm": 23.47598604754119, "learning_rate": 1.600427049130794e-05, "loss": 2.1826, "mean_token_accuracy": 0.4137930929660797, "step": 15890 }, { "epoch": 0.016009612818165243, "grad_norm": 24.63537524759791, "learning_rate": 1.6009306448038998e-05, "loss": 2.1615, "mean_token_accuracy": 0.4465819776058197, "step": 15895 }, { "epoch": 0.016014648871269417, "grad_norm": 24.126278888168514, "learning_rate": 1.6014342404770057e-05, "loss": 2.5908, "mean_token_accuracy": 0.4053236603736877, "step": 15900 }, { "epoch": 0.01601968492437359, "grad_norm": 25.096587721640475, "learning_rate": 1.601937836150112e-05, "loss": 2.6339, "mean_token_accuracy": 0.334482753276825, "step": 15905 }, { "epoch": 0.016024720977477765, "grad_norm": 23.967545157840846, "learning_rate": 1.602441431823218e-05, "loss": 2.185, "mean_token_accuracy": 0.48784029483795166, "step": 15910 }, { "epoch": 0.016029757030581935, "grad_norm": 35.22146616714449, "learning_rate": 1.602945027496324e-05, "loss": 2.5509, "mean_token_accuracy": 0.40689654350280763, "step": 15915 }, { "epoch": 0.01603479308368611, "grad_norm": 27.901641006268484, "learning_rate": 1.6034486231694298e-05, "loss": 2.5059, "mean_token_accuracy": 0.4034482777118683, "step": 15920 }, { "epoch": 0.016039829136790282, "grad_norm": 25.70249742300798, "learning_rate": 1.6039522188425357e-05, "loss": 2.5246, "mean_token_accuracy": 0.4310344815254211, "step": 15925 }, { "epoch": 0.016044865189894453, "grad_norm": 24.771958903437202, "learning_rate": 1.604455814515642e-05, "loss": 2.5717, "mean_token_accuracy": 0.41724138259887694, "step": 15930 }, { "epoch": 0.016049901242998627, "grad_norm": 29.341028345696607, "learning_rate": 1.604959410188748e-05, "loss": 2.3848, "mean_token_accuracy": 0.43333333134651186, "step": 15935 }, { "epoch": 0.0160549372961028, "grad_norm": 27.489158950671673, "learning_rate": 1.6054630058618535e-05, "loss": 2.6895, "mean_token_accuracy": 0.37586207389831544, "step": 15940 }, { "epoch": 0.016059973349206974, "grad_norm": 26.6104073922447, "learning_rate": 1.6059666015349597e-05, "loss": 2.4997, "mean_token_accuracy": 0.37931033968925476, "step": 15945 }, { "epoch": 0.016065009402311144, "grad_norm": 24.3907766332011, "learning_rate": 1.6064701972080657e-05, "loss": 2.1662, "mean_token_accuracy": 0.46896551847457885, "step": 15950 }, { "epoch": 0.016070045455415318, "grad_norm": 21.092949351361437, "learning_rate": 1.606973792881172e-05, "loss": 2.2405, "mean_token_accuracy": 0.4379310369491577, "step": 15955 }, { "epoch": 0.016075081508519492, "grad_norm": 27.576884189386345, "learning_rate": 1.6074773885542775e-05, "loss": 2.513, "mean_token_accuracy": 0.45359951853752134, "step": 15960 }, { "epoch": 0.016080117561623662, "grad_norm": 24.217079033603405, "learning_rate": 1.6079809842273834e-05, "loss": 2.132, "mean_token_accuracy": 0.4724137902259827, "step": 15965 }, { "epoch": 0.016085153614727836, "grad_norm": 32.857801870504055, "learning_rate": 1.6084845799004897e-05, "loss": 2.3211, "mean_token_accuracy": 0.43103447556495667, "step": 15970 }, { "epoch": 0.01609018966783201, "grad_norm": 26.15977713710439, "learning_rate": 1.6089881755735956e-05, "loss": 2.0777, "mean_token_accuracy": 0.4310344815254211, "step": 15975 }, { "epoch": 0.016095225720936183, "grad_norm": 29.713938267381558, "learning_rate": 1.6094917712467016e-05, "loss": 2.5705, "mean_token_accuracy": 0.3931034475564957, "step": 15980 }, { "epoch": 0.016100261774040354, "grad_norm": 27.288597723397817, "learning_rate": 1.6099953669198075e-05, "loss": 2.1992, "mean_token_accuracy": 0.4103448331356049, "step": 15985 }, { "epoch": 0.016105297827144528, "grad_norm": 25.287181102055378, "learning_rate": 1.6104989625929134e-05, "loss": 2.2514, "mean_token_accuracy": 0.4379310369491577, "step": 15990 }, { "epoch": 0.0161103338802487, "grad_norm": 26.393151576672576, "learning_rate": 1.6110025582660197e-05, "loss": 2.3617, "mean_token_accuracy": 0.42758620977401735, "step": 15995 }, { "epoch": 0.01611536993335287, "grad_norm": 35.85765610785674, "learning_rate": 1.6115061539391256e-05, "loss": 2.1501, "mean_token_accuracy": 0.46551724672317507, "step": 16000 }, { "epoch": 0.016120405986457045, "grad_norm": 19.665480402671054, "learning_rate": 1.6120097496122312e-05, "loss": 2.3023, "mean_token_accuracy": 0.41034482717514037, "step": 16005 }, { "epoch": 0.01612544203956122, "grad_norm": 31.828767051237428, "learning_rate": 1.6125133452853374e-05, "loss": 2.4721, "mean_token_accuracy": 0.4241379380226135, "step": 16010 }, { "epoch": 0.016130478092665393, "grad_norm": 20.483178545507872, "learning_rate": 1.6130169409584434e-05, "loss": 2.2288, "mean_token_accuracy": 0.4689655125141144, "step": 16015 }, { "epoch": 0.016135514145769563, "grad_norm": 23.6914917792498, "learning_rate": 1.6135205366315493e-05, "loss": 2.6866, "mean_token_accuracy": 0.3517241418361664, "step": 16020 }, { "epoch": 0.016140550198873737, "grad_norm": 23.035257388334024, "learning_rate": 1.6140241323046552e-05, "loss": 2.3521, "mean_token_accuracy": 0.43103448748588563, "step": 16025 }, { "epoch": 0.01614558625197791, "grad_norm": 28.04894710546101, "learning_rate": 1.614527727977761e-05, "loss": 2.6548, "mean_token_accuracy": 0.3620689570903778, "step": 16030 }, { "epoch": 0.01615062230508208, "grad_norm": 26.162963037248577, "learning_rate": 1.6150313236508674e-05, "loss": 2.4051, "mean_token_accuracy": 0.38620689511299133, "step": 16035 }, { "epoch": 0.016155658358186255, "grad_norm": 18.915327482976778, "learning_rate": 1.6155349193239733e-05, "loss": 2.4538, "mean_token_accuracy": 0.43986691236495973, "step": 16040 }, { "epoch": 0.01616069441129043, "grad_norm": 20.01961908454445, "learning_rate": 1.6160385149970793e-05, "loss": 2.3729, "mean_token_accuracy": 0.4068965494632721, "step": 16045 }, { "epoch": 0.016165730464394602, "grad_norm": 25.79690408161775, "learning_rate": 1.6165421106701852e-05, "loss": 2.6078, "mean_token_accuracy": 0.32413792610168457, "step": 16050 }, { "epoch": 0.016170766517498773, "grad_norm": 30.41181267883847, "learning_rate": 1.617045706343291e-05, "loss": 2.2503, "mean_token_accuracy": 0.46551724076271056, "step": 16055 }, { "epoch": 0.016175802570602946, "grad_norm": 22.211689171292427, "learning_rate": 1.617549302016397e-05, "loss": 2.1925, "mean_token_accuracy": 0.4413793087005615, "step": 16060 }, { "epoch": 0.01618083862370712, "grad_norm": 25.39401378285009, "learning_rate": 1.6180528976895033e-05, "loss": 2.5749, "mean_token_accuracy": 0.3482758581638336, "step": 16065 }, { "epoch": 0.01618587467681129, "grad_norm": 26.027902502935834, "learning_rate": 1.6185564933626092e-05, "loss": 2.2322, "mean_token_accuracy": 0.4586206912994385, "step": 16070 }, { "epoch": 0.016190910729915464, "grad_norm": 27.80210475261293, "learning_rate": 1.619060089035715e-05, "loss": 2.8137, "mean_token_accuracy": 0.3620689630508423, "step": 16075 }, { "epoch": 0.016195946783019638, "grad_norm": 31.22774236205814, "learning_rate": 1.619563684708821e-05, "loss": 2.6171, "mean_token_accuracy": 0.3931034505367279, "step": 16080 }, { "epoch": 0.016200982836123812, "grad_norm": 26.954897007995886, "learning_rate": 1.620067280381927e-05, "loss": 2.751, "mean_token_accuracy": 0.4413793087005615, "step": 16085 }, { "epoch": 0.016206018889227982, "grad_norm": 19.33062368570597, "learning_rate": 1.6205708760550333e-05, "loss": 2.3301, "mean_token_accuracy": 0.43103448748588563, "step": 16090 }, { "epoch": 0.016211054942332156, "grad_norm": 24.381737230547273, "learning_rate": 1.621074471728139e-05, "loss": 2.2871, "mean_token_accuracy": 0.42413793206214906, "step": 16095 }, { "epoch": 0.01621609099543633, "grad_norm": 23.581354748241136, "learning_rate": 1.6215780674012448e-05, "loss": 2.4416, "mean_token_accuracy": 0.38620689511299133, "step": 16100 }, { "epoch": 0.0162211270485405, "grad_norm": 23.858839747082953, "learning_rate": 1.622081663074351e-05, "loss": 2.282, "mean_token_accuracy": 0.43793103098869324, "step": 16105 }, { "epoch": 0.016226163101644674, "grad_norm": 17.098119307804446, "learning_rate": 1.622585258747457e-05, "loss": 1.8045, "mean_token_accuracy": 0.5241379380226135, "step": 16110 }, { "epoch": 0.016231199154748847, "grad_norm": 26.61697143943336, "learning_rate": 1.623088854420563e-05, "loss": 2.4687, "mean_token_accuracy": 0.42413793206214906, "step": 16115 }, { "epoch": 0.01623623520785302, "grad_norm": 25.163525739575366, "learning_rate": 1.6235924500936688e-05, "loss": 2.2006, "mean_token_accuracy": 0.44289171099662783, "step": 16120 }, { "epoch": 0.01624127126095719, "grad_norm": 25.925235005717155, "learning_rate": 1.6240960457667747e-05, "loss": 2.3186, "mean_token_accuracy": 0.4310344815254211, "step": 16125 }, { "epoch": 0.016246307314061365, "grad_norm": 35.864643725650254, "learning_rate": 1.624599641439881e-05, "loss": 2.6501, "mean_token_accuracy": 0.4034482777118683, "step": 16130 }, { "epoch": 0.01625134336716554, "grad_norm": 20.2869681221461, "learning_rate": 1.625103237112987e-05, "loss": 2.1231, "mean_token_accuracy": 0.44482757449150084, "step": 16135 }, { "epoch": 0.01625637942026971, "grad_norm": 27.069352429360414, "learning_rate": 1.6256068327860925e-05, "loss": 2.5896, "mean_token_accuracy": 0.3896551728248596, "step": 16140 }, { "epoch": 0.016261415473373883, "grad_norm": 25.99181200103059, "learning_rate": 1.6261104284591988e-05, "loss": 2.4997, "mean_token_accuracy": 0.40689654350280763, "step": 16145 }, { "epoch": 0.016266451526478057, "grad_norm": 21.896147953216605, "learning_rate": 1.6266140241323047e-05, "loss": 2.2066, "mean_token_accuracy": 0.4482758641242981, "step": 16150 }, { "epoch": 0.01627148757958223, "grad_norm": 32.932150703178834, "learning_rate": 1.627117619805411e-05, "loss": 2.3894, "mean_token_accuracy": 0.4068965554237366, "step": 16155 }, { "epoch": 0.0162765236326864, "grad_norm": 26.24690299402618, "learning_rate": 1.6276212154785166e-05, "loss": 2.0487, "mean_token_accuracy": 0.4482758641242981, "step": 16160 }, { "epoch": 0.016281559685790575, "grad_norm": 24.052027266493173, "learning_rate": 1.6281248111516225e-05, "loss": 2.7884, "mean_token_accuracy": 0.36896551847457887, "step": 16165 }, { "epoch": 0.01628659573889475, "grad_norm": 27.82847029412751, "learning_rate": 1.6286284068247288e-05, "loss": 2.378, "mean_token_accuracy": 0.42068964838981626, "step": 16170 }, { "epoch": 0.01629163179199892, "grad_norm": 31.713487553074717, "learning_rate": 1.6291320024978347e-05, "loss": 2.7123, "mean_token_accuracy": 0.3724137842655182, "step": 16175 }, { "epoch": 0.016296667845103092, "grad_norm": 22.310470652978424, "learning_rate": 1.6296355981709406e-05, "loss": 2.5887, "mean_token_accuracy": 0.37931033968925476, "step": 16180 }, { "epoch": 0.016301703898207266, "grad_norm": 18.85288515149046, "learning_rate": 1.6301391938440465e-05, "loss": 2.2143, "mean_token_accuracy": 0.4344827651977539, "step": 16185 }, { "epoch": 0.01630673995131144, "grad_norm": 26.4457753751189, "learning_rate": 1.6306427895171525e-05, "loss": 2.6684, "mean_token_accuracy": 0.39655172228813174, "step": 16190 }, { "epoch": 0.01631177600441561, "grad_norm": 21.654323523382995, "learning_rate": 1.6311463851902587e-05, "loss": 1.9999, "mean_token_accuracy": 0.5379310369491577, "step": 16195 }, { "epoch": 0.016316812057519784, "grad_norm": 28.013841265705846, "learning_rate": 1.6316499808633646e-05, "loss": 2.7621, "mean_token_accuracy": 0.40689654350280763, "step": 16200 }, { "epoch": 0.016321848110623958, "grad_norm": 28.508231338281014, "learning_rate": 1.6321535765364702e-05, "loss": 2.5796, "mean_token_accuracy": 0.37586206793785093, "step": 16205 }, { "epoch": 0.016326884163728128, "grad_norm": 27.295748122135567, "learning_rate": 1.6326571722095765e-05, "loss": 2.6954, "mean_token_accuracy": 0.4068965494632721, "step": 16210 }, { "epoch": 0.016331920216832302, "grad_norm": 31.794370345097523, "learning_rate": 1.6331607678826824e-05, "loss": 2.2109, "mean_token_accuracy": 0.42758620381355283, "step": 16215 }, { "epoch": 0.016336956269936476, "grad_norm": 25.54876472381247, "learning_rate": 1.6336643635557883e-05, "loss": 2.585, "mean_token_accuracy": 0.41578946709632875, "step": 16220 }, { "epoch": 0.01634199232304065, "grad_norm": 18.84254917369886, "learning_rate": 1.6341679592288943e-05, "loss": 2.2133, "mean_token_accuracy": 0.441379314661026, "step": 16225 }, { "epoch": 0.01634702837614482, "grad_norm": 25.790788684532103, "learning_rate": 1.6346715549020002e-05, "loss": 2.177, "mean_token_accuracy": 0.4896551609039307, "step": 16230 }, { "epoch": 0.016352064429248993, "grad_norm": 36.49193846036397, "learning_rate": 1.6351751505751065e-05, "loss": 2.5195, "mean_token_accuracy": 0.4206896543502808, "step": 16235 }, { "epoch": 0.016357100482353167, "grad_norm": 29.154792125619448, "learning_rate": 1.6356787462482124e-05, "loss": 2.4637, "mean_token_accuracy": 0.4137931078672409, "step": 16240 }, { "epoch": 0.016362136535457338, "grad_norm": 23.948759302242753, "learning_rate": 1.6361823419213183e-05, "loss": 2.1343, "mean_token_accuracy": 0.4982456147670746, "step": 16245 }, { "epoch": 0.01636717258856151, "grad_norm": 30.949804075670286, "learning_rate": 1.6366859375944242e-05, "loss": 2.2667, "mean_token_accuracy": 0.482758617401123, "step": 16250 }, { "epoch": 0.016372208641665685, "grad_norm": 25.926615500117563, "learning_rate": 1.63718953326753e-05, "loss": 2.5455, "mean_token_accuracy": 0.4137930989265442, "step": 16255 }, { "epoch": 0.01637724469476986, "grad_norm": 27.231062987404485, "learning_rate": 1.637693128940636e-05, "loss": 2.3902, "mean_token_accuracy": 0.42758620381355283, "step": 16260 }, { "epoch": 0.01638228074787403, "grad_norm": 21.267875106576117, "learning_rate": 1.6381967246137423e-05, "loss": 2.2121, "mean_token_accuracy": 0.4344827651977539, "step": 16265 }, { "epoch": 0.016387316800978203, "grad_norm": 23.679544749161433, "learning_rate": 1.6387003202868483e-05, "loss": 2.5736, "mean_token_accuracy": 0.3931034505367279, "step": 16270 }, { "epoch": 0.016392352854082377, "grad_norm": 34.90922485444246, "learning_rate": 1.6392039159599542e-05, "loss": 2.5651, "mean_token_accuracy": 0.3655172407627106, "step": 16275 }, { "epoch": 0.016397388907186547, "grad_norm": 27.663061049582694, "learning_rate": 1.63970751163306e-05, "loss": 2.3567, "mean_token_accuracy": 0.42758620977401735, "step": 16280 }, { "epoch": 0.01640242496029072, "grad_norm": 24.40515639467733, "learning_rate": 1.640211107306166e-05, "loss": 2.694, "mean_token_accuracy": 0.4379310369491577, "step": 16285 }, { "epoch": 0.016407461013394894, "grad_norm": 30.254389304531294, "learning_rate": 1.6407147029792723e-05, "loss": 2.0815, "mean_token_accuracy": 0.42068964838981626, "step": 16290 }, { "epoch": 0.016412497066499068, "grad_norm": 33.502771661139455, "learning_rate": 1.641218298652378e-05, "loss": 2.6024, "mean_token_accuracy": 0.41379310488700866, "step": 16295 }, { "epoch": 0.01641753311960324, "grad_norm": 33.652929006096365, "learning_rate": 1.6417218943254838e-05, "loss": 2.3518, "mean_token_accuracy": 0.44658197164535524, "step": 16300 }, { "epoch": 0.016422569172707412, "grad_norm": 18.406245554811697, "learning_rate": 1.64222548999859e-05, "loss": 2.2587, "mean_token_accuracy": 0.42413792610168455, "step": 16305 }, { "epoch": 0.016427605225811586, "grad_norm": 28.860191862911947, "learning_rate": 1.642729085671696e-05, "loss": 2.2777, "mean_token_accuracy": 0.44482759237289426, "step": 16310 }, { "epoch": 0.016432641278915756, "grad_norm": 24.594454152829588, "learning_rate": 1.643232681344802e-05, "loss": 2.499, "mean_token_accuracy": 0.4034482717514038, "step": 16315 }, { "epoch": 0.01643767733201993, "grad_norm": 31.766144385372044, "learning_rate": 1.643736277017908e-05, "loss": 2.1385, "mean_token_accuracy": 0.46896551847457885, "step": 16320 }, { "epoch": 0.016442713385124104, "grad_norm": 31.475708951014706, "learning_rate": 1.6442398726910138e-05, "loss": 2.3606, "mean_token_accuracy": 0.4310344815254211, "step": 16325 }, { "epoch": 0.016447749438228278, "grad_norm": 22.84072727117869, "learning_rate": 1.64474346836412e-05, "loss": 2.1678, "mean_token_accuracy": 0.4172413766384125, "step": 16330 }, { "epoch": 0.016452785491332448, "grad_norm": 22.68620768044315, "learning_rate": 1.645247064037226e-05, "loss": 2.2783, "mean_token_accuracy": 0.41379310488700866, "step": 16335 }, { "epoch": 0.016457821544436622, "grad_norm": 25.42061216455319, "learning_rate": 1.6457506597103316e-05, "loss": 2.5925, "mean_token_accuracy": 0.4241379380226135, "step": 16340 }, { "epoch": 0.016462857597540796, "grad_norm": 21.347266167333927, "learning_rate": 1.6462542553834378e-05, "loss": 2.2483, "mean_token_accuracy": 0.4724137902259827, "step": 16345 }, { "epoch": 0.016467893650644966, "grad_norm": 23.976569145358116, "learning_rate": 1.6467578510565438e-05, "loss": 2.3479, "mean_token_accuracy": 0.41724138557910917, "step": 16350 }, { "epoch": 0.01647292970374914, "grad_norm": 22.645448624959368, "learning_rate": 1.64726144672965e-05, "loss": 2.3178, "mean_token_accuracy": 0.4517241358757019, "step": 16355 }, { "epoch": 0.016477965756853313, "grad_norm": 33.61058309427705, "learning_rate": 1.6477650424027556e-05, "loss": 3.0406, "mean_token_accuracy": 0.36206896007061007, "step": 16360 }, { "epoch": 0.016483001809957487, "grad_norm": 18.299279680385496, "learning_rate": 1.6482686380758615e-05, "loss": 2.414, "mean_token_accuracy": 0.4206896543502808, "step": 16365 }, { "epoch": 0.016488037863061657, "grad_norm": 26.716352235897503, "learning_rate": 1.6487722337489678e-05, "loss": 2.6868, "mean_token_accuracy": 0.3482758641242981, "step": 16370 }, { "epoch": 0.01649307391616583, "grad_norm": 25.03171105871525, "learning_rate": 1.6492758294220737e-05, "loss": 2.444, "mean_token_accuracy": 0.46551724672317507, "step": 16375 }, { "epoch": 0.016498109969270005, "grad_norm": 30.903829283500382, "learning_rate": 1.6497794250951796e-05, "loss": 2.4255, "mean_token_accuracy": 0.4655172348022461, "step": 16380 }, { "epoch": 0.016503146022374175, "grad_norm": 31.945931610683598, "learning_rate": 1.6502830207682856e-05, "loss": 2.5491, "mean_token_accuracy": 0.42068965137004855, "step": 16385 }, { "epoch": 0.01650818207547835, "grad_norm": 21.596168913162956, "learning_rate": 1.6507866164413915e-05, "loss": 2.3104, "mean_token_accuracy": 0.4517241358757019, "step": 16390 }, { "epoch": 0.016513218128582523, "grad_norm": 26.8934510304877, "learning_rate": 1.6512902121144978e-05, "loss": 2.3333, "mean_token_accuracy": 0.42068966031074523, "step": 16395 }, { "epoch": 0.016518254181686697, "grad_norm": 27.573908605537593, "learning_rate": 1.6517938077876037e-05, "loss": 2.3879, "mean_token_accuracy": 0.4310344815254211, "step": 16400 }, { "epoch": 0.016523290234790867, "grad_norm": 28.291771619159178, "learning_rate": 1.6522974034607093e-05, "loss": 2.3594, "mean_token_accuracy": 0.4620689570903778, "step": 16405 }, { "epoch": 0.01652832628789504, "grad_norm": 23.505598476403605, "learning_rate": 1.6528009991338155e-05, "loss": 2.569, "mean_token_accuracy": 0.4172413766384125, "step": 16410 }, { "epoch": 0.016533362340999214, "grad_norm": 31.102384970986943, "learning_rate": 1.6533045948069215e-05, "loss": 2.132, "mean_token_accuracy": 0.441379314661026, "step": 16415 }, { "epoch": 0.016538398394103385, "grad_norm": 31.089726723547216, "learning_rate": 1.6538081904800277e-05, "loss": 2.5162, "mean_token_accuracy": 0.3931034505367279, "step": 16420 }, { "epoch": 0.01654343444720756, "grad_norm": 28.617932566672557, "learning_rate": 1.6543117861531333e-05, "loss": 2.804, "mean_token_accuracy": 0.3724137932062149, "step": 16425 }, { "epoch": 0.016548470500311732, "grad_norm": 19.99098296598657, "learning_rate": 1.6548153818262392e-05, "loss": 2.158, "mean_token_accuracy": 0.4931034564971924, "step": 16430 }, { "epoch": 0.016553506553415906, "grad_norm": 25.670661093428624, "learning_rate": 1.6553189774993455e-05, "loss": 2.518, "mean_token_accuracy": 0.41379310488700866, "step": 16435 }, { "epoch": 0.016558542606520076, "grad_norm": 22.060563784602415, "learning_rate": 1.6558225731724514e-05, "loss": 2.5407, "mean_token_accuracy": 0.39655172228813174, "step": 16440 }, { "epoch": 0.01656357865962425, "grad_norm": 18.920481955920355, "learning_rate": 1.6563261688455574e-05, "loss": 2.5141, "mean_token_accuracy": 0.43587417602539064, "step": 16445 }, { "epoch": 0.016568614712728424, "grad_norm": 20.766823787273278, "learning_rate": 1.6568297645186633e-05, "loss": 2.2358, "mean_token_accuracy": 0.3896551787853241, "step": 16450 }, { "epoch": 0.016573650765832594, "grad_norm": 23.010608740906083, "learning_rate": 1.6573333601917692e-05, "loss": 2.6799, "mean_token_accuracy": 0.37586206793785093, "step": 16455 }, { "epoch": 0.016578686818936768, "grad_norm": 20.460391920551803, "learning_rate": 1.6578369558648755e-05, "loss": 2.6053, "mean_token_accuracy": 0.4, "step": 16460 }, { "epoch": 0.01658372287204094, "grad_norm": 24.95671465400697, "learning_rate": 1.6583405515379814e-05, "loss": 2.273, "mean_token_accuracy": 0.5018753707408905, "step": 16465 }, { "epoch": 0.016588758925145115, "grad_norm": 25.306175338911693, "learning_rate": 1.6588441472110873e-05, "loss": 1.933, "mean_token_accuracy": 0.46551724076271056, "step": 16470 }, { "epoch": 0.016593794978249286, "grad_norm": 25.10691877592691, "learning_rate": 1.6593477428841932e-05, "loss": 2.341, "mean_token_accuracy": 0.45172412395477296, "step": 16475 }, { "epoch": 0.01659883103135346, "grad_norm": 20.494545115912963, "learning_rate": 1.659851338557299e-05, "loss": 2.5433, "mean_token_accuracy": 0.41034482717514037, "step": 16480 }, { "epoch": 0.016603867084457633, "grad_norm": 29.08519778130725, "learning_rate": 1.660354934230405e-05, "loss": 2.552, "mean_token_accuracy": 0.3896551728248596, "step": 16485 }, { "epoch": 0.016608903137561803, "grad_norm": 38.19694701792948, "learning_rate": 1.6608585299035114e-05, "loss": 2.4155, "mean_token_accuracy": 0.4172413766384125, "step": 16490 }, { "epoch": 0.016613939190665977, "grad_norm": 30.46599049650939, "learning_rate": 1.661362125576617e-05, "loss": 2.4659, "mean_token_accuracy": 0.3965517282485962, "step": 16495 }, { "epoch": 0.01661897524377015, "grad_norm": 25.081106031779544, "learning_rate": 1.6618657212497232e-05, "loss": 2.4273, "mean_token_accuracy": 0.4344827592372894, "step": 16500 }, { "epoch": 0.016624011296874325, "grad_norm": 22.516074121367467, "learning_rate": 1.662369316922829e-05, "loss": 2.3528, "mean_token_accuracy": 0.441379314661026, "step": 16505 }, { "epoch": 0.016629047349978495, "grad_norm": 25.04593411553964, "learning_rate": 1.662872912595935e-05, "loss": 2.2382, "mean_token_accuracy": 0.4689655125141144, "step": 16510 }, { "epoch": 0.01663408340308267, "grad_norm": 26.22121289626016, "learning_rate": 1.663376508269041e-05, "loss": 2.3972, "mean_token_accuracy": 0.4551724135875702, "step": 16515 }, { "epoch": 0.016639119456186843, "grad_norm": 26.259929173846718, "learning_rate": 1.663880103942147e-05, "loss": 2.5201, "mean_token_accuracy": 0.42758620977401735, "step": 16520 }, { "epoch": 0.016644155509291013, "grad_norm": 26.960765378942927, "learning_rate": 1.664383699615253e-05, "loss": 2.2683, "mean_token_accuracy": 0.48275862336158754, "step": 16525 }, { "epoch": 0.016649191562395187, "grad_norm": 25.882408101246885, "learning_rate": 1.664887295288359e-05, "loss": 2.2574, "mean_token_accuracy": 0.42758620381355283, "step": 16530 }, { "epoch": 0.01665422761549936, "grad_norm": 32.53124626375646, "learning_rate": 1.665390890961465e-05, "loss": 2.1684, "mean_token_accuracy": 0.5344827592372894, "step": 16535 }, { "epoch": 0.016659263668603534, "grad_norm": 23.457614859671715, "learning_rate": 1.665894486634571e-05, "loss": 2.2992, "mean_token_accuracy": 0.43103448748588563, "step": 16540 }, { "epoch": 0.016664299721707704, "grad_norm": 22.57790746846105, "learning_rate": 1.666398082307677e-05, "loss": 2.5344, "mean_token_accuracy": 0.43793103098869324, "step": 16545 }, { "epoch": 0.016669335774811878, "grad_norm": 23.55183037302489, "learning_rate": 1.6669016779807828e-05, "loss": 2.1971, "mean_token_accuracy": 0.4363581418991089, "step": 16550 }, { "epoch": 0.016674371827916052, "grad_norm": 31.813446379687424, "learning_rate": 1.667405273653889e-05, "loss": 2.3633, "mean_token_accuracy": 0.4330308437347412, "step": 16555 }, { "epoch": 0.016679407881020222, "grad_norm": 28.010272178958477, "learning_rate": 1.6679088693269947e-05, "loss": 2.1711, "mean_token_accuracy": 0.44482758045196535, "step": 16560 }, { "epoch": 0.016684443934124396, "grad_norm": 24.766228205349133, "learning_rate": 1.6684124650001006e-05, "loss": 2.531, "mean_token_accuracy": 0.46551724076271056, "step": 16565 }, { "epoch": 0.01668947998722857, "grad_norm": 32.626409140469605, "learning_rate": 1.668916060673207e-05, "loss": 2.3574, "mean_token_accuracy": 0.41034482717514037, "step": 16570 }, { "epoch": 0.016694516040332744, "grad_norm": 28.68895241745584, "learning_rate": 1.6694196563463128e-05, "loss": 2.2842, "mean_token_accuracy": 0.41379310488700866, "step": 16575 }, { "epoch": 0.016699552093436914, "grad_norm": 24.786791611751994, "learning_rate": 1.6699232520194187e-05, "loss": 2.2697, "mean_token_accuracy": 0.4448275864124298, "step": 16580 }, { "epoch": 0.016704588146541088, "grad_norm": 22.849364945602453, "learning_rate": 1.6704268476925246e-05, "loss": 2.2101, "mean_token_accuracy": 0.4206896543502808, "step": 16585 }, { "epoch": 0.01670962419964526, "grad_norm": 28.815823615461213, "learning_rate": 1.6709304433656305e-05, "loss": 2.3716, "mean_token_accuracy": 0.441379314661026, "step": 16590 }, { "epoch": 0.016714660252749432, "grad_norm": 25.67189319869878, "learning_rate": 1.6714340390387368e-05, "loss": 2.4791, "mean_token_accuracy": 0.3827586233615875, "step": 16595 }, { "epoch": 0.016719696305853606, "grad_norm": 20.250273980546684, "learning_rate": 1.6719376347118427e-05, "loss": 2.1972, "mean_token_accuracy": 0.4241379201412201, "step": 16600 }, { "epoch": 0.01672473235895778, "grad_norm": 27.812835609853447, "learning_rate": 1.6724412303849487e-05, "loss": 2.4992, "mean_token_accuracy": 0.39655171930789945, "step": 16605 }, { "epoch": 0.016729768412061953, "grad_norm": 27.02153546562052, "learning_rate": 1.6729448260580546e-05, "loss": 2.2211, "mean_token_accuracy": 0.42068966031074523, "step": 16610 }, { "epoch": 0.016734804465166123, "grad_norm": 25.07533165800941, "learning_rate": 1.6734484217311605e-05, "loss": 2.691, "mean_token_accuracy": 0.3862068891525269, "step": 16615 }, { "epoch": 0.016739840518270297, "grad_norm": 19.262066873221702, "learning_rate": 1.6739520174042668e-05, "loss": 2.5882, "mean_token_accuracy": 0.4034482777118683, "step": 16620 }, { "epoch": 0.01674487657137447, "grad_norm": 27.630239723381568, "learning_rate": 1.6744556130773727e-05, "loss": 2.4266, "mean_token_accuracy": 0.4330308556556702, "step": 16625 }, { "epoch": 0.01674991262447864, "grad_norm": 28.69394637716775, "learning_rate": 1.6749592087504783e-05, "loss": 2.4544, "mean_token_accuracy": 0.4034482777118683, "step": 16630 }, { "epoch": 0.016754948677582815, "grad_norm": 26.138878231836326, "learning_rate": 1.6754628044235845e-05, "loss": 2.3082, "mean_token_accuracy": 0.4980641186237335, "step": 16635 }, { "epoch": 0.01675998473068699, "grad_norm": 24.53681925560233, "learning_rate": 1.6759664000966905e-05, "loss": 2.2614, "mean_token_accuracy": 0.43103448748588563, "step": 16640 }, { "epoch": 0.016765020783791162, "grad_norm": 21.98524845882193, "learning_rate": 1.6764699957697964e-05, "loss": 2.0146, "mean_token_accuracy": 0.5160314559936523, "step": 16645 }, { "epoch": 0.016770056836895333, "grad_norm": 25.55489798229527, "learning_rate": 1.6769735914429023e-05, "loss": 2.0501, "mean_token_accuracy": 0.5034482717514038, "step": 16650 }, { "epoch": 0.016775092889999507, "grad_norm": 29.455896722420146, "learning_rate": 1.6774771871160082e-05, "loss": 2.4353, "mean_token_accuracy": 0.36896551847457887, "step": 16655 }, { "epoch": 0.01678012894310368, "grad_norm": 31.003228995006612, "learning_rate": 1.6779807827891145e-05, "loss": 2.4644, "mean_token_accuracy": 0.4034482717514038, "step": 16660 }, { "epoch": 0.01678516499620785, "grad_norm": 25.38053425177616, "learning_rate": 1.6784843784622204e-05, "loss": 2.3268, "mean_token_accuracy": 0.4551724135875702, "step": 16665 }, { "epoch": 0.016790201049312024, "grad_norm": 21.760982674378607, "learning_rate": 1.6789879741353264e-05, "loss": 2.6763, "mean_token_accuracy": 0.3551724076271057, "step": 16670 }, { "epoch": 0.016795237102416198, "grad_norm": 24.404175760634832, "learning_rate": 1.6794915698084323e-05, "loss": 2.4728, "mean_token_accuracy": 0.41724138259887694, "step": 16675 }, { "epoch": 0.016800273155520372, "grad_norm": 21.991555110528008, "learning_rate": 1.6799951654815382e-05, "loss": 2.2598, "mean_token_accuracy": 0.4551724135875702, "step": 16680 }, { "epoch": 0.016805309208624542, "grad_norm": 22.362492828740233, "learning_rate": 1.680498761154644e-05, "loss": 2.3174, "mean_token_accuracy": 0.4, "step": 16685 }, { "epoch": 0.016810345261728716, "grad_norm": 19.596537434292678, "learning_rate": 1.6810023568277504e-05, "loss": 2.2956, "mean_token_accuracy": 0.43793103098869324, "step": 16690 }, { "epoch": 0.01681538131483289, "grad_norm": 18.386247426360306, "learning_rate": 1.681505952500856e-05, "loss": 2.5653, "mean_token_accuracy": 0.3551724135875702, "step": 16695 }, { "epoch": 0.01682041736793706, "grad_norm": 20.27802176513931, "learning_rate": 1.6820095481739623e-05, "loss": 2.3197, "mean_token_accuracy": 0.41379310488700866, "step": 16700 }, { "epoch": 0.016825453421041234, "grad_norm": 23.94985538201386, "learning_rate": 1.6825131438470682e-05, "loss": 2.3415, "mean_token_accuracy": 0.4068965554237366, "step": 16705 }, { "epoch": 0.016830489474145408, "grad_norm": 23.136199548446854, "learning_rate": 1.683016739520174e-05, "loss": 2.1842, "mean_token_accuracy": 0.4379310369491577, "step": 16710 }, { "epoch": 0.01683552552724958, "grad_norm": 22.392366648969645, "learning_rate": 1.68352033519328e-05, "loss": 2.3128, "mean_token_accuracy": 0.44137930274009707, "step": 16715 }, { "epoch": 0.01684056158035375, "grad_norm": 20.79343688099604, "learning_rate": 1.684023930866386e-05, "loss": 2.3726, "mean_token_accuracy": 0.4034482777118683, "step": 16720 }, { "epoch": 0.016845597633457925, "grad_norm": 23.53411563648727, "learning_rate": 1.684527526539492e-05, "loss": 2.3836, "mean_token_accuracy": 0.4448275864124298, "step": 16725 }, { "epoch": 0.0168506336865621, "grad_norm": 23.404719460434745, "learning_rate": 1.685031122212598e-05, "loss": 2.2104, "mean_token_accuracy": 0.47931033968925474, "step": 16730 }, { "epoch": 0.01685566973966627, "grad_norm": 23.173083679408165, "learning_rate": 1.685534717885704e-05, "loss": 2.2963, "mean_token_accuracy": 0.41724138259887694, "step": 16735 }, { "epoch": 0.016860705792770443, "grad_norm": 30.797005470254117, "learning_rate": 1.68603831355881e-05, "loss": 2.3107, "mean_token_accuracy": 0.43103447556495667, "step": 16740 }, { "epoch": 0.016865741845874617, "grad_norm": 24.228966362661172, "learning_rate": 1.686541909231916e-05, "loss": 2.5924, "mean_token_accuracy": 0.36896551847457887, "step": 16745 }, { "epoch": 0.01687077789897879, "grad_norm": 29.877716458941702, "learning_rate": 1.687045504905022e-05, "loss": 2.5948, "mean_token_accuracy": 0.4448275864124298, "step": 16750 }, { "epoch": 0.01687581395208296, "grad_norm": 23.175622911123604, "learning_rate": 1.687549100578128e-05, "loss": 2.4684, "mean_token_accuracy": 0.4206896543502808, "step": 16755 }, { "epoch": 0.016880850005187135, "grad_norm": 29.47379565480641, "learning_rate": 1.6880526962512337e-05, "loss": 2.5385, "mean_token_accuracy": 0.4034482777118683, "step": 16760 }, { "epoch": 0.01688588605829131, "grad_norm": 25.63048320923515, "learning_rate": 1.6885562919243396e-05, "loss": 2.2866, "mean_token_accuracy": 0.441379314661026, "step": 16765 }, { "epoch": 0.01689092211139548, "grad_norm": 25.083368180138745, "learning_rate": 1.689059887597446e-05, "loss": 2.5079, "mean_token_accuracy": 0.37586206793785093, "step": 16770 }, { "epoch": 0.016895958164499653, "grad_norm": 31.985448268481715, "learning_rate": 1.6895634832705518e-05, "loss": 2.4864, "mean_token_accuracy": 0.3965517282485962, "step": 16775 }, { "epoch": 0.016900994217603826, "grad_norm": 25.796786094649132, "learning_rate": 1.6900670789436577e-05, "loss": 2.6663, "mean_token_accuracy": 0.3551724076271057, "step": 16780 }, { "epoch": 0.016906030270708, "grad_norm": 21.709930426023988, "learning_rate": 1.6905706746167637e-05, "loss": 2.7189, "mean_token_accuracy": 0.3793103456497192, "step": 16785 }, { "epoch": 0.01691106632381217, "grad_norm": 19.617598631316774, "learning_rate": 1.6910742702898696e-05, "loss": 1.9605, "mean_token_accuracy": 0.5172413766384125, "step": 16790 }, { "epoch": 0.016916102376916344, "grad_norm": 21.846556545721675, "learning_rate": 1.691577865962976e-05, "loss": 2.3074, "mean_token_accuracy": 0.46551724672317507, "step": 16795 }, { "epoch": 0.016921138430020518, "grad_norm": 23.557834718940086, "learning_rate": 1.6920814616360818e-05, "loss": 2.3278, "mean_token_accuracy": 0.4620689690113068, "step": 16800 }, { "epoch": 0.016926174483124688, "grad_norm": 18.266984969749736, "learning_rate": 1.6925850573091877e-05, "loss": 2.2223, "mean_token_accuracy": 0.42413792610168455, "step": 16805 }, { "epoch": 0.016931210536228862, "grad_norm": 28.987995605777982, "learning_rate": 1.6930886529822936e-05, "loss": 2.3678, "mean_token_accuracy": 0.4103448212146759, "step": 16810 }, { "epoch": 0.016936246589333036, "grad_norm": 19.09859453193959, "learning_rate": 1.6935922486553996e-05, "loss": 2.1327, "mean_token_accuracy": 0.4931034445762634, "step": 16815 }, { "epoch": 0.01694128264243721, "grad_norm": 28.642916482497466, "learning_rate": 1.6940958443285058e-05, "loss": 2.7749, "mean_token_accuracy": 0.3620689630508423, "step": 16820 }, { "epoch": 0.01694631869554138, "grad_norm": 28.281008802403854, "learning_rate": 1.6945994400016117e-05, "loss": 2.5941, "mean_token_accuracy": 0.3896551728248596, "step": 16825 }, { "epoch": 0.016951354748645554, "grad_norm": 23.616357843621316, "learning_rate": 1.6951030356747173e-05, "loss": 2.2922, "mean_token_accuracy": 0.4344827592372894, "step": 16830 }, { "epoch": 0.016956390801749727, "grad_norm": 23.848832177595586, "learning_rate": 1.6956066313478236e-05, "loss": 2.6654, "mean_token_accuracy": 0.38275861740112305, "step": 16835 }, { "epoch": 0.016961426854853898, "grad_norm": 22.375460521922786, "learning_rate": 1.6961102270209295e-05, "loss": 2.527, "mean_token_accuracy": 0.39655172228813174, "step": 16840 }, { "epoch": 0.01696646290795807, "grad_norm": 22.49239676855279, "learning_rate": 1.6966138226940358e-05, "loss": 2.5873, "mean_token_accuracy": 0.39794312715530394, "step": 16845 }, { "epoch": 0.016971498961062245, "grad_norm": 22.330940870463852, "learning_rate": 1.6971174183671414e-05, "loss": 2.778, "mean_token_accuracy": 0.38021778464317324, "step": 16850 }, { "epoch": 0.01697653501416642, "grad_norm": 21.86718311836748, "learning_rate": 1.6976210140402473e-05, "loss": 2.0557, "mean_token_accuracy": 0.4517241358757019, "step": 16855 }, { "epoch": 0.01698157106727059, "grad_norm": 29.518516609639338, "learning_rate": 1.6981246097133536e-05, "loss": 2.4277, "mean_token_accuracy": 0.37392619252204895, "step": 16860 }, { "epoch": 0.016986607120374763, "grad_norm": 22.75331959358362, "learning_rate": 1.6986282053864595e-05, "loss": 2.7001, "mean_token_accuracy": 0.4068965494632721, "step": 16865 }, { "epoch": 0.016991643173478937, "grad_norm": 27.48321845168011, "learning_rate": 1.6991318010595654e-05, "loss": 2.8149, "mean_token_accuracy": 0.36551723480224607, "step": 16870 }, { "epoch": 0.016996679226583107, "grad_norm": 25.53383939797929, "learning_rate": 1.6996353967326713e-05, "loss": 2.5689, "mean_token_accuracy": 0.3931034505367279, "step": 16875 }, { "epoch": 0.01700171527968728, "grad_norm": 23.4979280092709, "learning_rate": 1.7001389924057773e-05, "loss": 2.4146, "mean_token_accuracy": 0.47931033968925474, "step": 16880 }, { "epoch": 0.017006751332791455, "grad_norm": 30.884072616511304, "learning_rate": 1.7006425880788835e-05, "loss": 2.3744, "mean_token_accuracy": 0.443254679441452, "step": 16885 }, { "epoch": 0.01701178738589563, "grad_norm": 18.901462006544367, "learning_rate": 1.7011461837519894e-05, "loss": 2.2586, "mean_token_accuracy": 0.41724138259887694, "step": 16890 }, { "epoch": 0.0170168234389998, "grad_norm": 23.643674359115106, "learning_rate": 1.701649779425095e-05, "loss": 2.4209, "mean_token_accuracy": 0.44827585816383364, "step": 16895 }, { "epoch": 0.017021859492103972, "grad_norm": 18.502886715526195, "learning_rate": 1.7021533750982013e-05, "loss": 2.1947, "mean_token_accuracy": 0.4862069010734558, "step": 16900 }, { "epoch": 0.017026895545208146, "grad_norm": 23.80808223912398, "learning_rate": 1.7026569707713072e-05, "loss": 1.9763, "mean_token_accuracy": 0.4896551728248596, "step": 16905 }, { "epoch": 0.017031931598312317, "grad_norm": 16.292167289333896, "learning_rate": 1.703160566444413e-05, "loss": 2.0484, "mean_token_accuracy": 0.482758617401123, "step": 16910 }, { "epoch": 0.01703696765141649, "grad_norm": 24.26451475533685, "learning_rate": 1.703664162117519e-05, "loss": 2.4793, "mean_token_accuracy": 0.41379310488700866, "step": 16915 }, { "epoch": 0.017042003704520664, "grad_norm": 26.12217445671135, "learning_rate": 1.704167757790625e-05, "loss": 2.8941, "mean_token_accuracy": 0.3896551728248596, "step": 16920 }, { "epoch": 0.017047039757624838, "grad_norm": 28.03840535394834, "learning_rate": 1.7046713534637313e-05, "loss": 2.6204, "mean_token_accuracy": 0.4413793087005615, "step": 16925 }, { "epoch": 0.017052075810729008, "grad_norm": 20.946667814989045, "learning_rate": 1.7051749491368372e-05, "loss": 2.3908, "mean_token_accuracy": 0.3896551787853241, "step": 16930 }, { "epoch": 0.017057111863833182, "grad_norm": 21.573774119361257, "learning_rate": 1.705678544809943e-05, "loss": 2.5358, "mean_token_accuracy": 0.43103447556495667, "step": 16935 }, { "epoch": 0.017062147916937356, "grad_norm": 32.16166612648354, "learning_rate": 1.706182140483049e-05, "loss": 2.6069, "mean_token_accuracy": 0.39655172228813174, "step": 16940 }, { "epoch": 0.017067183970041526, "grad_norm": 21.356594774685313, "learning_rate": 1.706685736156155e-05, "loss": 2.5144, "mean_token_accuracy": 0.42820197343826294, "step": 16945 }, { "epoch": 0.0170722200231457, "grad_norm": 26.64612832961511, "learning_rate": 1.707189331829261e-05, "loss": 2.439, "mean_token_accuracy": 0.3620689630508423, "step": 16950 }, { "epoch": 0.017077256076249873, "grad_norm": 28.609755436671982, "learning_rate": 1.707692927502367e-05, "loss": 2.6326, "mean_token_accuracy": 0.41893526911735535, "step": 16955 }, { "epoch": 0.017082292129354047, "grad_norm": 21.62245765998265, "learning_rate": 1.7081965231754727e-05, "loss": 2.0171, "mean_token_accuracy": 0.5110837399959565, "step": 16960 }, { "epoch": 0.017087328182458218, "grad_norm": 25.860257412667455, "learning_rate": 1.708700118848579e-05, "loss": 2.3232, "mean_token_accuracy": 0.4137930989265442, "step": 16965 }, { "epoch": 0.01709236423556239, "grad_norm": 23.80630529834784, "learning_rate": 1.709203714521685e-05, "loss": 2.3485, "mean_token_accuracy": 0.41379310488700866, "step": 16970 }, { "epoch": 0.017097400288666565, "grad_norm": 24.002206389771395, "learning_rate": 1.709707310194791e-05, "loss": 2.2951, "mean_token_accuracy": 0.46896551847457885, "step": 16975 }, { "epoch": 0.017102436341770735, "grad_norm": 20.28646346479148, "learning_rate": 1.710210905867897e-05, "loss": 2.194, "mean_token_accuracy": 0.4551724076271057, "step": 16980 }, { "epoch": 0.01710747239487491, "grad_norm": 26.826628437996515, "learning_rate": 1.7107145015410027e-05, "loss": 2.738, "mean_token_accuracy": 0.441379314661026, "step": 16985 }, { "epoch": 0.017112508447979083, "grad_norm": 25.704531254648305, "learning_rate": 1.7112180972141086e-05, "loss": 2.1682, "mean_token_accuracy": 0.4241379380226135, "step": 16990 }, { "epoch": 0.017117544501083257, "grad_norm": 22.8919726188464, "learning_rate": 1.711721692887215e-05, "loss": 2.4086, "mean_token_accuracy": 0.4551724135875702, "step": 16995 }, { "epoch": 0.017122580554187427, "grad_norm": 23.351421984432573, "learning_rate": 1.7122252885603208e-05, "loss": 2.3203, "mean_token_accuracy": 0.4482758641242981, "step": 17000 }, { "epoch": 0.0171276166072916, "grad_norm": 37.58523548924887, "learning_rate": 1.7127288842334267e-05, "loss": 2.1699, "mean_token_accuracy": 0.4689655065536499, "step": 17005 }, { "epoch": 0.017132652660395774, "grad_norm": 19.83621903528445, "learning_rate": 1.7132324799065327e-05, "loss": 2.2801, "mean_token_accuracy": 0.4586206912994385, "step": 17010 }, { "epoch": 0.017137688713499945, "grad_norm": 29.990762500032883, "learning_rate": 1.7137360755796386e-05, "loss": 2.3877, "mean_token_accuracy": 0.3724137842655182, "step": 17015 }, { "epoch": 0.01714272476660412, "grad_norm": 21.395193560502197, "learning_rate": 1.714239671252745e-05, "loss": 2.6039, "mean_token_accuracy": 0.42068966031074523, "step": 17020 }, { "epoch": 0.017147760819708292, "grad_norm": 22.821579564968065, "learning_rate": 1.7147432669258508e-05, "loss": 2.4418, "mean_token_accuracy": 0.4206896543502808, "step": 17025 }, { "epoch": 0.017152796872812466, "grad_norm": 26.608042043880413, "learning_rate": 1.7152468625989564e-05, "loss": 2.4207, "mean_token_accuracy": 0.43793103098869324, "step": 17030 }, { "epoch": 0.017157832925916636, "grad_norm": 25.625257156422766, "learning_rate": 1.7157504582720626e-05, "loss": 2.479, "mean_token_accuracy": 0.4188747763633728, "step": 17035 }, { "epoch": 0.01716286897902081, "grad_norm": 23.757593294716646, "learning_rate": 1.7162540539451686e-05, "loss": 2.1516, "mean_token_accuracy": 0.4517241358757019, "step": 17040 }, { "epoch": 0.017167905032124984, "grad_norm": 29.63098332835245, "learning_rate": 1.7167576496182748e-05, "loss": 2.7424, "mean_token_accuracy": 0.3495462715625763, "step": 17045 }, { "epoch": 0.017172941085229154, "grad_norm": 24.27220567025843, "learning_rate": 1.7172612452913804e-05, "loss": 2.2286, "mean_token_accuracy": 0.47241379618644713, "step": 17050 }, { "epoch": 0.017177977138333328, "grad_norm": 21.539020120510976, "learning_rate": 1.7177648409644863e-05, "loss": 2.3071, "mean_token_accuracy": 0.42758620977401735, "step": 17055 }, { "epoch": 0.0171830131914375, "grad_norm": 21.68058802859244, "learning_rate": 1.7182684366375926e-05, "loss": 2.2475, "mean_token_accuracy": 0.46551724672317507, "step": 17060 }, { "epoch": 0.017188049244541675, "grad_norm": 28.396285901412426, "learning_rate": 1.7187720323106985e-05, "loss": 1.9916, "mean_token_accuracy": 0.4758620738983154, "step": 17065 }, { "epoch": 0.017193085297645846, "grad_norm": 25.752053143382444, "learning_rate": 1.7192756279838045e-05, "loss": 2.3177, "mean_token_accuracy": 0.4586206912994385, "step": 17070 }, { "epoch": 0.01719812135075002, "grad_norm": 24.691942460316515, "learning_rate": 1.7197792236569104e-05, "loss": 2.1456, "mean_token_accuracy": 0.46896551847457885, "step": 17075 }, { "epoch": 0.017203157403854193, "grad_norm": 16.54078023280488, "learning_rate": 1.7202828193300163e-05, "loss": 2.1242, "mean_token_accuracy": 0.4379310369491577, "step": 17080 }, { "epoch": 0.017208193456958364, "grad_norm": 22.855771907560182, "learning_rate": 1.7207864150031226e-05, "loss": 2.2227, "mean_token_accuracy": 0.4517241358757019, "step": 17085 }, { "epoch": 0.017213229510062537, "grad_norm": 33.074964765988945, "learning_rate": 1.7212900106762285e-05, "loss": 2.4396, "mean_token_accuracy": 0.4551724076271057, "step": 17090 }, { "epoch": 0.01721826556316671, "grad_norm": 22.62862453504766, "learning_rate": 1.721793606349334e-05, "loss": 2.0233, "mean_token_accuracy": 0.48620688915252686, "step": 17095 }, { "epoch": 0.017223301616270885, "grad_norm": 20.273131121523456, "learning_rate": 1.7222972020224403e-05, "loss": 2.6155, "mean_token_accuracy": 0.3827586144208908, "step": 17100 }, { "epoch": 0.017228337669375055, "grad_norm": 22.51211578881003, "learning_rate": 1.7228007976955463e-05, "loss": 2.4178, "mean_token_accuracy": 0.3965517163276672, "step": 17105 }, { "epoch": 0.01723337372247923, "grad_norm": 84.16959554946303, "learning_rate": 1.7233043933686522e-05, "loss": 2.3054, "mean_token_accuracy": 0.4551724135875702, "step": 17110 }, { "epoch": 0.017238409775583403, "grad_norm": 20.50101782438021, "learning_rate": 1.723807989041758e-05, "loss": 2.5777, "mean_token_accuracy": 0.3724137842655182, "step": 17115 }, { "epoch": 0.017243445828687573, "grad_norm": 27.33558168710615, "learning_rate": 1.724311584714864e-05, "loss": 2.0983, "mean_token_accuracy": 0.4620689630508423, "step": 17120 }, { "epoch": 0.017248481881791747, "grad_norm": 27.514305515855735, "learning_rate": 1.7248151803879703e-05, "loss": 2.3523, "mean_token_accuracy": 0.4399273991584778, "step": 17125 }, { "epoch": 0.01725351793489592, "grad_norm": 23.18210368859295, "learning_rate": 1.7253187760610762e-05, "loss": 2.2599, "mean_token_accuracy": 0.4034482777118683, "step": 17130 }, { "epoch": 0.017258553988000094, "grad_norm": 17.704120273365863, "learning_rate": 1.725822371734182e-05, "loss": 2.2296, "mean_token_accuracy": 0.44137930274009707, "step": 17135 }, { "epoch": 0.017263590041104265, "grad_norm": 21.017394238514395, "learning_rate": 1.726325967407288e-05, "loss": 2.1386, "mean_token_accuracy": 0.46551724672317507, "step": 17140 }, { "epoch": 0.01726862609420844, "grad_norm": 27.582861840428897, "learning_rate": 1.726829563080394e-05, "loss": 2.3705, "mean_token_accuracy": 0.3965517282485962, "step": 17145 }, { "epoch": 0.017273662147312612, "grad_norm": 26.01625521275371, "learning_rate": 1.7273331587535e-05, "loss": 2.4477, "mean_token_accuracy": 0.4068965494632721, "step": 17150 }, { "epoch": 0.017278698200416782, "grad_norm": 36.17817015082944, "learning_rate": 1.7278367544266062e-05, "loss": 2.3759, "mean_token_accuracy": 0.4551724135875702, "step": 17155 }, { "epoch": 0.017283734253520956, "grad_norm": 21.422984387095664, "learning_rate": 1.728340350099712e-05, "loss": 2.1965, "mean_token_accuracy": 0.46896551847457885, "step": 17160 }, { "epoch": 0.01728877030662513, "grad_norm": 20.580260013866525, "learning_rate": 1.728843945772818e-05, "loss": 2.4614, "mean_token_accuracy": 0.4, "step": 17165 }, { "epoch": 0.017293806359729304, "grad_norm": 19.256983666487866, "learning_rate": 1.729347541445924e-05, "loss": 2.5576, "mean_token_accuracy": 0.4103448212146759, "step": 17170 }, { "epoch": 0.017298842412833474, "grad_norm": 27.353384892700625, "learning_rate": 1.72985113711903e-05, "loss": 2.5757, "mean_token_accuracy": 0.37241379618644715, "step": 17175 }, { "epoch": 0.017303878465937648, "grad_norm": 23.615407544767102, "learning_rate": 1.730354732792136e-05, "loss": 2.2754, "mean_token_accuracy": 0.43448275327682495, "step": 17180 }, { "epoch": 0.01730891451904182, "grad_norm": 26.244350652956335, "learning_rate": 1.7308583284652417e-05, "loss": 2.1651, "mean_token_accuracy": 0.47586206793785096, "step": 17185 }, { "epoch": 0.017313950572145992, "grad_norm": 19.305258312131684, "learning_rate": 1.731361924138348e-05, "loss": 2.4327, "mean_token_accuracy": 0.40086206793785095, "step": 17190 }, { "epoch": 0.017318986625250166, "grad_norm": 30.19412163179365, "learning_rate": 1.731865519811454e-05, "loss": 2.5088, "mean_token_accuracy": 0.42758620381355283, "step": 17195 }, { "epoch": 0.01732402267835434, "grad_norm": 30.747832867794486, "learning_rate": 1.73236911548456e-05, "loss": 2.5431, "mean_token_accuracy": 0.4172413766384125, "step": 17200 }, { "epoch": 0.017329058731458513, "grad_norm": 23.1037889960755, "learning_rate": 1.7328727111576658e-05, "loss": 2.5489, "mean_token_accuracy": 0.3862069010734558, "step": 17205 }, { "epoch": 0.017334094784562683, "grad_norm": 25.941128439981238, "learning_rate": 1.7333763068307717e-05, "loss": 2.4251, "mean_token_accuracy": 0.4137930989265442, "step": 17210 }, { "epoch": 0.017339130837666857, "grad_norm": 23.869162905032915, "learning_rate": 1.7338799025038776e-05, "loss": 2.533, "mean_token_accuracy": 0.3551724135875702, "step": 17215 }, { "epoch": 0.01734416689077103, "grad_norm": 26.36224671369415, "learning_rate": 1.734383498176984e-05, "loss": 2.7179, "mean_token_accuracy": 0.36896551251411436, "step": 17220 }, { "epoch": 0.0173492029438752, "grad_norm": 21.00237111569138, "learning_rate": 1.7348870938500898e-05, "loss": 2.1971, "mean_token_accuracy": 0.4689655125141144, "step": 17225 }, { "epoch": 0.017354238996979375, "grad_norm": 22.121329078886497, "learning_rate": 1.7353906895231958e-05, "loss": 2.2713, "mean_token_accuracy": 0.4482758641242981, "step": 17230 }, { "epoch": 0.01735927505008355, "grad_norm": 35.590469996180964, "learning_rate": 1.7358942851963017e-05, "loss": 2.6176, "mean_token_accuracy": 0.4137930989265442, "step": 17235 }, { "epoch": 0.017364311103187723, "grad_norm": 27.8567394318579, "learning_rate": 1.7363978808694076e-05, "loss": 2.4839, "mean_token_accuracy": 0.4172413766384125, "step": 17240 }, { "epoch": 0.017369347156291893, "grad_norm": 35.18304278404932, "learning_rate": 1.736901476542514e-05, "loss": 2.312, "mean_token_accuracy": 0.4862068951129913, "step": 17245 }, { "epoch": 0.017374383209396067, "grad_norm": 103.94002807813789, "learning_rate": 1.7374050722156195e-05, "loss": 2.5213, "mean_token_accuracy": 0.358620685338974, "step": 17250 }, { "epoch": 0.01737941926250024, "grad_norm": 26.540669579160614, "learning_rate": 1.7379086678887254e-05, "loss": 2.6305, "mean_token_accuracy": 0.4000000059604645, "step": 17255 }, { "epoch": 0.01738445531560441, "grad_norm": 24.15332022067708, "learning_rate": 1.7384122635618316e-05, "loss": 2.1583, "mean_token_accuracy": 0.45710828304290774, "step": 17260 }, { "epoch": 0.017389491368708584, "grad_norm": 35.886360997256936, "learning_rate": 1.7389158592349376e-05, "loss": 2.6766, "mean_token_accuracy": 0.38076224327087405, "step": 17265 }, { "epoch": 0.017394527421812758, "grad_norm": 23.556475145192334, "learning_rate": 1.7394194549080435e-05, "loss": 2.4301, "mean_token_accuracy": 0.38965516686439516, "step": 17270 }, { "epoch": 0.017399563474916932, "grad_norm": 25.85148136055674, "learning_rate": 1.7399230505811494e-05, "loss": 2.3706, "mean_token_accuracy": 0.42758620381355283, "step": 17275 }, { "epoch": 0.017404599528021102, "grad_norm": 23.049517198126363, "learning_rate": 1.7404266462542553e-05, "loss": 2.1388, "mean_token_accuracy": 0.49491833448410033, "step": 17280 }, { "epoch": 0.017409635581125276, "grad_norm": 18.43971040666176, "learning_rate": 1.7409302419273616e-05, "loss": 2.0456, "mean_token_accuracy": 0.4620689630508423, "step": 17285 }, { "epoch": 0.01741467163422945, "grad_norm": 23.837235878500735, "learning_rate": 1.7414338376004675e-05, "loss": 2.3365, "mean_token_accuracy": 0.43793103098869324, "step": 17290 }, { "epoch": 0.01741970768733362, "grad_norm": 18.68997577120234, "learning_rate": 1.741937433273573e-05, "loss": 2.1335, "mean_token_accuracy": 0.45517241954803467, "step": 17295 }, { "epoch": 0.017424743740437794, "grad_norm": 20.104010768017094, "learning_rate": 1.7424410289466794e-05, "loss": 2.116, "mean_token_accuracy": 0.47931033968925474, "step": 17300 }, { "epoch": 0.017429779793541968, "grad_norm": 26.76951987270601, "learning_rate": 1.7429446246197853e-05, "loss": 2.4479, "mean_token_accuracy": 0.43793103098869324, "step": 17305 }, { "epoch": 0.01743481584664614, "grad_norm": 25.869988185983974, "learning_rate": 1.7434482202928916e-05, "loss": 2.3298, "mean_token_accuracy": 0.4448275864124298, "step": 17310 }, { "epoch": 0.01743985189975031, "grad_norm": 19.647809220096583, "learning_rate": 1.743951815965997e-05, "loss": 2.3403, "mean_token_accuracy": 0.4344827502965927, "step": 17315 }, { "epoch": 0.017444887952854485, "grad_norm": 21.341376187528436, "learning_rate": 1.744455411639103e-05, "loss": 2.2894, "mean_token_accuracy": 0.4189352750778198, "step": 17320 }, { "epoch": 0.01744992400595866, "grad_norm": 34.67212526756585, "learning_rate": 1.7449590073122094e-05, "loss": 2.8367, "mean_token_accuracy": 0.39310344457626345, "step": 17325 }, { "epoch": 0.01745496005906283, "grad_norm": 23.27076907558333, "learning_rate": 1.7454626029853153e-05, "loss": 2.2101, "mean_token_accuracy": 0.4551724076271057, "step": 17330 }, { "epoch": 0.017459996112167003, "grad_norm": 23.911707460278514, "learning_rate": 1.7459661986584212e-05, "loss": 2.5651, "mean_token_accuracy": 0.37586207389831544, "step": 17335 }, { "epoch": 0.017465032165271177, "grad_norm": 25.603640452219295, "learning_rate": 1.746469794331527e-05, "loss": 2.3273, "mean_token_accuracy": 0.4862069070339203, "step": 17340 }, { "epoch": 0.01747006821837535, "grad_norm": 26.569056738424994, "learning_rate": 1.746973390004633e-05, "loss": 2.0427, "mean_token_accuracy": 0.44482758045196535, "step": 17345 }, { "epoch": 0.01747510427147952, "grad_norm": 24.032783190066525, "learning_rate": 1.7474769856777393e-05, "loss": 2.611, "mean_token_accuracy": 0.4344827592372894, "step": 17350 }, { "epoch": 0.017480140324583695, "grad_norm": 24.773190508745046, "learning_rate": 1.7479805813508452e-05, "loss": 2.3345, "mean_token_accuracy": 0.44482759237289426, "step": 17355 }, { "epoch": 0.01748517637768787, "grad_norm": 26.556699719132038, "learning_rate": 1.748484177023951e-05, "loss": 2.6376, "mean_token_accuracy": 0.3896551787853241, "step": 17360 }, { "epoch": 0.01749021243079204, "grad_norm": 26.31064089327591, "learning_rate": 1.748987772697057e-05, "loss": 2.5097, "mean_token_accuracy": 0.43103447556495667, "step": 17365 }, { "epoch": 0.017495248483896213, "grad_norm": 21.454088842914057, "learning_rate": 1.749491368370163e-05, "loss": 1.9923, "mean_token_accuracy": 0.4712038815021515, "step": 17370 }, { "epoch": 0.017500284537000386, "grad_norm": 18.55862458351415, "learning_rate": 1.749994964043269e-05, "loss": 2.5636, "mean_token_accuracy": 0.3827586144208908, "step": 17375 }, { "epoch": 0.01750532059010456, "grad_norm": 28.418284016942984, "learning_rate": 1.7504985597163752e-05, "loss": 2.0485, "mean_token_accuracy": 0.47852389216423036, "step": 17380 }, { "epoch": 0.01751035664320873, "grad_norm": 27.532273746794147, "learning_rate": 1.7510021553894808e-05, "loss": 2.283, "mean_token_accuracy": 0.47931034564971925, "step": 17385 }, { "epoch": 0.017515392696312904, "grad_norm": 23.28607582655699, "learning_rate": 1.751505751062587e-05, "loss": 2.6172, "mean_token_accuracy": 0.4329098641872406, "step": 17390 }, { "epoch": 0.017520428749417078, "grad_norm": 31.3212357837227, "learning_rate": 1.752009346735693e-05, "loss": 3.0013, "mean_token_accuracy": 0.32413792610168457, "step": 17395 }, { "epoch": 0.01752546480252125, "grad_norm": 30.656970365145632, "learning_rate": 1.752512942408799e-05, "loss": 2.4152, "mean_token_accuracy": 0.4413793087005615, "step": 17400 }, { "epoch": 0.017530500855625422, "grad_norm": 28.62431970737396, "learning_rate": 1.753016538081905e-05, "loss": 2.3463, "mean_token_accuracy": 0.4724137902259827, "step": 17405 }, { "epoch": 0.017535536908729596, "grad_norm": 25.44910255275347, "learning_rate": 1.7535201337550108e-05, "loss": 2.8187, "mean_token_accuracy": 0.3758620619773865, "step": 17410 }, { "epoch": 0.017540572961833766, "grad_norm": 22.715702431006928, "learning_rate": 1.7540237294281167e-05, "loss": 2.448, "mean_token_accuracy": 0.4172413766384125, "step": 17415 }, { "epoch": 0.01754560901493794, "grad_norm": 22.497223643230594, "learning_rate": 1.754527325101223e-05, "loss": 2.4341, "mean_token_accuracy": 0.46376285552978513, "step": 17420 }, { "epoch": 0.017550645068042114, "grad_norm": 21.959151439427185, "learning_rate": 1.755030920774329e-05, "loss": 2.651, "mean_token_accuracy": 0.37931035459041595, "step": 17425 }, { "epoch": 0.017555681121146288, "grad_norm": 40.84754050533766, "learning_rate": 1.7555345164474348e-05, "loss": 2.4802, "mean_token_accuracy": 0.4275862008333206, "step": 17430 }, { "epoch": 0.017560717174250458, "grad_norm": 22.6856804952366, "learning_rate": 1.7560381121205407e-05, "loss": 2.2713, "mean_token_accuracy": 0.42413792610168455, "step": 17435 }, { "epoch": 0.01756575322735463, "grad_norm": 33.437174409592515, "learning_rate": 1.7565417077936466e-05, "loss": 2.5717, "mean_token_accuracy": 0.4034482717514038, "step": 17440 }, { "epoch": 0.017570789280458805, "grad_norm": 30.44284986700725, "learning_rate": 1.757045303466753e-05, "loss": 2.4759, "mean_token_accuracy": 0.4068965494632721, "step": 17445 }, { "epoch": 0.017575825333562976, "grad_norm": 42.91800692760815, "learning_rate": 1.7575488991398585e-05, "loss": 2.7056, "mean_token_accuracy": 0.4344827592372894, "step": 17450 }, { "epoch": 0.01758086138666715, "grad_norm": 24.418330753413446, "learning_rate": 1.7580524948129644e-05, "loss": 2.3593, "mean_token_accuracy": 0.43448275327682495, "step": 17455 }, { "epoch": 0.017585897439771323, "grad_norm": 25.518142057620672, "learning_rate": 1.7585560904860707e-05, "loss": 2.4157, "mean_token_accuracy": 0.45517241060733793, "step": 17460 }, { "epoch": 0.017590933492875497, "grad_norm": 18.98461253847586, "learning_rate": 1.7590596861591766e-05, "loss": 2.337, "mean_token_accuracy": 0.4241379201412201, "step": 17465 }, { "epoch": 0.017595969545979667, "grad_norm": 35.89824019991377, "learning_rate": 1.7595632818322825e-05, "loss": 2.2511, "mean_token_accuracy": 0.42758620381355283, "step": 17470 }, { "epoch": 0.01760100559908384, "grad_norm": 22.815412389329182, "learning_rate": 1.7600668775053885e-05, "loss": 2.5754, "mean_token_accuracy": 0.45517241954803467, "step": 17475 }, { "epoch": 0.017606041652188015, "grad_norm": 24.80619368605664, "learning_rate": 1.7605704731784944e-05, "loss": 2.3215, "mean_token_accuracy": 0.4572897732257843, "step": 17480 }, { "epoch": 0.017611077705292185, "grad_norm": 29.686113140259838, "learning_rate": 1.7610740688516007e-05, "loss": 2.3438, "mean_token_accuracy": 0.4344827592372894, "step": 17485 }, { "epoch": 0.01761611375839636, "grad_norm": 27.604209203992585, "learning_rate": 1.7615776645247066e-05, "loss": 2.4472, "mean_token_accuracy": 0.42413793206214906, "step": 17490 }, { "epoch": 0.017621149811500533, "grad_norm": 24.714328471228647, "learning_rate": 1.762081260197812e-05, "loss": 2.4688, "mean_token_accuracy": 0.43793103098869324, "step": 17495 }, { "epoch": 0.017626185864604706, "grad_norm": 19.88228375594727, "learning_rate": 1.7625848558709184e-05, "loss": 2.2342, "mean_token_accuracy": 0.4551724076271057, "step": 17500 }, { "epoch": 0.017631221917708877, "grad_norm": 18.768651688850156, "learning_rate": 1.7630884515440244e-05, "loss": 2.1795, "mean_token_accuracy": 0.4517241418361664, "step": 17505 }, { "epoch": 0.01763625797081305, "grad_norm": 25.87447357424474, "learning_rate": 1.7635920472171306e-05, "loss": 2.5555, "mean_token_accuracy": 0.39655172228813174, "step": 17510 }, { "epoch": 0.017641294023917224, "grad_norm": 20.611190898242114, "learning_rate": 1.7640956428902365e-05, "loss": 2.2705, "mean_token_accuracy": 0.4088324248790741, "step": 17515 }, { "epoch": 0.017646330077021394, "grad_norm": 24.30741609002821, "learning_rate": 1.764599238563342e-05, "loss": 1.943, "mean_token_accuracy": 0.5155172407627105, "step": 17520 }, { "epoch": 0.017651366130125568, "grad_norm": 28.197705245983673, "learning_rate": 1.7651028342364484e-05, "loss": 2.7748, "mean_token_accuracy": 0.3620689570903778, "step": 17525 }, { "epoch": 0.017656402183229742, "grad_norm": 24.178148949146955, "learning_rate": 1.7656064299095543e-05, "loss": 2.417, "mean_token_accuracy": 0.4103448331356049, "step": 17530 }, { "epoch": 0.017661438236333916, "grad_norm": 24.384424738049628, "learning_rate": 1.7661100255826602e-05, "loss": 2.2878, "mean_token_accuracy": 0.44827585816383364, "step": 17535 }, { "epoch": 0.017666474289438086, "grad_norm": 32.82125709249734, "learning_rate": 1.7666136212557662e-05, "loss": 2.4782, "mean_token_accuracy": 0.43103448748588563, "step": 17540 }, { "epoch": 0.01767151034254226, "grad_norm": 20.304794758538325, "learning_rate": 1.767117216928872e-05, "loss": 2.4222, "mean_token_accuracy": 0.4206896543502808, "step": 17545 }, { "epoch": 0.017676546395646434, "grad_norm": 19.461983219121986, "learning_rate": 1.7676208126019784e-05, "loss": 2.5778, "mean_token_accuracy": 0.39310344457626345, "step": 17550 }, { "epoch": 0.017681582448750604, "grad_norm": 18.5774299106097, "learning_rate": 1.7681244082750843e-05, "loss": 2.1944, "mean_token_accuracy": 0.42068964838981626, "step": 17555 }, { "epoch": 0.017686618501854778, "grad_norm": 30.991594543549862, "learning_rate": 1.7686280039481902e-05, "loss": 2.1665, "mean_token_accuracy": 0.44482759237289426, "step": 17560 }, { "epoch": 0.01769165455495895, "grad_norm": 26.435215446921944, "learning_rate": 1.769131599621296e-05, "loss": 2.7565, "mean_token_accuracy": 0.3862068921327591, "step": 17565 }, { "epoch": 0.017696690608063125, "grad_norm": 23.76161114727331, "learning_rate": 1.769635195294402e-05, "loss": 2.2272, "mean_token_accuracy": 0.43103448748588563, "step": 17570 }, { "epoch": 0.017701726661167295, "grad_norm": 23.746848862187342, "learning_rate": 1.770138790967508e-05, "loss": 2.2432, "mean_token_accuracy": 0.43189655542373656, "step": 17575 }, { "epoch": 0.01770676271427147, "grad_norm": 24.675746764587743, "learning_rate": 1.7706423866406143e-05, "loss": 2.377, "mean_token_accuracy": 0.4413793087005615, "step": 17580 }, { "epoch": 0.017711798767375643, "grad_norm": 28.94297930018188, "learning_rate": 1.77114598231372e-05, "loss": 2.2313, "mean_token_accuracy": 0.4772167444229126, "step": 17585 }, { "epoch": 0.017716834820479813, "grad_norm": 22.398562915053283, "learning_rate": 1.771649577986826e-05, "loss": 2.3615, "mean_token_accuracy": 0.47453115582466127, "step": 17590 }, { "epoch": 0.017721870873583987, "grad_norm": 31.97321389502253, "learning_rate": 1.772153173659932e-05, "loss": 2.4503, "mean_token_accuracy": 0.463520872592926, "step": 17595 }, { "epoch": 0.01772690692668816, "grad_norm": 20.57668206466762, "learning_rate": 1.772656769333038e-05, "loss": 2.3387, "mean_token_accuracy": 0.482758617401123, "step": 17600 }, { "epoch": 0.017731942979792335, "grad_norm": 19.505591317719347, "learning_rate": 1.773160365006144e-05, "loss": 2.3006, "mean_token_accuracy": 0.4586206912994385, "step": 17605 }, { "epoch": 0.017736979032896505, "grad_norm": 40.07010102083105, "learning_rate": 1.7736639606792498e-05, "loss": 2.6011, "mean_token_accuracy": 0.42928009629249575, "step": 17610 }, { "epoch": 0.01774201508600068, "grad_norm": 31.149344924268654, "learning_rate": 1.774167556352356e-05, "loss": 2.3403, "mean_token_accuracy": 0.41034482717514037, "step": 17615 }, { "epoch": 0.017747051139104852, "grad_norm": 22.999002334619874, "learning_rate": 1.774671152025462e-05, "loss": 2.558, "mean_token_accuracy": 0.4328493714332581, "step": 17620 }, { "epoch": 0.017752087192209023, "grad_norm": 19.465299523840663, "learning_rate": 1.775174747698568e-05, "loss": 2.197, "mean_token_accuracy": 0.4708409011363983, "step": 17625 }, { "epoch": 0.017757123245313196, "grad_norm": 27.630794938445447, "learning_rate": 1.775678343371674e-05, "loss": 2.2774, "mean_token_accuracy": 0.4034482717514038, "step": 17630 }, { "epoch": 0.01776215929841737, "grad_norm": 26.03809983600311, "learning_rate": 1.7761819390447798e-05, "loss": 2.615, "mean_token_accuracy": 0.4137930989265442, "step": 17635 }, { "epoch": 0.017767195351521544, "grad_norm": 30.731470789434216, "learning_rate": 1.7766855347178857e-05, "loss": 2.2515, "mean_token_accuracy": 0.4689655303955078, "step": 17640 }, { "epoch": 0.017772231404625714, "grad_norm": 22.528022437516356, "learning_rate": 1.777189130390992e-05, "loss": 2.3947, "mean_token_accuracy": 0.41379310488700866, "step": 17645 }, { "epoch": 0.017777267457729888, "grad_norm": 20.055747158686895, "learning_rate": 1.7776927260640975e-05, "loss": 2.2321, "mean_token_accuracy": 0.441379314661026, "step": 17650 }, { "epoch": 0.017782303510834062, "grad_norm": 25.88682399334647, "learning_rate": 1.7781963217372038e-05, "loss": 2.5413, "mean_token_accuracy": 0.4068965494632721, "step": 17655 }, { "epoch": 0.017787339563938232, "grad_norm": 26.07787959697268, "learning_rate": 1.7786999174103097e-05, "loss": 2.522, "mean_token_accuracy": 0.4034482777118683, "step": 17660 }, { "epoch": 0.017792375617042406, "grad_norm": 31.559282927008837, "learning_rate": 1.7792035130834157e-05, "loss": 2.1127, "mean_token_accuracy": 0.4482758641242981, "step": 17665 }, { "epoch": 0.01779741167014658, "grad_norm": 28.65068090976905, "learning_rate": 1.7797071087565216e-05, "loss": 2.4776, "mean_token_accuracy": 0.4103448212146759, "step": 17670 }, { "epoch": 0.017802447723250753, "grad_norm": 27.93920190299055, "learning_rate": 1.7802107044296275e-05, "loss": 2.3597, "mean_token_accuracy": 0.44664247035980226, "step": 17675 }, { "epoch": 0.017807483776354924, "grad_norm": 22.397118637496995, "learning_rate": 1.7807143001027334e-05, "loss": 2.0955, "mean_token_accuracy": 0.4896551787853241, "step": 17680 }, { "epoch": 0.017812519829459098, "grad_norm": 22.853642386216666, "learning_rate": 1.7812178957758397e-05, "loss": 2.5238, "mean_token_accuracy": 0.3551724076271057, "step": 17685 }, { "epoch": 0.01781755588256327, "grad_norm": 26.36908877903871, "learning_rate": 1.7817214914489456e-05, "loss": 2.7587, "mean_token_accuracy": 0.41724138259887694, "step": 17690 }, { "epoch": 0.01782259193566744, "grad_norm": 24.9865067076988, "learning_rate": 1.7822250871220516e-05, "loss": 2.4114, "mean_token_accuracy": 0.4068965554237366, "step": 17695 }, { "epoch": 0.017827627988771615, "grad_norm": 24.312096621812806, "learning_rate": 1.7827286827951575e-05, "loss": 2.2751, "mean_token_accuracy": 0.4620689630508423, "step": 17700 }, { "epoch": 0.01783266404187579, "grad_norm": 26.887038696163156, "learning_rate": 1.7832322784682634e-05, "loss": 2.5744, "mean_token_accuracy": 0.38275861740112305, "step": 17705 }, { "epoch": 0.017837700094979963, "grad_norm": 28.059780291831554, "learning_rate": 1.7837358741413697e-05, "loss": 2.4029, "mean_token_accuracy": 0.45172414779663084, "step": 17710 }, { "epoch": 0.017842736148084133, "grad_norm": 21.021218327454225, "learning_rate": 1.7842394698144756e-05, "loss": 2.1719, "mean_token_accuracy": 0.4586206912994385, "step": 17715 }, { "epoch": 0.017847772201188307, "grad_norm": 25.718807589788288, "learning_rate": 1.7847430654875812e-05, "loss": 2.2782, "mean_token_accuracy": 0.41917725205421447, "step": 17720 }, { "epoch": 0.01785280825429248, "grad_norm": 29.280621724606597, "learning_rate": 1.7852466611606874e-05, "loss": 2.3456, "mean_token_accuracy": 0.44827585816383364, "step": 17725 }, { "epoch": 0.01785784430739665, "grad_norm": 26.999977608066605, "learning_rate": 1.7857502568337934e-05, "loss": 2.4852, "mean_token_accuracy": 0.40689654350280763, "step": 17730 }, { "epoch": 0.017862880360500825, "grad_norm": 22.89249060597913, "learning_rate": 1.7862538525068996e-05, "loss": 2.4806, "mean_token_accuracy": 0.41034482717514037, "step": 17735 }, { "epoch": 0.017867916413605, "grad_norm": 32.99612133088084, "learning_rate": 1.7867574481800052e-05, "loss": 2.9538, "mean_token_accuracy": 0.34482758343219755, "step": 17740 }, { "epoch": 0.017872952466709172, "grad_norm": 19.215433956640425, "learning_rate": 1.787261043853111e-05, "loss": 2.1701, "mean_token_accuracy": 0.5084089517593384, "step": 17745 }, { "epoch": 0.017877988519813343, "grad_norm": 21.83348193967266, "learning_rate": 1.7877646395262174e-05, "loss": 2.4333, "mean_token_accuracy": 0.41004234552383423, "step": 17750 }, { "epoch": 0.017883024572917516, "grad_norm": 20.536419297082414, "learning_rate": 1.7882682351993233e-05, "loss": 2.3946, "mean_token_accuracy": 0.4172413766384125, "step": 17755 }, { "epoch": 0.01788806062602169, "grad_norm": 22.407979233843204, "learning_rate": 1.7887718308724293e-05, "loss": 2.1646, "mean_token_accuracy": 0.4689655125141144, "step": 17760 }, { "epoch": 0.01789309667912586, "grad_norm": 47.956054068011184, "learning_rate": 1.7892754265455352e-05, "loss": 2.4651, "mean_token_accuracy": 0.43103448748588563, "step": 17765 }, { "epoch": 0.017898132732230034, "grad_norm": 20.930335234278594, "learning_rate": 1.789779022218641e-05, "loss": 2.4341, "mean_token_accuracy": 0.4189957737922668, "step": 17770 }, { "epoch": 0.017903168785334208, "grad_norm": 21.71411443575081, "learning_rate": 1.7902826178917474e-05, "loss": 2.2481, "mean_token_accuracy": 0.43641863465309144, "step": 17775 }, { "epoch": 0.01790820483843838, "grad_norm": 28.7102381154154, "learning_rate": 1.7907862135648533e-05, "loss": 2.7496, "mean_token_accuracy": 0.3931034505367279, "step": 17780 }, { "epoch": 0.017913240891542552, "grad_norm": 21.781497393774814, "learning_rate": 1.791289809237959e-05, "loss": 2.5741, "mean_token_accuracy": 0.4103448331356049, "step": 17785 }, { "epoch": 0.017918276944646726, "grad_norm": 24.781937344331205, "learning_rate": 1.791793404911065e-05, "loss": 2.8346, "mean_token_accuracy": 0.3896551728248596, "step": 17790 }, { "epoch": 0.0179233129977509, "grad_norm": 29.059807852533513, "learning_rate": 1.792297000584171e-05, "loss": 2.5855, "mean_token_accuracy": 0.43103447556495667, "step": 17795 }, { "epoch": 0.01792834905085507, "grad_norm": 25.407786555534514, "learning_rate": 1.792800596257277e-05, "loss": 2.4015, "mean_token_accuracy": 0.44827587008476255, "step": 17800 }, { "epoch": 0.017933385103959244, "grad_norm": 17.973880950806024, "learning_rate": 1.793304191930383e-05, "loss": 2.2747, "mean_token_accuracy": 0.45396249890327456, "step": 17805 }, { "epoch": 0.017938421157063417, "grad_norm": 17.283264375485086, "learning_rate": 1.793807787603489e-05, "loss": 2.3358, "mean_token_accuracy": 0.4517241358757019, "step": 17810 }, { "epoch": 0.01794345721016759, "grad_norm": 28.150657409318192, "learning_rate": 1.794311383276595e-05, "loss": 2.4233, "mean_token_accuracy": 0.4310344845056534, "step": 17815 }, { "epoch": 0.01794849326327176, "grad_norm": 21.39567686871591, "learning_rate": 1.794814978949701e-05, "loss": 2.5214, "mean_token_accuracy": 0.41034482717514037, "step": 17820 }, { "epoch": 0.017953529316375935, "grad_norm": 26.30556551873422, "learning_rate": 1.795318574622807e-05, "loss": 2.3908, "mean_token_accuracy": 0.3793103456497192, "step": 17825 }, { "epoch": 0.01795856536948011, "grad_norm": 19.253761101351866, "learning_rate": 1.795822170295913e-05, "loss": 2.3134, "mean_token_accuracy": 0.45517241954803467, "step": 17830 }, { "epoch": 0.01796360142258428, "grad_norm": 17.509717985428647, "learning_rate": 1.7963257659690188e-05, "loss": 2.6476, "mean_token_accuracy": 0.37241379022598264, "step": 17835 }, { "epoch": 0.017968637475688453, "grad_norm": 25.57963889440956, "learning_rate": 1.7968293616421247e-05, "loss": 2.4963, "mean_token_accuracy": 0.43448275327682495, "step": 17840 }, { "epoch": 0.017973673528792627, "grad_norm": 21.712329938231694, "learning_rate": 1.797332957315231e-05, "loss": 2.4557, "mean_token_accuracy": 0.4517241299152374, "step": 17845 }, { "epoch": 0.0179787095818968, "grad_norm": 22.276125501252693, "learning_rate": 1.7978365529883366e-05, "loss": 2.5801, "mean_token_accuracy": 0.4344827592372894, "step": 17850 }, { "epoch": 0.01798374563500097, "grad_norm": 25.400250930627106, "learning_rate": 1.798340148661443e-05, "loss": 2.4426, "mean_token_accuracy": 0.42413793206214906, "step": 17855 }, { "epoch": 0.017988781688105145, "grad_norm": 21.432619555026445, "learning_rate": 1.7988437443345488e-05, "loss": 2.6549, "mean_token_accuracy": 0.41379310488700866, "step": 17860 }, { "epoch": 0.01799381774120932, "grad_norm": 25.12524210740627, "learning_rate": 1.7993473400076547e-05, "loss": 2.5328, "mean_token_accuracy": 0.41724137365818026, "step": 17865 }, { "epoch": 0.01799885379431349, "grad_norm": 23.899818435308884, "learning_rate": 1.7998509356807606e-05, "loss": 2.4895, "mean_token_accuracy": 0.47586206793785096, "step": 17870 }, { "epoch": 0.018003889847417662, "grad_norm": 24.689313656253333, "learning_rate": 1.8003545313538666e-05, "loss": 2.3117, "mean_token_accuracy": 0.4448275864124298, "step": 17875 }, { "epoch": 0.018008925900521836, "grad_norm": 24.852767428053237, "learning_rate": 1.8008581270269725e-05, "loss": 2.3515, "mean_token_accuracy": 0.44482758045196535, "step": 17880 }, { "epoch": 0.01801396195362601, "grad_norm": 25.045935697856397, "learning_rate": 1.8013617227000787e-05, "loss": 2.7134, "mean_token_accuracy": 0.37931033968925476, "step": 17885 }, { "epoch": 0.01801899800673018, "grad_norm": 22.09563764144968, "learning_rate": 1.8018653183731847e-05, "loss": 2.3021, "mean_token_accuracy": 0.42758620977401735, "step": 17890 }, { "epoch": 0.018024034059834354, "grad_norm": 23.96496201047184, "learning_rate": 1.8023689140462906e-05, "loss": 2.4955, "mean_token_accuracy": 0.43103448748588563, "step": 17895 }, { "epoch": 0.018029070112938528, "grad_norm": 22.62156259127355, "learning_rate": 1.8028725097193965e-05, "loss": 2.6392, "mean_token_accuracy": 0.4034482717514038, "step": 17900 }, { "epoch": 0.018034106166042698, "grad_norm": 21.977156595515346, "learning_rate": 1.8033761053925024e-05, "loss": 2.179, "mean_token_accuracy": 0.46896552443504336, "step": 17905 }, { "epoch": 0.018039142219146872, "grad_norm": 26.5977938469768, "learning_rate": 1.8038797010656087e-05, "loss": 2.2387, "mean_token_accuracy": 0.4551724076271057, "step": 17910 }, { "epoch": 0.018044178272251046, "grad_norm": 24.20540638808236, "learning_rate": 1.8043832967387146e-05, "loss": 2.3446, "mean_token_accuracy": 0.4379310369491577, "step": 17915 }, { "epoch": 0.01804921432535522, "grad_norm": 19.484822030973742, "learning_rate": 1.8048868924118202e-05, "loss": 2.2816, "mean_token_accuracy": 0.4344827592372894, "step": 17920 }, { "epoch": 0.01805425037845939, "grad_norm": 27.28241072551932, "learning_rate": 1.8053904880849265e-05, "loss": 2.4013, "mean_token_accuracy": 0.4517241418361664, "step": 17925 }, { "epoch": 0.018059286431563563, "grad_norm": 20.45482198487239, "learning_rate": 1.8058940837580324e-05, "loss": 2.4581, "mean_token_accuracy": 0.42758620977401735, "step": 17930 }, { "epoch": 0.018064322484667737, "grad_norm": 26.813351799703437, "learning_rate": 1.8063976794311387e-05, "loss": 2.4552, "mean_token_accuracy": 0.4068965494632721, "step": 17935 }, { "epoch": 0.018069358537771908, "grad_norm": 19.369171272788428, "learning_rate": 1.8069012751042443e-05, "loss": 2.1522, "mean_token_accuracy": 0.42413793206214906, "step": 17940 }, { "epoch": 0.01807439459087608, "grad_norm": 22.992413625878434, "learning_rate": 1.8074048707773502e-05, "loss": 2.2247, "mean_token_accuracy": 0.4517241299152374, "step": 17945 }, { "epoch": 0.018079430643980255, "grad_norm": 22.051474908139603, "learning_rate": 1.8079084664504565e-05, "loss": 2.5226, "mean_token_accuracy": 0.36896551847457887, "step": 17950 }, { "epoch": 0.01808446669708443, "grad_norm": 25.30049048940708, "learning_rate": 1.8084120621235624e-05, "loss": 2.3189, "mean_token_accuracy": 0.4379310429096222, "step": 17955 }, { "epoch": 0.0180895027501886, "grad_norm": 20.92183462199487, "learning_rate": 1.8089156577966683e-05, "loss": 2.5142, "mean_token_accuracy": 0.4206896543502808, "step": 17960 }, { "epoch": 0.018094538803292773, "grad_norm": 24.861238493924773, "learning_rate": 1.8094192534697742e-05, "loss": 2.2847, "mean_token_accuracy": 0.4724137902259827, "step": 17965 }, { "epoch": 0.018099574856396947, "grad_norm": 26.07495989185928, "learning_rate": 1.80992284914288e-05, "loss": 2.5871, "mean_token_accuracy": 0.41379310488700866, "step": 17970 }, { "epoch": 0.018104610909501117, "grad_norm": 87.31115130334746, "learning_rate": 1.8104264448159864e-05, "loss": 2.4175, "mean_token_accuracy": 0.42413792610168455, "step": 17975 }, { "epoch": 0.01810964696260529, "grad_norm": 24.69459049240753, "learning_rate": 1.8109300404890923e-05, "loss": 2.2178, "mean_token_accuracy": 0.44827585816383364, "step": 17980 }, { "epoch": 0.018114683015709464, "grad_norm": 23.56723405687756, "learning_rate": 1.811433636162198e-05, "loss": 2.4496, "mean_token_accuracy": 0.39655172228813174, "step": 17985 }, { "epoch": 0.018119719068813638, "grad_norm": 24.335418420927656, "learning_rate": 1.8119372318353042e-05, "loss": 2.3083, "mean_token_accuracy": 0.4862068951129913, "step": 17990 }, { "epoch": 0.01812475512191781, "grad_norm": 22.404727098820032, "learning_rate": 1.81244082750841e-05, "loss": 2.0647, "mean_token_accuracy": 0.4707199037075043, "step": 17995 }, { "epoch": 0.018129791175021982, "grad_norm": 21.666225047985428, "learning_rate": 1.8129444231815164e-05, "loss": 2.5004, "mean_token_accuracy": 0.41379310488700866, "step": 18000 }, { "epoch": 0.018134827228126156, "grad_norm": 24.688246564524185, "learning_rate": 1.813448018854622e-05, "loss": 2.3677, "mean_token_accuracy": 0.44827585816383364, "step": 18005 }, { "epoch": 0.018139863281230326, "grad_norm": 37.11071613560086, "learning_rate": 1.813951614527728e-05, "loss": 2.6779, "mean_token_accuracy": 0.4137930989265442, "step": 18010 }, { "epoch": 0.0181448993343345, "grad_norm": 29.42426974889362, "learning_rate": 1.814455210200834e-05, "loss": 2.3591, "mean_token_accuracy": 0.46896551847457885, "step": 18015 }, { "epoch": 0.018149935387438674, "grad_norm": 26.212084878253762, "learning_rate": 1.81495880587394e-05, "loss": 2.2063, "mean_token_accuracy": 0.4586206912994385, "step": 18020 }, { "epoch": 0.018154971440542848, "grad_norm": 30.990568639230204, "learning_rate": 1.815462401547046e-05, "loss": 2.749, "mean_token_accuracy": 0.4172413766384125, "step": 18025 }, { "epoch": 0.018160007493647018, "grad_norm": 31.099745693694484, "learning_rate": 1.815965997220152e-05, "loss": 2.2452, "mean_token_accuracy": 0.47586206793785096, "step": 18030 }, { "epoch": 0.01816504354675119, "grad_norm": 22.46318922164754, "learning_rate": 1.816469592893258e-05, "loss": 2.3509, "mean_token_accuracy": 0.5051421642303466, "step": 18035 }, { "epoch": 0.018170079599855365, "grad_norm": 19.35005212412641, "learning_rate": 1.816973188566364e-05, "loss": 2.2841, "mean_token_accuracy": 0.4344827592372894, "step": 18040 }, { "epoch": 0.018175115652959536, "grad_norm": 21.684310936835846, "learning_rate": 1.81747678423947e-05, "loss": 2.4035, "mean_token_accuracy": 0.4448275864124298, "step": 18045 }, { "epoch": 0.01818015170606371, "grad_norm": 33.086371182428195, "learning_rate": 1.817980379912576e-05, "loss": 2.3478, "mean_token_accuracy": 0.4689655125141144, "step": 18050 }, { "epoch": 0.018185187759167883, "grad_norm": 19.07403892507646, "learning_rate": 1.818483975585682e-05, "loss": 1.9568, "mean_token_accuracy": 0.4640653312206268, "step": 18055 }, { "epoch": 0.018190223812272057, "grad_norm": 34.102307363480314, "learning_rate": 1.8189875712587878e-05, "loss": 2.3472, "mean_token_accuracy": 0.46551724076271056, "step": 18060 }, { "epoch": 0.018195259865376227, "grad_norm": 23.171587807785667, "learning_rate": 1.8194911669318937e-05, "loss": 2.5169, "mean_token_accuracy": 0.3931034505367279, "step": 18065 }, { "epoch": 0.0182002959184804, "grad_norm": 17.733018611376394, "learning_rate": 1.819994762605e-05, "loss": 2.0571, "mean_token_accuracy": 0.4689655125141144, "step": 18070 }, { "epoch": 0.018205331971584575, "grad_norm": 22.644821568901595, "learning_rate": 1.8204983582781056e-05, "loss": 2.5359, "mean_token_accuracy": 0.4068965554237366, "step": 18075 }, { "epoch": 0.018210368024688745, "grad_norm": 20.264727304480328, "learning_rate": 1.821001953951212e-05, "loss": 2.3684, "mean_token_accuracy": 0.4517241358757019, "step": 18080 }, { "epoch": 0.01821540407779292, "grad_norm": 262.09946279066895, "learning_rate": 1.8215055496243178e-05, "loss": 2.2312, "mean_token_accuracy": 0.47586206793785096, "step": 18085 }, { "epoch": 0.018220440130897093, "grad_norm": 31.818670940558857, "learning_rate": 1.8220091452974237e-05, "loss": 2.4037, "mean_token_accuracy": 0.42758620977401735, "step": 18090 }, { "epoch": 0.018225476184001266, "grad_norm": 23.209558515353002, "learning_rate": 1.8225127409705296e-05, "loss": 2.3897, "mean_token_accuracy": 0.4344827592372894, "step": 18095 }, { "epoch": 0.018230512237105437, "grad_norm": 28.612889002732594, "learning_rate": 1.8230163366436356e-05, "loss": 2.4921, "mean_token_accuracy": 0.37931033968925476, "step": 18100 }, { "epoch": 0.01823554829020961, "grad_norm": 23.55292949986719, "learning_rate": 1.8235199323167415e-05, "loss": 2.2141, "mean_token_accuracy": 0.4448275864124298, "step": 18105 }, { "epoch": 0.018240584343313784, "grad_norm": 23.948456099648727, "learning_rate": 1.8240235279898478e-05, "loss": 2.7445, "mean_token_accuracy": 0.4103448212146759, "step": 18110 }, { "epoch": 0.018245620396417955, "grad_norm": 31.177019492375045, "learning_rate": 1.8245271236629537e-05, "loss": 2.2587, "mean_token_accuracy": 0.49655171632766726, "step": 18115 }, { "epoch": 0.01825065644952213, "grad_norm": 20.244547132110107, "learning_rate": 1.8250307193360596e-05, "loss": 2.3337, "mean_token_accuracy": 0.4379310369491577, "step": 18120 }, { "epoch": 0.018255692502626302, "grad_norm": 20.66116424283987, "learning_rate": 1.8255343150091655e-05, "loss": 2.2243, "mean_token_accuracy": 0.48275862336158754, "step": 18125 }, { "epoch": 0.018260728555730476, "grad_norm": 20.0904786657892, "learning_rate": 1.8260379106822715e-05, "loss": 2.4443, "mean_token_accuracy": 0.3827586233615875, "step": 18130 }, { "epoch": 0.018265764608834646, "grad_norm": 22.21856935231415, "learning_rate": 1.8265415063553777e-05, "loss": 2.2486, "mean_token_accuracy": 0.44827585816383364, "step": 18135 }, { "epoch": 0.01827080066193882, "grad_norm": 21.22679228510795, "learning_rate": 1.8270451020284833e-05, "loss": 2.2839, "mean_token_accuracy": 0.43103448748588563, "step": 18140 }, { "epoch": 0.018275836715042994, "grad_norm": 20.21340258007444, "learning_rate": 1.8275486977015892e-05, "loss": 2.5993, "mean_token_accuracy": 0.4068965494632721, "step": 18145 }, { "epoch": 0.018280872768147164, "grad_norm": 24.636832974506714, "learning_rate": 1.8280522933746955e-05, "loss": 2.5697, "mean_token_accuracy": 0.4310344815254211, "step": 18150 }, { "epoch": 0.018285908821251338, "grad_norm": 25.73286651674442, "learning_rate": 1.8285558890478014e-05, "loss": 2.3321, "mean_token_accuracy": 0.441379314661026, "step": 18155 }, { "epoch": 0.01829094487435551, "grad_norm": 20.64224988299182, "learning_rate": 1.8290594847209073e-05, "loss": 2.165, "mean_token_accuracy": 0.4172413766384125, "step": 18160 }, { "epoch": 0.018295980927459685, "grad_norm": 32.5435038140328, "learning_rate": 1.8295630803940133e-05, "loss": 2.5579, "mean_token_accuracy": 0.4379310250282288, "step": 18165 }, { "epoch": 0.018301016980563856, "grad_norm": 23.232260552158536, "learning_rate": 1.8300666760671192e-05, "loss": 2.7553, "mean_token_accuracy": 0.38620689511299133, "step": 18170 }, { "epoch": 0.01830605303366803, "grad_norm": 21.16191819070584, "learning_rate": 1.8305702717402255e-05, "loss": 2.4844, "mean_token_accuracy": 0.4049606740474701, "step": 18175 }, { "epoch": 0.018311089086772203, "grad_norm": 21.791003867464912, "learning_rate": 1.8310738674133314e-05, "loss": 2.4461, "mean_token_accuracy": 0.4676346004009247, "step": 18180 }, { "epoch": 0.018316125139876373, "grad_norm": 25.34037013791446, "learning_rate": 1.831577463086437e-05, "loss": 2.2647, "mean_token_accuracy": 0.4655172348022461, "step": 18185 }, { "epoch": 0.018321161192980547, "grad_norm": 22.080552835509003, "learning_rate": 1.8320810587595432e-05, "loss": 2.0999, "mean_token_accuracy": 0.441379314661026, "step": 18190 }, { "epoch": 0.01832619724608472, "grad_norm": 27.581194693649795, "learning_rate": 1.832584654432649e-05, "loss": 2.5972, "mean_token_accuracy": 0.417241370677948, "step": 18195 }, { "epoch": 0.018331233299188895, "grad_norm": 23.409531578584257, "learning_rate": 1.8330882501057554e-05, "loss": 2.0593, "mean_token_accuracy": 0.4931034505367279, "step": 18200 }, { "epoch": 0.018336269352293065, "grad_norm": 24.20278832127102, "learning_rate": 1.833591845778861e-05, "loss": 2.8432, "mean_token_accuracy": 0.3896551728248596, "step": 18205 }, { "epoch": 0.01834130540539724, "grad_norm": 29.951294493477235, "learning_rate": 1.834095441451967e-05, "loss": 2.397, "mean_token_accuracy": 0.42758620381355283, "step": 18210 }, { "epoch": 0.018346341458501413, "grad_norm": 24.22736942119158, "learning_rate": 1.8345990371250732e-05, "loss": 2.2892, "mean_token_accuracy": 0.4517241358757019, "step": 18215 }, { "epoch": 0.018351377511605583, "grad_norm": 19.50679996040326, "learning_rate": 1.835102632798179e-05, "loss": 2.5672, "mean_token_accuracy": 0.38275861740112305, "step": 18220 }, { "epoch": 0.018356413564709757, "grad_norm": 25.986453028308706, "learning_rate": 1.835606228471285e-05, "loss": 2.4904, "mean_token_accuracy": 0.4344827592372894, "step": 18225 }, { "epoch": 0.01836144961781393, "grad_norm": 21.306181038151017, "learning_rate": 1.836109824144391e-05, "loss": 2.1886, "mean_token_accuracy": 0.4551724076271057, "step": 18230 }, { "epoch": 0.018366485670918104, "grad_norm": 25.16754083291629, "learning_rate": 1.836613419817497e-05, "loss": 2.2746, "mean_token_accuracy": 0.458620685338974, "step": 18235 }, { "epoch": 0.018371521724022274, "grad_norm": 24.206454312674186, "learning_rate": 1.837117015490603e-05, "loss": 2.5027, "mean_token_accuracy": 0.4206896543502808, "step": 18240 }, { "epoch": 0.018376557777126448, "grad_norm": 23.248629177404304, "learning_rate": 1.837620611163709e-05, "loss": 2.7173, "mean_token_accuracy": 0.4034482777118683, "step": 18245 }, { "epoch": 0.018381593830230622, "grad_norm": 25.15605550080502, "learning_rate": 1.838124206836815e-05, "loss": 2.135, "mean_token_accuracy": 0.4465819776058197, "step": 18250 }, { "epoch": 0.018386629883334792, "grad_norm": 26.67414574265421, "learning_rate": 1.838627802509921e-05, "loss": 2.5828, "mean_token_accuracy": 0.41034482717514037, "step": 18255 }, { "epoch": 0.018391665936438966, "grad_norm": 36.44143395868395, "learning_rate": 1.839131398183027e-05, "loss": 2.1803, "mean_token_accuracy": 0.4807881832122803, "step": 18260 }, { "epoch": 0.01839670198954314, "grad_norm": 21.887312294518598, "learning_rate": 1.8396349938561328e-05, "loss": 2.122, "mean_token_accuracy": 0.4724137902259827, "step": 18265 }, { "epoch": 0.018401738042647314, "grad_norm": 24.12697173675327, "learning_rate": 1.840138589529239e-05, "loss": 2.3038, "mean_token_accuracy": 0.4172413766384125, "step": 18270 }, { "epoch": 0.018406774095751484, "grad_norm": 20.76582688741353, "learning_rate": 1.8406421852023446e-05, "loss": 2.4333, "mean_token_accuracy": 0.4034482777118683, "step": 18275 }, { "epoch": 0.018411810148855658, "grad_norm": 19.30215815795281, "learning_rate": 1.841145780875451e-05, "loss": 2.3007, "mean_token_accuracy": 0.4172413766384125, "step": 18280 }, { "epoch": 0.01841684620195983, "grad_norm": 18.933301816078163, "learning_rate": 1.841649376548557e-05, "loss": 2.449, "mean_token_accuracy": 0.4206896543502808, "step": 18285 }, { "epoch": 0.018421882255064, "grad_norm": 25.17672256729281, "learning_rate": 1.8421529722216628e-05, "loss": 2.4051, "mean_token_accuracy": 0.39655172228813174, "step": 18290 }, { "epoch": 0.018426918308168175, "grad_norm": 20.298991493644095, "learning_rate": 1.8426565678947687e-05, "loss": 2.548, "mean_token_accuracy": 0.4620689570903778, "step": 18295 }, { "epoch": 0.01843195436127235, "grad_norm": 33.72948507543238, "learning_rate": 1.8431601635678746e-05, "loss": 2.4262, "mean_token_accuracy": 0.417241370677948, "step": 18300 }, { "epoch": 0.018436990414376523, "grad_norm": 24.376558557746463, "learning_rate": 1.8436637592409805e-05, "loss": 2.4937, "mean_token_accuracy": 0.36896551251411436, "step": 18305 }, { "epoch": 0.018442026467480693, "grad_norm": 24.105309686406684, "learning_rate": 1.8441673549140868e-05, "loss": 2.447, "mean_token_accuracy": 0.4206896543502808, "step": 18310 }, { "epoch": 0.018447062520584867, "grad_norm": 17.683494157745173, "learning_rate": 1.8446709505871927e-05, "loss": 2.4752, "mean_token_accuracy": 0.4172413766384125, "step": 18315 }, { "epoch": 0.01845209857368904, "grad_norm": 24.057333381526337, "learning_rate": 1.8451745462602986e-05, "loss": 2.3932, "mean_token_accuracy": 0.41379310488700866, "step": 18320 }, { "epoch": 0.01845713462679321, "grad_norm": 17.076969746205037, "learning_rate": 1.8456781419334046e-05, "loss": 2.1679, "mean_token_accuracy": 0.47084090709686277, "step": 18325 }, { "epoch": 0.018462170679897385, "grad_norm": 33.926869220634046, "learning_rate": 1.8461817376065105e-05, "loss": 2.2014, "mean_token_accuracy": 0.38620689511299133, "step": 18330 }, { "epoch": 0.01846720673300156, "grad_norm": 26.747349952152167, "learning_rate": 1.8466853332796168e-05, "loss": 2.6312, "mean_token_accuracy": 0.3482758641242981, "step": 18335 }, { "epoch": 0.018472242786105732, "grad_norm": 19.441859540332494, "learning_rate": 1.8471889289527224e-05, "loss": 2.7548, "mean_token_accuracy": 0.4068965494632721, "step": 18340 }, { "epoch": 0.018477278839209903, "grad_norm": 18.43005966056521, "learning_rate": 1.8476925246258283e-05, "loss": 2.2123, "mean_token_accuracy": 0.4206896543502808, "step": 18345 }, { "epoch": 0.018482314892314076, "grad_norm": 24.168112688403802, "learning_rate": 1.8481961202989345e-05, "loss": 2.5376, "mean_token_accuracy": 0.4551724135875702, "step": 18350 }, { "epoch": 0.01848735094541825, "grad_norm": 35.506522947631794, "learning_rate": 1.8486997159720405e-05, "loss": 2.541, "mean_token_accuracy": 0.4118572294712067, "step": 18355 }, { "epoch": 0.01849238699852242, "grad_norm": 16.67603722301561, "learning_rate": 1.8492033116451464e-05, "loss": 2.4443, "mean_token_accuracy": 0.4344827592372894, "step": 18360 }, { "epoch": 0.018497423051626594, "grad_norm": 23.02598946078082, "learning_rate": 1.8497069073182523e-05, "loss": 2.5876, "mean_token_accuracy": 0.42068966031074523, "step": 18365 }, { "epoch": 0.018502459104730768, "grad_norm": 29.81051655374456, "learning_rate": 1.8502105029913582e-05, "loss": 2.3613, "mean_token_accuracy": 0.4464609742164612, "step": 18370 }, { "epoch": 0.018507495157834942, "grad_norm": 24.071169605696266, "learning_rate": 1.8507140986644645e-05, "loss": 2.6156, "mean_token_accuracy": 0.4206896543502808, "step": 18375 }, { "epoch": 0.018512531210939112, "grad_norm": 17.924317123757, "learning_rate": 1.8512176943375704e-05, "loss": 2.4325, "mean_token_accuracy": 0.42413792610168455, "step": 18380 }, { "epoch": 0.018517567264043286, "grad_norm": 23.6614030272707, "learning_rate": 1.851721290010676e-05, "loss": 2.1647, "mean_token_accuracy": 0.4662561535835266, "step": 18385 }, { "epoch": 0.01852260331714746, "grad_norm": 24.344594508760775, "learning_rate": 1.8522248856837823e-05, "loss": 2.8046, "mean_token_accuracy": 0.3965517282485962, "step": 18390 }, { "epoch": 0.01852763937025163, "grad_norm": 29.05329447343585, "learning_rate": 1.8527284813568882e-05, "loss": 2.795, "mean_token_accuracy": 0.38965516686439516, "step": 18395 }, { "epoch": 0.018532675423355804, "grad_norm": 23.44668463015132, "learning_rate": 1.8532320770299945e-05, "loss": 2.6074, "mean_token_accuracy": 0.4172413766384125, "step": 18400 }, { "epoch": 0.018537711476459977, "grad_norm": 25.68964458542458, "learning_rate": 1.8537356727031e-05, "loss": 2.4078, "mean_token_accuracy": 0.3999999940395355, "step": 18405 }, { "epoch": 0.01854274752956415, "grad_norm": 19.79096826839757, "learning_rate": 1.854239268376206e-05, "loss": 2.4656, "mean_token_accuracy": 0.4, "step": 18410 }, { "epoch": 0.01854778358266832, "grad_norm": 25.116766876971045, "learning_rate": 1.8547428640493122e-05, "loss": 2.4208, "mean_token_accuracy": 0.4655172348022461, "step": 18415 }, { "epoch": 0.018552819635772495, "grad_norm": 21.10956035204785, "learning_rate": 1.8552464597224182e-05, "loss": 2.7123, "mean_token_accuracy": 0.39999998807907106, "step": 18420 }, { "epoch": 0.01855785568887667, "grad_norm": 28.40286701907159, "learning_rate": 1.8557500553955244e-05, "loss": 2.2128, "mean_token_accuracy": 0.4263157844543457, "step": 18425 }, { "epoch": 0.01856289174198084, "grad_norm": 20.320657102430992, "learning_rate": 1.85625365106863e-05, "loss": 2.2389, "mean_token_accuracy": 0.43224440813064574, "step": 18430 }, { "epoch": 0.018567927795085013, "grad_norm": 26.43193860882125, "learning_rate": 1.856757246741736e-05, "loss": 2.179, "mean_token_accuracy": 0.4620689690113068, "step": 18435 }, { "epoch": 0.018572963848189187, "grad_norm": 26.029482516303847, "learning_rate": 1.8572608424148422e-05, "loss": 2.4367, "mean_token_accuracy": 0.4448275864124298, "step": 18440 }, { "epoch": 0.01857799990129336, "grad_norm": 26.36269070776917, "learning_rate": 1.857764438087948e-05, "loss": 2.5901, "mean_token_accuracy": 0.42758620381355283, "step": 18445 }, { "epoch": 0.01858303595439753, "grad_norm": 24.25198266824291, "learning_rate": 1.858268033761054e-05, "loss": 2.277, "mean_token_accuracy": 0.4517241299152374, "step": 18450 }, { "epoch": 0.018588072007501705, "grad_norm": 20.965138881577037, "learning_rate": 1.85877162943416e-05, "loss": 2.1715, "mean_token_accuracy": 0.4517241418361664, "step": 18455 }, { "epoch": 0.01859310806060588, "grad_norm": 27.964006325279833, "learning_rate": 1.859275225107266e-05, "loss": 2.6605, "mean_token_accuracy": 0.3827586233615875, "step": 18460 }, { "epoch": 0.01859814411371005, "grad_norm": 19.795094147467587, "learning_rate": 1.8597788207803722e-05, "loss": 2.0803, "mean_token_accuracy": 0.45359951853752134, "step": 18465 }, { "epoch": 0.018603180166814223, "grad_norm": 23.69274929922452, "learning_rate": 1.860282416453478e-05, "loss": 2.2592, "mean_token_accuracy": 0.4620689630508423, "step": 18470 }, { "epoch": 0.018608216219918396, "grad_norm": 25.75426115793313, "learning_rate": 1.8607860121265837e-05, "loss": 2.7545, "mean_token_accuracy": 0.4034482777118683, "step": 18475 }, { "epoch": 0.01861325227302257, "grad_norm": 25.138947021390557, "learning_rate": 1.86128960779969e-05, "loss": 2.608, "mean_token_accuracy": 0.41542649269104004, "step": 18480 }, { "epoch": 0.01861828832612674, "grad_norm": 20.300810854767338, "learning_rate": 1.861793203472796e-05, "loss": 2.4421, "mean_token_accuracy": 0.45033273100852966, "step": 18485 }, { "epoch": 0.018623324379230914, "grad_norm": 20.708512153951613, "learning_rate": 1.8622967991459018e-05, "loss": 2.3171, "mean_token_accuracy": 0.4137930989265442, "step": 18490 }, { "epoch": 0.018628360432335088, "grad_norm": 18.960310294395807, "learning_rate": 1.8628003948190077e-05, "loss": 2.3082, "mean_token_accuracy": 0.47586206197738645, "step": 18495 }, { "epoch": 0.018633396485439258, "grad_norm": 28.23100233338147, "learning_rate": 1.8633039904921137e-05, "loss": 2.5978, "mean_token_accuracy": 0.39655172228813174, "step": 18500 }, { "epoch": 0.018638432538543432, "grad_norm": 22.786989551854223, "learning_rate": 1.86380758616522e-05, "loss": 2.4694, "mean_token_accuracy": 0.4172413766384125, "step": 18505 }, { "epoch": 0.018643468591647606, "grad_norm": 18.117899626868255, "learning_rate": 1.864311181838326e-05, "loss": 2.1059, "mean_token_accuracy": 0.47241379618644713, "step": 18510 }, { "epoch": 0.01864850464475178, "grad_norm": 25.602680263941036, "learning_rate": 1.8648147775114318e-05, "loss": 2.4637, "mean_token_accuracy": 0.37586207389831544, "step": 18515 }, { "epoch": 0.01865354069785595, "grad_norm": 24.191454159131858, "learning_rate": 1.8653183731845377e-05, "loss": 2.5345, "mean_token_accuracy": 0.3896551728248596, "step": 18520 }, { "epoch": 0.018658576750960124, "grad_norm": 23.13403813802116, "learning_rate": 1.8658219688576436e-05, "loss": 2.4672, "mean_token_accuracy": 0.39655172228813174, "step": 18525 }, { "epoch": 0.018663612804064297, "grad_norm": 23.663311339393506, "learning_rate": 1.8663255645307495e-05, "loss": 2.3559, "mean_token_accuracy": 0.42068964838981626, "step": 18530 }, { "epoch": 0.018668648857168468, "grad_norm": 27.471039374875893, "learning_rate": 1.8668291602038558e-05, "loss": 2.4905, "mean_token_accuracy": 0.4137930989265442, "step": 18535 }, { "epoch": 0.01867368491027264, "grad_norm": 23.03935742764747, "learning_rate": 1.8673327558769614e-05, "loss": 2.5281, "mean_token_accuracy": 0.3655172407627106, "step": 18540 }, { "epoch": 0.018678720963376815, "grad_norm": 20.12579561621323, "learning_rate": 1.8678363515500677e-05, "loss": 2.0713, "mean_token_accuracy": 0.4571082890033722, "step": 18545 }, { "epoch": 0.01868375701648099, "grad_norm": 21.439037772772547, "learning_rate": 1.8683399472231736e-05, "loss": 2.3707, "mean_token_accuracy": 0.4433151841163635, "step": 18550 }, { "epoch": 0.01868879306958516, "grad_norm": 30.227392675033723, "learning_rate": 1.8688435428962795e-05, "loss": 2.3508, "mean_token_accuracy": 0.4310344815254211, "step": 18555 }, { "epoch": 0.018693829122689333, "grad_norm": 18.40296840728748, "learning_rate": 1.8693471385693854e-05, "loss": 2.4781, "mean_token_accuracy": 0.43774954676628114, "step": 18560 }, { "epoch": 0.018698865175793507, "grad_norm": 25.110820555236238, "learning_rate": 1.8698507342424914e-05, "loss": 2.7454, "mean_token_accuracy": 0.39171203672885896, "step": 18565 }, { "epoch": 0.018703901228897677, "grad_norm": 19.20230503447168, "learning_rate": 1.8703543299155973e-05, "loss": 2.3663, "mean_token_accuracy": 0.4655172348022461, "step": 18570 }, { "epoch": 0.01870893728200185, "grad_norm": 20.745675559486628, "learning_rate": 1.8708579255887035e-05, "loss": 2.4282, "mean_token_accuracy": 0.42613430619239806, "step": 18575 }, { "epoch": 0.018713973335106025, "grad_norm": 20.523347873253048, "learning_rate": 1.8713615212618095e-05, "loss": 2.6128, "mean_token_accuracy": 0.41034482717514037, "step": 18580 }, { "epoch": 0.0187190093882102, "grad_norm": 26.002638657581258, "learning_rate": 1.8718651169349154e-05, "loss": 2.5821, "mean_token_accuracy": 0.4034482777118683, "step": 18585 }, { "epoch": 0.01872404544131437, "grad_norm": 19.715752976807146, "learning_rate": 1.8723687126080213e-05, "loss": 2.6948, "mean_token_accuracy": 0.3807622492313385, "step": 18590 }, { "epoch": 0.018729081494418542, "grad_norm": 18.18784159170998, "learning_rate": 1.8728723082811273e-05, "loss": 2.6336, "mean_token_accuracy": 0.441379314661026, "step": 18595 }, { "epoch": 0.018734117547522716, "grad_norm": 19.16161870410463, "learning_rate": 1.8733759039542335e-05, "loss": 2.2031, "mean_token_accuracy": 0.4551724135875702, "step": 18600 }, { "epoch": 0.018739153600626886, "grad_norm": 23.118294381776135, "learning_rate": 1.8738794996273394e-05, "loss": 2.4119, "mean_token_accuracy": 0.47586206197738645, "step": 18605 }, { "epoch": 0.01874418965373106, "grad_norm": 17.85852863510062, "learning_rate": 1.874383095300445e-05, "loss": 2.6663, "mean_token_accuracy": 0.4103448212146759, "step": 18610 }, { "epoch": 0.018749225706835234, "grad_norm": 18.90992499439653, "learning_rate": 1.8748866909735513e-05, "loss": 2.3938, "mean_token_accuracy": 0.43103447556495667, "step": 18615 }, { "epoch": 0.018754261759939408, "grad_norm": 24.132692004407723, "learning_rate": 1.8753902866466572e-05, "loss": 2.3201, "mean_token_accuracy": 0.42068964838981626, "step": 18620 }, { "epoch": 0.018759297813043578, "grad_norm": 24.32289904119032, "learning_rate": 1.8758938823197635e-05, "loss": 2.4022, "mean_token_accuracy": 0.4471264362335205, "step": 18625 }, { "epoch": 0.018764333866147752, "grad_norm": 20.352546763306705, "learning_rate": 1.876397477992869e-05, "loss": 2.3471, "mean_token_accuracy": 0.4137930989265442, "step": 18630 }, { "epoch": 0.018769369919251926, "grad_norm": 20.132813035546505, "learning_rate": 1.876901073665975e-05, "loss": 2.2938, "mean_token_accuracy": 0.47931033968925474, "step": 18635 }, { "epoch": 0.018774405972356096, "grad_norm": 20.924008918925406, "learning_rate": 1.8774046693390813e-05, "loss": 2.7575, "mean_token_accuracy": 0.3517241388559341, "step": 18640 }, { "epoch": 0.01877944202546027, "grad_norm": 22.971882668830713, "learning_rate": 1.8779082650121872e-05, "loss": 2.9397, "mean_token_accuracy": 0.3655172407627106, "step": 18645 }, { "epoch": 0.018784478078564443, "grad_norm": 20.384143348866996, "learning_rate": 1.878411860685293e-05, "loss": 2.205, "mean_token_accuracy": 0.45172412395477296, "step": 18650 }, { "epoch": 0.018789514131668617, "grad_norm": 20.90598635261647, "learning_rate": 1.878915456358399e-05, "loss": 2.2686, "mean_token_accuracy": 0.44137930274009707, "step": 18655 }, { "epoch": 0.018794550184772787, "grad_norm": 18.391497744588737, "learning_rate": 1.879419052031505e-05, "loss": 2.4152, "mean_token_accuracy": 0.3862069010734558, "step": 18660 }, { "epoch": 0.01879958623787696, "grad_norm": 22.68095456033905, "learning_rate": 1.8799226477046112e-05, "loss": 2.353, "mean_token_accuracy": 0.4413793087005615, "step": 18665 }, { "epoch": 0.018804622290981135, "grad_norm": 27.357647063369004, "learning_rate": 1.880426243377717e-05, "loss": 2.6484, "mean_token_accuracy": 0.41724138855934145, "step": 18670 }, { "epoch": 0.018809658344085305, "grad_norm": 19.81846864631288, "learning_rate": 1.8809298390508227e-05, "loss": 2.4199, "mean_token_accuracy": 0.4448275864124298, "step": 18675 }, { "epoch": 0.01881469439718948, "grad_norm": 26.03359990295359, "learning_rate": 1.881433434723929e-05, "loss": 2.2594, "mean_token_accuracy": 0.40344828367233276, "step": 18680 }, { "epoch": 0.018819730450293653, "grad_norm": 19.701604642943927, "learning_rate": 1.881937030397035e-05, "loss": 2.3354, "mean_token_accuracy": 0.41724138259887694, "step": 18685 }, { "epoch": 0.018824766503397827, "grad_norm": 20.66082461782884, "learning_rate": 1.882440626070141e-05, "loss": 2.3089, "mean_token_accuracy": 0.42413793206214906, "step": 18690 }, { "epoch": 0.018829802556501997, "grad_norm": 24.63902668536542, "learning_rate": 1.8829442217432468e-05, "loss": 2.2573, "mean_token_accuracy": 0.4295220851898193, "step": 18695 }, { "epoch": 0.01883483860960617, "grad_norm": 30.656894829562354, "learning_rate": 1.8834478174163527e-05, "loss": 3.0471, "mean_token_accuracy": 0.41379310488700866, "step": 18700 }, { "epoch": 0.018839874662710344, "grad_norm": 20.04233789310212, "learning_rate": 1.883951413089459e-05, "loss": 2.3474, "mean_token_accuracy": 0.3827586203813553, "step": 18705 }, { "epoch": 0.018844910715814515, "grad_norm": 30.93024479269929, "learning_rate": 1.884455008762565e-05, "loss": 2.5824, "mean_token_accuracy": 0.3758620649576187, "step": 18710 }, { "epoch": 0.01884994676891869, "grad_norm": 20.213321690357215, "learning_rate": 1.8849586044356708e-05, "loss": 2.4802, "mean_token_accuracy": 0.46745312213897705, "step": 18715 }, { "epoch": 0.018854982822022862, "grad_norm": 28.4021018595473, "learning_rate": 1.8854622001087767e-05, "loss": 3.0217, "mean_token_accuracy": 0.38965517580509185, "step": 18720 }, { "epoch": 0.018860018875127036, "grad_norm": 30.664398158741548, "learning_rate": 1.8859657957818827e-05, "loss": 2.5236, "mean_token_accuracy": 0.42758620977401735, "step": 18725 }, { "epoch": 0.018865054928231206, "grad_norm": 18.888463410119773, "learning_rate": 1.8864693914549886e-05, "loss": 2.3902, "mean_token_accuracy": 0.44827587008476255, "step": 18730 }, { "epoch": 0.01887009098133538, "grad_norm": 23.481759982635758, "learning_rate": 1.886972987128095e-05, "loss": 2.3706, "mean_token_accuracy": 0.3862068891525269, "step": 18735 }, { "epoch": 0.018875127034439554, "grad_norm": 24.201531574535828, "learning_rate": 1.8874765828012004e-05, "loss": 2.4099, "mean_token_accuracy": 0.43103447556495667, "step": 18740 }, { "epoch": 0.018880163087543724, "grad_norm": 17.2269666441305, "learning_rate": 1.8879801784743067e-05, "loss": 2.0047, "mean_token_accuracy": 0.47586206793785096, "step": 18745 }, { "epoch": 0.018885199140647898, "grad_norm": 37.174245614286264, "learning_rate": 1.8884837741474126e-05, "loss": 2.6555, "mean_token_accuracy": 0.4050211727619171, "step": 18750 }, { "epoch": 0.01889023519375207, "grad_norm": 17.887613203658898, "learning_rate": 1.8889873698205186e-05, "loss": 2.462, "mean_token_accuracy": 0.4517241358757019, "step": 18755 }, { "epoch": 0.018895271246856245, "grad_norm": 22.39999922272541, "learning_rate": 1.8894909654936245e-05, "loss": 2.1091, "mean_token_accuracy": 0.5137931108474731, "step": 18760 }, { "epoch": 0.018900307299960416, "grad_norm": 29.83249442873979, "learning_rate": 1.8899945611667304e-05, "loss": 2.4745, "mean_token_accuracy": 0.39655172228813174, "step": 18765 }, { "epoch": 0.01890534335306459, "grad_norm": 23.89755745016707, "learning_rate": 1.8904981568398363e-05, "loss": 2.7878, "mean_token_accuracy": 0.3655172407627106, "step": 18770 }, { "epoch": 0.018910379406168763, "grad_norm": 20.68289422700322, "learning_rate": 1.8910017525129426e-05, "loss": 2.4851, "mean_token_accuracy": 0.3999999940395355, "step": 18775 }, { "epoch": 0.018915415459272934, "grad_norm": 18.21338794531444, "learning_rate": 1.8915053481860485e-05, "loss": 2.2646, "mean_token_accuracy": 0.4896551609039307, "step": 18780 }, { "epoch": 0.018920451512377107, "grad_norm": 22.000402602411857, "learning_rate": 1.8920089438591544e-05, "loss": 2.3004, "mean_token_accuracy": 0.48275861144065857, "step": 18785 }, { "epoch": 0.01892548756548128, "grad_norm": 22.879675315635378, "learning_rate": 1.8925125395322604e-05, "loss": 2.5838, "mean_token_accuracy": 0.41379310488700866, "step": 18790 }, { "epoch": 0.018930523618585455, "grad_norm": 23.42500362417901, "learning_rate": 1.8930161352053663e-05, "loss": 2.7684, "mean_token_accuracy": 0.3482758641242981, "step": 18795 }, { "epoch": 0.018935559671689625, "grad_norm": 23.188705991115167, "learning_rate": 1.8935197308784726e-05, "loss": 2.3294, "mean_token_accuracy": 0.42068964838981626, "step": 18800 }, { "epoch": 0.0189405957247938, "grad_norm": 20.350090717460624, "learning_rate": 1.8940233265515785e-05, "loss": 2.4954, "mean_token_accuracy": 0.43793103098869324, "step": 18805 }, { "epoch": 0.018945631777897973, "grad_norm": 20.2595044789961, "learning_rate": 1.894526922224684e-05, "loss": 2.5942, "mean_token_accuracy": 0.400369456410408, "step": 18810 }, { "epoch": 0.018950667831002143, "grad_norm": 31.09178734581473, "learning_rate": 1.8950305178977903e-05, "loss": 2.6437, "mean_token_accuracy": 0.33448275923728943, "step": 18815 }, { "epoch": 0.018955703884106317, "grad_norm": 24.66167751656284, "learning_rate": 1.8955341135708963e-05, "loss": 2.1062, "mean_token_accuracy": 0.4344827592372894, "step": 18820 }, { "epoch": 0.01896073993721049, "grad_norm": 20.15465544497304, "learning_rate": 1.8960377092440025e-05, "loss": 2.4724, "mean_token_accuracy": 0.41034482717514037, "step": 18825 }, { "epoch": 0.018965775990314664, "grad_norm": 29.108649941448636, "learning_rate": 1.896541304917108e-05, "loss": 2.5572, "mean_token_accuracy": 0.42413793206214906, "step": 18830 }, { "epoch": 0.018970812043418835, "grad_norm": 16.53741994988761, "learning_rate": 1.897044900590214e-05, "loss": 2.591, "mean_token_accuracy": 0.4068965554237366, "step": 18835 }, { "epoch": 0.01897584809652301, "grad_norm": 20.388984480647625, "learning_rate": 1.8975484962633203e-05, "loss": 2.6244, "mean_token_accuracy": 0.441379314661026, "step": 18840 }, { "epoch": 0.018980884149627182, "grad_norm": 23.652446278266744, "learning_rate": 1.8980520919364262e-05, "loss": 2.4188, "mean_token_accuracy": 0.4448275864124298, "step": 18845 }, { "epoch": 0.018985920202731352, "grad_norm": 19.742141238148445, "learning_rate": 1.898555687609532e-05, "loss": 2.4013, "mean_token_accuracy": 0.4034482777118683, "step": 18850 }, { "epoch": 0.018990956255835526, "grad_norm": 20.60123636793245, "learning_rate": 1.899059283282638e-05, "loss": 2.0559, "mean_token_accuracy": 0.5206896603107453, "step": 18855 }, { "epoch": 0.0189959923089397, "grad_norm": 21.468944623244187, "learning_rate": 1.899562878955744e-05, "loss": 2.3469, "mean_token_accuracy": 0.4896551728248596, "step": 18860 }, { "epoch": 0.019001028362043874, "grad_norm": 17.353235706003375, "learning_rate": 1.9000664746288503e-05, "loss": 2.0922, "mean_token_accuracy": 0.49655171632766726, "step": 18865 }, { "epoch": 0.019006064415148044, "grad_norm": 28.14462167623131, "learning_rate": 1.9005700703019562e-05, "loss": 3.0209, "mean_token_accuracy": 0.32413792610168457, "step": 18870 }, { "epoch": 0.019011100468252218, "grad_norm": 18.975555536316858, "learning_rate": 1.9010736659750618e-05, "loss": 2.0986, "mean_token_accuracy": 0.4862069010734558, "step": 18875 }, { "epoch": 0.01901613652135639, "grad_norm": 22.76994866800488, "learning_rate": 1.901577261648168e-05, "loss": 2.411, "mean_token_accuracy": 0.4758620738983154, "step": 18880 }, { "epoch": 0.019021172574460562, "grad_norm": 20.948865374630078, "learning_rate": 1.902080857321274e-05, "loss": 2.4945, "mean_token_accuracy": 0.4068965494632721, "step": 18885 }, { "epoch": 0.019026208627564736, "grad_norm": 30.26253072264103, "learning_rate": 1.9025844529943802e-05, "loss": 2.855, "mean_token_accuracy": 0.379310342669487, "step": 18890 }, { "epoch": 0.01903124468066891, "grad_norm": 25.749921093712405, "learning_rate": 1.9030880486674858e-05, "loss": 2.3108, "mean_token_accuracy": 0.4413793087005615, "step": 18895 }, { "epoch": 0.019036280733773083, "grad_norm": 24.526414610094402, "learning_rate": 1.9035916443405917e-05, "loss": 2.363, "mean_token_accuracy": 0.41379310488700866, "step": 18900 }, { "epoch": 0.019041316786877253, "grad_norm": 25.657819075648135, "learning_rate": 1.904095240013698e-05, "loss": 2.2318, "mean_token_accuracy": 0.4620689690113068, "step": 18905 }, { "epoch": 0.019046352839981427, "grad_norm": 28.70816140072034, "learning_rate": 1.904598835686804e-05, "loss": 2.6287, "mean_token_accuracy": 0.3827586233615875, "step": 18910 }, { "epoch": 0.0190513888930856, "grad_norm": 34.635211628022624, "learning_rate": 1.90510243135991e-05, "loss": 2.5462, "mean_token_accuracy": 0.4413793087005615, "step": 18915 }, { "epoch": 0.01905642494618977, "grad_norm": 18.94225028225776, "learning_rate": 1.9056060270330158e-05, "loss": 2.084, "mean_token_accuracy": 0.4689655125141144, "step": 18920 }, { "epoch": 0.019061460999293945, "grad_norm": 18.62035792038576, "learning_rate": 1.9061096227061217e-05, "loss": 2.6458, "mean_token_accuracy": 0.3896551728248596, "step": 18925 }, { "epoch": 0.01906649705239812, "grad_norm": 34.229517364856164, "learning_rate": 1.906613218379228e-05, "loss": 2.6708, "mean_token_accuracy": 0.4517241418361664, "step": 18930 }, { "epoch": 0.019071533105502293, "grad_norm": 25.769185841767303, "learning_rate": 1.907116814052334e-05, "loss": 2.6084, "mean_token_accuracy": 0.3827586233615875, "step": 18935 }, { "epoch": 0.019076569158606463, "grad_norm": 22.22799869039467, "learning_rate": 1.9076204097254395e-05, "loss": 2.2299, "mean_token_accuracy": 0.4034482777118683, "step": 18940 }, { "epoch": 0.019081605211710637, "grad_norm": 25.64921687545923, "learning_rate": 1.9081240053985457e-05, "loss": 2.3967, "mean_token_accuracy": 0.42068966031074523, "step": 18945 }, { "epoch": 0.01908664126481481, "grad_norm": 28.145675705745326, "learning_rate": 1.9086276010716517e-05, "loss": 2.4984, "mean_token_accuracy": 0.42758620381355283, "step": 18950 }, { "epoch": 0.01909167731791898, "grad_norm": 23.058502847850253, "learning_rate": 1.9091311967447576e-05, "loss": 2.5672, "mean_token_accuracy": 0.4344827651977539, "step": 18955 }, { "epoch": 0.019096713371023154, "grad_norm": 21.459581212472585, "learning_rate": 1.909634792417864e-05, "loss": 2.4628, "mean_token_accuracy": 0.3965517163276672, "step": 18960 }, { "epoch": 0.019101749424127328, "grad_norm": 31.939235127973767, "learning_rate": 1.9101383880909694e-05, "loss": 2.4405, "mean_token_accuracy": 0.4206896543502808, "step": 18965 }, { "epoch": 0.019106785477231502, "grad_norm": 20.528203750990738, "learning_rate": 1.9106419837640757e-05, "loss": 2.2257, "mean_token_accuracy": 0.4620689630508423, "step": 18970 }, { "epoch": 0.019111821530335672, "grad_norm": 21.453631207307506, "learning_rate": 1.9111455794371816e-05, "loss": 2.3287, "mean_token_accuracy": 0.44482759237289426, "step": 18975 }, { "epoch": 0.019116857583439846, "grad_norm": 64.85160328271458, "learning_rate": 1.9116491751102876e-05, "loss": 2.497, "mean_token_accuracy": 0.4448275864124298, "step": 18980 }, { "epoch": 0.01912189363654402, "grad_norm": 24.049164964505476, "learning_rate": 1.9121527707833935e-05, "loss": 2.3524, "mean_token_accuracy": 0.441379314661026, "step": 18985 }, { "epoch": 0.01912692968964819, "grad_norm": 23.076237318845802, "learning_rate": 1.9126563664564994e-05, "loss": 2.6937, "mean_token_accuracy": 0.3896551728248596, "step": 18990 }, { "epoch": 0.019131965742752364, "grad_norm": 24.031773041287398, "learning_rate": 1.9131599621296053e-05, "loss": 2.3618, "mean_token_accuracy": 0.4034482717514038, "step": 18995 }, { "epoch": 0.019137001795856538, "grad_norm": 24.385140679476745, "learning_rate": 1.9136635578027116e-05, "loss": 2.3517, "mean_token_accuracy": 0.4275861978530884, "step": 19000 }, { "epoch": 0.01914203784896071, "grad_norm": 21.35885024541804, "learning_rate": 1.9141671534758175e-05, "loss": 2.2411, "mean_token_accuracy": 0.38965516686439516, "step": 19005 }, { "epoch": 0.01914707390206488, "grad_norm": 21.07186337358999, "learning_rate": 1.9146707491489235e-05, "loss": 2.0067, "mean_token_accuracy": 0.5021173536777497, "step": 19010 }, { "epoch": 0.019152109955169055, "grad_norm": 18.488088610319316, "learning_rate": 1.9151743448220294e-05, "loss": 2.7663, "mean_token_accuracy": 0.38620689511299133, "step": 19015 }, { "epoch": 0.01915714600827323, "grad_norm": 17.356027015500718, "learning_rate": 1.9156779404951353e-05, "loss": 2.0081, "mean_token_accuracy": 0.48694581389427183, "step": 19020 }, { "epoch": 0.0191621820613774, "grad_norm": 20.176422967642363, "learning_rate": 1.9161815361682416e-05, "loss": 2.5009, "mean_token_accuracy": 0.33448275923728943, "step": 19025 }, { "epoch": 0.019167218114481573, "grad_norm": 19.321236228404917, "learning_rate": 1.916685131841347e-05, "loss": 2.3869, "mean_token_accuracy": 0.4310344815254211, "step": 19030 }, { "epoch": 0.019172254167585747, "grad_norm": 20.58533999038996, "learning_rate": 1.917188727514453e-05, "loss": 2.5406, "mean_token_accuracy": 0.4000000059604645, "step": 19035 }, { "epoch": 0.01917729022068992, "grad_norm": 23.345554869767714, "learning_rate": 1.9176923231875593e-05, "loss": 2.3113, "mean_token_accuracy": 0.48275862336158754, "step": 19040 }, { "epoch": 0.01918232627379409, "grad_norm": 25.62405401274355, "learning_rate": 1.9181959188606653e-05, "loss": 2.6555, "mean_token_accuracy": 0.382758629322052, "step": 19045 }, { "epoch": 0.019187362326898265, "grad_norm": 22.634158233986092, "learning_rate": 1.9186995145337712e-05, "loss": 2.3265, "mean_token_accuracy": 0.44827585816383364, "step": 19050 }, { "epoch": 0.01919239838000244, "grad_norm": 19.071023605774617, "learning_rate": 1.919203110206877e-05, "loss": 2.5948, "mean_token_accuracy": 0.4344827592372894, "step": 19055 }, { "epoch": 0.01919743443310661, "grad_norm": 30.103983955026525, "learning_rate": 1.919706705879983e-05, "loss": 2.465, "mean_token_accuracy": 0.41034482717514037, "step": 19060 }, { "epoch": 0.019202470486210783, "grad_norm": 23.749913222399545, "learning_rate": 1.9202103015530893e-05, "loss": 2.7265, "mean_token_accuracy": 0.4275862157344818, "step": 19065 }, { "epoch": 0.019207506539314956, "grad_norm": 28.846360838484934, "learning_rate": 1.9207138972261952e-05, "loss": 2.5446, "mean_token_accuracy": 0.4379310250282288, "step": 19070 }, { "epoch": 0.01921254259241913, "grad_norm": 21.115389956683682, "learning_rate": 1.9212174928993008e-05, "loss": 2.6631, "mean_token_accuracy": 0.3827586233615875, "step": 19075 }, { "epoch": 0.0192175786455233, "grad_norm": 27.022949371441303, "learning_rate": 1.921721088572407e-05, "loss": 2.7361, "mean_token_accuracy": 0.3793103516101837, "step": 19080 }, { "epoch": 0.019222614698627474, "grad_norm": 28.72932577451179, "learning_rate": 1.922224684245513e-05, "loss": 2.4192, "mean_token_accuracy": 0.39310344457626345, "step": 19085 }, { "epoch": 0.019227650751731648, "grad_norm": 21.33406037284613, "learning_rate": 1.9227282799186193e-05, "loss": 2.7183, "mean_token_accuracy": 0.334482753276825, "step": 19090 }, { "epoch": 0.01923268680483582, "grad_norm": 23.305160789540917, "learning_rate": 1.923231875591725e-05, "loss": 2.399, "mean_token_accuracy": 0.43793103098869324, "step": 19095 }, { "epoch": 0.019237722857939992, "grad_norm": 34.73831446218085, "learning_rate": 1.9237354712648308e-05, "loss": 2.7472, "mean_token_accuracy": 0.38620689511299133, "step": 19100 }, { "epoch": 0.019242758911044166, "grad_norm": 21.614758806338134, "learning_rate": 1.924239066937937e-05, "loss": 2.4642, "mean_token_accuracy": 0.3931034505367279, "step": 19105 }, { "epoch": 0.01924779496414834, "grad_norm": 25.36006241470092, "learning_rate": 1.924742662611043e-05, "loss": 2.3311, "mean_token_accuracy": 0.4344827651977539, "step": 19110 }, { "epoch": 0.01925283101725251, "grad_norm": 18.226525431525708, "learning_rate": 1.925246258284149e-05, "loss": 2.2973, "mean_token_accuracy": 0.4620689690113068, "step": 19115 }, { "epoch": 0.019257867070356684, "grad_norm": 24.492660683601795, "learning_rate": 1.9257498539572548e-05, "loss": 2.135, "mean_token_accuracy": 0.4448275864124298, "step": 19120 }, { "epoch": 0.019262903123460857, "grad_norm": 21.68615301058674, "learning_rate": 1.9262534496303608e-05, "loss": 2.3263, "mean_token_accuracy": 0.4586206912994385, "step": 19125 }, { "epoch": 0.019267939176565028, "grad_norm": 23.94939291291365, "learning_rate": 1.926757045303467e-05, "loss": 2.3115, "mean_token_accuracy": 0.4517241418361664, "step": 19130 }, { "epoch": 0.0192729752296692, "grad_norm": 21.485230371257042, "learning_rate": 1.927260640976573e-05, "loss": 2.4975, "mean_token_accuracy": 0.42068964838981626, "step": 19135 }, { "epoch": 0.019278011282773375, "grad_norm": 21.19594214676024, "learning_rate": 1.927764236649679e-05, "loss": 2.5566, "mean_token_accuracy": 0.36896551847457887, "step": 19140 }, { "epoch": 0.01928304733587755, "grad_norm": 25.499974404046863, "learning_rate": 1.9282678323227848e-05, "loss": 2.8967, "mean_token_accuracy": 0.3620689630508423, "step": 19145 }, { "epoch": 0.01928808338898172, "grad_norm": 20.904588169592742, "learning_rate": 1.9287714279958907e-05, "loss": 2.4021, "mean_token_accuracy": 0.4504537105560303, "step": 19150 }, { "epoch": 0.019293119442085893, "grad_norm": 22.25344689644324, "learning_rate": 1.9292750236689966e-05, "loss": 3.0055, "mean_token_accuracy": 0.3482758641242981, "step": 19155 }, { "epoch": 0.019298155495190067, "grad_norm": 20.132923013800138, "learning_rate": 1.929778619342103e-05, "loss": 2.0817, "mean_token_accuracy": 0.47931034564971925, "step": 19160 }, { "epoch": 0.019303191548294237, "grad_norm": 21.354640319349592, "learning_rate": 1.9302822150152085e-05, "loss": 2.1367, "mean_token_accuracy": 0.4517241299152374, "step": 19165 }, { "epoch": 0.01930822760139841, "grad_norm": 19.3046963605477, "learning_rate": 1.9307858106883148e-05, "loss": 2.5955, "mean_token_accuracy": 0.3862068891525269, "step": 19170 }, { "epoch": 0.019313263654502585, "grad_norm": 22.097695379788654, "learning_rate": 1.9312894063614207e-05, "loss": 2.2049, "mean_token_accuracy": 0.4724137902259827, "step": 19175 }, { "epoch": 0.01931829970760676, "grad_norm": 22.31561560021954, "learning_rate": 1.9317930020345266e-05, "loss": 2.7319, "mean_token_accuracy": 0.41034482717514037, "step": 19180 }, { "epoch": 0.01932333576071093, "grad_norm": 26.417886084764117, "learning_rate": 1.9322965977076325e-05, "loss": 2.329, "mean_token_accuracy": 0.46896551847457885, "step": 19185 }, { "epoch": 0.019328371813815103, "grad_norm": 19.866357520399784, "learning_rate": 1.9328001933807385e-05, "loss": 2.7874, "mean_token_accuracy": 0.33103448152542114, "step": 19190 }, { "epoch": 0.019333407866919276, "grad_norm": 18.360339379830954, "learning_rate": 1.9333037890538444e-05, "loss": 2.3254, "mean_token_accuracy": 0.4448275864124298, "step": 19195 }, { "epoch": 0.019338443920023447, "grad_norm": 25.473783650744327, "learning_rate": 1.9338073847269506e-05, "loss": 2.3748, "mean_token_accuracy": 0.43448275327682495, "step": 19200 }, { "epoch": 0.01934347997312762, "grad_norm": 22.726667607811105, "learning_rate": 1.9343109804000566e-05, "loss": 2.4949, "mean_token_accuracy": 0.4, "step": 19205 }, { "epoch": 0.019348516026231794, "grad_norm": 21.143753892616008, "learning_rate": 1.9348145760731625e-05, "loss": 2.4894, "mean_token_accuracy": 0.38275861740112305, "step": 19210 }, { "epoch": 0.019353552079335968, "grad_norm": 24.746164793535748, "learning_rate": 1.9353181717462684e-05, "loss": 2.324, "mean_token_accuracy": 0.43103447556495667, "step": 19215 }, { "epoch": 0.019358588132440138, "grad_norm": 21.17050979255156, "learning_rate": 1.9358217674193743e-05, "loss": 2.6149, "mean_token_accuracy": 0.3999999940395355, "step": 19220 }, { "epoch": 0.019363624185544312, "grad_norm": 23.853296853406036, "learning_rate": 1.9363253630924806e-05, "loss": 2.6862, "mean_token_accuracy": 0.3862068891525269, "step": 19225 }, { "epoch": 0.019368660238648486, "grad_norm": 27.02083369670501, "learning_rate": 1.9368289587655862e-05, "loss": 2.5642, "mean_token_accuracy": 0.3965517282485962, "step": 19230 }, { "epoch": 0.019373696291752656, "grad_norm": 22.91106839604228, "learning_rate": 1.9373325544386925e-05, "loss": 2.2112, "mean_token_accuracy": 0.4448275864124298, "step": 19235 }, { "epoch": 0.01937873234485683, "grad_norm": 28.464968452712085, "learning_rate": 1.9378361501117984e-05, "loss": 2.6346, "mean_token_accuracy": 0.39655172228813174, "step": 19240 }, { "epoch": 0.019383768397961004, "grad_norm": 20.30334656201811, "learning_rate": 1.9383397457849043e-05, "loss": 2.3556, "mean_token_accuracy": 0.41379310488700866, "step": 19245 }, { "epoch": 0.019388804451065174, "grad_norm": 23.40595938767292, "learning_rate": 1.9388433414580102e-05, "loss": 2.2606, "mean_token_accuracy": 0.4241379380226135, "step": 19250 }, { "epoch": 0.019393840504169348, "grad_norm": 35.80985472161722, "learning_rate": 1.939346937131116e-05, "loss": 2.7535, "mean_token_accuracy": 0.4103448331356049, "step": 19255 }, { "epoch": 0.01939887655727352, "grad_norm": 25.64783571019983, "learning_rate": 1.939850532804222e-05, "loss": 2.456, "mean_token_accuracy": 0.40344828367233276, "step": 19260 }, { "epoch": 0.019403912610377695, "grad_norm": 22.055052769733614, "learning_rate": 1.9403541284773284e-05, "loss": 2.5328, "mean_token_accuracy": 0.42413793206214906, "step": 19265 }, { "epoch": 0.019408948663481865, "grad_norm": 30.061260574863393, "learning_rate": 1.9408577241504343e-05, "loss": 2.3038, "mean_token_accuracy": 0.43103448748588563, "step": 19270 }, { "epoch": 0.01941398471658604, "grad_norm": 23.492195740144325, "learning_rate": 1.9413613198235402e-05, "loss": 2.418, "mean_token_accuracy": 0.4571082890033722, "step": 19275 }, { "epoch": 0.019419020769690213, "grad_norm": 23.17806247370198, "learning_rate": 1.941864915496646e-05, "loss": 2.2688, "mean_token_accuracy": 0.46551724076271056, "step": 19280 }, { "epoch": 0.019424056822794383, "grad_norm": 25.987372865088407, "learning_rate": 1.942368511169752e-05, "loss": 2.4035, "mean_token_accuracy": 0.4395644307136536, "step": 19285 }, { "epoch": 0.019429092875898557, "grad_norm": 18.559552602524132, "learning_rate": 1.9428721068428583e-05, "loss": 2.6534, "mean_token_accuracy": 0.3931034505367279, "step": 19290 }, { "epoch": 0.01943412892900273, "grad_norm": 23.03042873979043, "learning_rate": 1.943375702515964e-05, "loss": 2.5958, "mean_token_accuracy": 0.4206896424293518, "step": 19295 }, { "epoch": 0.019439164982106905, "grad_norm": 24.926087738102524, "learning_rate": 1.94387929818907e-05, "loss": 2.3143, "mean_token_accuracy": 0.40689654350280763, "step": 19300 }, { "epoch": 0.019444201035211075, "grad_norm": 21.260535807099927, "learning_rate": 1.944382893862176e-05, "loss": 2.5112, "mean_token_accuracy": 0.4068965554237366, "step": 19305 }, { "epoch": 0.01944923708831525, "grad_norm": 20.591896190660602, "learning_rate": 1.944886489535282e-05, "loss": 2.328, "mean_token_accuracy": 0.4482758641242981, "step": 19310 }, { "epoch": 0.019454273141419422, "grad_norm": 20.233529813523838, "learning_rate": 1.945390085208388e-05, "loss": 2.2581, "mean_token_accuracy": 0.48523896336555483, "step": 19315 }, { "epoch": 0.019459309194523593, "grad_norm": 18.27460801311187, "learning_rate": 1.945893680881494e-05, "loss": 2.7165, "mean_token_accuracy": 0.4034482777118683, "step": 19320 }, { "epoch": 0.019464345247627766, "grad_norm": 21.45706905987862, "learning_rate": 1.9463972765545998e-05, "loss": 2.7051, "mean_token_accuracy": 0.3999999940395355, "step": 19325 }, { "epoch": 0.01946938130073194, "grad_norm": 22.056454640252664, "learning_rate": 1.946900872227706e-05, "loss": 2.4943, "mean_token_accuracy": 0.44482759237289426, "step": 19330 }, { "epoch": 0.019474417353836114, "grad_norm": 26.35043528970658, "learning_rate": 1.947404467900812e-05, "loss": 2.816, "mean_token_accuracy": 0.3862069010734558, "step": 19335 }, { "epoch": 0.019479453406940284, "grad_norm": 29.277167580017586, "learning_rate": 1.947908063573918e-05, "loss": 2.3923, "mean_token_accuracy": 0.4068965554237366, "step": 19340 }, { "epoch": 0.019484489460044458, "grad_norm": 22.660510664728218, "learning_rate": 1.948411659247024e-05, "loss": 2.5806, "mean_token_accuracy": 0.40344828367233276, "step": 19345 }, { "epoch": 0.019489525513148632, "grad_norm": 16.08706833776135, "learning_rate": 1.9489152549201298e-05, "loss": 2.2493, "mean_token_accuracy": 0.4571082890033722, "step": 19350 }, { "epoch": 0.019494561566252802, "grad_norm": 26.745840492307664, "learning_rate": 1.949418850593236e-05, "loss": 2.763, "mean_token_accuracy": 0.39999998807907106, "step": 19355 }, { "epoch": 0.019499597619356976, "grad_norm": 20.477973285395915, "learning_rate": 1.949922446266342e-05, "loss": 2.6096, "mean_token_accuracy": 0.4, "step": 19360 }, { "epoch": 0.01950463367246115, "grad_norm": 17.145100040157523, "learning_rate": 1.9504260419394475e-05, "loss": 2.2553, "mean_token_accuracy": 0.4379310369491577, "step": 19365 }, { "epoch": 0.019509669725565323, "grad_norm": 25.0147791770493, "learning_rate": 1.9509296376125538e-05, "loss": 2.0779, "mean_token_accuracy": 0.5367211163043976, "step": 19370 }, { "epoch": 0.019514705778669494, "grad_norm": 25.196541997571952, "learning_rate": 1.9514332332856597e-05, "loss": 2.3969, "mean_token_accuracy": 0.4206896543502808, "step": 19375 }, { "epoch": 0.019519741831773667, "grad_norm": 24.755288927952, "learning_rate": 1.9519368289587657e-05, "loss": 2.3844, "mean_token_accuracy": 0.42232305407524107, "step": 19380 }, { "epoch": 0.01952477788487784, "grad_norm": 19.04459836152872, "learning_rate": 1.9524404246318716e-05, "loss": 2.3192, "mean_token_accuracy": 0.43103448748588563, "step": 19385 }, { "epoch": 0.01952981393798201, "grad_norm": 21.558449894243214, "learning_rate": 1.9529440203049775e-05, "loss": 2.1703, "mean_token_accuracy": 0.4413793087005615, "step": 19390 }, { "epoch": 0.019534849991086185, "grad_norm": 19.781588443586138, "learning_rate": 1.9534476159780838e-05, "loss": 2.3806, "mean_token_accuracy": 0.43793103098869324, "step": 19395 }, { "epoch": 0.01953988604419036, "grad_norm": 23.29396094065663, "learning_rate": 1.9539512116511897e-05, "loss": 2.5847, "mean_token_accuracy": 0.33793103098869326, "step": 19400 }, { "epoch": 0.019544922097294533, "grad_norm": 20.581060755594958, "learning_rate": 1.9544548073242956e-05, "loss": 2.4807, "mean_token_accuracy": 0.4344827651977539, "step": 19405 }, { "epoch": 0.019549958150398703, "grad_norm": 23.24665183560334, "learning_rate": 1.9549584029974015e-05, "loss": 2.7321, "mean_token_accuracy": 0.35862069129943847, "step": 19410 }, { "epoch": 0.019554994203502877, "grad_norm": 23.020710398695247, "learning_rate": 1.9554619986705075e-05, "loss": 2.3401, "mean_token_accuracy": 0.4448275864124298, "step": 19415 }, { "epoch": 0.01956003025660705, "grad_norm": 23.059783685177063, "learning_rate": 1.9559655943436134e-05, "loss": 2.2026, "mean_token_accuracy": 0.41034482717514037, "step": 19420 }, { "epoch": 0.01956506630971122, "grad_norm": 18.711438378302265, "learning_rate": 1.9564691900167197e-05, "loss": 2.2146, "mean_token_accuracy": 0.44482758045196535, "step": 19425 }, { "epoch": 0.019570102362815395, "grad_norm": 22.450052771439, "learning_rate": 1.9569727856898252e-05, "loss": 2.4209, "mean_token_accuracy": 0.42589232325553894, "step": 19430 }, { "epoch": 0.01957513841591957, "grad_norm": 22.64552852095471, "learning_rate": 1.9574763813629315e-05, "loss": 2.2356, "mean_token_accuracy": 0.4448275864124298, "step": 19435 }, { "epoch": 0.019580174469023742, "grad_norm": 18.9687169723688, "learning_rate": 1.9579799770360374e-05, "loss": 3.0165, "mean_token_accuracy": 0.417241370677948, "step": 19440 }, { "epoch": 0.019585210522127913, "grad_norm": 22.678322909084194, "learning_rate": 1.9584835727091434e-05, "loss": 2.5673, "mean_token_accuracy": 0.38965517580509185, "step": 19445 }, { "epoch": 0.019590246575232086, "grad_norm": 44.06056001444506, "learning_rate": 1.9589871683822493e-05, "loss": 2.4834, "mean_token_accuracy": 0.3827586233615875, "step": 19450 }, { "epoch": 0.01959528262833626, "grad_norm": 25.35535799068325, "learning_rate": 1.9594907640553552e-05, "loss": 2.4644, "mean_token_accuracy": 0.4448275864124298, "step": 19455 }, { "epoch": 0.01960031868144043, "grad_norm": 20.428422659854686, "learning_rate": 1.959994359728461e-05, "loss": 2.5562, "mean_token_accuracy": 0.3931034505367279, "step": 19460 }, { "epoch": 0.019605354734544604, "grad_norm": 21.926657100495863, "learning_rate": 1.9604979554015674e-05, "loss": 2.4568, "mean_token_accuracy": 0.37241379022598264, "step": 19465 }, { "epoch": 0.019610390787648778, "grad_norm": 18.571065640052613, "learning_rate": 1.9610015510746733e-05, "loss": 2.1219, "mean_token_accuracy": 0.5000000119209289, "step": 19470 }, { "epoch": 0.01961542684075295, "grad_norm": 19.99694544236865, "learning_rate": 1.9615051467477792e-05, "loss": 2.5688, "mean_token_accuracy": 0.4413793087005615, "step": 19475 }, { "epoch": 0.019620462893857122, "grad_norm": 23.29830442478529, "learning_rate": 1.9620087424208852e-05, "loss": 2.6918, "mean_token_accuracy": 0.3758620619773865, "step": 19480 }, { "epoch": 0.019625498946961296, "grad_norm": 28.014318131290587, "learning_rate": 1.962512338093991e-05, "loss": 2.7607, "mean_token_accuracy": 0.3965517282485962, "step": 19485 }, { "epoch": 0.01963053500006547, "grad_norm": 24.013684902351578, "learning_rate": 1.9630159337670974e-05, "loss": 2.4681, "mean_token_accuracy": 0.38620689511299133, "step": 19490 }, { "epoch": 0.01963557105316964, "grad_norm": 19.9055592544922, "learning_rate": 1.9635195294402033e-05, "loss": 2.2672, "mean_token_accuracy": 0.4379310369491577, "step": 19495 }, { "epoch": 0.019640607106273814, "grad_norm": 22.794552668528056, "learning_rate": 1.964023125113309e-05, "loss": 2.4731, "mean_token_accuracy": 0.39655172526836396, "step": 19500 }, { "epoch": 0.019645643159377987, "grad_norm": 26.673363214235547, "learning_rate": 1.964526720786415e-05, "loss": 2.5865, "mean_token_accuracy": 0.40163338780403135, "step": 19505 }, { "epoch": 0.01965067921248216, "grad_norm": 23.1969981139211, "learning_rate": 1.965030316459521e-05, "loss": 2.619, "mean_token_accuracy": 0.35862069129943847, "step": 19510 }, { "epoch": 0.01965571526558633, "grad_norm": 20.98372477459728, "learning_rate": 1.9655339121326273e-05, "loss": 2.4192, "mean_token_accuracy": 0.43793103098869324, "step": 19515 }, { "epoch": 0.019660751318690505, "grad_norm": 18.775265012060057, "learning_rate": 1.966037507805733e-05, "loss": 2.2992, "mean_token_accuracy": 0.4379310369491577, "step": 19520 }, { "epoch": 0.01966578737179468, "grad_norm": 20.382438064492526, "learning_rate": 1.966541103478839e-05, "loss": 2.4878, "mean_token_accuracy": 0.3999999940395355, "step": 19525 }, { "epoch": 0.01967082342489885, "grad_norm": 22.513270262911472, "learning_rate": 1.967044699151945e-05, "loss": 2.6284, "mean_token_accuracy": 0.43629764318466185, "step": 19530 }, { "epoch": 0.019675859478003023, "grad_norm": 21.774391755695813, "learning_rate": 1.967548294825051e-05, "loss": 2.4784, "mean_token_accuracy": 0.4275862157344818, "step": 19535 }, { "epoch": 0.019680895531107197, "grad_norm": 22.771118313889335, "learning_rate": 1.968051890498157e-05, "loss": 2.654, "mean_token_accuracy": 0.36896551847457887, "step": 19540 }, { "epoch": 0.01968593158421137, "grad_norm": 21.313805036033248, "learning_rate": 1.968555486171263e-05, "loss": 2.2671, "mean_token_accuracy": 0.4896551609039307, "step": 19545 }, { "epoch": 0.01969096763731554, "grad_norm": 28.38610331227566, "learning_rate": 1.9690590818443688e-05, "loss": 2.5704, "mean_token_accuracy": 0.3655172407627106, "step": 19550 }, { "epoch": 0.019696003690419715, "grad_norm": 21.180729103172418, "learning_rate": 1.969562677517475e-05, "loss": 2.2293, "mean_token_accuracy": 0.4068965494632721, "step": 19555 }, { "epoch": 0.01970103974352389, "grad_norm": 24.115690333598778, "learning_rate": 1.970066273190581e-05, "loss": 2.5892, "mean_token_accuracy": 0.43103447556495667, "step": 19560 }, { "epoch": 0.01970607579662806, "grad_norm": 25.41951153973096, "learning_rate": 1.9705698688636866e-05, "loss": 2.4177, "mean_token_accuracy": 0.4678161025047302, "step": 19565 }, { "epoch": 0.019711111849732232, "grad_norm": 22.571575768541788, "learning_rate": 1.971073464536793e-05, "loss": 2.3521, "mean_token_accuracy": 0.3965517282485962, "step": 19570 }, { "epoch": 0.019716147902836406, "grad_norm": 25.122398212189243, "learning_rate": 1.9715770602098988e-05, "loss": 2.429, "mean_token_accuracy": 0.4137930989265442, "step": 19575 }, { "epoch": 0.01972118395594058, "grad_norm": 18.59421859271088, "learning_rate": 1.9720806558830047e-05, "loss": 2.299, "mean_token_accuracy": 0.4379310369491577, "step": 19580 }, { "epoch": 0.01972622000904475, "grad_norm": 30.89035485234619, "learning_rate": 1.9725842515561106e-05, "loss": 2.159, "mean_token_accuracy": 0.4488916337490082, "step": 19585 }, { "epoch": 0.019731256062148924, "grad_norm": 22.616968370618622, "learning_rate": 1.9730878472292165e-05, "loss": 2.6236, "mean_token_accuracy": 0.4206896543502808, "step": 19590 }, { "epoch": 0.019736292115253098, "grad_norm": 21.267021295042294, "learning_rate": 1.9735914429023228e-05, "loss": 2.2882, "mean_token_accuracy": 0.43956443667411804, "step": 19595 }, { "epoch": 0.019741328168357268, "grad_norm": 19.578745283777145, "learning_rate": 1.9740950385754287e-05, "loss": 2.633, "mean_token_accuracy": 0.38620689511299133, "step": 19600 }, { "epoch": 0.019746364221461442, "grad_norm": 25.107282899046538, "learning_rate": 1.9745986342485347e-05, "loss": 2.5779, "mean_token_accuracy": 0.36896551251411436, "step": 19605 }, { "epoch": 0.019751400274565616, "grad_norm": 22.184511800268798, "learning_rate": 1.9751022299216406e-05, "loss": 2.7231, "mean_token_accuracy": 0.3827586233615875, "step": 19610 }, { "epoch": 0.01975643632766979, "grad_norm": 20.477292563075824, "learning_rate": 1.9756058255947465e-05, "loss": 2.7844, "mean_token_accuracy": 0.38620689511299133, "step": 19615 }, { "epoch": 0.01976147238077396, "grad_norm": 22.64414698638484, "learning_rate": 1.9761094212678524e-05, "loss": 2.5353, "mean_token_accuracy": 0.420689657330513, "step": 19620 }, { "epoch": 0.019766508433878133, "grad_norm": 28.672234040709238, "learning_rate": 1.9766130169409587e-05, "loss": 2.7729, "mean_token_accuracy": 0.38620689511299133, "step": 19625 }, { "epoch": 0.019771544486982307, "grad_norm": 23.000095257899765, "learning_rate": 1.9771166126140643e-05, "loss": 2.5256, "mean_token_accuracy": 0.43793103098869324, "step": 19630 }, { "epoch": 0.019776580540086477, "grad_norm": 21.79197923301158, "learning_rate": 1.9776202082871706e-05, "loss": 2.1846, "mean_token_accuracy": 0.40344828367233276, "step": 19635 }, { "epoch": 0.01978161659319065, "grad_norm": 32.78900397382135, "learning_rate": 1.9781238039602765e-05, "loss": 2.4213, "mean_token_accuracy": 0.4363581359386444, "step": 19640 }, { "epoch": 0.019786652646294825, "grad_norm": 24.728391006801576, "learning_rate": 1.9786273996333824e-05, "loss": 2.2754, "mean_token_accuracy": 0.458620685338974, "step": 19645 }, { "epoch": 0.019791688699399, "grad_norm": 28.035853431038824, "learning_rate": 1.9791309953064883e-05, "loss": 2.5163, "mean_token_accuracy": 0.4379310429096222, "step": 19650 }, { "epoch": 0.01979672475250317, "grad_norm": 18.822381505457027, "learning_rate": 1.9796345909795943e-05, "loss": 2.4849, "mean_token_accuracy": 0.4344827592372894, "step": 19655 }, { "epoch": 0.019801760805607343, "grad_norm": 19.354404502416187, "learning_rate": 1.9801381866527005e-05, "loss": 2.3816, "mean_token_accuracy": 0.42758620381355283, "step": 19660 }, { "epoch": 0.019806796858711517, "grad_norm": 22.63016659594927, "learning_rate": 1.9806417823258064e-05, "loss": 2.4781, "mean_token_accuracy": 0.45517241954803467, "step": 19665 }, { "epoch": 0.019811832911815687, "grad_norm": 24.21606805861793, "learning_rate": 1.9811453779989124e-05, "loss": 2.3656, "mean_token_accuracy": 0.41724138259887694, "step": 19670 }, { "epoch": 0.01981686896491986, "grad_norm": 25.636286240418784, "learning_rate": 1.9816489736720183e-05, "loss": 2.6066, "mean_token_accuracy": 0.4310344815254211, "step": 19675 }, { "epoch": 0.019821905018024034, "grad_norm": 19.697027037679458, "learning_rate": 1.9821525693451242e-05, "loss": 2.2553, "mean_token_accuracy": 0.3827586114406586, "step": 19680 }, { "epoch": 0.019826941071128208, "grad_norm": 20.22678751788821, "learning_rate": 1.98265616501823e-05, "loss": 2.5872, "mean_token_accuracy": 0.3793103456497192, "step": 19685 }, { "epoch": 0.01983197712423238, "grad_norm": 17.132820605132977, "learning_rate": 1.9831597606913364e-05, "loss": 2.3199, "mean_token_accuracy": 0.42758620977401735, "step": 19690 }, { "epoch": 0.019837013177336552, "grad_norm": 29.307599096242587, "learning_rate": 1.9836633563644423e-05, "loss": 2.6303, "mean_token_accuracy": 0.4068965494632721, "step": 19695 }, { "epoch": 0.019842049230440726, "grad_norm": 31.20471407504354, "learning_rate": 1.9841669520375483e-05, "loss": 2.4529, "mean_token_accuracy": 0.42413793206214906, "step": 19700 }, { "epoch": 0.019847085283544896, "grad_norm": 18.829203985718003, "learning_rate": 1.9846705477106542e-05, "loss": 2.4723, "mean_token_accuracy": 0.42068966031074523, "step": 19705 }, { "epoch": 0.01985212133664907, "grad_norm": 21.99221809137749, "learning_rate": 1.98517414338376e-05, "loss": 2.4622, "mean_token_accuracy": 0.4034482777118683, "step": 19710 }, { "epoch": 0.019857157389753244, "grad_norm": 31.271867027813663, "learning_rate": 1.9856777390568664e-05, "loss": 2.3963, "mean_token_accuracy": 0.4413793087005615, "step": 19715 }, { "epoch": 0.019862193442857418, "grad_norm": 25.27460252796519, "learning_rate": 1.986181334729972e-05, "loss": 2.3298, "mean_token_accuracy": 0.46206897497177124, "step": 19720 }, { "epoch": 0.019867229495961588, "grad_norm": 25.23595392014646, "learning_rate": 1.986684930403078e-05, "loss": 2.1099, "mean_token_accuracy": 0.4448275864124298, "step": 19725 }, { "epoch": 0.01987226554906576, "grad_norm": 18.890023522028038, "learning_rate": 1.987188526076184e-05, "loss": 2.1577, "mean_token_accuracy": 0.4482758641242981, "step": 19730 }, { "epoch": 0.019877301602169935, "grad_norm": 19.92335395801169, "learning_rate": 1.98769212174929e-05, "loss": 2.8164, "mean_token_accuracy": 0.3551724165678024, "step": 19735 }, { "epoch": 0.019882337655274106, "grad_norm": 21.354098540231103, "learning_rate": 1.988195717422396e-05, "loss": 2.2178, "mean_token_accuracy": 0.4862068951129913, "step": 19740 }, { "epoch": 0.01988737370837828, "grad_norm": 19.111345276088578, "learning_rate": 1.988699313095502e-05, "loss": 2.4499, "mean_token_accuracy": 0.42413792610168455, "step": 19745 }, { "epoch": 0.019892409761482453, "grad_norm": 19.940269924189423, "learning_rate": 1.989202908768608e-05, "loss": 2.3744, "mean_token_accuracy": 0.43448275327682495, "step": 19750 }, { "epoch": 0.019897445814586627, "grad_norm": 24.00192066203954, "learning_rate": 1.989706504441714e-05, "loss": 2.0404, "mean_token_accuracy": 0.49999999403953554, "step": 19755 }, { "epoch": 0.019902481867690797, "grad_norm": 22.85741513974964, "learning_rate": 1.99021010011482e-05, "loss": 2.1737, "mean_token_accuracy": 0.46551724672317507, "step": 19760 }, { "epoch": 0.01990751792079497, "grad_norm": 24.031069465769527, "learning_rate": 1.9907136957879256e-05, "loss": 2.4035, "mean_token_accuracy": 0.441379314661026, "step": 19765 }, { "epoch": 0.019912553973899145, "grad_norm": 21.910986262870132, "learning_rate": 1.991217291461032e-05, "loss": 2.6481, "mean_token_accuracy": 0.4034482777118683, "step": 19770 }, { "epoch": 0.019917590027003315, "grad_norm": 23.957067736517022, "learning_rate": 1.9917208871341378e-05, "loss": 2.5237, "mean_token_accuracy": 0.38275861740112305, "step": 19775 }, { "epoch": 0.01992262608010749, "grad_norm": 23.42427466199256, "learning_rate": 1.992224482807244e-05, "loss": 2.4608, "mean_token_accuracy": 0.4068965554237366, "step": 19780 }, { "epoch": 0.019927662133211663, "grad_norm": 27.26501196993155, "learning_rate": 1.9927280784803497e-05, "loss": 2.6888, "mean_token_accuracy": 0.358620685338974, "step": 19785 }, { "epoch": 0.019932698186315836, "grad_norm": 27.30677704137981, "learning_rate": 1.9932316741534556e-05, "loss": 2.445, "mean_token_accuracy": 0.4050211668014526, "step": 19790 }, { "epoch": 0.019937734239420007, "grad_norm": 23.788990132045406, "learning_rate": 1.993735269826562e-05, "loss": 2.5053, "mean_token_accuracy": 0.4310344815254211, "step": 19795 }, { "epoch": 0.01994277029252418, "grad_norm": 19.99802614459865, "learning_rate": 1.9942388654996678e-05, "loss": 2.469, "mean_token_accuracy": 0.4, "step": 19800 }, { "epoch": 0.019947806345628354, "grad_norm": 22.36445313189385, "learning_rate": 1.9947424611727737e-05, "loss": 2.4429, "mean_token_accuracy": 0.39479734301567077, "step": 19805 }, { "epoch": 0.019952842398732525, "grad_norm": 21.32394750571172, "learning_rate": 1.9952460568458796e-05, "loss": 2.0264, "mean_token_accuracy": 0.4344827592372894, "step": 19810 }, { "epoch": 0.0199578784518367, "grad_norm": 22.272840630467616, "learning_rate": 1.9957496525189856e-05, "loss": 2.4318, "mean_token_accuracy": 0.4310344815254211, "step": 19815 }, { "epoch": 0.019962914504940872, "grad_norm": 26.755959642632238, "learning_rate": 1.9962532481920918e-05, "loss": 2.2747, "mean_token_accuracy": 0.4328493714332581, "step": 19820 }, { "epoch": 0.019967950558045046, "grad_norm": 24.81101266374396, "learning_rate": 1.9967568438651977e-05, "loss": 2.4354, "mean_token_accuracy": 0.4620689690113068, "step": 19825 }, { "epoch": 0.019972986611149216, "grad_norm": 30.033593269499878, "learning_rate": 1.9972604395383033e-05, "loss": 2.2176, "mean_token_accuracy": 0.46551724672317507, "step": 19830 }, { "epoch": 0.01997802266425339, "grad_norm": 19.281394208094138, "learning_rate": 1.9977640352114096e-05, "loss": 2.1782, "mean_token_accuracy": 0.4689655125141144, "step": 19835 }, { "epoch": 0.019983058717357564, "grad_norm": 27.686479717412116, "learning_rate": 1.9982676308845155e-05, "loss": 2.6764, "mean_token_accuracy": 0.4034482777118683, "step": 19840 }, { "epoch": 0.019988094770461734, "grad_norm": 21.337179671378916, "learning_rate": 1.9987712265576214e-05, "loss": 2.329, "mean_token_accuracy": 0.4413793087005615, "step": 19845 }, { "epoch": 0.019993130823565908, "grad_norm": 21.74332707671995, "learning_rate": 1.9992748222307274e-05, "loss": 2.3135, "mean_token_accuracy": 0.4310344815254211, "step": 19850 }, { "epoch": 0.01999816687667008, "grad_norm": 22.31467039186713, "learning_rate": 1.9997784179038333e-05, "loss": 2.438, "mean_token_accuracy": 0.3808832406997681, "step": 19855 }, { "epoch": 0.020003202929774255, "grad_norm": 25.04168673486434, "learning_rate": 2.0002820135769396e-05, "loss": 2.5857, "mean_token_accuracy": 0.4034482717514038, "step": 19860 }, { "epoch": 0.020008238982878426, "grad_norm": 17.856824867718597, "learning_rate": 2.0007856092500455e-05, "loss": 2.2978, "mean_token_accuracy": 0.46551724076271056, "step": 19865 }, { "epoch": 0.0200132750359826, "grad_norm": 16.683259631887143, "learning_rate": 2.0012892049231514e-05, "loss": 2.4661, "mean_token_accuracy": 0.41724138259887694, "step": 19870 }, { "epoch": 0.020018311089086773, "grad_norm": 24.762002397014577, "learning_rate": 2.0017928005962573e-05, "loss": 2.2516, "mean_token_accuracy": 0.46896552443504336, "step": 19875 }, { "epoch": 0.020023347142190943, "grad_norm": 26.204284897366747, "learning_rate": 2.0022963962693633e-05, "loss": 2.4559, "mean_token_accuracy": 0.37586206793785093, "step": 19880 }, { "epoch": 0.020028383195295117, "grad_norm": 19.24675451553648, "learning_rate": 2.0027999919424692e-05, "loss": 2.6628, "mean_token_accuracy": 0.4034482777118683, "step": 19885 }, { "epoch": 0.02003341924839929, "grad_norm": 20.522969947818638, "learning_rate": 2.0033035876155755e-05, "loss": 2.2211, "mean_token_accuracy": 0.46896551847457885, "step": 19890 }, { "epoch": 0.020038455301503465, "grad_norm": 22.243001438453426, "learning_rate": 2.0038071832886814e-05, "loss": 2.6968, "mean_token_accuracy": 0.41034482717514037, "step": 19895 }, { "epoch": 0.020043491354607635, "grad_norm": 20.73232941888443, "learning_rate": 2.0043107789617873e-05, "loss": 2.1047, "mean_token_accuracy": 0.46551724672317507, "step": 19900 }, { "epoch": 0.02004852740771181, "grad_norm": 22.283220168231363, "learning_rate": 2.0048143746348932e-05, "loss": 2.4339, "mean_token_accuracy": 0.42413793206214906, "step": 19905 }, { "epoch": 0.020053563460815983, "grad_norm": 19.39015337331024, "learning_rate": 2.005317970307999e-05, "loss": 2.1464, "mean_token_accuracy": 0.4551724076271057, "step": 19910 }, { "epoch": 0.020058599513920153, "grad_norm": 26.57323772574109, "learning_rate": 2.0058215659811054e-05, "loss": 2.3459, "mean_token_accuracy": 0.4151845097541809, "step": 19915 }, { "epoch": 0.020063635567024327, "grad_norm": 20.181797350457796, "learning_rate": 2.006325161654211e-05, "loss": 2.4108, "mean_token_accuracy": 0.4068965494632721, "step": 19920 }, { "epoch": 0.0200686716201285, "grad_norm": 19.82957670016997, "learning_rate": 2.006828757327317e-05, "loss": 2.4968, "mean_token_accuracy": 0.4669950783252716, "step": 19925 }, { "epoch": 0.020073707673232674, "grad_norm": 22.735002622728427, "learning_rate": 2.0073323530004232e-05, "loss": 2.4244, "mean_token_accuracy": 0.4103448212146759, "step": 19930 }, { "epoch": 0.020078743726336844, "grad_norm": 29.210612347748405, "learning_rate": 2.007835948673529e-05, "loss": 2.582, "mean_token_accuracy": 0.37931033968925476, "step": 19935 }, { "epoch": 0.020083779779441018, "grad_norm": 21.203336744598115, "learning_rate": 2.008339544346635e-05, "loss": 2.22, "mean_token_accuracy": 0.4655172348022461, "step": 19940 }, { "epoch": 0.020088815832545192, "grad_norm": 28.502244458367624, "learning_rate": 2.008843140019741e-05, "loss": 2.4644, "mean_token_accuracy": 0.3965517282485962, "step": 19945 }, { "epoch": 0.020093851885649362, "grad_norm": 17.4466692547439, "learning_rate": 2.009346735692847e-05, "loss": 2.5001, "mean_token_accuracy": 0.4482758641242981, "step": 19950 }, { "epoch": 0.020098887938753536, "grad_norm": 22.91752071433038, "learning_rate": 2.009850331365953e-05, "loss": 2.5429, "mean_token_accuracy": 0.4482758641242981, "step": 19955 }, { "epoch": 0.02010392399185771, "grad_norm": 27.732540239252913, "learning_rate": 2.010353927039059e-05, "loss": 2.5477, "mean_token_accuracy": 0.4413793087005615, "step": 19960 }, { "epoch": 0.020108960044961884, "grad_norm": 20.60362071143817, "learning_rate": 2.0108575227121647e-05, "loss": 2.4125, "mean_token_accuracy": 0.4758620738983154, "step": 19965 }, { "epoch": 0.020113996098066054, "grad_norm": 17.461400081581683, "learning_rate": 2.011361118385271e-05, "loss": 2.3917, "mean_token_accuracy": 0.4275861978530884, "step": 19970 }, { "epoch": 0.020119032151170228, "grad_norm": 21.196968315768604, "learning_rate": 2.011864714058377e-05, "loss": 2.6275, "mean_token_accuracy": 0.3758620709180832, "step": 19975 }, { "epoch": 0.0201240682042744, "grad_norm": 22.67066390019272, "learning_rate": 2.012368309731483e-05, "loss": 2.2504, "mean_token_accuracy": 0.43284936547279357, "step": 19980 }, { "epoch": 0.02012910425737857, "grad_norm": 19.720872828690645, "learning_rate": 2.0128719054045887e-05, "loss": 2.0867, "mean_token_accuracy": 0.45517241954803467, "step": 19985 }, { "epoch": 0.020134140310482745, "grad_norm": 21.14831266732385, "learning_rate": 2.0133755010776946e-05, "loss": 2.529, "mean_token_accuracy": 0.42758620381355283, "step": 19990 }, { "epoch": 0.02013917636358692, "grad_norm": 29.826248086436237, "learning_rate": 2.013879096750801e-05, "loss": 2.7157, "mean_token_accuracy": 0.3896551728248596, "step": 19995 }, { "epoch": 0.020144212416691093, "grad_norm": 22.348135691696463, "learning_rate": 2.0143826924239068e-05, "loss": 2.4669, "mean_token_accuracy": 0.458620685338974, "step": 20000 }, { "epoch": 0.020149248469795263, "grad_norm": 20.312473209514764, "learning_rate": 2.0148862880970128e-05, "loss": 2.2812, "mean_token_accuracy": 0.417241370677948, "step": 20005 }, { "epoch": 0.020154284522899437, "grad_norm": 18.128967472374892, "learning_rate": 2.0153898837701187e-05, "loss": 2.2357, "mean_token_accuracy": 0.4344827592372894, "step": 20010 }, { "epoch": 0.02015932057600361, "grad_norm": 25.972831262973227, "learning_rate": 2.0158934794432246e-05, "loss": 2.6056, "mean_token_accuracy": 0.4068965554237366, "step": 20015 }, { "epoch": 0.02016435662910778, "grad_norm": 21.698010994170193, "learning_rate": 2.016397075116331e-05, "loss": 2.2518, "mean_token_accuracy": 0.4222625494003296, "step": 20020 }, { "epoch": 0.020169392682211955, "grad_norm": 20.49595400193641, "learning_rate": 2.0169006707894368e-05, "loss": 2.3718, "mean_token_accuracy": 0.458620685338974, "step": 20025 }, { "epoch": 0.02017442873531613, "grad_norm": 19.545832859740564, "learning_rate": 2.0174042664625424e-05, "loss": 2.2899, "mean_token_accuracy": 0.44827585220336913, "step": 20030 }, { "epoch": 0.020179464788420302, "grad_norm": 25.627378483006126, "learning_rate": 2.0179078621356486e-05, "loss": 2.6059, "mean_token_accuracy": 0.3896551728248596, "step": 20035 }, { "epoch": 0.020184500841524473, "grad_norm": 14.454305060601428, "learning_rate": 2.0184114578087546e-05, "loss": 2.122, "mean_token_accuracy": 0.45983060598373415, "step": 20040 }, { "epoch": 0.020189536894628646, "grad_norm": 23.900280511559952, "learning_rate": 2.018915053481861e-05, "loss": 2.5806, "mean_token_accuracy": 0.3965517282485962, "step": 20045 }, { "epoch": 0.02019457294773282, "grad_norm": 17.19209105736644, "learning_rate": 2.0194186491549668e-05, "loss": 2.0879, "mean_token_accuracy": 0.4931034564971924, "step": 20050 }, { "epoch": 0.02019960900083699, "grad_norm": 27.12404573786952, "learning_rate": 2.0199222448280723e-05, "loss": 2.5582, "mean_token_accuracy": 0.39310344457626345, "step": 20055 }, { "epoch": 0.020204645053941164, "grad_norm": 29.18327620572081, "learning_rate": 2.0204258405011786e-05, "loss": 2.7166, "mean_token_accuracy": 0.39655172228813174, "step": 20060 }, { "epoch": 0.020209681107045338, "grad_norm": 20.9754133748773, "learning_rate": 2.0209294361742845e-05, "loss": 2.2913, "mean_token_accuracy": 0.44137929677963256, "step": 20065 }, { "epoch": 0.020214717160149512, "grad_norm": 16.682171951543346, "learning_rate": 2.0214330318473905e-05, "loss": 2.1102, "mean_token_accuracy": 0.46896551847457885, "step": 20070 }, { "epoch": 0.020219753213253682, "grad_norm": 20.855804478755104, "learning_rate": 2.0219366275204964e-05, "loss": 2.1519, "mean_token_accuracy": 0.458620685338974, "step": 20075 }, { "epoch": 0.020224789266357856, "grad_norm": 19.195625308820834, "learning_rate": 2.0224402231936023e-05, "loss": 2.4016, "mean_token_accuracy": 0.4536600112915039, "step": 20080 }, { "epoch": 0.02022982531946203, "grad_norm": 25.41693500861622, "learning_rate": 2.0229438188667086e-05, "loss": 2.2716, "mean_token_accuracy": 0.4292196035385132, "step": 20085 }, { "epoch": 0.0202348613725662, "grad_norm": 16.122117355877883, "learning_rate": 2.0234474145398145e-05, "loss": 2.0698, "mean_token_accuracy": 0.49999999403953554, "step": 20090 }, { "epoch": 0.020239897425670374, "grad_norm": 24.694727606681806, "learning_rate": 2.0239510102129204e-05, "loss": 2.7991, "mean_token_accuracy": 0.33793103098869326, "step": 20095 }, { "epoch": 0.020244933478774547, "grad_norm": 21.772912849525827, "learning_rate": 2.0244546058860263e-05, "loss": 2.4242, "mean_token_accuracy": 0.42068966031074523, "step": 20100 }, { "epoch": 0.02024996953187872, "grad_norm": 24.77716630187442, "learning_rate": 2.0249582015591323e-05, "loss": 2.4497, "mean_token_accuracy": 0.42068966031074523, "step": 20105 }, { "epoch": 0.02025500558498289, "grad_norm": 25.288169202397047, "learning_rate": 2.0254617972322382e-05, "loss": 2.6914, "mean_token_accuracy": 0.4034482777118683, "step": 20110 }, { "epoch": 0.020260041638087065, "grad_norm": 26.315086993608244, "learning_rate": 2.0259653929053445e-05, "loss": 2.2057, "mean_token_accuracy": 0.45529341101646426, "step": 20115 }, { "epoch": 0.02026507769119124, "grad_norm": 26.5018000994247, "learning_rate": 2.02646898857845e-05, "loss": 2.9612, "mean_token_accuracy": 0.32758620381355286, "step": 20120 }, { "epoch": 0.02027011374429541, "grad_norm": 23.329797054303814, "learning_rate": 2.0269725842515563e-05, "loss": 2.3103, "mean_token_accuracy": 0.4379310369491577, "step": 20125 }, { "epoch": 0.020275149797399583, "grad_norm": 20.105331378118663, "learning_rate": 2.0274761799246622e-05, "loss": 2.3334, "mean_token_accuracy": 0.3551724135875702, "step": 20130 }, { "epoch": 0.020280185850503757, "grad_norm": 25.72764941586213, "learning_rate": 2.027979775597768e-05, "loss": 2.39, "mean_token_accuracy": 0.42758620381355283, "step": 20135 }, { "epoch": 0.02028522190360793, "grad_norm": 25.904955880847428, "learning_rate": 2.028483371270874e-05, "loss": 2.5929, "mean_token_accuracy": 0.36551723480224607, "step": 20140 }, { "epoch": 0.0202902579567121, "grad_norm": 19.082821335083086, "learning_rate": 2.02898696694398e-05, "loss": 2.172, "mean_token_accuracy": 0.4448275864124298, "step": 20145 }, { "epoch": 0.020295294009816275, "grad_norm": 24.65784182761241, "learning_rate": 2.029490562617086e-05, "loss": 2.6988, "mean_token_accuracy": 0.4068965554237366, "step": 20150 }, { "epoch": 0.02030033006292045, "grad_norm": 21.390529974095937, "learning_rate": 2.0299941582901922e-05, "loss": 2.3091, "mean_token_accuracy": 0.4551724135875702, "step": 20155 }, { "epoch": 0.02030536611602462, "grad_norm": 23.214021808635774, "learning_rate": 2.030497753963298e-05, "loss": 2.3324, "mean_token_accuracy": 0.39310344457626345, "step": 20160 }, { "epoch": 0.020310402169128793, "grad_norm": 24.718262858775972, "learning_rate": 2.031001349636404e-05, "loss": 2.2154, "mean_token_accuracy": 0.46745312213897705, "step": 20165 }, { "epoch": 0.020315438222232966, "grad_norm": 26.881671772445134, "learning_rate": 2.03150494530951e-05, "loss": 2.6324, "mean_token_accuracy": 0.3999999940395355, "step": 20170 }, { "epoch": 0.02032047427533714, "grad_norm": 26.623659235145304, "learning_rate": 2.032008540982616e-05, "loss": 2.6511, "mean_token_accuracy": 0.42068966031074523, "step": 20175 }, { "epoch": 0.02032551032844131, "grad_norm": 23.388010906064196, "learning_rate": 2.0325121366557222e-05, "loss": 2.5614, "mean_token_accuracy": 0.4255898356437683, "step": 20180 }, { "epoch": 0.020330546381545484, "grad_norm": 17.768880869214286, "learning_rate": 2.0330157323288278e-05, "loss": 2.5482, "mean_token_accuracy": 0.44482758045196535, "step": 20185 }, { "epoch": 0.020335582434649658, "grad_norm": 27.100302762358506, "learning_rate": 2.0335193280019337e-05, "loss": 2.6876, "mean_token_accuracy": 0.3931034505367279, "step": 20190 }, { "epoch": 0.020340618487753828, "grad_norm": 22.183201716737187, "learning_rate": 2.03402292367504e-05, "loss": 2.5313, "mean_token_accuracy": 0.41034482717514037, "step": 20195 }, { "epoch": 0.020345654540858002, "grad_norm": 19.279018153046746, "learning_rate": 2.034526519348146e-05, "loss": 2.402, "mean_token_accuracy": 0.41379310488700866, "step": 20200 }, { "epoch": 0.020350690593962176, "grad_norm": 21.74218347014969, "learning_rate": 2.0350301150212518e-05, "loss": 2.459, "mean_token_accuracy": 0.43067150712013247, "step": 20205 }, { "epoch": 0.02035572664706635, "grad_norm": 28.277716378251696, "learning_rate": 2.0355337106943577e-05, "loss": 2.2686, "mean_token_accuracy": 0.4620689630508423, "step": 20210 }, { "epoch": 0.02036076270017052, "grad_norm": 26.340418280170695, "learning_rate": 2.0360373063674636e-05, "loss": 2.6408, "mean_token_accuracy": 0.38620689809322356, "step": 20215 }, { "epoch": 0.020365798753274694, "grad_norm": 20.68237692618053, "learning_rate": 2.03654090204057e-05, "loss": 2.1616, "mean_token_accuracy": 0.4940108895301819, "step": 20220 }, { "epoch": 0.020370834806378867, "grad_norm": 18.233475076546913, "learning_rate": 2.037044497713676e-05, "loss": 2.5348, "mean_token_accuracy": 0.3931034505367279, "step": 20225 }, { "epoch": 0.020375870859483038, "grad_norm": 22.631162411150207, "learning_rate": 2.0375480933867818e-05, "loss": 2.2392, "mean_token_accuracy": 0.4620689630508423, "step": 20230 }, { "epoch": 0.02038090691258721, "grad_norm": 20.58237313282624, "learning_rate": 2.0380516890598877e-05, "loss": 2.3934, "mean_token_accuracy": 0.4190562665462494, "step": 20235 }, { "epoch": 0.020385942965691385, "grad_norm": 19.87636273252363, "learning_rate": 2.0385552847329936e-05, "loss": 2.4813, "mean_token_accuracy": 0.4172413766384125, "step": 20240 }, { "epoch": 0.02039097901879556, "grad_norm": 22.159772173630532, "learning_rate": 2.0390588804061e-05, "loss": 2.2794, "mean_token_accuracy": 0.43793103098869324, "step": 20245 }, { "epoch": 0.02039601507189973, "grad_norm": 20.003481403413694, "learning_rate": 2.0395624760792058e-05, "loss": 2.6248, "mean_token_accuracy": 0.38965516686439516, "step": 20250 }, { "epoch": 0.020401051125003903, "grad_norm": 27.02614037097411, "learning_rate": 2.0400660717523114e-05, "loss": 2.4206, "mean_token_accuracy": 0.4413793087005615, "step": 20255 }, { "epoch": 0.020406087178108077, "grad_norm": 20.61797433414269, "learning_rate": 2.0405696674254177e-05, "loss": 2.5768, "mean_token_accuracy": 0.4310344815254211, "step": 20260 }, { "epoch": 0.020411123231212247, "grad_norm": 27.116937568094198, "learning_rate": 2.0410732630985236e-05, "loss": 3.1021, "mean_token_accuracy": 0.3379310369491577, "step": 20265 }, { "epoch": 0.02041615928431642, "grad_norm": 28.933521363285905, "learning_rate": 2.0415768587716295e-05, "loss": 2.3532, "mean_token_accuracy": 0.43448275327682495, "step": 20270 }, { "epoch": 0.020421195337420595, "grad_norm": 17.38801833294216, "learning_rate": 2.0420804544447354e-05, "loss": 2.3373, "mean_token_accuracy": 0.4310344934463501, "step": 20275 }, { "epoch": 0.02042623139052477, "grad_norm": 20.51271345922397, "learning_rate": 2.0425840501178414e-05, "loss": 2.2784, "mean_token_accuracy": 0.4344827592372894, "step": 20280 }, { "epoch": 0.02043126744362894, "grad_norm": 22.602800097757285, "learning_rate": 2.0430876457909476e-05, "loss": 2.4874, "mean_token_accuracy": 0.3482758641242981, "step": 20285 }, { "epoch": 0.020436303496733112, "grad_norm": 18.903154608885608, "learning_rate": 2.0435912414640535e-05, "loss": 2.2618, "mean_token_accuracy": 0.4517241418361664, "step": 20290 }, { "epoch": 0.020441339549837286, "grad_norm": 20.892725119984043, "learning_rate": 2.0440948371371595e-05, "loss": 2.2489, "mean_token_accuracy": 0.4241379380226135, "step": 20295 }, { "epoch": 0.020446375602941456, "grad_norm": 22.96702302124522, "learning_rate": 2.0445984328102654e-05, "loss": 2.7542, "mean_token_accuracy": 0.3882032573223114, "step": 20300 }, { "epoch": 0.02045141165604563, "grad_norm": 18.4462427780884, "learning_rate": 2.0451020284833713e-05, "loss": 2.6832, "mean_token_accuracy": 0.41379311084747317, "step": 20305 }, { "epoch": 0.020456447709149804, "grad_norm": 24.544093617629084, "learning_rate": 2.0456056241564772e-05, "loss": 2.2292, "mean_token_accuracy": 0.4206896543502808, "step": 20310 }, { "epoch": 0.020461483762253978, "grad_norm": 22.46487817436657, "learning_rate": 2.0461092198295835e-05, "loss": 2.5283, "mean_token_accuracy": 0.4034482717514038, "step": 20315 }, { "epoch": 0.020466519815358148, "grad_norm": 22.26812629777027, "learning_rate": 2.046612815502689e-05, "loss": 2.309, "mean_token_accuracy": 0.4206896543502808, "step": 20320 }, { "epoch": 0.020471555868462322, "grad_norm": 22.410231858747288, "learning_rate": 2.0471164111757954e-05, "loss": 2.6072, "mean_token_accuracy": 0.3862068891525269, "step": 20325 }, { "epoch": 0.020476591921566496, "grad_norm": 21.763726941931004, "learning_rate": 2.0476200068489013e-05, "loss": 2.2369, "mean_token_accuracy": 0.4206896543502808, "step": 20330 }, { "epoch": 0.020481627974670666, "grad_norm": 25.267250731705264, "learning_rate": 2.0481236025220072e-05, "loss": 2.2505, "mean_token_accuracy": 0.4551724135875702, "step": 20335 }, { "epoch": 0.02048666402777484, "grad_norm": 21.002614873531986, "learning_rate": 2.048627198195113e-05, "loss": 2.6855, "mean_token_accuracy": 0.38620689511299133, "step": 20340 }, { "epoch": 0.020491700080879013, "grad_norm": 21.750525760714734, "learning_rate": 2.049130793868219e-05, "loss": 2.4442, "mean_token_accuracy": 0.4103448212146759, "step": 20345 }, { "epoch": 0.020496736133983187, "grad_norm": 17.003556925783922, "learning_rate": 2.049634389541325e-05, "loss": 2.5639, "mean_token_accuracy": 0.43103448748588563, "step": 20350 }, { "epoch": 0.020501772187087357, "grad_norm": 23.950653001718205, "learning_rate": 2.0501379852144312e-05, "loss": 2.3697, "mean_token_accuracy": 0.41911675333976744, "step": 20355 }, { "epoch": 0.02050680824019153, "grad_norm": 20.740732031594927, "learning_rate": 2.0506415808875372e-05, "loss": 2.2549, "mean_token_accuracy": 0.4551724135875702, "step": 20360 }, { "epoch": 0.020511844293295705, "grad_norm": 22.145651452968355, "learning_rate": 2.051145176560643e-05, "loss": 2.6835, "mean_token_accuracy": 0.4068965554237366, "step": 20365 }, { "epoch": 0.020516880346399875, "grad_norm": 23.640407822414623, "learning_rate": 2.051648772233749e-05, "loss": 2.5363, "mean_token_accuracy": 0.43103448748588563, "step": 20370 }, { "epoch": 0.02052191639950405, "grad_norm": 25.09349317688804, "learning_rate": 2.052152367906855e-05, "loss": 2.5742, "mean_token_accuracy": 0.4172413766384125, "step": 20375 }, { "epoch": 0.020526952452608223, "grad_norm": 23.22167483946741, "learning_rate": 2.0526559635799612e-05, "loss": 2.5049, "mean_token_accuracy": 0.40344826579093934, "step": 20380 }, { "epoch": 0.020531988505712397, "grad_norm": 20.365829939790775, "learning_rate": 2.0531595592530668e-05, "loss": 2.4667, "mean_token_accuracy": 0.41724138259887694, "step": 20385 }, { "epoch": 0.020537024558816567, "grad_norm": 21.19800753381962, "learning_rate": 2.0536631549261727e-05, "loss": 2.2691, "mean_token_accuracy": 0.42758620381355283, "step": 20390 }, { "epoch": 0.02054206061192074, "grad_norm": 21.900326403813818, "learning_rate": 2.054166750599279e-05, "loss": 2.2727, "mean_token_accuracy": 0.47931034564971925, "step": 20395 }, { "epoch": 0.020547096665024914, "grad_norm": 28.93848201795326, "learning_rate": 2.054670346272385e-05, "loss": 2.451, "mean_token_accuracy": 0.4517241358757019, "step": 20400 }, { "epoch": 0.020552132718129085, "grad_norm": 23.891048107482828, "learning_rate": 2.0551739419454912e-05, "loss": 2.6828, "mean_token_accuracy": 0.37586207389831544, "step": 20405 }, { "epoch": 0.02055716877123326, "grad_norm": 21.840176469710574, "learning_rate": 2.0556775376185968e-05, "loss": 2.627, "mean_token_accuracy": 0.43103447556495667, "step": 20410 }, { "epoch": 0.020562204824337432, "grad_norm": 18.922850606996075, "learning_rate": 2.0561811332917027e-05, "loss": 2.4214, "mean_token_accuracy": 0.42758620381355283, "step": 20415 }, { "epoch": 0.020567240877441606, "grad_norm": 18.174470785782198, "learning_rate": 2.056684728964809e-05, "loss": 2.2255, "mean_token_accuracy": 0.4898366630077362, "step": 20420 }, { "epoch": 0.020572276930545776, "grad_norm": 27.226391043954685, "learning_rate": 2.057188324637915e-05, "loss": 2.8804, "mean_token_accuracy": 0.36551723480224607, "step": 20425 }, { "epoch": 0.02057731298364995, "grad_norm": 17.44236010108542, "learning_rate": 2.0576919203110208e-05, "loss": 2.4531, "mean_token_accuracy": 0.40471869707107544, "step": 20430 }, { "epoch": 0.020582349036754124, "grad_norm": 22.27273076169086, "learning_rate": 2.0581955159841267e-05, "loss": 2.6766, "mean_token_accuracy": 0.43103447556495667, "step": 20435 }, { "epoch": 0.020587385089858294, "grad_norm": 17.693952333965473, "learning_rate": 2.0586991116572327e-05, "loss": 2.2798, "mean_token_accuracy": 0.4396854221820831, "step": 20440 }, { "epoch": 0.020592421142962468, "grad_norm": 18.023586223898967, "learning_rate": 2.059202707330339e-05, "loss": 2.5281, "mean_token_accuracy": 0.4137930989265442, "step": 20445 }, { "epoch": 0.02059745719606664, "grad_norm": 25.29147616398832, "learning_rate": 2.059706303003445e-05, "loss": 2.645, "mean_token_accuracy": 0.3965517163276672, "step": 20450 }, { "epoch": 0.020602493249170815, "grad_norm": 18.916044560677673, "learning_rate": 2.0602098986765504e-05, "loss": 2.4573, "mean_token_accuracy": 0.4275861978530884, "step": 20455 }, { "epoch": 0.020607529302274986, "grad_norm": 19.329200011984057, "learning_rate": 2.0607134943496567e-05, "loss": 2.3013, "mean_token_accuracy": 0.44652147889137267, "step": 20460 }, { "epoch": 0.02061256535537916, "grad_norm": 32.913470360769075, "learning_rate": 2.0612170900227626e-05, "loss": 2.4724, "mean_token_accuracy": 0.37701149582862853, "step": 20465 }, { "epoch": 0.020617601408483333, "grad_norm": 24.584539090771973, "learning_rate": 2.061720685695869e-05, "loss": 2.3344, "mean_token_accuracy": 0.44349666237831115, "step": 20470 }, { "epoch": 0.020622637461587504, "grad_norm": 23.468866691562596, "learning_rate": 2.0622242813689745e-05, "loss": 2.3436, "mean_token_accuracy": 0.42413793206214906, "step": 20475 }, { "epoch": 0.020627673514691677, "grad_norm": 25.082168456220924, "learning_rate": 2.0627278770420804e-05, "loss": 2.2851, "mean_token_accuracy": 0.4551724076271057, "step": 20480 }, { "epoch": 0.02063270956779585, "grad_norm": 20.65713368804598, "learning_rate": 2.0632314727151867e-05, "loss": 1.9743, "mean_token_accuracy": 0.48965516686439514, "step": 20485 }, { "epoch": 0.020637745620900025, "grad_norm": 20.22227763404955, "learning_rate": 2.0637350683882926e-05, "loss": 2.4053, "mean_token_accuracy": 0.41379310488700866, "step": 20490 }, { "epoch": 0.020642781674004195, "grad_norm": 22.313221064851298, "learning_rate": 2.0642386640613985e-05, "loss": 2.3204, "mean_token_accuracy": 0.42068964838981626, "step": 20495 }, { "epoch": 0.02064781772710837, "grad_norm": 23.042933047901666, "learning_rate": 2.0647422597345044e-05, "loss": 2.4391, "mean_token_accuracy": 0.4206896543502808, "step": 20500 }, { "epoch": 0.020652853780212543, "grad_norm": 19.087961230640815, "learning_rate": 2.0652458554076104e-05, "loss": 2.2903, "mean_token_accuracy": 0.4379310250282288, "step": 20505 }, { "epoch": 0.020657889833316713, "grad_norm": 17.33476864845292, "learning_rate": 2.0657494510807166e-05, "loss": 2.2708, "mean_token_accuracy": 0.46412583589553835, "step": 20510 }, { "epoch": 0.020662925886420887, "grad_norm": 17.156648512367376, "learning_rate": 2.0662530467538226e-05, "loss": 2.3583, "mean_token_accuracy": 0.43103448748588563, "step": 20515 }, { "epoch": 0.02066796193952506, "grad_norm": 20.317100966878122, "learning_rate": 2.066756642426928e-05, "loss": 2.6848, "mean_token_accuracy": 0.39655172228813174, "step": 20520 }, { "epoch": 0.020672997992629234, "grad_norm": 26.520492216235837, "learning_rate": 2.0672602381000344e-05, "loss": 2.2614, "mean_token_accuracy": 0.46394434571266174, "step": 20525 }, { "epoch": 0.020678034045733405, "grad_norm": 20.12590707674502, "learning_rate": 2.0677638337731403e-05, "loss": 2.3983, "mean_token_accuracy": 0.4344827651977539, "step": 20530 }, { "epoch": 0.02068307009883758, "grad_norm": 28.39348166787303, "learning_rate": 2.0682674294462463e-05, "loss": 2.6283, "mean_token_accuracy": 0.4137930989265442, "step": 20535 }, { "epoch": 0.020688106151941752, "grad_norm": 21.616073684971788, "learning_rate": 2.0687710251193522e-05, "loss": 2.2901, "mean_token_accuracy": 0.417241370677948, "step": 20540 }, { "epoch": 0.020693142205045922, "grad_norm": 24.01636217342845, "learning_rate": 2.069274620792458e-05, "loss": 2.7583, "mean_token_accuracy": 0.4068965494632721, "step": 20545 }, { "epoch": 0.020698178258150096, "grad_norm": 24.980828653272724, "learning_rate": 2.0697782164655644e-05, "loss": 2.6273, "mean_token_accuracy": 0.37931033968925476, "step": 20550 }, { "epoch": 0.02070321431125427, "grad_norm": 22.826711065736834, "learning_rate": 2.0702818121386703e-05, "loss": 2.3888, "mean_token_accuracy": 0.4413793087005615, "step": 20555 }, { "epoch": 0.020708250364358444, "grad_norm": 18.506444929152813, "learning_rate": 2.0707854078117762e-05, "loss": 2.5037, "mean_token_accuracy": 0.3551724135875702, "step": 20560 }, { "epoch": 0.020713286417462614, "grad_norm": 21.239680886285672, "learning_rate": 2.071289003484882e-05, "loss": 2.4648, "mean_token_accuracy": 0.4172413766384125, "step": 20565 }, { "epoch": 0.020718322470566788, "grad_norm": 23.471894097945487, "learning_rate": 2.071792599157988e-05, "loss": 2.2872, "mean_token_accuracy": 0.44827585816383364, "step": 20570 }, { "epoch": 0.02072335852367096, "grad_norm": 22.43242279166281, "learning_rate": 2.072296194831094e-05, "loss": 2.8487, "mean_token_accuracy": 0.3482758581638336, "step": 20575 }, { "epoch": 0.020728394576775132, "grad_norm": 19.04518189750828, "learning_rate": 2.0727997905042003e-05, "loss": 2.0623, "mean_token_accuracy": 0.43103448748588563, "step": 20580 }, { "epoch": 0.020733430629879306, "grad_norm": 24.175859135965364, "learning_rate": 2.0733033861773062e-05, "loss": 2.2581, "mean_token_accuracy": 0.4401088893413544, "step": 20585 }, { "epoch": 0.02073846668298348, "grad_norm": 28.21371826847323, "learning_rate": 2.073806981850412e-05, "loss": 2.8496, "mean_token_accuracy": 0.3601935803890228, "step": 20590 }, { "epoch": 0.020743502736087653, "grad_norm": 24.159271640617533, "learning_rate": 2.074310577523518e-05, "loss": 2.4508, "mean_token_accuracy": 0.4206896543502808, "step": 20595 }, { "epoch": 0.020748538789191823, "grad_norm": 21.871630064081565, "learning_rate": 2.074814173196624e-05, "loss": 2.5846, "mean_token_accuracy": 0.3947973370552063, "step": 20600 }, { "epoch": 0.020753574842295997, "grad_norm": 21.57414972492069, "learning_rate": 2.0753177688697302e-05, "loss": 2.1858, "mean_token_accuracy": 0.43448275327682495, "step": 20605 }, { "epoch": 0.02075861089540017, "grad_norm": 20.689410453152817, "learning_rate": 2.0758213645428358e-05, "loss": 2.4268, "mean_token_accuracy": 0.45027223229408264, "step": 20610 }, { "epoch": 0.02076364694850434, "grad_norm": 18.94062976006235, "learning_rate": 2.0763249602159417e-05, "loss": 2.3732, "mean_token_accuracy": 0.42413792610168455, "step": 20615 }, { "epoch": 0.020768683001608515, "grad_norm": 21.162725631034164, "learning_rate": 2.076828555889048e-05, "loss": 2.0892, "mean_token_accuracy": 0.4862068951129913, "step": 20620 }, { "epoch": 0.02077371905471269, "grad_norm": 19.343405837960226, "learning_rate": 2.077332151562154e-05, "loss": 2.1845, "mean_token_accuracy": 0.4448275864124298, "step": 20625 }, { "epoch": 0.020778755107816863, "grad_norm": 26.657340398656682, "learning_rate": 2.07783574723526e-05, "loss": 2.459, "mean_token_accuracy": 0.43236539959907533, "step": 20630 }, { "epoch": 0.020783791160921033, "grad_norm": 23.297529860383225, "learning_rate": 2.0783393429083658e-05, "loss": 2.5583, "mean_token_accuracy": 0.4103448182344437, "step": 20635 }, { "epoch": 0.020788827214025207, "grad_norm": 24.375311232146665, "learning_rate": 2.0788429385814717e-05, "loss": 2.7381, "mean_token_accuracy": 0.40689654648303986, "step": 20640 }, { "epoch": 0.02079386326712938, "grad_norm": 25.910400053753975, "learning_rate": 2.079346534254578e-05, "loss": 2.3787, "mean_token_accuracy": 0.4137930989265442, "step": 20645 }, { "epoch": 0.02079889932023355, "grad_norm": 18.392707396541727, "learning_rate": 2.079850129927684e-05, "loss": 2.3692, "mean_token_accuracy": 0.47586207985877993, "step": 20650 }, { "epoch": 0.020803935373337724, "grad_norm": 25.560505360356665, "learning_rate": 2.0803537256007895e-05, "loss": 2.8649, "mean_token_accuracy": 0.35862069129943847, "step": 20655 }, { "epoch": 0.020808971426441898, "grad_norm": 20.969388023639176, "learning_rate": 2.0808573212738957e-05, "loss": 2.6647, "mean_token_accuracy": 0.37586205899715425, "step": 20660 }, { "epoch": 0.020814007479546072, "grad_norm": 21.408324197549252, "learning_rate": 2.0813609169470017e-05, "loss": 2.2042, "mean_token_accuracy": 0.4275861978530884, "step": 20665 }, { "epoch": 0.020819043532650242, "grad_norm": 22.89885398256849, "learning_rate": 2.081864512620108e-05, "loss": 2.4562, "mean_token_accuracy": 0.4310344815254211, "step": 20670 }, { "epoch": 0.020824079585754416, "grad_norm": 25.6720580402118, "learning_rate": 2.0823681082932135e-05, "loss": 2.2407, "mean_token_accuracy": 0.3896551787853241, "step": 20675 }, { "epoch": 0.02082911563885859, "grad_norm": 20.391907160039267, "learning_rate": 2.0828717039663194e-05, "loss": 2.6567, "mean_token_accuracy": 0.4103448212146759, "step": 20680 }, { "epoch": 0.02083415169196276, "grad_norm": 19.72446517063746, "learning_rate": 2.0833752996394257e-05, "loss": 2.4718, "mean_token_accuracy": 0.3896551728248596, "step": 20685 }, { "epoch": 0.020839187745066934, "grad_norm": 33.62630801668942, "learning_rate": 2.0838788953125316e-05, "loss": 2.8378, "mean_token_accuracy": 0.38965516686439516, "step": 20690 }, { "epoch": 0.020844223798171108, "grad_norm": 18.19741363651444, "learning_rate": 2.0843824909856376e-05, "loss": 2.3302, "mean_token_accuracy": 0.44827585816383364, "step": 20695 }, { "epoch": 0.02084925985127528, "grad_norm": 26.038475731701535, "learning_rate": 2.0848860866587435e-05, "loss": 2.4227, "mean_token_accuracy": 0.458620685338974, "step": 20700 }, { "epoch": 0.02085429590437945, "grad_norm": 20.694083873134705, "learning_rate": 2.0853896823318494e-05, "loss": 2.3401, "mean_token_accuracy": 0.46551724076271056, "step": 20705 }, { "epoch": 0.020859331957483625, "grad_norm": 20.517144448968768, "learning_rate": 2.0858932780049557e-05, "loss": 2.6501, "mean_token_accuracy": 0.40859044194221494, "step": 20710 }, { "epoch": 0.0208643680105878, "grad_norm": 20.15282612144898, "learning_rate": 2.0863968736780616e-05, "loss": 2.7406, "mean_token_accuracy": 0.43260737657547, "step": 20715 }, { "epoch": 0.02086940406369197, "grad_norm": 21.926990376297955, "learning_rate": 2.0869004693511672e-05, "loss": 2.7712, "mean_token_accuracy": 0.3551724076271057, "step": 20720 }, { "epoch": 0.020874440116796143, "grad_norm": 32.90672899016541, "learning_rate": 2.0874040650242734e-05, "loss": 2.5821, "mean_token_accuracy": 0.4172413766384125, "step": 20725 }, { "epoch": 0.020879476169900317, "grad_norm": 24.75315313920941, "learning_rate": 2.0879076606973794e-05, "loss": 2.1809, "mean_token_accuracy": 0.4896551549434662, "step": 20730 }, { "epoch": 0.02088451222300449, "grad_norm": 29.915871870629, "learning_rate": 2.0884112563704853e-05, "loss": 2.4106, "mean_token_accuracy": 0.44827585816383364, "step": 20735 }, { "epoch": 0.02088954827610866, "grad_norm": 23.027456020227316, "learning_rate": 2.0889148520435912e-05, "loss": 2.4046, "mean_token_accuracy": 0.41379310488700866, "step": 20740 }, { "epoch": 0.020894584329212835, "grad_norm": 21.455565044689756, "learning_rate": 2.089418447716697e-05, "loss": 2.303, "mean_token_accuracy": 0.4620689690113068, "step": 20745 }, { "epoch": 0.02089962038231701, "grad_norm": 22.19652830621377, "learning_rate": 2.0899220433898034e-05, "loss": 2.3625, "mean_token_accuracy": 0.42758620381355283, "step": 20750 }, { "epoch": 0.02090465643542118, "grad_norm": 19.064880914822027, "learning_rate": 2.0904256390629093e-05, "loss": 2.2502, "mean_token_accuracy": 0.4206896543502808, "step": 20755 }, { "epoch": 0.020909692488525353, "grad_norm": 21.06840897155441, "learning_rate": 2.0909292347360153e-05, "loss": 2.6852, "mean_token_accuracy": 0.40689654648303986, "step": 20760 }, { "epoch": 0.020914728541629526, "grad_norm": 17.433829716573808, "learning_rate": 2.0914328304091212e-05, "loss": 2.3182, "mean_token_accuracy": 0.47586206793785096, "step": 20765 }, { "epoch": 0.0209197645947337, "grad_norm": 18.32584598886545, "learning_rate": 2.091936426082227e-05, "loss": 2.3205, "mean_token_accuracy": 0.41724138259887694, "step": 20770 }, { "epoch": 0.02092480064783787, "grad_norm": 20.27329927322556, "learning_rate": 2.092440021755333e-05, "loss": 2.6067, "mean_token_accuracy": 0.4275862157344818, "step": 20775 }, { "epoch": 0.020929836700942044, "grad_norm": 19.079933779732045, "learning_rate": 2.0929436174284393e-05, "loss": 2.47, "mean_token_accuracy": 0.3896551728248596, "step": 20780 }, { "epoch": 0.020934872754046218, "grad_norm": 24.529186463683658, "learning_rate": 2.0934472131015452e-05, "loss": 2.7102, "mean_token_accuracy": 0.37586206793785093, "step": 20785 }, { "epoch": 0.02093990880715039, "grad_norm": 18.22434578596645, "learning_rate": 2.093950808774651e-05, "loss": 2.2973, "mean_token_accuracy": 0.46958128809928895, "step": 20790 }, { "epoch": 0.020944944860254562, "grad_norm": 23.97462592483251, "learning_rate": 2.094454404447757e-05, "loss": 2.5887, "mean_token_accuracy": 0.35862068831920624, "step": 20795 }, { "epoch": 0.020949980913358736, "grad_norm": 18.229944357014688, "learning_rate": 2.094958000120863e-05, "loss": 1.9787, "mean_token_accuracy": 0.5, "step": 20800 }, { "epoch": 0.02095501696646291, "grad_norm": 26.823473252745288, "learning_rate": 2.0954615957939693e-05, "loss": 2.2683, "mean_token_accuracy": 0.4570477843284607, "step": 20805 }, { "epoch": 0.02096005301956708, "grad_norm": 24.404108843190482, "learning_rate": 2.095965191467075e-05, "loss": 2.6987, "mean_token_accuracy": 0.38965516686439516, "step": 20810 }, { "epoch": 0.020965089072671254, "grad_norm": 15.250059427661341, "learning_rate": 2.0964687871401808e-05, "loss": 2.3291, "mean_token_accuracy": 0.4744101703166962, "step": 20815 }, { "epoch": 0.020970125125775427, "grad_norm": 20.08263411313411, "learning_rate": 2.096972382813287e-05, "loss": 2.3474, "mean_token_accuracy": 0.4620689690113068, "step": 20820 }, { "epoch": 0.020975161178879598, "grad_norm": 35.39941870480334, "learning_rate": 2.097475978486393e-05, "loss": 2.6609, "mean_token_accuracy": 0.4310344815254211, "step": 20825 }, { "epoch": 0.02098019723198377, "grad_norm": 22.391267340765804, "learning_rate": 2.097979574159499e-05, "loss": 2.4936, "mean_token_accuracy": 0.4034482777118683, "step": 20830 }, { "epoch": 0.020985233285087945, "grad_norm": 20.109065682505307, "learning_rate": 2.0984831698326048e-05, "loss": 2.8539, "mean_token_accuracy": 0.32758620083332063, "step": 20835 }, { "epoch": 0.02099026933819212, "grad_norm": 31.136650218679605, "learning_rate": 2.0989867655057107e-05, "loss": 2.4236, "mean_token_accuracy": 0.3896551698446274, "step": 20840 }, { "epoch": 0.02099530539129629, "grad_norm": 28.374319694473026, "learning_rate": 2.099490361178817e-05, "loss": 2.3838, "mean_token_accuracy": 0.4379310369491577, "step": 20845 }, { "epoch": 0.021000341444400463, "grad_norm": 19.199579244580416, "learning_rate": 2.099993956851923e-05, "loss": 2.3744, "mean_token_accuracy": 0.4241379380226135, "step": 20850 }, { "epoch": 0.021005377497504637, "grad_norm": 20.131444451548532, "learning_rate": 2.100497552525029e-05, "loss": 2.1426, "mean_token_accuracy": 0.4724137902259827, "step": 20855 }, { "epoch": 0.021010413550608807, "grad_norm": 23.244707857595394, "learning_rate": 2.1010011481981348e-05, "loss": 2.558, "mean_token_accuracy": 0.4034482777118683, "step": 20860 }, { "epoch": 0.02101544960371298, "grad_norm": 132.14670814511433, "learning_rate": 2.1015047438712407e-05, "loss": 2.4918, "mean_token_accuracy": 0.41034482717514037, "step": 20865 }, { "epoch": 0.021020485656817155, "grad_norm": 27.211363773870946, "learning_rate": 2.102008339544347e-05, "loss": 2.6126, "mean_token_accuracy": 0.38965516686439516, "step": 20870 }, { "epoch": 0.02102552170992133, "grad_norm": 17.86001900291278, "learning_rate": 2.1025119352174526e-05, "loss": 2.1934, "mean_token_accuracy": 0.4517241358757019, "step": 20875 }, { "epoch": 0.0210305577630255, "grad_norm": 20.966328468401194, "learning_rate": 2.1030155308905585e-05, "loss": 2.4641, "mean_token_accuracy": 0.38965516686439516, "step": 20880 }, { "epoch": 0.021035593816129673, "grad_norm": 22.163251619676686, "learning_rate": 2.1035191265636648e-05, "loss": 2.5332, "mean_token_accuracy": 0.36551723480224607, "step": 20885 }, { "epoch": 0.021040629869233846, "grad_norm": 15.45488073309816, "learning_rate": 2.1040227222367707e-05, "loss": 2.4469, "mean_token_accuracy": 0.42068964838981626, "step": 20890 }, { "epoch": 0.021045665922338017, "grad_norm": 22.2710628132038, "learning_rate": 2.1045263179098766e-05, "loss": 2.4395, "mean_token_accuracy": 0.4448275864124298, "step": 20895 }, { "epoch": 0.02105070197544219, "grad_norm": 30.7929429454058, "learning_rate": 2.1050299135829825e-05, "loss": 2.6728, "mean_token_accuracy": 0.3655172407627106, "step": 20900 }, { "epoch": 0.021055738028546364, "grad_norm": 16.014108532331385, "learning_rate": 2.1055335092560885e-05, "loss": 2.3556, "mean_token_accuracy": 0.41724138259887694, "step": 20905 }, { "epoch": 0.021060774081650538, "grad_norm": 21.14752554231236, "learning_rate": 2.1060371049291947e-05, "loss": 2.4167, "mean_token_accuracy": 0.41034482717514037, "step": 20910 }, { "epoch": 0.021065810134754708, "grad_norm": 19.346628894314094, "learning_rate": 2.1065407006023006e-05, "loss": 2.3921, "mean_token_accuracy": 0.43793103098869324, "step": 20915 }, { "epoch": 0.021070846187858882, "grad_norm": 28.703939352884596, "learning_rate": 2.1070442962754062e-05, "loss": 2.7912, "mean_token_accuracy": 0.4068965554237366, "step": 20920 }, { "epoch": 0.021075882240963056, "grad_norm": 24.01577725670423, "learning_rate": 2.1075478919485125e-05, "loss": 2.6339, "mean_token_accuracy": 0.3655172437429428, "step": 20925 }, { "epoch": 0.021080918294067226, "grad_norm": 36.243478886755774, "learning_rate": 2.1080514876216184e-05, "loss": 2.2843, "mean_token_accuracy": 0.4261947929859161, "step": 20930 }, { "epoch": 0.0210859543471714, "grad_norm": 20.60866818430531, "learning_rate": 2.1085550832947247e-05, "loss": 2.6057, "mean_token_accuracy": 0.38620689511299133, "step": 20935 }, { "epoch": 0.021090990400275574, "grad_norm": 18.582219388872534, "learning_rate": 2.1090586789678303e-05, "loss": 2.4615, "mean_token_accuracy": 0.4517241418361664, "step": 20940 }, { "epoch": 0.021096026453379747, "grad_norm": 27.594770077376182, "learning_rate": 2.1095622746409362e-05, "loss": 2.8242, "mean_token_accuracy": 0.3776164650917053, "step": 20945 }, { "epoch": 0.021101062506483918, "grad_norm": 24.24602305086245, "learning_rate": 2.1100658703140425e-05, "loss": 2.8683, "mean_token_accuracy": 0.3517241358757019, "step": 20950 }, { "epoch": 0.02110609855958809, "grad_norm": 16.263725098729793, "learning_rate": 2.1105694659871484e-05, "loss": 2.5608, "mean_token_accuracy": 0.401875376701355, "step": 20955 }, { "epoch": 0.021111134612692265, "grad_norm": 19.569998824677764, "learning_rate": 2.1110730616602543e-05, "loss": 2.477, "mean_token_accuracy": 0.391349071264267, "step": 20960 }, { "epoch": 0.021116170665796435, "grad_norm": 21.781077255744773, "learning_rate": 2.1115766573333602e-05, "loss": 2.7433, "mean_token_accuracy": 0.4000000059604645, "step": 20965 }, { "epoch": 0.02112120671890061, "grad_norm": 18.40128037980539, "learning_rate": 2.112080253006466e-05, "loss": 2.4836, "mean_token_accuracy": 0.3655172407627106, "step": 20970 }, { "epoch": 0.021126242772004783, "grad_norm": 46.492749617498546, "learning_rate": 2.1125838486795724e-05, "loss": 2.3891, "mean_token_accuracy": 0.4551724135875702, "step": 20975 }, { "epoch": 0.021131278825108957, "grad_norm": 19.349739978910076, "learning_rate": 2.1130874443526783e-05, "loss": 2.8626, "mean_token_accuracy": 0.3517241358757019, "step": 20980 }, { "epoch": 0.021136314878213127, "grad_norm": 20.97780408660785, "learning_rate": 2.1135910400257843e-05, "loss": 2.4613, "mean_token_accuracy": 0.37931033968925476, "step": 20985 }, { "epoch": 0.0211413509313173, "grad_norm": 24.445144217630627, "learning_rate": 2.1140946356988902e-05, "loss": 2.2161, "mean_token_accuracy": 0.4620689690113068, "step": 20990 }, { "epoch": 0.021146386984421475, "grad_norm": 21.6366577428185, "learning_rate": 2.114598231371996e-05, "loss": 2.1493, "mean_token_accuracy": 0.4620689630508423, "step": 20995 }, { "epoch": 0.021151423037525645, "grad_norm": 24.427408832294397, "learning_rate": 2.115101827045102e-05, "loss": 2.3802, "mean_token_accuracy": 0.40344828367233276, "step": 21000 }, { "epoch": 0.02115645909062982, "grad_norm": 23.111088936342323, "learning_rate": 2.1156054227182083e-05, "loss": 2.3372, "mean_token_accuracy": 0.4310344815254211, "step": 21005 }, { "epoch": 0.021161495143733992, "grad_norm": 17.82266980970074, "learning_rate": 2.116109018391314e-05, "loss": 1.8099, "mean_token_accuracy": 0.5241379261016845, "step": 21010 }, { "epoch": 0.021166531196838166, "grad_norm": 23.38331723872994, "learning_rate": 2.11661261406442e-05, "loss": 2.8492, "mean_token_accuracy": 0.3620689630508423, "step": 21015 }, { "epoch": 0.021171567249942336, "grad_norm": 25.22452911055958, "learning_rate": 2.117116209737526e-05, "loss": 2.5648, "mean_token_accuracy": 0.417241370677948, "step": 21020 }, { "epoch": 0.02117660330304651, "grad_norm": 19.44492501026059, "learning_rate": 2.117619805410632e-05, "loss": 2.4373, "mean_token_accuracy": 0.4172413766384125, "step": 21025 }, { "epoch": 0.021181639356150684, "grad_norm": 26.642112492184182, "learning_rate": 2.118123401083738e-05, "loss": 2.9754, "mean_token_accuracy": 0.33103448152542114, "step": 21030 }, { "epoch": 0.021186675409254854, "grad_norm": 18.614317069537332, "learning_rate": 2.118626996756844e-05, "loss": 2.4687, "mean_token_accuracy": 0.41379310488700866, "step": 21035 }, { "epoch": 0.021191711462359028, "grad_norm": 22.057150559882654, "learning_rate": 2.1191305924299498e-05, "loss": 2.4049, "mean_token_accuracy": 0.4344827592372894, "step": 21040 }, { "epoch": 0.021196747515463202, "grad_norm": 18.71614200054645, "learning_rate": 2.119634188103056e-05, "loss": 2.5238, "mean_token_accuracy": 0.4068965494632721, "step": 21045 }, { "epoch": 0.021201783568567376, "grad_norm": 18.585694268094016, "learning_rate": 2.120137783776162e-05, "loss": 2.7448, "mean_token_accuracy": 0.36551724672317504, "step": 21050 }, { "epoch": 0.021206819621671546, "grad_norm": 19.608911101528797, "learning_rate": 2.120641379449268e-05, "loss": 2.5549, "mean_token_accuracy": 0.3965517282485962, "step": 21055 }, { "epoch": 0.02121185567477572, "grad_norm": 21.560036124037065, "learning_rate": 2.1211449751223738e-05, "loss": 2.3331, "mean_token_accuracy": 0.44482757449150084, "step": 21060 }, { "epoch": 0.021216891727879893, "grad_norm": 18.11930895897455, "learning_rate": 2.1216485707954798e-05, "loss": 2.4571, "mean_token_accuracy": 0.46551724672317507, "step": 21065 }, { "epoch": 0.021221927780984064, "grad_norm": 25.66411609483518, "learning_rate": 2.122152166468586e-05, "loss": 2.5234, "mean_token_accuracy": 0.39310345649719236, "step": 21070 }, { "epoch": 0.021226963834088237, "grad_norm": 23.356947858412035, "learning_rate": 2.1226557621416916e-05, "loss": 2.1268, "mean_token_accuracy": 0.4620689630508423, "step": 21075 }, { "epoch": 0.02123199988719241, "grad_norm": 22.557399422631665, "learning_rate": 2.1231593578147975e-05, "loss": 2.8912, "mean_token_accuracy": 0.39655172228813174, "step": 21080 }, { "epoch": 0.02123703594029658, "grad_norm": 24.702196168548216, "learning_rate": 2.1236629534879038e-05, "loss": 2.3692, "mean_token_accuracy": 0.4275861978530884, "step": 21085 }, { "epoch": 0.021242071993400755, "grad_norm": 20.286725374452587, "learning_rate": 2.1241665491610097e-05, "loss": 2.4183, "mean_token_accuracy": 0.3931034505367279, "step": 21090 }, { "epoch": 0.02124710804650493, "grad_norm": 25.638408993938015, "learning_rate": 2.1246701448341156e-05, "loss": 2.1182, "mean_token_accuracy": 0.453901994228363, "step": 21095 }, { "epoch": 0.021252144099609103, "grad_norm": 18.671845367902705, "learning_rate": 2.1251737405072216e-05, "loss": 2.1867, "mean_token_accuracy": 0.46551724076271056, "step": 21100 }, { "epoch": 0.021257180152713273, "grad_norm": 20.53286730376482, "learning_rate": 2.1256773361803275e-05, "loss": 2.5607, "mean_token_accuracy": 0.38965516686439516, "step": 21105 }, { "epoch": 0.021262216205817447, "grad_norm": 21.654851421821583, "learning_rate": 2.1261809318534338e-05, "loss": 2.4706, "mean_token_accuracy": 0.4137930989265442, "step": 21110 }, { "epoch": 0.02126725225892162, "grad_norm": 18.0515039288747, "learning_rate": 2.1266845275265397e-05, "loss": 2.2179, "mean_token_accuracy": 0.43103448748588563, "step": 21115 }, { "epoch": 0.02127228831202579, "grad_norm": 23.624520030738847, "learning_rate": 2.1271881231996456e-05, "loss": 2.2583, "mean_token_accuracy": 0.44482759237289426, "step": 21120 }, { "epoch": 0.021277324365129965, "grad_norm": 21.366795476132026, "learning_rate": 2.1276917188727515e-05, "loss": 2.4638, "mean_token_accuracy": 0.4103448331356049, "step": 21125 }, { "epoch": 0.02128236041823414, "grad_norm": 18.204018378083113, "learning_rate": 2.1281953145458575e-05, "loss": 2.1241, "mean_token_accuracy": 0.4344827592372894, "step": 21130 }, { "epoch": 0.021287396471338312, "grad_norm": 24.57298413065489, "learning_rate": 2.1286989102189637e-05, "loss": 2.8041, "mean_token_accuracy": 0.3517241418361664, "step": 21135 }, { "epoch": 0.021292432524442483, "grad_norm": 20.42255866967111, "learning_rate": 2.1292025058920697e-05, "loss": 2.3904, "mean_token_accuracy": 0.41724138259887694, "step": 21140 }, { "epoch": 0.021297468577546656, "grad_norm": 17.821941091188915, "learning_rate": 2.1297061015651752e-05, "loss": 2.0925, "mean_token_accuracy": 0.4603750824928284, "step": 21145 }, { "epoch": 0.02130250463065083, "grad_norm": 22.035364753885666, "learning_rate": 2.1302096972382815e-05, "loss": 2.3861, "mean_token_accuracy": 0.4103448212146759, "step": 21150 }, { "epoch": 0.021307540683755, "grad_norm": 16.70567385951173, "learning_rate": 2.1307132929113874e-05, "loss": 2.2951, "mean_token_accuracy": 0.4137930989265442, "step": 21155 }, { "epoch": 0.021312576736859174, "grad_norm": 17.220590490166508, "learning_rate": 2.1312168885844934e-05, "loss": 2.3865, "mean_token_accuracy": 0.3758620619773865, "step": 21160 }, { "epoch": 0.021317612789963348, "grad_norm": 28.45063276946072, "learning_rate": 2.1317204842575993e-05, "loss": 2.7205, "mean_token_accuracy": 0.3517241388559341, "step": 21165 }, { "epoch": 0.02132264884306752, "grad_norm": 20.96639430569427, "learning_rate": 2.1322240799307052e-05, "loss": 2.5758, "mean_token_accuracy": 0.4000000059604645, "step": 21170 }, { "epoch": 0.021327684896171692, "grad_norm": 20.574555001628905, "learning_rate": 2.1327276756038115e-05, "loss": 2.6416, "mean_token_accuracy": 0.3999999940395355, "step": 21175 }, { "epoch": 0.021332720949275866, "grad_norm": 19.359705768956218, "learning_rate": 2.1332312712769174e-05, "loss": 2.5769, "mean_token_accuracy": 0.4034482777118683, "step": 21180 }, { "epoch": 0.02133775700238004, "grad_norm": 15.733272651482979, "learning_rate": 2.1337348669500233e-05, "loss": 1.912, "mean_token_accuracy": 0.510344821214676, "step": 21185 }, { "epoch": 0.02134279305548421, "grad_norm": 17.0283901025158, "learning_rate": 2.1342384626231292e-05, "loss": 2.4239, "mean_token_accuracy": 0.458620685338974, "step": 21190 }, { "epoch": 0.021347829108588384, "grad_norm": 23.272429140930885, "learning_rate": 2.134742058296235e-05, "loss": 2.6658, "mean_token_accuracy": 0.38275861740112305, "step": 21195 }, { "epoch": 0.021352865161692557, "grad_norm": 24.685033246849063, "learning_rate": 2.135245653969341e-05, "loss": 2.5016, "mean_token_accuracy": 0.40689654350280763, "step": 21200 }, { "epoch": 0.02135790121479673, "grad_norm": 22.868386925848363, "learning_rate": 2.1357492496424474e-05, "loss": 2.3335, "mean_token_accuracy": 0.4551724135875702, "step": 21205 }, { "epoch": 0.0213629372679009, "grad_norm": 20.629060435773535, "learning_rate": 2.136252845315553e-05, "loss": 2.4539, "mean_token_accuracy": 0.41034482717514037, "step": 21210 }, { "epoch": 0.021367973321005075, "grad_norm": 27.954016138822368, "learning_rate": 2.1367564409886592e-05, "loss": 2.7829, "mean_token_accuracy": 0.3517241418361664, "step": 21215 }, { "epoch": 0.02137300937410925, "grad_norm": 17.66840555137636, "learning_rate": 2.137260036661765e-05, "loss": 2.3751, "mean_token_accuracy": 0.4586206912994385, "step": 21220 }, { "epoch": 0.02137804542721342, "grad_norm": 22.69578949512599, "learning_rate": 2.137763632334871e-05, "loss": 2.5368, "mean_token_accuracy": 0.38620689511299133, "step": 21225 }, { "epoch": 0.021383081480317593, "grad_norm": 25.03333138350183, "learning_rate": 2.138267228007977e-05, "loss": 2.6763, "mean_token_accuracy": 0.4034482717514038, "step": 21230 }, { "epoch": 0.021388117533421767, "grad_norm": 22.30316455489453, "learning_rate": 2.138770823681083e-05, "loss": 2.6622, "mean_token_accuracy": 0.39655172228813174, "step": 21235 }, { "epoch": 0.02139315358652594, "grad_norm": 25.50510708600825, "learning_rate": 2.139274419354189e-05, "loss": 2.4519, "mean_token_accuracy": 0.4413793087005615, "step": 21240 }, { "epoch": 0.02139818963963011, "grad_norm": 18.309690727445744, "learning_rate": 2.139778015027295e-05, "loss": 2.4431, "mean_token_accuracy": 0.42068964838981626, "step": 21245 }, { "epoch": 0.021403225692734285, "grad_norm": 18.969194831388506, "learning_rate": 2.140281610700401e-05, "loss": 2.2073, "mean_token_accuracy": 0.45517241656780244, "step": 21250 }, { "epoch": 0.02140826174583846, "grad_norm": 22.96546996371774, "learning_rate": 2.140785206373507e-05, "loss": 2.7717, "mean_token_accuracy": 0.37241379022598264, "step": 21255 }, { "epoch": 0.02141329779894263, "grad_norm": 20.26171254629993, "learning_rate": 2.141288802046613e-05, "loss": 2.606, "mean_token_accuracy": 0.3896551728248596, "step": 21260 }, { "epoch": 0.021418333852046802, "grad_norm": 21.646809421900933, "learning_rate": 2.1417923977197188e-05, "loss": 2.5949, "mean_token_accuracy": 0.36896551251411436, "step": 21265 }, { "epoch": 0.021423369905150976, "grad_norm": 16.72230650044111, "learning_rate": 2.142295993392825e-05, "loss": 2.2471, "mean_token_accuracy": 0.4551724076271057, "step": 21270 }, { "epoch": 0.02142840595825515, "grad_norm": 18.533552086992632, "learning_rate": 2.1427995890659307e-05, "loss": 2.2507, "mean_token_accuracy": 0.4310344815254211, "step": 21275 }, { "epoch": 0.02143344201135932, "grad_norm": 25.12206590002422, "learning_rate": 2.143303184739037e-05, "loss": 2.1477, "mean_token_accuracy": 0.42952207922935487, "step": 21280 }, { "epoch": 0.021438478064463494, "grad_norm": 18.46060055551337, "learning_rate": 2.143806780412143e-05, "loss": 2.2397, "mean_token_accuracy": 0.4551724135875702, "step": 21285 }, { "epoch": 0.021443514117567668, "grad_norm": 21.09666101634836, "learning_rate": 2.1443103760852488e-05, "loss": 2.7549, "mean_token_accuracy": 0.4018148899078369, "step": 21290 }, { "epoch": 0.021448550170671838, "grad_norm": 24.00296093144513, "learning_rate": 2.1448139717583547e-05, "loss": 2.66, "mean_token_accuracy": 0.4172413766384125, "step": 21295 }, { "epoch": 0.021453586223776012, "grad_norm": 30.69521282994135, "learning_rate": 2.1453175674314606e-05, "loss": 2.8866, "mean_token_accuracy": 0.42413793206214906, "step": 21300 }, { "epoch": 0.021458622276880186, "grad_norm": 20.638833099008483, "learning_rate": 2.1458211631045665e-05, "loss": 2.4647, "mean_token_accuracy": 0.3931034505367279, "step": 21305 }, { "epoch": 0.02146365832998436, "grad_norm": 19.200889976637768, "learning_rate": 2.1463247587776728e-05, "loss": 2.473, "mean_token_accuracy": 0.4482758641242981, "step": 21310 }, { "epoch": 0.02146869438308853, "grad_norm": 23.3168330177276, "learning_rate": 2.1468283544507787e-05, "loss": 2.5571, "mean_token_accuracy": 0.4517241418361664, "step": 21315 }, { "epoch": 0.021473730436192703, "grad_norm": 23.29290192156174, "learning_rate": 2.1473319501238847e-05, "loss": 2.4604, "mean_token_accuracy": 0.37241379618644715, "step": 21320 }, { "epoch": 0.021478766489296877, "grad_norm": 23.982081592848296, "learning_rate": 2.1478355457969906e-05, "loss": 2.6965, "mean_token_accuracy": 0.3620689630508423, "step": 21325 }, { "epoch": 0.021483802542401047, "grad_norm": 19.886959459342087, "learning_rate": 2.1483391414700965e-05, "loss": 2.4501, "mean_token_accuracy": 0.3827586233615875, "step": 21330 }, { "epoch": 0.02148883859550522, "grad_norm": 17.80350165783561, "learning_rate": 2.1488427371432028e-05, "loss": 2.2599, "mean_token_accuracy": 0.4517241358757019, "step": 21335 }, { "epoch": 0.021493874648609395, "grad_norm": 58.46189296422362, "learning_rate": 2.1493463328163087e-05, "loss": 2.768, "mean_token_accuracy": 0.38275861740112305, "step": 21340 }, { "epoch": 0.02149891070171357, "grad_norm": 19.649963383936225, "learning_rate": 2.1498499284894143e-05, "loss": 2.6442, "mean_token_accuracy": 0.3931034505367279, "step": 21345 }, { "epoch": 0.02150394675481774, "grad_norm": 18.758889172396653, "learning_rate": 2.1503535241625205e-05, "loss": 2.4415, "mean_token_accuracy": 0.4448275864124298, "step": 21350 }, { "epoch": 0.021508982807921913, "grad_norm": 19.3393762573304, "learning_rate": 2.1508571198356265e-05, "loss": 2.3811, "mean_token_accuracy": 0.4379310369491577, "step": 21355 }, { "epoch": 0.021514018861026087, "grad_norm": 18.917696870709353, "learning_rate": 2.1513607155087327e-05, "loss": 2.187, "mean_token_accuracy": 0.4586206912994385, "step": 21360 }, { "epoch": 0.021519054914130257, "grad_norm": 26.589235099172644, "learning_rate": 2.1518643111818383e-05, "loss": 2.5438, "mean_token_accuracy": 0.4068965554237366, "step": 21365 }, { "epoch": 0.02152409096723443, "grad_norm": 19.88762641238384, "learning_rate": 2.1523679068549442e-05, "loss": 2.5633, "mean_token_accuracy": 0.3758620649576187, "step": 21370 }, { "epoch": 0.021529127020338604, "grad_norm": 16.272772106163, "learning_rate": 2.1528715025280505e-05, "loss": 2.4822, "mean_token_accuracy": 0.43793103098869324, "step": 21375 }, { "epoch": 0.021534163073442778, "grad_norm": 18.894817428127777, "learning_rate": 2.1533750982011564e-05, "loss": 2.739, "mean_token_accuracy": 0.36206896901130675, "step": 21380 }, { "epoch": 0.02153919912654695, "grad_norm": 19.088232725365174, "learning_rate": 2.1538786938742624e-05, "loss": 2.1964, "mean_token_accuracy": 0.4586206912994385, "step": 21385 }, { "epoch": 0.021544235179651122, "grad_norm": 14.86055196125715, "learning_rate": 2.1543822895473683e-05, "loss": 2.1084, "mean_token_accuracy": 0.5125831842422486, "step": 21390 }, { "epoch": 0.021549271232755296, "grad_norm": 21.15171316757635, "learning_rate": 2.1548858852204742e-05, "loss": 2.5044, "mean_token_accuracy": 0.4034482777118683, "step": 21395 }, { "epoch": 0.021554307285859466, "grad_norm": 21.614144527541498, "learning_rate": 2.1553894808935805e-05, "loss": 2.4896, "mean_token_accuracy": 0.41929824352264405, "step": 21400 }, { "epoch": 0.02155934333896364, "grad_norm": 16.39683654684843, "learning_rate": 2.1558930765666864e-05, "loss": 2.4782, "mean_token_accuracy": 0.4137930989265442, "step": 21405 }, { "epoch": 0.021564379392067814, "grad_norm": 20.836267171175415, "learning_rate": 2.156396672239792e-05, "loss": 2.6525, "mean_token_accuracy": 0.4103448152542114, "step": 21410 }, { "epoch": 0.021569415445171988, "grad_norm": 19.327925233600205, "learning_rate": 2.1569002679128983e-05, "loss": 2.4536, "mean_token_accuracy": 0.4413793087005615, "step": 21415 }, { "epoch": 0.021574451498276158, "grad_norm": 31.673396901473147, "learning_rate": 2.1574038635860042e-05, "loss": 2.0451, "mean_token_accuracy": 0.4793103516101837, "step": 21420 }, { "epoch": 0.02157948755138033, "grad_norm": 20.757972626620997, "learning_rate": 2.15790745925911e-05, "loss": 2.8699, "mean_token_accuracy": 0.3482758700847626, "step": 21425 }, { "epoch": 0.021584523604484505, "grad_norm": 23.04080107849021, "learning_rate": 2.158411054932216e-05, "loss": 2.3406, "mean_token_accuracy": 0.3896551728248596, "step": 21430 }, { "epoch": 0.021589559657588676, "grad_norm": 20.102389032289793, "learning_rate": 2.158914650605322e-05, "loss": 2.3703, "mean_token_accuracy": 0.40852994918823243, "step": 21435 }, { "epoch": 0.02159459571069285, "grad_norm": 21.870507736067438, "learning_rate": 2.1594182462784282e-05, "loss": 2.6617, "mean_token_accuracy": 0.4068965494632721, "step": 21440 }, { "epoch": 0.021599631763797023, "grad_norm": 17.929142313440906, "learning_rate": 2.159921841951534e-05, "loss": 2.4824, "mean_token_accuracy": 0.41379310488700866, "step": 21445 }, { "epoch": 0.021604667816901197, "grad_norm": 23.078209549488363, "learning_rate": 2.16042543762464e-05, "loss": 2.4586, "mean_token_accuracy": 0.3965517282485962, "step": 21450 }, { "epoch": 0.021609703870005367, "grad_norm": 23.90567190795958, "learning_rate": 2.160929033297746e-05, "loss": 2.5161, "mean_token_accuracy": 0.3909255862236023, "step": 21455 }, { "epoch": 0.02161473992310954, "grad_norm": 17.45807935710331, "learning_rate": 2.161432628970852e-05, "loss": 2.4131, "mean_token_accuracy": 0.4034482717514038, "step": 21460 }, { "epoch": 0.021619775976213715, "grad_norm": 26.050217461696022, "learning_rate": 2.161936224643958e-05, "loss": 2.7065, "mean_token_accuracy": 0.39310343861579894, "step": 21465 }, { "epoch": 0.021624812029317885, "grad_norm": 23.61665140389976, "learning_rate": 2.162439820317064e-05, "loss": 2.7876, "mean_token_accuracy": 0.3448275804519653, "step": 21470 }, { "epoch": 0.02162984808242206, "grad_norm": 16.055468444101866, "learning_rate": 2.1629434159901697e-05, "loss": 2.4813, "mean_token_accuracy": 0.37779794335365297, "step": 21475 }, { "epoch": 0.021634884135526233, "grad_norm": 25.499886817554344, "learning_rate": 2.163447011663276e-05, "loss": 2.268, "mean_token_accuracy": 0.40344828367233276, "step": 21480 }, { "epoch": 0.021639920188630406, "grad_norm": 28.854832200085404, "learning_rate": 2.163950607336382e-05, "loss": 2.1744, "mean_token_accuracy": 0.4689655065536499, "step": 21485 }, { "epoch": 0.021644956241734577, "grad_norm": 20.437794488963352, "learning_rate": 2.1644542030094878e-05, "loss": 2.6063, "mean_token_accuracy": 0.3999999940395355, "step": 21490 }, { "epoch": 0.02164999229483875, "grad_norm": 24.872013565093347, "learning_rate": 2.164957798682594e-05, "loss": 2.3231, "mean_token_accuracy": 0.4551724076271057, "step": 21495 }, { "epoch": 0.021655028347942924, "grad_norm": 21.057369221037185, "learning_rate": 2.1654613943556997e-05, "loss": 2.6737, "mean_token_accuracy": 0.3620689630508423, "step": 21500 }, { "epoch": 0.021660064401047095, "grad_norm": 21.0599149429598, "learning_rate": 2.1659649900288056e-05, "loss": 2.2349, "mean_token_accuracy": 0.4103448212146759, "step": 21505 }, { "epoch": 0.02166510045415127, "grad_norm": 25.471535098354508, "learning_rate": 2.166468585701912e-05, "loss": 2.5737, "mean_token_accuracy": 0.4034482717514038, "step": 21510 }, { "epoch": 0.021670136507255442, "grad_norm": 23.50442998090923, "learning_rate": 2.1669721813750178e-05, "loss": 2.9357, "mean_token_accuracy": 0.38620689511299133, "step": 21515 }, { "epoch": 0.021675172560359616, "grad_norm": 21.35387634333185, "learning_rate": 2.1674757770481237e-05, "loss": 2.8811, "mean_token_accuracy": 0.37241379618644715, "step": 21520 }, { "epoch": 0.021680208613463786, "grad_norm": 21.789492178161805, "learning_rate": 2.1679793727212296e-05, "loss": 2.4923, "mean_token_accuracy": 0.42068964838981626, "step": 21525 }, { "epoch": 0.02168524466656796, "grad_norm": 14.99269352384579, "learning_rate": 2.1684829683943356e-05, "loss": 2.2103, "mean_token_accuracy": 0.45704780220985414, "step": 21530 }, { "epoch": 0.021690280719672134, "grad_norm": 24.334991206304526, "learning_rate": 2.1689865640674418e-05, "loss": 2.7528, "mean_token_accuracy": 0.403448274731636, "step": 21535 }, { "epoch": 0.021695316772776304, "grad_norm": 25.147343277361273, "learning_rate": 2.1694901597405477e-05, "loss": 2.0583, "mean_token_accuracy": 0.46206897497177124, "step": 21540 }, { "epoch": 0.021700352825880478, "grad_norm": 17.124692390940705, "learning_rate": 2.1699937554136533e-05, "loss": 2.5092, "mean_token_accuracy": 0.4034482777118683, "step": 21545 }, { "epoch": 0.02170538887898465, "grad_norm": 21.562574627880416, "learning_rate": 2.1704973510867596e-05, "loss": 2.2704, "mean_token_accuracy": 0.45517241954803467, "step": 21550 }, { "epoch": 0.021710424932088825, "grad_norm": 18.064784382956333, "learning_rate": 2.1710009467598655e-05, "loss": 2.4777, "mean_token_accuracy": 0.39655172228813174, "step": 21555 }, { "epoch": 0.021715460985192996, "grad_norm": 20.57611940441752, "learning_rate": 2.1715045424329718e-05, "loss": 2.246, "mean_token_accuracy": 0.4379310369491577, "step": 21560 }, { "epoch": 0.02172049703829717, "grad_norm": 21.736817655462637, "learning_rate": 2.1720081381060774e-05, "loss": 2.5078, "mean_token_accuracy": 0.4255898356437683, "step": 21565 }, { "epoch": 0.021725533091401343, "grad_norm": 20.270649883822532, "learning_rate": 2.1725117337791833e-05, "loss": 2.3941, "mean_token_accuracy": 0.43986691236495973, "step": 21570 }, { "epoch": 0.021730569144505513, "grad_norm": 22.29449077866916, "learning_rate": 2.1730153294522896e-05, "loss": 2.4522, "mean_token_accuracy": 0.4172413766384125, "step": 21575 }, { "epoch": 0.021735605197609687, "grad_norm": 18.59065764390342, "learning_rate": 2.1735189251253955e-05, "loss": 2.642, "mean_token_accuracy": 0.37241379618644715, "step": 21580 }, { "epoch": 0.02174064125071386, "grad_norm": 18.105978132724644, "learning_rate": 2.1740225207985014e-05, "loss": 2.4865, "mean_token_accuracy": 0.39655172228813174, "step": 21585 }, { "epoch": 0.021745677303818035, "grad_norm": 19.23912587497405, "learning_rate": 2.1745261164716073e-05, "loss": 2.542, "mean_token_accuracy": 0.40798547863960266, "step": 21590 }, { "epoch": 0.021750713356922205, "grad_norm": 16.97105849821303, "learning_rate": 2.1750297121447133e-05, "loss": 2.3399, "mean_token_accuracy": 0.4379310369491577, "step": 21595 }, { "epoch": 0.02175574941002638, "grad_norm": 20.906553191127436, "learning_rate": 2.1755333078178195e-05, "loss": 1.9815, "mean_token_accuracy": 0.48275861144065857, "step": 21600 }, { "epoch": 0.021760785463130553, "grad_norm": 21.106023720707956, "learning_rate": 2.1760369034909254e-05, "loss": 2.4754, "mean_token_accuracy": 0.42758620977401735, "step": 21605 }, { "epoch": 0.021765821516234723, "grad_norm": 20.51090469243702, "learning_rate": 2.176540499164031e-05, "loss": 2.5098, "mean_token_accuracy": 0.4137930989265442, "step": 21610 }, { "epoch": 0.021770857569338897, "grad_norm": 22.621381846141933, "learning_rate": 2.1770440948371373e-05, "loss": 2.3678, "mean_token_accuracy": 0.43103447556495667, "step": 21615 }, { "epoch": 0.02177589362244307, "grad_norm": 21.716603358908372, "learning_rate": 2.1775476905102432e-05, "loss": 2.5886, "mean_token_accuracy": 0.4068965494632721, "step": 21620 }, { "epoch": 0.021780929675547244, "grad_norm": 18.38476529279915, "learning_rate": 2.178051286183349e-05, "loss": 2.4682, "mean_token_accuracy": 0.42262552976608275, "step": 21625 }, { "epoch": 0.021785965728651414, "grad_norm": 18.11916614840942, "learning_rate": 2.178554881856455e-05, "loss": 2.3106, "mean_token_accuracy": 0.4379310369491577, "step": 21630 }, { "epoch": 0.021791001781755588, "grad_norm": 23.036596259685872, "learning_rate": 2.179058477529561e-05, "loss": 2.7539, "mean_token_accuracy": 0.4103448212146759, "step": 21635 }, { "epoch": 0.021796037834859762, "grad_norm": 19.71209653554668, "learning_rate": 2.1795620732026673e-05, "loss": 2.3661, "mean_token_accuracy": 0.4206896543502808, "step": 21640 }, { "epoch": 0.021801073887963932, "grad_norm": 17.633921261304454, "learning_rate": 2.1800656688757732e-05, "loss": 2.5948, "mean_token_accuracy": 0.40344826877117157, "step": 21645 }, { "epoch": 0.021806109941068106, "grad_norm": 17.009069070450185, "learning_rate": 2.180569264548879e-05, "loss": 2.2365, "mean_token_accuracy": 0.48275862336158754, "step": 21650 }, { "epoch": 0.02181114599417228, "grad_norm": 35.62188496942071, "learning_rate": 2.181072860221985e-05, "loss": 3.0139, "mean_token_accuracy": 0.3655172437429428, "step": 21655 }, { "epoch": 0.021816182047276454, "grad_norm": 18.39800007551049, "learning_rate": 2.181576455895091e-05, "loss": 2.2108, "mean_token_accuracy": 0.41379310488700866, "step": 21660 }, { "epoch": 0.021821218100380624, "grad_norm": 21.737673921005147, "learning_rate": 2.182080051568197e-05, "loss": 2.5352, "mean_token_accuracy": 0.4310344815254211, "step": 21665 }, { "epoch": 0.021826254153484798, "grad_norm": 18.96735408373914, "learning_rate": 2.182583647241303e-05, "loss": 2.3243, "mean_token_accuracy": 0.4310344815254211, "step": 21670 }, { "epoch": 0.02183129020658897, "grad_norm": 15.139247169467529, "learning_rate": 2.183087242914409e-05, "loss": 2.389, "mean_token_accuracy": 0.4034482777118683, "step": 21675 }, { "epoch": 0.02183632625969314, "grad_norm": 19.119105362923744, "learning_rate": 2.183590838587515e-05, "loss": 2.4462, "mean_token_accuracy": 0.40689654350280763, "step": 21680 }, { "epoch": 0.021841362312797315, "grad_norm": 18.930010892932486, "learning_rate": 2.184094434260621e-05, "loss": 2.3072, "mean_token_accuracy": 0.44355716109275817, "step": 21685 }, { "epoch": 0.02184639836590149, "grad_norm": 24.216667079310714, "learning_rate": 2.184598029933727e-05, "loss": 2.6638, "mean_token_accuracy": 0.39782214164733887, "step": 21690 }, { "epoch": 0.021851434419005663, "grad_norm": 21.807882113820927, "learning_rate": 2.185101625606833e-05, "loss": 2.2816, "mean_token_accuracy": 0.4793103516101837, "step": 21695 }, { "epoch": 0.021856470472109833, "grad_norm": 16.88438049742662, "learning_rate": 2.1856052212799387e-05, "loss": 2.4499, "mean_token_accuracy": 0.4241379201412201, "step": 21700 }, { "epoch": 0.021861506525214007, "grad_norm": 45.5940845574123, "learning_rate": 2.186108816953045e-05, "loss": 2.2933, "mean_token_accuracy": 0.41379310488700866, "step": 21705 }, { "epoch": 0.02186654257831818, "grad_norm": 19.19299342144245, "learning_rate": 2.186612412626151e-05, "loss": 2.8039, "mean_token_accuracy": 0.4068965554237366, "step": 21710 }, { "epoch": 0.02187157863142235, "grad_norm": 20.60919521921651, "learning_rate": 2.1871160082992568e-05, "loss": 2.6019, "mean_token_accuracy": 0.41034482717514037, "step": 21715 }, { "epoch": 0.021876614684526525, "grad_norm": 23.285871346211103, "learning_rate": 2.1876196039723627e-05, "loss": 2.6472, "mean_token_accuracy": 0.403448274731636, "step": 21720 }, { "epoch": 0.0218816507376307, "grad_norm": 21.954056115545995, "learning_rate": 2.1881231996454687e-05, "loss": 2.6767, "mean_token_accuracy": 0.3551724076271057, "step": 21725 }, { "epoch": 0.021886686790734872, "grad_norm": 19.39028219645686, "learning_rate": 2.1886267953185746e-05, "loss": 2.3678, "mean_token_accuracy": 0.42758620977401735, "step": 21730 }, { "epoch": 0.021891722843839043, "grad_norm": 16.332245733504926, "learning_rate": 2.189130390991681e-05, "loss": 2.8005, "mean_token_accuracy": 0.358620685338974, "step": 21735 }, { "epoch": 0.021896758896943216, "grad_norm": 19.014127284738546, "learning_rate": 2.1896339866647868e-05, "loss": 2.3323, "mean_token_accuracy": 0.4172413766384125, "step": 21740 }, { "epoch": 0.02190179495004739, "grad_norm": 15.392209986702513, "learning_rate": 2.1901375823378927e-05, "loss": 2.4275, "mean_token_accuracy": 0.441379314661026, "step": 21745 }, { "epoch": 0.02190683100315156, "grad_norm": 26.426989347706606, "learning_rate": 2.1906411780109986e-05, "loss": 2.2238, "mean_token_accuracy": 0.47586206793785096, "step": 21750 }, { "epoch": 0.021911867056255734, "grad_norm": 21.487707141990146, "learning_rate": 2.1911447736841046e-05, "loss": 2.4446, "mean_token_accuracy": 0.4517241358757019, "step": 21755 }, { "epoch": 0.021916903109359908, "grad_norm": 23.425239068699245, "learning_rate": 2.1916483693572108e-05, "loss": 2.5788, "mean_token_accuracy": 0.4413793087005615, "step": 21760 }, { "epoch": 0.021921939162464082, "grad_norm": 21.06322853220178, "learning_rate": 2.1921519650303164e-05, "loss": 2.4168, "mean_token_accuracy": 0.4379310429096222, "step": 21765 }, { "epoch": 0.021926975215568252, "grad_norm": 20.29777047241038, "learning_rate": 2.1926555607034223e-05, "loss": 2.1808, "mean_token_accuracy": 0.4413793087005615, "step": 21770 }, { "epoch": 0.021932011268672426, "grad_norm": 21.90389458575001, "learning_rate": 2.1931591563765286e-05, "loss": 2.6499, "mean_token_accuracy": 0.3827586114406586, "step": 21775 }, { "epoch": 0.0219370473217766, "grad_norm": 17.848850455719244, "learning_rate": 2.1936627520496345e-05, "loss": 2.2759, "mean_token_accuracy": 0.441379314661026, "step": 21780 }, { "epoch": 0.02194208337488077, "grad_norm": 21.07058548206167, "learning_rate": 2.1941663477227405e-05, "loss": 2.3136, "mean_token_accuracy": 0.4620689690113068, "step": 21785 }, { "epoch": 0.021947119427984944, "grad_norm": 14.98205951650851, "learning_rate": 2.1946699433958464e-05, "loss": 2.3405, "mean_token_accuracy": 0.4655172348022461, "step": 21790 }, { "epoch": 0.021952155481089117, "grad_norm": 19.368701853449807, "learning_rate": 2.1951735390689523e-05, "loss": 2.3931, "mean_token_accuracy": 0.4124016880989075, "step": 21795 }, { "epoch": 0.02195719153419329, "grad_norm": 20.403598050296736, "learning_rate": 2.1956771347420586e-05, "loss": 2.5877, "mean_token_accuracy": 0.38965516686439516, "step": 21800 }, { "epoch": 0.02196222758729746, "grad_norm": 15.39313184023891, "learning_rate": 2.1961807304151645e-05, "loss": 2.0945, "mean_token_accuracy": 0.458620685338974, "step": 21805 }, { "epoch": 0.021967263640401635, "grad_norm": 17.43069099682159, "learning_rate": 2.19668432608827e-05, "loss": 2.5244, "mean_token_accuracy": 0.4689655125141144, "step": 21810 }, { "epoch": 0.02197229969350581, "grad_norm": 19.96825753111768, "learning_rate": 2.1971879217613763e-05, "loss": 2.6333, "mean_token_accuracy": 0.41034482717514037, "step": 21815 }, { "epoch": 0.02197733574660998, "grad_norm": 21.828421216227504, "learning_rate": 2.1976915174344823e-05, "loss": 2.456, "mean_token_accuracy": 0.4172413766384125, "step": 21820 }, { "epoch": 0.021982371799714153, "grad_norm": 22.166922489684964, "learning_rate": 2.1981951131075885e-05, "loss": 2.5327, "mean_token_accuracy": 0.41724138259887694, "step": 21825 }, { "epoch": 0.021987407852818327, "grad_norm": 19.418319006373743, "learning_rate": 2.198698708780694e-05, "loss": 2.7421, "mean_token_accuracy": 0.3896551728248596, "step": 21830 }, { "epoch": 0.0219924439059225, "grad_norm": 20.83800054741006, "learning_rate": 2.1992023044538e-05, "loss": 2.4592, "mean_token_accuracy": 0.4068965554237366, "step": 21835 }, { "epoch": 0.02199747995902667, "grad_norm": 17.23561815004508, "learning_rate": 2.1997059001269063e-05, "loss": 2.1408, "mean_token_accuracy": 0.495160311460495, "step": 21840 }, { "epoch": 0.022002516012130845, "grad_norm": 32.72038738751284, "learning_rate": 2.2002094958000122e-05, "loss": 2.3529, "mean_token_accuracy": 0.4620689630508423, "step": 21845 }, { "epoch": 0.02200755206523502, "grad_norm": 22.98757668643551, "learning_rate": 2.200713091473118e-05, "loss": 2.5392, "mean_token_accuracy": 0.45862067937850953, "step": 21850 }, { "epoch": 0.02201258811833919, "grad_norm": 18.93283329650684, "learning_rate": 2.201216687146224e-05, "loss": 2.1277, "mean_token_accuracy": 0.512522679567337, "step": 21855 }, { "epoch": 0.022017624171443363, "grad_norm": 20.262030735855543, "learning_rate": 2.20172028281933e-05, "loss": 3.0397, "mean_token_accuracy": 0.3241379290819168, "step": 21860 }, { "epoch": 0.022022660224547536, "grad_norm": 22.240427103536543, "learning_rate": 2.2022238784924363e-05, "loss": 2.7564, "mean_token_accuracy": 0.37931033968925476, "step": 21865 }, { "epoch": 0.02202769627765171, "grad_norm": 31.935353878221505, "learning_rate": 2.2027274741655422e-05, "loss": 2.6589, "mean_token_accuracy": 0.37931033968925476, "step": 21870 }, { "epoch": 0.02203273233075588, "grad_norm": 18.99255074863125, "learning_rate": 2.203231069838648e-05, "loss": 2.8635, "mean_token_accuracy": 0.3793103516101837, "step": 21875 }, { "epoch": 0.022037768383860054, "grad_norm": 18.670657842813824, "learning_rate": 2.203734665511754e-05, "loss": 2.1107, "mean_token_accuracy": 0.47931034564971925, "step": 21880 }, { "epoch": 0.022042804436964228, "grad_norm": 20.68548688465189, "learning_rate": 2.20423826118486e-05, "loss": 2.2387, "mean_token_accuracy": 0.46551724076271056, "step": 21885 }, { "epoch": 0.022047840490068398, "grad_norm": 21.19548673988787, "learning_rate": 2.204741856857966e-05, "loss": 2.4872, "mean_token_accuracy": 0.3827586233615875, "step": 21890 }, { "epoch": 0.022052876543172572, "grad_norm": 23.818325367342048, "learning_rate": 2.205245452531072e-05, "loss": 2.6599, "mean_token_accuracy": 0.41857229471206664, "step": 21895 }, { "epoch": 0.022057912596276746, "grad_norm": 21.64332862225286, "learning_rate": 2.2057490482041777e-05, "loss": 2.3055, "mean_token_accuracy": 0.44700543880462645, "step": 21900 }, { "epoch": 0.02206294864938092, "grad_norm": 20.0682466915093, "learning_rate": 2.206252643877284e-05, "loss": 2.7357, "mean_token_accuracy": 0.35862069129943847, "step": 21905 }, { "epoch": 0.02206798470248509, "grad_norm": 17.813662132891224, "learning_rate": 2.20675623955039e-05, "loss": 2.3007, "mean_token_accuracy": 0.44482758045196535, "step": 21910 }, { "epoch": 0.022073020755589264, "grad_norm": 24.433699893250825, "learning_rate": 2.207259835223496e-05, "loss": 2.2106, "mean_token_accuracy": 0.47241378426551817, "step": 21915 }, { "epoch": 0.022078056808693437, "grad_norm": 18.850225640797632, "learning_rate": 2.2077634308966018e-05, "loss": 2.5472, "mean_token_accuracy": 0.4413793087005615, "step": 21920 }, { "epoch": 0.022083092861797608, "grad_norm": 21.393883458892418, "learning_rate": 2.2082670265697077e-05, "loss": 2.3689, "mean_token_accuracy": 0.4344827592372894, "step": 21925 }, { "epoch": 0.02208812891490178, "grad_norm": 22.28777989040416, "learning_rate": 2.2087706222428136e-05, "loss": 2.5321, "mean_token_accuracy": 0.41724138259887694, "step": 21930 }, { "epoch": 0.022093164968005955, "grad_norm": 19.131649625652614, "learning_rate": 2.20927421791592e-05, "loss": 2.5263, "mean_token_accuracy": 0.3931034475564957, "step": 21935 }, { "epoch": 0.02209820102111013, "grad_norm": 19.46406322167404, "learning_rate": 2.2097778135890258e-05, "loss": 2.688, "mean_token_accuracy": 0.37241379022598264, "step": 21940 }, { "epoch": 0.0221032370742143, "grad_norm": 18.292065564127814, "learning_rate": 2.2102814092621318e-05, "loss": 2.5956, "mean_token_accuracy": 0.3965517282485962, "step": 21945 }, { "epoch": 0.022108273127318473, "grad_norm": 17.825280635786516, "learning_rate": 2.2107850049352377e-05, "loss": 2.7299, "mean_token_accuracy": 0.38965516686439516, "step": 21950 }, { "epoch": 0.022113309180422647, "grad_norm": 19.878855842706756, "learning_rate": 2.2112886006083436e-05, "loss": 2.6181, "mean_token_accuracy": 0.3965517163276672, "step": 21955 }, { "epoch": 0.022118345233526817, "grad_norm": 22.842665736173938, "learning_rate": 2.21179219628145e-05, "loss": 2.9404, "mean_token_accuracy": 0.37931033968925476, "step": 21960 }, { "epoch": 0.02212338128663099, "grad_norm": 19.60550180726225, "learning_rate": 2.2122957919545555e-05, "loss": 2.2325, "mean_token_accuracy": 0.46896552443504336, "step": 21965 }, { "epoch": 0.022128417339735165, "grad_norm": 16.54542227085258, "learning_rate": 2.2127993876276614e-05, "loss": 2.3343, "mean_token_accuracy": 0.4448275864124298, "step": 21970 }, { "epoch": 0.02213345339283934, "grad_norm": 20.55499149420374, "learning_rate": 2.2133029833007676e-05, "loss": 2.4232, "mean_token_accuracy": 0.44482759237289426, "step": 21975 }, { "epoch": 0.02213848944594351, "grad_norm": 17.144770357985625, "learning_rate": 2.2138065789738736e-05, "loss": 2.1678, "mean_token_accuracy": 0.47586206793785096, "step": 21980 }, { "epoch": 0.022143525499047682, "grad_norm": 24.792037772286424, "learning_rate": 2.2143101746469795e-05, "loss": 2.5414, "mean_token_accuracy": 0.3517241358757019, "step": 21985 }, { "epoch": 0.022148561552151856, "grad_norm": 18.64800959994105, "learning_rate": 2.2148137703200854e-05, "loss": 2.4998, "mean_token_accuracy": 0.3517241358757019, "step": 21990 }, { "epoch": 0.022153597605256026, "grad_norm": 30.721025502671388, "learning_rate": 2.2153173659931913e-05, "loss": 2.1328, "mean_token_accuracy": 0.4620689630508423, "step": 21995 }, { "epoch": 0.0221586336583602, "grad_norm": 18.529978820796558, "learning_rate": 2.2158209616662976e-05, "loss": 2.5182, "mean_token_accuracy": 0.40889291763305663, "step": 22000 }, { "epoch": 0.022163669711464374, "grad_norm": 26.469480126011934, "learning_rate": 2.2163245573394035e-05, "loss": 2.2622, "mean_token_accuracy": 0.41379311084747317, "step": 22005 }, { "epoch": 0.022168705764568548, "grad_norm": 18.16070527807918, "learning_rate": 2.216828153012509e-05, "loss": 2.3764, "mean_token_accuracy": 0.39999999701976774, "step": 22010 }, { "epoch": 0.022173741817672718, "grad_norm": 17.24744470420216, "learning_rate": 2.2173317486856154e-05, "loss": 2.3641, "mean_token_accuracy": 0.4344827651977539, "step": 22015 }, { "epoch": 0.022178777870776892, "grad_norm": 19.488925858277245, "learning_rate": 2.2178353443587213e-05, "loss": 2.1236, "mean_token_accuracy": 0.46958128213882444, "step": 22020 }, { "epoch": 0.022183813923881066, "grad_norm": 27.528580089454017, "learning_rate": 2.2183389400318276e-05, "loss": 2.913, "mean_token_accuracy": 0.34137930870056155, "step": 22025 }, { "epoch": 0.022188849976985236, "grad_norm": 33.62422459603598, "learning_rate": 2.2188425357049335e-05, "loss": 2.5234, "mean_token_accuracy": 0.3827586144208908, "step": 22030 }, { "epoch": 0.02219388603008941, "grad_norm": 19.474926122141394, "learning_rate": 2.219346131378039e-05, "loss": 2.6024, "mean_token_accuracy": 0.42413793206214906, "step": 22035 }, { "epoch": 0.022198922083193583, "grad_norm": 16.585208457436593, "learning_rate": 2.2198497270511454e-05, "loss": 2.6301, "mean_token_accuracy": 0.37586206793785093, "step": 22040 }, { "epoch": 0.022203958136297757, "grad_norm": 18.568035252189002, "learning_rate": 2.2203533227242513e-05, "loss": 2.6529, "mean_token_accuracy": 0.3896551728248596, "step": 22045 }, { "epoch": 0.022208994189401927, "grad_norm": 19.064448471957252, "learning_rate": 2.2208569183973572e-05, "loss": 2.2902, "mean_token_accuracy": 0.4724137902259827, "step": 22050 }, { "epoch": 0.0222140302425061, "grad_norm": 20.336275906284552, "learning_rate": 2.221360514070463e-05, "loss": 2.5596, "mean_token_accuracy": 0.39999998807907106, "step": 22055 }, { "epoch": 0.022219066295610275, "grad_norm": 17.74089781034804, "learning_rate": 2.221864109743569e-05, "loss": 2.3276, "mean_token_accuracy": 0.41034482717514037, "step": 22060 }, { "epoch": 0.022224102348714445, "grad_norm": 28.9707201207465, "learning_rate": 2.2223677054166753e-05, "loss": 2.5623, "mean_token_accuracy": 0.4172413766384125, "step": 22065 }, { "epoch": 0.02222913840181862, "grad_norm": 17.797284401979596, "learning_rate": 2.2228713010897812e-05, "loss": 2.0655, "mean_token_accuracy": 0.5206896543502808, "step": 22070 }, { "epoch": 0.022234174454922793, "grad_norm": 17.04126926265857, "learning_rate": 2.223374896762887e-05, "loss": 2.5295, "mean_token_accuracy": 0.39310344457626345, "step": 22075 }, { "epoch": 0.022239210508026967, "grad_norm": 18.219008304708492, "learning_rate": 2.223878492435993e-05, "loss": 2.0721, "mean_token_accuracy": 0.4931034445762634, "step": 22080 }, { "epoch": 0.022244246561131137, "grad_norm": 19.579876178045556, "learning_rate": 2.224382088109099e-05, "loss": 2.6664, "mean_token_accuracy": 0.39310344457626345, "step": 22085 }, { "epoch": 0.02224928261423531, "grad_norm": 20.727652525353168, "learning_rate": 2.2248856837822053e-05, "loss": 2.6057, "mean_token_accuracy": 0.36896551251411436, "step": 22090 }, { "epoch": 0.022254318667339484, "grad_norm": 25.71178053912386, "learning_rate": 2.2253892794553112e-05, "loss": 2.52, "mean_token_accuracy": 0.41724138259887694, "step": 22095 }, { "epoch": 0.022259354720443655, "grad_norm": 22.19101844185746, "learning_rate": 2.2258928751284168e-05, "loss": 2.5349, "mean_token_accuracy": 0.3999999940395355, "step": 22100 }, { "epoch": 0.02226439077354783, "grad_norm": 17.444978177841975, "learning_rate": 2.226396470801523e-05, "loss": 2.1695, "mean_token_accuracy": 0.4896551728248596, "step": 22105 }, { "epoch": 0.022269426826652002, "grad_norm": 17.525196428243337, "learning_rate": 2.226900066474629e-05, "loss": 2.5023, "mean_token_accuracy": 0.43103447556495667, "step": 22110 }, { "epoch": 0.022274462879756176, "grad_norm": 17.312631765622033, "learning_rate": 2.227403662147735e-05, "loss": 2.518, "mean_token_accuracy": 0.4103448212146759, "step": 22115 }, { "epoch": 0.022279498932860346, "grad_norm": 31.832139802670028, "learning_rate": 2.227907257820841e-05, "loss": 2.6816, "mean_token_accuracy": 0.3793103516101837, "step": 22120 }, { "epoch": 0.02228453498596452, "grad_norm": 19.066747654896485, "learning_rate": 2.2284108534939468e-05, "loss": 2.3297, "mean_token_accuracy": 0.42475369572639465, "step": 22125 }, { "epoch": 0.022289571039068694, "grad_norm": 17.678994499298476, "learning_rate": 2.228914449167053e-05, "loss": 2.4118, "mean_token_accuracy": 0.44482758045196535, "step": 22130 }, { "epoch": 0.022294607092172864, "grad_norm": 22.517709464302218, "learning_rate": 2.229418044840159e-05, "loss": 2.7515, "mean_token_accuracy": 0.41724138259887694, "step": 22135 }, { "epoch": 0.022299643145277038, "grad_norm": 18.862621011620117, "learning_rate": 2.229921640513265e-05, "loss": 2.5284, "mean_token_accuracy": 0.4, "step": 22140 }, { "epoch": 0.02230467919838121, "grad_norm": 28.54303826814318, "learning_rate": 2.2304252361863708e-05, "loss": 2.3845, "mean_token_accuracy": 0.43793103098869324, "step": 22145 }, { "epoch": 0.022309715251485385, "grad_norm": 19.720926920423196, "learning_rate": 2.2309288318594767e-05, "loss": 2.392, "mean_token_accuracy": 0.4206896424293518, "step": 22150 }, { "epoch": 0.022314751304589556, "grad_norm": 21.147741435462077, "learning_rate": 2.2314324275325827e-05, "loss": 2.383, "mean_token_accuracy": 0.41379310488700866, "step": 22155 }, { "epoch": 0.02231978735769373, "grad_norm": 21.84547054587362, "learning_rate": 2.231936023205689e-05, "loss": 2.6847, "mean_token_accuracy": 0.4, "step": 22160 }, { "epoch": 0.022324823410797903, "grad_norm": 28.431524954376894, "learning_rate": 2.2324396188787945e-05, "loss": 2.5632, "mean_token_accuracy": 0.42413793206214906, "step": 22165 }, { "epoch": 0.022329859463902074, "grad_norm": 19.16903860920344, "learning_rate": 2.2329432145519008e-05, "loss": 2.223, "mean_token_accuracy": 0.458620685338974, "step": 22170 }, { "epoch": 0.022334895517006247, "grad_norm": 26.85744625812132, "learning_rate": 2.2334468102250067e-05, "loss": 2.5157, "mean_token_accuracy": 0.3965517282485962, "step": 22175 }, { "epoch": 0.02233993157011042, "grad_norm": 22.947922162874136, "learning_rate": 2.2339504058981126e-05, "loss": 2.7134, "mean_token_accuracy": 0.358620685338974, "step": 22180 }, { "epoch": 0.022344967623214595, "grad_norm": 21.377917623613296, "learning_rate": 2.2344540015712185e-05, "loss": 2.44, "mean_token_accuracy": 0.44827585816383364, "step": 22185 }, { "epoch": 0.022350003676318765, "grad_norm": 17.445912939362415, "learning_rate": 2.2349575972443245e-05, "loss": 2.4056, "mean_token_accuracy": 0.42413792610168455, "step": 22190 }, { "epoch": 0.02235503972942294, "grad_norm": 22.54786475522324, "learning_rate": 2.2354611929174304e-05, "loss": 2.6845, "mean_token_accuracy": 0.36896551549434664, "step": 22195 }, { "epoch": 0.022360075782527113, "grad_norm": 20.031459047298654, "learning_rate": 2.2359647885905367e-05, "loss": 2.3074, "mean_token_accuracy": 0.4568663060665131, "step": 22200 }, { "epoch": 0.022365111835631283, "grad_norm": 23.318258570706305, "learning_rate": 2.2364683842636426e-05, "loss": 2.5512, "mean_token_accuracy": 0.3931034505367279, "step": 22205 }, { "epoch": 0.022370147888735457, "grad_norm": 22.834068307645833, "learning_rate": 2.2369719799367485e-05, "loss": 2.5969, "mean_token_accuracy": 0.41379310488700866, "step": 22210 }, { "epoch": 0.02237518394183963, "grad_norm": 24.388384134336423, "learning_rate": 2.2374755756098544e-05, "loss": 2.1986, "mean_token_accuracy": 0.46551724672317507, "step": 22215 }, { "epoch": 0.022380219994943804, "grad_norm": 23.372507417723465, "learning_rate": 2.2379791712829604e-05, "loss": 2.5898, "mean_token_accuracy": 0.4034482717514038, "step": 22220 }, { "epoch": 0.022385256048047975, "grad_norm": 18.871339052692168, "learning_rate": 2.2384827669560666e-05, "loss": 2.276, "mean_token_accuracy": 0.43793103098869324, "step": 22225 }, { "epoch": 0.02239029210115215, "grad_norm": 26.062997719553426, "learning_rate": 2.2389863626291725e-05, "loss": 2.4886, "mean_token_accuracy": 0.4358741700649261, "step": 22230 }, { "epoch": 0.022395328154256322, "grad_norm": 23.174492575524077, "learning_rate": 2.239489958302278e-05, "loss": 2.6006, "mean_token_accuracy": 0.3931034505367279, "step": 22235 }, { "epoch": 0.022400364207360492, "grad_norm": 18.1040954333043, "learning_rate": 2.2399935539753844e-05, "loss": 2.4222, "mean_token_accuracy": 0.41379311084747317, "step": 22240 }, { "epoch": 0.022405400260464666, "grad_norm": 24.493134539717133, "learning_rate": 2.2404971496484903e-05, "loss": 2.849, "mean_token_accuracy": 0.36896551251411436, "step": 22245 }, { "epoch": 0.02241043631356884, "grad_norm": 19.18978538303809, "learning_rate": 2.2410007453215966e-05, "loss": 2.6983, "mean_token_accuracy": 0.3896551728248596, "step": 22250 }, { "epoch": 0.022415472366673014, "grad_norm": 14.265059198158891, "learning_rate": 2.2415043409947022e-05, "loss": 2.2997, "mean_token_accuracy": 0.48747730255126953, "step": 22255 }, { "epoch": 0.022420508419777184, "grad_norm": 25.427834720968296, "learning_rate": 2.242007936667808e-05, "loss": 2.6528, "mean_token_accuracy": 0.3758620619773865, "step": 22260 }, { "epoch": 0.022425544472881358, "grad_norm": 18.51087331360119, "learning_rate": 2.2425115323409144e-05, "loss": 2.59, "mean_token_accuracy": 0.43103447556495667, "step": 22265 }, { "epoch": 0.02243058052598553, "grad_norm": 21.42302683306011, "learning_rate": 2.2430151280140203e-05, "loss": 2.5512, "mean_token_accuracy": 0.3896551787853241, "step": 22270 }, { "epoch": 0.022435616579089702, "grad_norm": 23.046029473027314, "learning_rate": 2.2435187236871262e-05, "loss": 2.5384, "mean_token_accuracy": 0.36896551847457887, "step": 22275 }, { "epoch": 0.022440652632193876, "grad_norm": 16.057161230476066, "learning_rate": 2.244022319360232e-05, "loss": 2.6856, "mean_token_accuracy": 0.4068965554237366, "step": 22280 }, { "epoch": 0.02244568868529805, "grad_norm": 17.639397575770012, "learning_rate": 2.244525915033338e-05, "loss": 2.5045, "mean_token_accuracy": 0.4034482717514038, "step": 22285 }, { "epoch": 0.022450724738402223, "grad_norm": 17.893885697651772, "learning_rate": 2.2450295107064443e-05, "loss": 2.5162, "mean_token_accuracy": 0.42413792610168455, "step": 22290 }, { "epoch": 0.022455760791506393, "grad_norm": 17.329695781495438, "learning_rate": 2.2455331063795503e-05, "loss": 2.1294, "mean_token_accuracy": 0.4103448331356049, "step": 22295 }, { "epoch": 0.022460796844610567, "grad_norm": 18.504104992864324, "learning_rate": 2.246036702052656e-05, "loss": 2.5708, "mean_token_accuracy": 0.37241379618644715, "step": 22300 }, { "epoch": 0.02246583289771474, "grad_norm": 22.03805586439012, "learning_rate": 2.246540297725762e-05, "loss": 2.3461, "mean_token_accuracy": 0.4482758641242981, "step": 22305 }, { "epoch": 0.02247086895081891, "grad_norm": 21.069023053962944, "learning_rate": 2.247043893398868e-05, "loss": 2.3123, "mean_token_accuracy": 0.4344827592372894, "step": 22310 }, { "epoch": 0.022475905003923085, "grad_norm": 18.7543630496956, "learning_rate": 2.247547489071974e-05, "loss": 2.3285, "mean_token_accuracy": 0.45862069725990295, "step": 22315 }, { "epoch": 0.02248094105702726, "grad_norm": 18.58035747384884, "learning_rate": 2.24805108474508e-05, "loss": 2.5487, "mean_token_accuracy": 0.4068965554237366, "step": 22320 }, { "epoch": 0.022485977110131432, "grad_norm": 29.659721198682824, "learning_rate": 2.2485546804181858e-05, "loss": 2.7291, "mean_token_accuracy": 0.33448275923728943, "step": 22325 }, { "epoch": 0.022491013163235603, "grad_norm": 23.104117481218953, "learning_rate": 2.249058276091292e-05, "loss": 2.4649, "mean_token_accuracy": 0.40562612414360044, "step": 22330 }, { "epoch": 0.022496049216339777, "grad_norm": 17.66567299953306, "learning_rate": 2.249561871764398e-05, "loss": 2.6325, "mean_token_accuracy": 0.3655172407627106, "step": 22335 }, { "epoch": 0.02250108526944395, "grad_norm": 27.871278373827845, "learning_rate": 2.250065467437504e-05, "loss": 2.7913, "mean_token_accuracy": 0.3896551728248596, "step": 22340 }, { "epoch": 0.02250612132254812, "grad_norm": 20.518959052869597, "learning_rate": 2.25056906311061e-05, "loss": 2.4906, "mean_token_accuracy": 0.4068965554237366, "step": 22345 }, { "epoch": 0.022511157375652294, "grad_norm": 23.563047248173834, "learning_rate": 2.2510726587837158e-05, "loss": 2.8235, "mean_token_accuracy": 0.38275861740112305, "step": 22350 }, { "epoch": 0.022516193428756468, "grad_norm": 20.75663922178308, "learning_rate": 2.2515762544568217e-05, "loss": 2.316, "mean_token_accuracy": 0.4517241358757019, "step": 22355 }, { "epoch": 0.022521229481860642, "grad_norm": 17.69785708724455, "learning_rate": 2.252079850129928e-05, "loss": 2.5617, "mean_token_accuracy": 0.3862069010734558, "step": 22360 }, { "epoch": 0.022526265534964812, "grad_norm": 17.627379486652025, "learning_rate": 2.2525834458030335e-05, "loss": 2.2386, "mean_token_accuracy": 0.458620685338974, "step": 22365 }, { "epoch": 0.022531301588068986, "grad_norm": 21.97807014633657, "learning_rate": 2.2530870414761398e-05, "loss": 2.3722, "mean_token_accuracy": 0.42758620977401735, "step": 22370 }, { "epoch": 0.02253633764117316, "grad_norm": 17.838855015088015, "learning_rate": 2.2535906371492457e-05, "loss": 2.3766, "mean_token_accuracy": 0.41724138259887694, "step": 22375 }, { "epoch": 0.02254137369427733, "grad_norm": 21.8998732395183, "learning_rate": 2.2540942328223517e-05, "loss": 2.2293, "mean_token_accuracy": 0.47241379618644713, "step": 22380 }, { "epoch": 0.022546409747381504, "grad_norm": 24.701371474290475, "learning_rate": 2.2545978284954576e-05, "loss": 2.5738, "mean_token_accuracy": 0.4206896543502808, "step": 22385 }, { "epoch": 0.022551445800485678, "grad_norm": 21.6951296392118, "learning_rate": 2.2551014241685635e-05, "loss": 2.4643, "mean_token_accuracy": 0.43103448748588563, "step": 22390 }, { "epoch": 0.02255648185358985, "grad_norm": 21.552926014920672, "learning_rate": 2.2556050198416694e-05, "loss": 2.5057, "mean_token_accuracy": 0.3825166404247284, "step": 22395 }, { "epoch": 0.02256151790669402, "grad_norm": 16.728623960915243, "learning_rate": 2.2561086155147757e-05, "loss": 2.3849, "mean_token_accuracy": 0.4482758641242981, "step": 22400 }, { "epoch": 0.022566553959798195, "grad_norm": 21.15696615803908, "learning_rate": 2.2566122111878816e-05, "loss": 2.4679, "mean_token_accuracy": 0.42758620381355283, "step": 22405 }, { "epoch": 0.02257159001290237, "grad_norm": 22.60642953967318, "learning_rate": 2.2571158068609876e-05, "loss": 2.2346, "mean_token_accuracy": 0.44482759237289426, "step": 22410 }, { "epoch": 0.02257662606600654, "grad_norm": 27.526608502129932, "learning_rate": 2.2576194025340935e-05, "loss": 2.7476, "mean_token_accuracy": 0.3827586233615875, "step": 22415 }, { "epoch": 0.022581662119110713, "grad_norm": 17.84235743698005, "learning_rate": 2.2581229982071994e-05, "loss": 2.2659, "mean_token_accuracy": 0.4724137902259827, "step": 22420 }, { "epoch": 0.022586698172214887, "grad_norm": 20.25861201645504, "learning_rate": 2.2586265938803057e-05, "loss": 2.7866, "mean_token_accuracy": 0.3827586233615875, "step": 22425 }, { "epoch": 0.02259173422531906, "grad_norm": 25.454065220567372, "learning_rate": 2.2591301895534116e-05, "loss": 2.6312, "mean_token_accuracy": 0.38275861740112305, "step": 22430 }, { "epoch": 0.02259677027842323, "grad_norm": 15.768697011891009, "learning_rate": 2.2596337852265172e-05, "loss": 2.5159, "mean_token_accuracy": 0.4344827473163605, "step": 22435 }, { "epoch": 0.022601806331527405, "grad_norm": 26.235903057070672, "learning_rate": 2.2601373808996234e-05, "loss": 2.3687, "mean_token_accuracy": 0.4379310429096222, "step": 22440 }, { "epoch": 0.02260684238463158, "grad_norm": 18.37197651668032, "learning_rate": 2.2606409765727294e-05, "loss": 2.5066, "mean_token_accuracy": 0.4413793087005615, "step": 22445 }, { "epoch": 0.02261187843773575, "grad_norm": 20.495894762654945, "learning_rate": 2.2611445722458356e-05, "loss": 2.5205, "mean_token_accuracy": 0.39655172228813174, "step": 22450 }, { "epoch": 0.022616914490839923, "grad_norm": 20.432753457155183, "learning_rate": 2.2616481679189412e-05, "loss": 2.5514, "mean_token_accuracy": 0.4172413766384125, "step": 22455 }, { "epoch": 0.022621950543944096, "grad_norm": 30.722106421553068, "learning_rate": 2.262151763592047e-05, "loss": 2.8931, "mean_token_accuracy": 0.3517241358757019, "step": 22460 }, { "epoch": 0.02262698659704827, "grad_norm": 22.013400065153043, "learning_rate": 2.2626553592651534e-05, "loss": 2.3811, "mean_token_accuracy": 0.42068964838981626, "step": 22465 }, { "epoch": 0.02263202265015244, "grad_norm": 23.76465805126878, "learning_rate": 2.2631589549382593e-05, "loss": 2.7677, "mean_token_accuracy": 0.3793103456497192, "step": 22470 }, { "epoch": 0.022637058703256614, "grad_norm": 17.779989889542147, "learning_rate": 2.2636625506113653e-05, "loss": 2.2668, "mean_token_accuracy": 0.4517241358757019, "step": 22475 }, { "epoch": 0.022642094756360788, "grad_norm": 21.257585924279987, "learning_rate": 2.2641661462844712e-05, "loss": 2.2506, "mean_token_accuracy": 0.42413792610168455, "step": 22480 }, { "epoch": 0.02264713080946496, "grad_norm": 16.850431182569892, "learning_rate": 2.264669741957577e-05, "loss": 2.2979, "mean_token_accuracy": 0.43793103098869324, "step": 22485 }, { "epoch": 0.022652166862569132, "grad_norm": 24.456366384931854, "learning_rate": 2.2651733376306834e-05, "loss": 2.6736, "mean_token_accuracy": 0.39999999701976774, "step": 22490 }, { "epoch": 0.022657202915673306, "grad_norm": 19.1417247541628, "learning_rate": 2.2656769333037893e-05, "loss": 2.3048, "mean_token_accuracy": 0.4206896543502808, "step": 22495 }, { "epoch": 0.02266223896877748, "grad_norm": 18.84599896684726, "learning_rate": 2.266180528976895e-05, "loss": 2.6017, "mean_token_accuracy": 0.39310345649719236, "step": 22500 }, { "epoch": 0.02266727502188165, "grad_norm": 16.62193286530031, "learning_rate": 2.266684124650001e-05, "loss": 2.0632, "mean_token_accuracy": 0.458620685338974, "step": 22505 }, { "epoch": 0.022672311074985824, "grad_norm": 18.591903937220767, "learning_rate": 2.267187720323107e-05, "loss": 2.5364, "mean_token_accuracy": 0.4, "step": 22510 }, { "epoch": 0.022677347128089997, "grad_norm": 21.217173761893346, "learning_rate": 2.2676913159962133e-05, "loss": 2.8769, "mean_token_accuracy": 0.3999999940395355, "step": 22515 }, { "epoch": 0.022682383181194168, "grad_norm": 17.840197566068447, "learning_rate": 2.268194911669319e-05, "loss": 2.5899, "mean_token_accuracy": 0.3931034505367279, "step": 22520 }, { "epoch": 0.02268741923429834, "grad_norm": 18.780295823148354, "learning_rate": 2.268698507342425e-05, "loss": 2.5879, "mean_token_accuracy": 0.39655172228813174, "step": 22525 }, { "epoch": 0.022692455287402515, "grad_norm": 19.105647606527235, "learning_rate": 2.269202103015531e-05, "loss": 2.0668, "mean_token_accuracy": 0.44482759237289426, "step": 22530 }, { "epoch": 0.02269749134050669, "grad_norm": 20.68306278123323, "learning_rate": 2.269705698688637e-05, "loss": 2.503, "mean_token_accuracy": 0.4103448331356049, "step": 22535 }, { "epoch": 0.02270252739361086, "grad_norm": 24.15491603822209, "learning_rate": 2.270209294361743e-05, "loss": 2.3817, "mean_token_accuracy": 0.4068965554237366, "step": 22540 }, { "epoch": 0.022707563446715033, "grad_norm": 20.827742247720774, "learning_rate": 2.270712890034849e-05, "loss": 2.4991, "mean_token_accuracy": 0.4, "step": 22545 }, { "epoch": 0.022712599499819207, "grad_norm": 21.334346528435063, "learning_rate": 2.2712164857079548e-05, "loss": 2.2004, "mean_token_accuracy": 0.4413793087005615, "step": 22550 }, { "epoch": 0.022717635552923377, "grad_norm": 19.185609288663436, "learning_rate": 2.271720081381061e-05, "loss": 2.175, "mean_token_accuracy": 0.48765880465507505, "step": 22555 }, { "epoch": 0.02272267160602755, "grad_norm": 23.687491018950126, "learning_rate": 2.272223677054167e-05, "loss": 2.3841, "mean_token_accuracy": 0.45862069725990295, "step": 22560 }, { "epoch": 0.022727707659131725, "grad_norm": 16.67345590425749, "learning_rate": 2.272727272727273e-05, "loss": 2.1727, "mean_token_accuracy": 0.4172413766384125, "step": 22565 }, { "epoch": 0.0227327437122359, "grad_norm": 19.26383388557665, "learning_rate": 2.273230868400379e-05, "loss": 2.7115, "mean_token_accuracy": 0.358620685338974, "step": 22570 }, { "epoch": 0.02273777976534007, "grad_norm": 18.937735556298822, "learning_rate": 2.2737344640734848e-05, "loss": 2.0429, "mean_token_accuracy": 0.5044334948062896, "step": 22575 }, { "epoch": 0.022742815818444242, "grad_norm": 19.95636119563846, "learning_rate": 2.2742380597465907e-05, "loss": 2.5669, "mean_token_accuracy": 0.4344827592372894, "step": 22580 }, { "epoch": 0.022747851871548416, "grad_norm": 22.079111238407474, "learning_rate": 2.274741655419697e-05, "loss": 2.2817, "mean_token_accuracy": 0.41379310488700866, "step": 22585 }, { "epoch": 0.022752887924652587, "grad_norm": 20.731706517261895, "learning_rate": 2.2752452510928026e-05, "loss": 2.5628, "mean_token_accuracy": 0.3793103456497192, "step": 22590 }, { "epoch": 0.02275792397775676, "grad_norm": 30.561394062291406, "learning_rate": 2.2757488467659088e-05, "loss": 2.2307, "mean_token_accuracy": 0.41905626058578493, "step": 22595 }, { "epoch": 0.022762960030860934, "grad_norm": 21.528989707860283, "learning_rate": 2.2762524424390147e-05, "loss": 2.7548, "mean_token_accuracy": 0.3793103456497192, "step": 22600 }, { "epoch": 0.022767996083965108, "grad_norm": 15.878088498627157, "learning_rate": 2.2767560381121207e-05, "loss": 2.3395, "mean_token_accuracy": 0.42758620977401735, "step": 22605 }, { "epoch": 0.022773032137069278, "grad_norm": 21.38107964537585, "learning_rate": 2.2772596337852266e-05, "loss": 2.2321, "mean_token_accuracy": 0.45668481588363646, "step": 22610 }, { "epoch": 0.022778068190173452, "grad_norm": 18.638422817724088, "learning_rate": 2.2777632294583325e-05, "loss": 2.419, "mean_token_accuracy": 0.4344827592372894, "step": 22615 }, { "epoch": 0.022783104243277626, "grad_norm": 25.541632104800644, "learning_rate": 2.2782668251314384e-05, "loss": 2.7406, "mean_token_accuracy": 0.3724137842655182, "step": 22620 }, { "epoch": 0.022788140296381796, "grad_norm": 24.870099288549408, "learning_rate": 2.2787704208045447e-05, "loss": 2.4821, "mean_token_accuracy": 0.42758620977401735, "step": 22625 }, { "epoch": 0.02279317634948597, "grad_norm": 28.984746085927338, "learning_rate": 2.2792740164776506e-05, "loss": 2.5235, "mean_token_accuracy": 0.42758620977401735, "step": 22630 }, { "epoch": 0.022798212402590144, "grad_norm": 17.33730303914686, "learning_rate": 2.2797776121507566e-05, "loss": 2.4157, "mean_token_accuracy": 0.4034482717514038, "step": 22635 }, { "epoch": 0.022803248455694317, "grad_norm": 17.429441611093488, "learning_rate": 2.2802812078238625e-05, "loss": 2.6754, "mean_token_accuracy": 0.3517241358757019, "step": 22640 }, { "epoch": 0.022808284508798488, "grad_norm": 20.765204328300264, "learning_rate": 2.2807848034969684e-05, "loss": 2.2778, "mean_token_accuracy": 0.4517241299152374, "step": 22645 }, { "epoch": 0.02281332056190266, "grad_norm": 25.36196991681778, "learning_rate": 2.2812883991700747e-05, "loss": 2.5011, "mean_token_accuracy": 0.4327888786792755, "step": 22650 }, { "epoch": 0.022818356615006835, "grad_norm": 18.79703684930374, "learning_rate": 2.2817919948431803e-05, "loss": 2.3929, "mean_token_accuracy": 0.43103448748588563, "step": 22655 }, { "epoch": 0.022823392668111005, "grad_norm": 17.85611908453297, "learning_rate": 2.2822955905162862e-05, "loss": 2.3343, "mean_token_accuracy": 0.4, "step": 22660 }, { "epoch": 0.02282842872121518, "grad_norm": 21.226299837353054, "learning_rate": 2.2827991861893925e-05, "loss": 2.5153, "mean_token_accuracy": 0.458620685338974, "step": 22665 }, { "epoch": 0.022833464774319353, "grad_norm": 21.64309500389514, "learning_rate": 2.2833027818624984e-05, "loss": 2.5086, "mean_token_accuracy": 0.3862068891525269, "step": 22670 }, { "epoch": 0.022838500827423527, "grad_norm": 21.710984538066924, "learning_rate": 2.2838063775356043e-05, "loss": 2.4433, "mean_token_accuracy": 0.4034482717514038, "step": 22675 }, { "epoch": 0.022843536880527697, "grad_norm": 21.00478712738089, "learning_rate": 2.2843099732087102e-05, "loss": 2.8297, "mean_token_accuracy": 0.3805202662944794, "step": 22680 }, { "epoch": 0.02284857293363187, "grad_norm": 21.870098936627976, "learning_rate": 2.284813568881816e-05, "loss": 2.497, "mean_token_accuracy": 0.4344827651977539, "step": 22685 }, { "epoch": 0.022853608986736045, "grad_norm": 17.213851678606343, "learning_rate": 2.2853171645549224e-05, "loss": 2.2466, "mean_token_accuracy": 0.44137930274009707, "step": 22690 }, { "epoch": 0.022858645039840215, "grad_norm": 17.763106173996384, "learning_rate": 2.2858207602280283e-05, "loss": 2.4115, "mean_token_accuracy": 0.45862067937850953, "step": 22695 }, { "epoch": 0.02286368109294439, "grad_norm": 21.85975830465156, "learning_rate": 2.286324355901134e-05, "loss": 2.4841, "mean_token_accuracy": 0.4344827651977539, "step": 22700 }, { "epoch": 0.022868717146048562, "grad_norm": 18.601093321681876, "learning_rate": 2.2868279515742402e-05, "loss": 2.353, "mean_token_accuracy": 0.41034482419490814, "step": 22705 }, { "epoch": 0.022873753199152736, "grad_norm": 15.097304122293911, "learning_rate": 2.287331547247346e-05, "loss": 1.9461, "mean_token_accuracy": 0.5310344874858857, "step": 22710 }, { "epoch": 0.022878789252256906, "grad_norm": 20.603198245841, "learning_rate": 2.2878351429204524e-05, "loss": 2.5929, "mean_token_accuracy": 0.3896551728248596, "step": 22715 }, { "epoch": 0.02288382530536108, "grad_norm": 21.944195006115965, "learning_rate": 2.288338738593558e-05, "loss": 2.7959, "mean_token_accuracy": 0.3586206942796707, "step": 22720 }, { "epoch": 0.022888861358465254, "grad_norm": 12.962044530940386, "learning_rate": 2.288842334266664e-05, "loss": 2.4928, "mean_token_accuracy": 0.3655172407627106, "step": 22725 }, { "epoch": 0.022893897411569424, "grad_norm": 17.448281235913246, "learning_rate": 2.28934592993977e-05, "loss": 2.666, "mean_token_accuracy": 0.35862069129943847, "step": 22730 }, { "epoch": 0.022898933464673598, "grad_norm": 32.96787197038869, "learning_rate": 2.289849525612876e-05, "loss": 2.3277, "mean_token_accuracy": 0.4517241299152374, "step": 22735 }, { "epoch": 0.022903969517777772, "grad_norm": 21.302498746653814, "learning_rate": 2.290353121285982e-05, "loss": 2.2013, "mean_token_accuracy": 0.4068965494632721, "step": 22740 }, { "epoch": 0.022909005570881946, "grad_norm": 20.115490757384034, "learning_rate": 2.290856716959088e-05, "loss": 2.6838, "mean_token_accuracy": 0.40689654350280763, "step": 22745 }, { "epoch": 0.022914041623986116, "grad_norm": 23.267723147162197, "learning_rate": 2.291360312632194e-05, "loss": 2.5425, "mean_token_accuracy": 0.4710936903953552, "step": 22750 }, { "epoch": 0.02291907767709029, "grad_norm": 18.638277963020272, "learning_rate": 2.2918639083053e-05, "loss": 2.376, "mean_token_accuracy": 0.3931034505367279, "step": 22755 }, { "epoch": 0.022924113730194463, "grad_norm": 18.212097372203548, "learning_rate": 2.292367503978406e-05, "loss": 2.3664, "mean_token_accuracy": 0.4724137902259827, "step": 22760 }, { "epoch": 0.022929149783298634, "grad_norm": 19.321665218909793, "learning_rate": 2.292871099651512e-05, "loss": 2.8056, "mean_token_accuracy": 0.37931033968925476, "step": 22765 }, { "epoch": 0.022934185836402807, "grad_norm": 17.6201713074256, "learning_rate": 2.293374695324618e-05, "loss": 2.5647, "mean_token_accuracy": 0.43448275327682495, "step": 22770 }, { "epoch": 0.02293922188950698, "grad_norm": 17.811160390890254, "learning_rate": 2.2938782909977238e-05, "loss": 2.5508, "mean_token_accuracy": 0.37241379022598264, "step": 22775 }, { "epoch": 0.022944257942611155, "grad_norm": 18.261853885826937, "learning_rate": 2.2943818866708297e-05, "loss": 2.5657, "mean_token_accuracy": 0.3137931048870087, "step": 22780 }, { "epoch": 0.022949293995715325, "grad_norm": 23.7481239411116, "learning_rate": 2.294885482343936e-05, "loss": 2.4587, "mean_token_accuracy": 0.4000000059604645, "step": 22785 }, { "epoch": 0.0229543300488195, "grad_norm": 17.042074337437455, "learning_rate": 2.2953890780170416e-05, "loss": 2.2672, "mean_token_accuracy": 0.4379310369491577, "step": 22790 }, { "epoch": 0.022959366101923673, "grad_norm": 16.890928928924556, "learning_rate": 2.295892673690148e-05, "loss": 2.2507, "mean_token_accuracy": 0.46896551847457885, "step": 22795 }, { "epoch": 0.022964402155027843, "grad_norm": 19.464047203597147, "learning_rate": 2.2963962693632538e-05, "loss": 2.4715, "mean_token_accuracy": 0.41034482717514037, "step": 22800 }, { "epoch": 0.022969438208132017, "grad_norm": 18.366188725603518, "learning_rate": 2.2968998650363597e-05, "loss": 2.1385, "mean_token_accuracy": 0.4620689630508423, "step": 22805 }, { "epoch": 0.02297447426123619, "grad_norm": 23.101494973332073, "learning_rate": 2.2974034607094656e-05, "loss": 2.4148, "mean_token_accuracy": 0.4103448331356049, "step": 22810 }, { "epoch": 0.022979510314340364, "grad_norm": 26.724476072881952, "learning_rate": 2.2979070563825716e-05, "loss": 2.7551, "mean_token_accuracy": 0.3965517163276672, "step": 22815 }, { "epoch": 0.022984546367444535, "grad_norm": 19.558034340626925, "learning_rate": 2.2984106520556775e-05, "loss": 2.2781, "mean_token_accuracy": 0.4666256129741669, "step": 22820 }, { "epoch": 0.02298958242054871, "grad_norm": 18.958228517274435, "learning_rate": 2.2989142477287838e-05, "loss": 2.179, "mean_token_accuracy": 0.44827585816383364, "step": 22825 }, { "epoch": 0.022994618473652882, "grad_norm": 17.580023659461865, "learning_rate": 2.2994178434018897e-05, "loss": 2.4399, "mean_token_accuracy": 0.4348457336425781, "step": 22830 }, { "epoch": 0.022999654526757052, "grad_norm": 18.822075228700367, "learning_rate": 2.2999214390749956e-05, "loss": 2.6342, "mean_token_accuracy": 0.4034482777118683, "step": 22835 }, { "epoch": 0.023004690579861226, "grad_norm": 21.018624034341308, "learning_rate": 2.3004250347481015e-05, "loss": 2.4219, "mean_token_accuracy": 0.4379310369491577, "step": 22840 }, { "epoch": 0.0230097266329654, "grad_norm": 19.91394150717735, "learning_rate": 2.3009286304212075e-05, "loss": 2.3617, "mean_token_accuracy": 0.4689655125141144, "step": 22845 }, { "epoch": 0.023014762686069574, "grad_norm": 22.90880935741304, "learning_rate": 2.3014322260943137e-05, "loss": 2.6454, "mean_token_accuracy": 0.3827586233615875, "step": 22850 }, { "epoch": 0.023019798739173744, "grad_norm": 18.84600555991698, "learning_rate": 2.3019358217674193e-05, "loss": 2.4955, "mean_token_accuracy": 0.3902709364891052, "step": 22855 }, { "epoch": 0.023024834792277918, "grad_norm": 17.557138401404146, "learning_rate": 2.3024394174405252e-05, "loss": 2.5965, "mean_token_accuracy": 0.4, "step": 22860 }, { "epoch": 0.02302987084538209, "grad_norm": 19.117916121017814, "learning_rate": 2.3029430131136315e-05, "loss": 2.1499, "mean_token_accuracy": 0.4655172348022461, "step": 22865 }, { "epoch": 0.023034906898486262, "grad_norm": 20.41709412811478, "learning_rate": 2.3034466087867374e-05, "loss": 2.1779, "mean_token_accuracy": 0.4620689690113068, "step": 22870 }, { "epoch": 0.023039942951590436, "grad_norm": 20.080008126135866, "learning_rate": 2.3039502044598433e-05, "loss": 2.0551, "mean_token_accuracy": 0.47586206793785096, "step": 22875 }, { "epoch": 0.02304497900469461, "grad_norm": 20.352513627776563, "learning_rate": 2.3044538001329493e-05, "loss": 2.4495, "mean_token_accuracy": 0.4103448212146759, "step": 22880 }, { "epoch": 0.023050015057798783, "grad_norm": 24.65168187970714, "learning_rate": 2.3049573958060552e-05, "loss": 2.8885, "mean_token_accuracy": 0.37586207389831544, "step": 22885 }, { "epoch": 0.023055051110902954, "grad_norm": 25.48647988694503, "learning_rate": 2.3054609914791615e-05, "loss": 2.8524, "mean_token_accuracy": 0.3551724135875702, "step": 22890 }, { "epoch": 0.023060087164007127, "grad_norm": 17.646917083114957, "learning_rate": 2.3059645871522674e-05, "loss": 2.4218, "mean_token_accuracy": 0.4379310369491577, "step": 22895 }, { "epoch": 0.0230651232171113, "grad_norm": 23.14327183533927, "learning_rate": 2.3064681828253733e-05, "loss": 2.4162, "mean_token_accuracy": 0.4137930989265442, "step": 22900 }, { "epoch": 0.02307015927021547, "grad_norm": 19.92739523568574, "learning_rate": 2.3069717784984792e-05, "loss": 2.8309, "mean_token_accuracy": 0.39497882723808286, "step": 22905 }, { "epoch": 0.023075195323319645, "grad_norm": 18.376262145963032, "learning_rate": 2.307475374171585e-05, "loss": 2.7732, "mean_token_accuracy": 0.3840290427207947, "step": 22910 }, { "epoch": 0.02308023137642382, "grad_norm": 20.488677917222653, "learning_rate": 2.3079789698446914e-05, "loss": 2.1023, "mean_token_accuracy": 0.4551724135875702, "step": 22915 }, { "epoch": 0.02308526742952799, "grad_norm": 17.03609906560991, "learning_rate": 2.308482565517797e-05, "loss": 2.2248, "mean_token_accuracy": 0.4482758641242981, "step": 22920 }, { "epoch": 0.023090303482632163, "grad_norm": 26.112126585190843, "learning_rate": 2.308986161190903e-05, "loss": 3.0098, "mean_token_accuracy": 0.36896551847457887, "step": 22925 }, { "epoch": 0.023095339535736337, "grad_norm": 18.985530438448993, "learning_rate": 2.3094897568640092e-05, "loss": 2.4916, "mean_token_accuracy": 0.4379310250282288, "step": 22930 }, { "epoch": 0.02310037558884051, "grad_norm": 16.922123042320393, "learning_rate": 2.309993352537115e-05, "loss": 2.5926, "mean_token_accuracy": 0.36896551549434664, "step": 22935 }, { "epoch": 0.02310541164194468, "grad_norm": 16.930864720978843, "learning_rate": 2.3104969482102214e-05, "loss": 2.5479, "mean_token_accuracy": 0.42413792610168455, "step": 22940 }, { "epoch": 0.023110447695048855, "grad_norm": 18.606395868769965, "learning_rate": 2.311000543883327e-05, "loss": 2.2358, "mean_token_accuracy": 0.48160919547080994, "step": 22945 }, { "epoch": 0.02311548374815303, "grad_norm": 21.863191604249504, "learning_rate": 2.311504139556433e-05, "loss": 2.3576, "mean_token_accuracy": 0.4620689690113068, "step": 22950 }, { "epoch": 0.0231205198012572, "grad_norm": 16.662726102692137, "learning_rate": 2.312007735229539e-05, "loss": 2.7683, "mean_token_accuracy": 0.36551723778247835, "step": 22955 }, { "epoch": 0.023125555854361372, "grad_norm": 19.677123315731578, "learning_rate": 2.312511330902645e-05, "loss": 2.3679, "mean_token_accuracy": 0.4137930989265442, "step": 22960 }, { "epoch": 0.023130591907465546, "grad_norm": 20.872678676649432, "learning_rate": 2.313014926575751e-05, "loss": 2.758, "mean_token_accuracy": 0.3793103456497192, "step": 22965 }, { "epoch": 0.02313562796056972, "grad_norm": 16.632263227426613, "learning_rate": 2.313518522248857e-05, "loss": 2.5275, "mean_token_accuracy": 0.41034482717514037, "step": 22970 }, { "epoch": 0.02314066401367389, "grad_norm": 25.55095335528505, "learning_rate": 2.314022117921963e-05, "loss": 2.5002, "mean_token_accuracy": 0.3931034505367279, "step": 22975 }, { "epoch": 0.023145700066778064, "grad_norm": 20.012027080019955, "learning_rate": 2.314525713595069e-05, "loss": 2.4564, "mean_token_accuracy": 0.43103448748588563, "step": 22980 }, { "epoch": 0.023150736119882238, "grad_norm": 17.574902582764828, "learning_rate": 2.315029309268175e-05, "loss": 2.5635, "mean_token_accuracy": 0.47428917288780215, "step": 22985 }, { "epoch": 0.023155772172986408, "grad_norm": 19.712751364062672, "learning_rate": 2.3155329049412806e-05, "loss": 2.3971, "mean_token_accuracy": 0.4172413766384125, "step": 22990 }, { "epoch": 0.023160808226090582, "grad_norm": 16.462634401717388, "learning_rate": 2.316036500614387e-05, "loss": 2.5296, "mean_token_accuracy": 0.44827585816383364, "step": 22995 }, { "epoch": 0.023165844279194756, "grad_norm": 18.108234727285744, "learning_rate": 2.316540096287493e-05, "loss": 2.1544, "mean_token_accuracy": 0.47586206793785096, "step": 23000 }, { "epoch": 0.02317088033229893, "grad_norm": 20.20866342765393, "learning_rate": 2.3170436919605988e-05, "loss": 2.5775, "mean_token_accuracy": 0.3862069010734558, "step": 23005 }, { "epoch": 0.0231759163854031, "grad_norm": 17.034229444595386, "learning_rate": 2.3175472876337047e-05, "loss": 2.318, "mean_token_accuracy": 0.4698275923728943, "step": 23010 }, { "epoch": 0.023180952438507273, "grad_norm": 18.786363107672358, "learning_rate": 2.3180508833068106e-05, "loss": 2.8486, "mean_token_accuracy": 0.3530550479888916, "step": 23015 }, { "epoch": 0.023185988491611447, "grad_norm": 19.18161811533767, "learning_rate": 2.318554478979917e-05, "loss": 2.5546, "mean_token_accuracy": 0.4, "step": 23020 }, { "epoch": 0.023191024544715617, "grad_norm": 21.68999575278217, "learning_rate": 2.3190580746530228e-05, "loss": 2.5458, "mean_token_accuracy": 0.3931034505367279, "step": 23025 }, { "epoch": 0.02319606059781979, "grad_norm": 22.92044649834235, "learning_rate": 2.3195616703261287e-05, "loss": 2.5237, "mean_token_accuracy": 0.4206896543502808, "step": 23030 }, { "epoch": 0.023201096650923965, "grad_norm": 16.142754652578454, "learning_rate": 2.3200652659992346e-05, "loss": 2.4486, "mean_token_accuracy": 0.4068965554237366, "step": 23035 }, { "epoch": 0.02320613270402814, "grad_norm": 21.27352017314491, "learning_rate": 2.3205688616723406e-05, "loss": 2.6783, "mean_token_accuracy": 0.39655172228813174, "step": 23040 }, { "epoch": 0.02321116875713231, "grad_norm": 22.180863995376555, "learning_rate": 2.3210724573454465e-05, "loss": 2.2055, "mean_token_accuracy": 0.5344827532768249, "step": 23045 }, { "epoch": 0.023216204810236483, "grad_norm": 16.206469249136866, "learning_rate": 2.3215760530185528e-05, "loss": 2.443, "mean_token_accuracy": 0.3793103456497192, "step": 23050 }, { "epoch": 0.023221240863340657, "grad_norm": 17.963615643345832, "learning_rate": 2.3220796486916584e-05, "loss": 2.2049, "mean_token_accuracy": 0.5068965494632721, "step": 23055 }, { "epoch": 0.023226276916444827, "grad_norm": 24.290381567609824, "learning_rate": 2.3225832443647646e-05, "loss": 2.6754, "mean_token_accuracy": 0.4258923172950745, "step": 23060 }, { "epoch": 0.023231312969549, "grad_norm": 20.946573083044743, "learning_rate": 2.3230868400378705e-05, "loss": 2.384, "mean_token_accuracy": 0.4344827651977539, "step": 23065 }, { "epoch": 0.023236349022653174, "grad_norm": 18.485546373030807, "learning_rate": 2.3235904357109765e-05, "loss": 2.0698, "mean_token_accuracy": 0.5275862038135528, "step": 23070 }, { "epoch": 0.023241385075757348, "grad_norm": 17.033514005794434, "learning_rate": 2.3240940313840824e-05, "loss": 2.3657, "mean_token_accuracy": 0.4448275864124298, "step": 23075 }, { "epoch": 0.02324642112886152, "grad_norm": 22.632504364202727, "learning_rate": 2.3245976270571883e-05, "loss": 2.5893, "mean_token_accuracy": 0.39999999701976774, "step": 23080 }, { "epoch": 0.023251457181965692, "grad_norm": 19.044371870230986, "learning_rate": 2.3251012227302942e-05, "loss": 2.2975, "mean_token_accuracy": 0.42758620381355283, "step": 23085 }, { "epoch": 0.023256493235069866, "grad_norm": 14.956867521627869, "learning_rate": 2.3256048184034005e-05, "loss": 2.9365, "mean_token_accuracy": 0.3758620619773865, "step": 23090 }, { "epoch": 0.023261529288174036, "grad_norm": 19.66057404064375, "learning_rate": 2.3261084140765064e-05, "loss": 2.7816, "mean_token_accuracy": 0.3620689630508423, "step": 23095 }, { "epoch": 0.02326656534127821, "grad_norm": 14.09174184629034, "learning_rate": 2.3266120097496124e-05, "loss": 2.4464, "mean_token_accuracy": 0.45741077661514284, "step": 23100 }, { "epoch": 0.023271601394382384, "grad_norm": 12.128026023680654, "learning_rate": 2.3271156054227183e-05, "loss": 2.2867, "mean_token_accuracy": 0.412583190202713, "step": 23105 }, { "epoch": 0.023276637447486558, "grad_norm": 19.176754278836363, "learning_rate": 2.3276192010958242e-05, "loss": 2.7116, "mean_token_accuracy": 0.4310344815254211, "step": 23110 }, { "epoch": 0.023281673500590728, "grad_norm": 18.963712150852185, "learning_rate": 2.3281227967689305e-05, "loss": 2.3551, "mean_token_accuracy": 0.43793103098869324, "step": 23115 }, { "epoch": 0.0232867095536949, "grad_norm": 20.51487798917281, "learning_rate": 2.3286263924420364e-05, "loss": 2.3704, "mean_token_accuracy": 0.4103448212146759, "step": 23120 }, { "epoch": 0.023291745606799075, "grad_norm": 21.575264438222536, "learning_rate": 2.329129988115142e-05, "loss": 2.6654, "mean_token_accuracy": 0.3793103456497192, "step": 23125 }, { "epoch": 0.023296781659903246, "grad_norm": 21.305805100371053, "learning_rate": 2.3296335837882482e-05, "loss": 2.7653, "mean_token_accuracy": 0.37931033670902253, "step": 23130 }, { "epoch": 0.02330181771300742, "grad_norm": 18.156066236647195, "learning_rate": 2.3301371794613542e-05, "loss": 2.6291, "mean_token_accuracy": 0.38275861740112305, "step": 23135 }, { "epoch": 0.023306853766111593, "grad_norm": 24.421629261010917, "learning_rate": 2.3306407751344604e-05, "loss": 2.5404, "mean_token_accuracy": 0.41379310488700866, "step": 23140 }, { "epoch": 0.023311889819215767, "grad_norm": 20.71044362521377, "learning_rate": 2.331144370807566e-05, "loss": 2.8435, "mean_token_accuracy": 0.33103448152542114, "step": 23145 }, { "epoch": 0.023316925872319937, "grad_norm": 149.8158185969122, "learning_rate": 2.331647966480672e-05, "loss": 2.4191, "mean_token_accuracy": 0.4137930989265442, "step": 23150 }, { "epoch": 0.02332196192542411, "grad_norm": 17.125177233582733, "learning_rate": 2.3321515621537782e-05, "loss": 2.7084, "mean_token_accuracy": 0.47241378426551817, "step": 23155 }, { "epoch": 0.023326997978528285, "grad_norm": 18.4979615776732, "learning_rate": 2.332655157826884e-05, "loss": 2.5377, "mean_token_accuracy": 0.37586206793785093, "step": 23160 }, { "epoch": 0.023332034031632455, "grad_norm": 22.74049703360252, "learning_rate": 2.33315875349999e-05, "loss": 2.1325, "mean_token_accuracy": 0.4655172348022461, "step": 23165 }, { "epoch": 0.02333707008473663, "grad_norm": 28.40734587243835, "learning_rate": 2.333662349173096e-05, "loss": 2.1769, "mean_token_accuracy": 0.4448275864124298, "step": 23170 }, { "epoch": 0.023342106137840803, "grad_norm": 15.90823415689045, "learning_rate": 2.334165944846202e-05, "loss": 2.2317, "mean_token_accuracy": 0.48275862336158754, "step": 23175 }, { "epoch": 0.023347142190944976, "grad_norm": 20.8508626072081, "learning_rate": 2.3346695405193082e-05, "loss": 2.2442, "mean_token_accuracy": 0.45862069725990295, "step": 23180 }, { "epoch": 0.023352178244049147, "grad_norm": 19.182113655354367, "learning_rate": 2.335173136192414e-05, "loss": 2.6433, "mean_token_accuracy": 0.417241370677948, "step": 23185 }, { "epoch": 0.02335721429715332, "grad_norm": 20.67039475556455, "learning_rate": 2.3356767318655197e-05, "loss": 2.4209, "mean_token_accuracy": 0.42068964838981626, "step": 23190 }, { "epoch": 0.023362250350257494, "grad_norm": 20.7741829319452, "learning_rate": 2.336180327538626e-05, "loss": 2.2663, "mean_token_accuracy": 0.49039408564567566, "step": 23195 }, { "epoch": 0.023367286403361665, "grad_norm": 29.140731300066438, "learning_rate": 2.336683923211732e-05, "loss": 2.3962, "mean_token_accuracy": 0.441379314661026, "step": 23200 }, { "epoch": 0.02337232245646584, "grad_norm": 21.57276173554743, "learning_rate": 2.3371875188848378e-05, "loss": 2.6041, "mean_token_accuracy": 0.4172413766384125, "step": 23205 }, { "epoch": 0.023377358509570012, "grad_norm": 16.823665657734125, "learning_rate": 2.3376911145579437e-05, "loss": 2.4537, "mean_token_accuracy": 0.4, "step": 23210 }, { "epoch": 0.023382394562674186, "grad_norm": 27.237850246821015, "learning_rate": 2.3381947102310497e-05, "loss": 2.461, "mean_token_accuracy": 0.39655171930789945, "step": 23215 }, { "epoch": 0.023387430615778356, "grad_norm": 18.487204871777276, "learning_rate": 2.338698305904156e-05, "loss": 2.4639, "mean_token_accuracy": 0.4137930929660797, "step": 23220 }, { "epoch": 0.02339246666888253, "grad_norm": 17.68062594413758, "learning_rate": 2.339201901577262e-05, "loss": 2.444, "mean_token_accuracy": 0.4310344934463501, "step": 23225 }, { "epoch": 0.023397502721986704, "grad_norm": 19.572802954405955, "learning_rate": 2.3397054972503678e-05, "loss": 2.6091, "mean_token_accuracy": 0.3793103456497192, "step": 23230 }, { "epoch": 0.023402538775090874, "grad_norm": 35.07218573970589, "learning_rate": 2.3402090929234737e-05, "loss": 2.9119, "mean_token_accuracy": 0.3758620709180832, "step": 23235 }, { "epoch": 0.023407574828195048, "grad_norm": 29.112515450855707, "learning_rate": 2.3407126885965796e-05, "loss": 2.5343, "mean_token_accuracy": 0.4200738906860352, "step": 23240 }, { "epoch": 0.02341261088129922, "grad_norm": 21.15423566602211, "learning_rate": 2.3412162842696855e-05, "loss": 2.3895, "mean_token_accuracy": 0.4413793087005615, "step": 23245 }, { "epoch": 0.023417646934403395, "grad_norm": 26.525838790535417, "learning_rate": 2.3417198799427918e-05, "loss": 2.6741, "mean_token_accuracy": 0.38620689511299133, "step": 23250 }, { "epoch": 0.023422682987507566, "grad_norm": 17.24454108983069, "learning_rate": 2.3422234756158974e-05, "loss": 2.4122, "mean_token_accuracy": 0.38620689511299133, "step": 23255 }, { "epoch": 0.02342771904061174, "grad_norm": 16.945555011004476, "learning_rate": 2.3427270712890037e-05, "loss": 2.5, "mean_token_accuracy": 0.38965516686439516, "step": 23260 }, { "epoch": 0.023432755093715913, "grad_norm": 19.777421857287845, "learning_rate": 2.3432306669621096e-05, "loss": 2.3755, "mean_token_accuracy": 0.42758620977401735, "step": 23265 }, { "epoch": 0.023437791146820083, "grad_norm": 12.783141533142857, "learning_rate": 2.3437342626352155e-05, "loss": 2.3924, "mean_token_accuracy": 0.4620689630508423, "step": 23270 }, { "epoch": 0.023442827199924257, "grad_norm": 18.493512293245807, "learning_rate": 2.3442378583083214e-05, "loss": 2.0751, "mean_token_accuracy": 0.48965516686439514, "step": 23275 }, { "epoch": 0.02344786325302843, "grad_norm": 16.915018131501643, "learning_rate": 2.3447414539814274e-05, "loss": 2.4425, "mean_token_accuracy": 0.3981246203184128, "step": 23280 }, { "epoch": 0.023452899306132605, "grad_norm": 22.963126691437093, "learning_rate": 2.3452450496545333e-05, "loss": 2.7325, "mean_token_accuracy": 0.3379310339689255, "step": 23285 }, { "epoch": 0.023457935359236775, "grad_norm": 14.693629660666877, "learning_rate": 2.3457486453276395e-05, "loss": 2.3208, "mean_token_accuracy": 0.4275861978530884, "step": 23290 }, { "epoch": 0.02346297141234095, "grad_norm": 18.11448411514457, "learning_rate": 2.3462522410007455e-05, "loss": 2.7619, "mean_token_accuracy": 0.3551724135875702, "step": 23295 }, { "epoch": 0.023468007465445122, "grad_norm": 26.292095807138796, "learning_rate": 2.3467558366738514e-05, "loss": 2.7848, "mean_token_accuracy": 0.3793103516101837, "step": 23300 }, { "epoch": 0.023473043518549293, "grad_norm": 18.532510370464035, "learning_rate": 2.3472594323469573e-05, "loss": 2.1855, "mean_token_accuracy": 0.47586206793785096, "step": 23305 }, { "epoch": 0.023478079571653467, "grad_norm": 17.28162510098119, "learning_rate": 2.3477630280200633e-05, "loss": 2.5518, "mean_token_accuracy": 0.39999998807907106, "step": 23310 }, { "epoch": 0.02348311562475764, "grad_norm": 19.6675761182803, "learning_rate": 2.3482666236931695e-05, "loss": 2.3339, "mean_token_accuracy": 0.37586206793785093, "step": 23315 }, { "epoch": 0.023488151677861814, "grad_norm": 27.76457846588624, "learning_rate": 2.3487702193662754e-05, "loss": 2.5959, "mean_token_accuracy": 0.40689656138420105, "step": 23320 }, { "epoch": 0.023493187730965984, "grad_norm": 20.64928450570137, "learning_rate": 2.3492738150393814e-05, "loss": 3.0885, "mean_token_accuracy": 0.3172413736581802, "step": 23325 }, { "epoch": 0.023498223784070158, "grad_norm": 18.917560597409945, "learning_rate": 2.3497774107124873e-05, "loss": 2.4874, "mean_token_accuracy": 0.4482758641242981, "step": 23330 }, { "epoch": 0.023503259837174332, "grad_norm": 15.411758240406288, "learning_rate": 2.3502810063855932e-05, "loss": 2.1298, "mean_token_accuracy": 0.4793103516101837, "step": 23335 }, { "epoch": 0.023508295890278502, "grad_norm": 20.015300306624695, "learning_rate": 2.3507846020586995e-05, "loss": 2.3879, "mean_token_accuracy": 0.39310344457626345, "step": 23340 }, { "epoch": 0.023513331943382676, "grad_norm": 17.345719924823456, "learning_rate": 2.351288197731805e-05, "loss": 2.3847, "mean_token_accuracy": 0.4034482717514038, "step": 23345 }, { "epoch": 0.02351836799648685, "grad_norm": 22.258021269623214, "learning_rate": 2.351791793404911e-05, "loss": 2.3043, "mean_token_accuracy": 0.45517241954803467, "step": 23350 }, { "epoch": 0.023523404049591023, "grad_norm": 18.761405749653825, "learning_rate": 2.3522953890780173e-05, "loss": 2.7389, "mean_token_accuracy": 0.42044767141342165, "step": 23355 }, { "epoch": 0.023528440102695194, "grad_norm": 20.208979398983107, "learning_rate": 2.3527989847511232e-05, "loss": 2.2699, "mean_token_accuracy": 0.43793103098869324, "step": 23360 }, { "epoch": 0.023533476155799368, "grad_norm": 19.523103142859696, "learning_rate": 2.353302580424229e-05, "loss": 2.8238, "mean_token_accuracy": 0.35517241060733795, "step": 23365 }, { "epoch": 0.02353851220890354, "grad_norm": 18.31660151996064, "learning_rate": 2.353806176097335e-05, "loss": 2.5006, "mean_token_accuracy": 0.4034482777118683, "step": 23370 }, { "epoch": 0.02354354826200771, "grad_norm": 18.759401601924488, "learning_rate": 2.354309771770441e-05, "loss": 2.5492, "mean_token_accuracy": 0.43103447556495667, "step": 23375 }, { "epoch": 0.023548584315111885, "grad_norm": 16.21704045480718, "learning_rate": 2.3548133674435472e-05, "loss": 2.8948, "mean_token_accuracy": 0.36896551251411436, "step": 23380 }, { "epoch": 0.02355362036821606, "grad_norm": 15.923460784263046, "learning_rate": 2.355316963116653e-05, "loss": 2.6577, "mean_token_accuracy": 0.4137930989265442, "step": 23385 }, { "epoch": 0.023558656421320233, "grad_norm": 18.906590448105323, "learning_rate": 2.3558205587897587e-05, "loss": 2.3943, "mean_token_accuracy": 0.39655172228813174, "step": 23390 }, { "epoch": 0.023563692474424403, "grad_norm": 30.838227828202086, "learning_rate": 2.356324154462865e-05, "loss": 2.3409, "mean_token_accuracy": 0.4551724135875702, "step": 23395 }, { "epoch": 0.023568728527528577, "grad_norm": 20.76235156703052, "learning_rate": 2.356827750135971e-05, "loss": 2.5859, "mean_token_accuracy": 0.3931034505367279, "step": 23400 }, { "epoch": 0.02357376458063275, "grad_norm": 20.699009468689052, "learning_rate": 2.3573313458090772e-05, "loss": 2.5678, "mean_token_accuracy": 0.40344828367233276, "step": 23405 }, { "epoch": 0.02357880063373692, "grad_norm": 17.94740499286189, "learning_rate": 2.3578349414821828e-05, "loss": 2.2906, "mean_token_accuracy": 0.44827587008476255, "step": 23410 }, { "epoch": 0.023583836686841095, "grad_norm": 20.281713312645824, "learning_rate": 2.3583385371552887e-05, "loss": 2.421, "mean_token_accuracy": 0.4379310369491577, "step": 23415 }, { "epoch": 0.02358887273994527, "grad_norm": 18.559842139842036, "learning_rate": 2.358842132828395e-05, "loss": 2.3308, "mean_token_accuracy": 0.47586206197738645, "step": 23420 }, { "epoch": 0.023593908793049442, "grad_norm": 17.881272738383622, "learning_rate": 2.359345728501501e-05, "loss": 2.717, "mean_token_accuracy": 0.3896551728248596, "step": 23425 }, { "epoch": 0.023598944846153613, "grad_norm": 15.785861140237035, "learning_rate": 2.3598493241746068e-05, "loss": 2.5003, "mean_token_accuracy": 0.44482759237289426, "step": 23430 }, { "epoch": 0.023603980899257786, "grad_norm": 23.018795432041433, "learning_rate": 2.3603529198477127e-05, "loss": 2.5212, "mean_token_accuracy": 0.43623715043067934, "step": 23435 }, { "epoch": 0.02360901695236196, "grad_norm": 24.590844691676757, "learning_rate": 2.3608565155208187e-05, "loss": 2.7643, "mean_token_accuracy": 0.39655172228813174, "step": 23440 }, { "epoch": 0.02361405300546613, "grad_norm": 20.011124953553157, "learning_rate": 2.361360111193925e-05, "loss": 2.5008, "mean_token_accuracy": 0.4068965554237366, "step": 23445 }, { "epoch": 0.023619089058570304, "grad_norm": 18.4145459224808, "learning_rate": 2.361863706867031e-05, "loss": 2.5606, "mean_token_accuracy": 0.43448275327682495, "step": 23450 }, { "epoch": 0.023624125111674478, "grad_norm": 23.77908547581733, "learning_rate": 2.3623673025401364e-05, "loss": 2.4494, "mean_token_accuracy": 0.41379310488700866, "step": 23455 }, { "epoch": 0.023629161164778652, "grad_norm": 26.079843414908968, "learning_rate": 2.3628708982132427e-05, "loss": 2.851, "mean_token_accuracy": 0.3793103486299515, "step": 23460 }, { "epoch": 0.023634197217882822, "grad_norm": 22.261213018982513, "learning_rate": 2.3633744938863486e-05, "loss": 2.4946, "mean_token_accuracy": 0.40344826877117157, "step": 23465 }, { "epoch": 0.023639233270986996, "grad_norm": 19.64210856798974, "learning_rate": 2.3638780895594546e-05, "loss": 2.5668, "mean_token_accuracy": 0.36896551847457887, "step": 23470 }, { "epoch": 0.02364426932409117, "grad_norm": 21.561370789108572, "learning_rate": 2.3643816852325608e-05, "loss": 2.4079, "mean_token_accuracy": 0.4017543911933899, "step": 23475 }, { "epoch": 0.02364930537719534, "grad_norm": 16.11440392242275, "learning_rate": 2.3648852809056664e-05, "loss": 2.2252, "mean_token_accuracy": 0.46551724672317507, "step": 23480 }, { "epoch": 0.023654341430299514, "grad_norm": 22.171707410434447, "learning_rate": 2.3653888765787727e-05, "loss": 2.907, "mean_token_accuracy": 0.4, "step": 23485 }, { "epoch": 0.023659377483403687, "grad_norm": 31.560156884595546, "learning_rate": 2.3658924722518786e-05, "loss": 2.4305, "mean_token_accuracy": 0.42413793206214906, "step": 23490 }, { "epoch": 0.02366441353650786, "grad_norm": 23.9305298764277, "learning_rate": 2.3663960679249845e-05, "loss": 2.5919, "mean_token_accuracy": 0.39655172228813174, "step": 23495 }, { "epoch": 0.02366944958961203, "grad_norm": 19.874659995757643, "learning_rate": 2.3668996635980904e-05, "loss": 2.4001, "mean_token_accuracy": 0.3931034505367279, "step": 23500 }, { "epoch": 0.023674485642716205, "grad_norm": 17.544494861672433, "learning_rate": 2.3674032592711964e-05, "loss": 2.3061, "mean_token_accuracy": 0.42758620381355283, "step": 23505 }, { "epoch": 0.02367952169582038, "grad_norm": 17.39787595227969, "learning_rate": 2.3679068549443023e-05, "loss": 2.8009, "mean_token_accuracy": 0.35862069129943847, "step": 23510 }, { "epoch": 0.02368455774892455, "grad_norm": 19.495897384488753, "learning_rate": 2.3684104506174086e-05, "loss": 2.0697, "mean_token_accuracy": 0.453901994228363, "step": 23515 }, { "epoch": 0.023689593802028723, "grad_norm": 17.415919447033218, "learning_rate": 2.3689140462905145e-05, "loss": 2.0513, "mean_token_accuracy": 0.4896551609039307, "step": 23520 }, { "epoch": 0.023694629855132897, "grad_norm": 16.298400575046205, "learning_rate": 2.3694176419636204e-05, "loss": 2.3334, "mean_token_accuracy": 0.4620689630508423, "step": 23525 }, { "epoch": 0.02369966590823707, "grad_norm": 22.940057601258907, "learning_rate": 2.3699212376367263e-05, "loss": 2.5494, "mean_token_accuracy": 0.3931034505367279, "step": 23530 }, { "epoch": 0.02370470196134124, "grad_norm": 22.822351243116703, "learning_rate": 2.3704248333098323e-05, "loss": 3.0025, "mean_token_accuracy": 0.37586207389831544, "step": 23535 }, { "epoch": 0.023709738014445415, "grad_norm": 25.967107128512296, "learning_rate": 2.3709284289829385e-05, "loss": 2.6307, "mean_token_accuracy": 0.3827586203813553, "step": 23540 }, { "epoch": 0.02371477406754959, "grad_norm": 18.29927294579388, "learning_rate": 2.371432024656044e-05, "loss": 2.7508, "mean_token_accuracy": 0.3758620709180832, "step": 23545 }, { "epoch": 0.02371981012065376, "grad_norm": 29.075152195889707, "learning_rate": 2.37193562032915e-05, "loss": 3.0583, "mean_token_accuracy": 0.34482758343219755, "step": 23550 }, { "epoch": 0.023724846173757932, "grad_norm": 17.41635731050841, "learning_rate": 2.3724392160022563e-05, "loss": 2.5049, "mean_token_accuracy": 0.4172413766384125, "step": 23555 }, { "epoch": 0.023729882226862106, "grad_norm": 20.45651449521775, "learning_rate": 2.3729428116753622e-05, "loss": 2.2647, "mean_token_accuracy": 0.4427102208137512, "step": 23560 }, { "epoch": 0.02373491827996628, "grad_norm": 16.140200188330002, "learning_rate": 2.373446407348468e-05, "loss": 2.5712, "mean_token_accuracy": 0.41724138259887694, "step": 23565 }, { "epoch": 0.02373995433307045, "grad_norm": 26.286340727833142, "learning_rate": 2.373950003021574e-05, "loss": 2.3267, "mean_token_accuracy": 0.38275861740112305, "step": 23570 }, { "epoch": 0.023744990386174624, "grad_norm": 19.212919218146784, "learning_rate": 2.37445359869468e-05, "loss": 2.3797, "mean_token_accuracy": 0.4413793087005615, "step": 23575 }, { "epoch": 0.023750026439278798, "grad_norm": 16.64514116275354, "learning_rate": 2.3749571943677863e-05, "loss": 2.3742, "mean_token_accuracy": 0.4586206912994385, "step": 23580 }, { "epoch": 0.023755062492382968, "grad_norm": 18.944658690430504, "learning_rate": 2.3754607900408922e-05, "loss": 2.6041, "mean_token_accuracy": 0.38275861740112305, "step": 23585 }, { "epoch": 0.023760098545487142, "grad_norm": 17.386407272354063, "learning_rate": 2.3759643857139978e-05, "loss": 2.212, "mean_token_accuracy": 0.43793103098869324, "step": 23590 }, { "epoch": 0.023765134598591316, "grad_norm": 17.44829653757147, "learning_rate": 2.376467981387104e-05, "loss": 2.6267, "mean_token_accuracy": 0.4430732011795044, "step": 23595 }, { "epoch": 0.02377017065169549, "grad_norm": 21.47115270928177, "learning_rate": 2.37697157706021e-05, "loss": 2.4859, "mean_token_accuracy": 0.43103448748588563, "step": 23600 }, { "epoch": 0.02377520670479966, "grad_norm": 17.36558017869824, "learning_rate": 2.3774751727333162e-05, "loss": 2.2194, "mean_token_accuracy": 0.4689655125141144, "step": 23605 }, { "epoch": 0.023780242757903833, "grad_norm": 16.234407530308633, "learning_rate": 2.3779787684064218e-05, "loss": 2.6729, "mean_token_accuracy": 0.4, "step": 23610 }, { "epoch": 0.023785278811008007, "grad_norm": 15.885473407388053, "learning_rate": 2.3784823640795277e-05, "loss": 2.205, "mean_token_accuracy": 0.4758620738983154, "step": 23615 }, { "epoch": 0.023790314864112178, "grad_norm": 17.728453772543524, "learning_rate": 2.378985959752634e-05, "loss": 2.7969, "mean_token_accuracy": 0.3655172407627106, "step": 23620 }, { "epoch": 0.02379535091721635, "grad_norm": 18.11546209261276, "learning_rate": 2.37948955542574e-05, "loss": 2.7741, "mean_token_accuracy": 0.3344827562570572, "step": 23625 }, { "epoch": 0.023800386970320525, "grad_norm": 16.705661265304805, "learning_rate": 2.379993151098846e-05, "loss": 2.2082, "mean_token_accuracy": 0.44482758045196535, "step": 23630 }, { "epoch": 0.0238054230234247, "grad_norm": 15.610559454990113, "learning_rate": 2.3804967467719518e-05, "loss": 2.3724, "mean_token_accuracy": 0.4482758641242981, "step": 23635 }, { "epoch": 0.02381045907652887, "grad_norm": 19.851795065165458, "learning_rate": 2.3810003424450577e-05, "loss": 2.5138, "mean_token_accuracy": 0.3965517282485962, "step": 23640 }, { "epoch": 0.023815495129633043, "grad_norm": 18.216179167334186, "learning_rate": 2.381503938118164e-05, "loss": 2.1583, "mean_token_accuracy": 0.4655172348022461, "step": 23645 }, { "epoch": 0.023820531182737217, "grad_norm": 29.54391006734845, "learning_rate": 2.38200753379127e-05, "loss": 2.7016, "mean_token_accuracy": 0.41379310488700866, "step": 23650 }, { "epoch": 0.023825567235841387, "grad_norm": 20.483513820459095, "learning_rate": 2.3825111294643758e-05, "loss": 2.4519, "mean_token_accuracy": 0.37241379022598264, "step": 23655 }, { "epoch": 0.02383060328894556, "grad_norm": 20.545656693137648, "learning_rate": 2.3830147251374817e-05, "loss": 2.5436, "mean_token_accuracy": 0.38620689511299133, "step": 23660 }, { "epoch": 0.023835639342049735, "grad_norm": 23.215230703990457, "learning_rate": 2.3835183208105877e-05, "loss": 2.3415, "mean_token_accuracy": 0.4172413766384125, "step": 23665 }, { "epoch": 0.02384067539515391, "grad_norm": 19.119674326637153, "learning_rate": 2.3840219164836936e-05, "loss": 2.2953, "mean_token_accuracy": 0.42413793206214906, "step": 23670 }, { "epoch": 0.02384571144825808, "grad_norm": 20.949025855348417, "learning_rate": 2.3845255121568e-05, "loss": 2.3744, "mean_token_accuracy": 0.42758620977401735, "step": 23675 }, { "epoch": 0.023850747501362252, "grad_norm": 19.02680851002806, "learning_rate": 2.3850291078299054e-05, "loss": 2.3298, "mean_token_accuracy": 0.458620685338974, "step": 23680 }, { "epoch": 0.023855783554466426, "grad_norm": 20.85833422456774, "learning_rate": 2.3855327035030117e-05, "loss": 2.4811, "mean_token_accuracy": 0.37241379618644715, "step": 23685 }, { "epoch": 0.023860819607570596, "grad_norm": 23.85156260096753, "learning_rate": 2.3860362991761176e-05, "loss": 2.9855, "mean_token_accuracy": 0.2965517193078995, "step": 23690 }, { "epoch": 0.02386585566067477, "grad_norm": 16.55918881731979, "learning_rate": 2.3865398948492236e-05, "loss": 2.6395, "mean_token_accuracy": 0.39310344457626345, "step": 23695 }, { "epoch": 0.023870891713778944, "grad_norm": 51.2283543673357, "learning_rate": 2.3870434905223295e-05, "loss": 2.473, "mean_token_accuracy": 0.4241379380226135, "step": 23700 }, { "epoch": 0.023875927766883118, "grad_norm": 17.387664292208008, "learning_rate": 2.3875470861954354e-05, "loss": 2.3641, "mean_token_accuracy": 0.45517241954803467, "step": 23705 }, { "epoch": 0.023880963819987288, "grad_norm": 20.42409548869287, "learning_rate": 2.3880506818685413e-05, "loss": 2.5717, "mean_token_accuracy": 0.4137930989265442, "step": 23710 }, { "epoch": 0.023885999873091462, "grad_norm": 18.3082402997011, "learning_rate": 2.3885542775416476e-05, "loss": 2.5601, "mean_token_accuracy": 0.4413793206214905, "step": 23715 }, { "epoch": 0.023891035926195636, "grad_norm": 24.44136640189517, "learning_rate": 2.3890578732147535e-05, "loss": 2.3461, "mean_token_accuracy": 0.47931034564971925, "step": 23720 }, { "epoch": 0.023896071979299806, "grad_norm": 17.13760110371456, "learning_rate": 2.3895614688878595e-05, "loss": 2.2449, "mean_token_accuracy": 0.43448275327682495, "step": 23725 }, { "epoch": 0.02390110803240398, "grad_norm": 15.073744295334292, "learning_rate": 2.3900650645609654e-05, "loss": 2.5755, "mean_token_accuracy": 0.4034482717514038, "step": 23730 }, { "epoch": 0.023906144085508153, "grad_norm": 28.7440150885389, "learning_rate": 2.3905686602340713e-05, "loss": 2.6923, "mean_token_accuracy": 0.3827586203813553, "step": 23735 }, { "epoch": 0.023911180138612327, "grad_norm": 20.02097132267851, "learning_rate": 2.3910722559071776e-05, "loss": 2.3861, "mean_token_accuracy": 0.42758620381355283, "step": 23740 }, { "epoch": 0.023916216191716497, "grad_norm": 17.307676050640012, "learning_rate": 2.391575851580283e-05, "loss": 2.2756, "mean_token_accuracy": 0.43103448748588563, "step": 23745 }, { "epoch": 0.02392125224482067, "grad_norm": 19.388220384863942, "learning_rate": 2.3920794472533894e-05, "loss": 2.3781, "mean_token_accuracy": 0.4517241418361664, "step": 23750 }, { "epoch": 0.023926288297924845, "grad_norm": 23.414296383731934, "learning_rate": 2.3925830429264953e-05, "loss": 2.8317, "mean_token_accuracy": 0.39310345351696013, "step": 23755 }, { "epoch": 0.023931324351029015, "grad_norm": 22.130689801476226, "learning_rate": 2.3930866385996013e-05, "loss": 2.7132, "mean_token_accuracy": 0.3551724076271057, "step": 23760 }, { "epoch": 0.02393636040413319, "grad_norm": 30.67360784502667, "learning_rate": 2.3935902342727072e-05, "loss": 2.9197, "mean_token_accuracy": 0.3620689630508423, "step": 23765 }, { "epoch": 0.023941396457237363, "grad_norm": 17.469142261168418, "learning_rate": 2.394093829945813e-05, "loss": 2.4147, "mean_token_accuracy": 0.441379314661026, "step": 23770 }, { "epoch": 0.023946432510341537, "grad_norm": 37.6236875387337, "learning_rate": 2.394597425618919e-05, "loss": 2.3822, "mean_token_accuracy": 0.4466424763202667, "step": 23775 }, { "epoch": 0.023951468563445707, "grad_norm": 21.66049301080567, "learning_rate": 2.3951010212920253e-05, "loss": 2.5332, "mean_token_accuracy": 0.39310344457626345, "step": 23780 }, { "epoch": 0.02395650461654988, "grad_norm": 35.47751975115294, "learning_rate": 2.3956046169651312e-05, "loss": 2.7288, "mean_token_accuracy": 0.3862068891525269, "step": 23785 }, { "epoch": 0.023961540669654054, "grad_norm": 19.618442503573856, "learning_rate": 2.396108212638237e-05, "loss": 2.5422, "mean_token_accuracy": 0.4206896543502808, "step": 23790 }, { "epoch": 0.023966576722758225, "grad_norm": 20.179552832379752, "learning_rate": 2.396611808311343e-05, "loss": 2.2058, "mean_token_accuracy": 0.441379314661026, "step": 23795 }, { "epoch": 0.0239716127758624, "grad_norm": 17.96604188546113, "learning_rate": 2.397115403984449e-05, "loss": 2.4077, "mean_token_accuracy": 0.4344827592372894, "step": 23800 }, { "epoch": 0.023976648828966572, "grad_norm": 20.144492100942074, "learning_rate": 2.3976189996575553e-05, "loss": 2.2976, "mean_token_accuracy": 0.43103448748588563, "step": 23805 }, { "epoch": 0.023981684882070746, "grad_norm": 25.614852115228516, "learning_rate": 2.398122595330661e-05, "loss": 2.4909, "mean_token_accuracy": 0.39310344457626345, "step": 23810 }, { "epoch": 0.023986720935174916, "grad_norm": 18.399126908540016, "learning_rate": 2.3986261910037668e-05, "loss": 2.7282, "mean_token_accuracy": 0.41034482717514037, "step": 23815 }, { "epoch": 0.02399175698827909, "grad_norm": 25.47444317057748, "learning_rate": 2.399129786676873e-05, "loss": 2.513, "mean_token_accuracy": 0.4517241418361664, "step": 23820 }, { "epoch": 0.023996793041383264, "grad_norm": 15.31360084651321, "learning_rate": 2.399633382349979e-05, "loss": 2.1752, "mean_token_accuracy": 0.43793103098869324, "step": 23825 }, { "epoch": 0.024001829094487434, "grad_norm": 22.68012745372398, "learning_rate": 2.400136978023085e-05, "loss": 2.6447, "mean_token_accuracy": 0.42413793206214906, "step": 23830 }, { "epoch": 0.024006865147591608, "grad_norm": 18.168727697916264, "learning_rate": 2.4006405736961908e-05, "loss": 2.4962, "mean_token_accuracy": 0.38275861740112305, "step": 23835 }, { "epoch": 0.02401190120069578, "grad_norm": 16.786267062300013, "learning_rate": 2.4011441693692968e-05, "loss": 2.7204, "mean_token_accuracy": 0.37586207389831544, "step": 23840 }, { "epoch": 0.024016937253799955, "grad_norm": 16.016152976189932, "learning_rate": 2.401647765042403e-05, "loss": 2.375, "mean_token_accuracy": 0.4413793206214905, "step": 23845 }, { "epoch": 0.024021973306904126, "grad_norm": 17.92130516304088, "learning_rate": 2.402151360715509e-05, "loss": 2.4421, "mean_token_accuracy": 0.43448275327682495, "step": 23850 }, { "epoch": 0.0240270093600083, "grad_norm": 16.675625221126964, "learning_rate": 2.402654956388615e-05, "loss": 2.6868, "mean_token_accuracy": 0.3630369007587433, "step": 23855 }, { "epoch": 0.024032045413112473, "grad_norm": 19.952075182854696, "learning_rate": 2.4031585520617208e-05, "loss": 2.3776, "mean_token_accuracy": 0.4068965494632721, "step": 23860 }, { "epoch": 0.024037081466216643, "grad_norm": 22.10554465846689, "learning_rate": 2.4036621477348267e-05, "loss": 2.374, "mean_token_accuracy": 0.45015124082565305, "step": 23865 }, { "epoch": 0.024042117519320817, "grad_norm": 14.676044338683282, "learning_rate": 2.404165743407933e-05, "loss": 2.15, "mean_token_accuracy": 0.5, "step": 23870 }, { "epoch": 0.02404715357242499, "grad_norm": 25.812892098466932, "learning_rate": 2.404669339081039e-05, "loss": 2.4162, "mean_token_accuracy": 0.42068966031074523, "step": 23875 }, { "epoch": 0.024052189625529165, "grad_norm": 23.64032767711632, "learning_rate": 2.4051729347541445e-05, "loss": 2.2248, "mean_token_accuracy": 0.44827585816383364, "step": 23880 }, { "epoch": 0.024057225678633335, "grad_norm": 17.989986680852034, "learning_rate": 2.4056765304272508e-05, "loss": 2.5464, "mean_token_accuracy": 0.3965517163276672, "step": 23885 }, { "epoch": 0.02406226173173751, "grad_norm": 20.044227117834797, "learning_rate": 2.4061801261003567e-05, "loss": 2.2554, "mean_token_accuracy": 0.42413792610168455, "step": 23890 }, { "epoch": 0.024067297784841683, "grad_norm": 20.638153058133938, "learning_rate": 2.4066837217734626e-05, "loss": 2.4875, "mean_token_accuracy": 0.42068966031074523, "step": 23895 }, { "epoch": 0.024072333837945853, "grad_norm": 18.413459950748987, "learning_rate": 2.4071873174465685e-05, "loss": 2.3067, "mean_token_accuracy": 0.4564428210258484, "step": 23900 }, { "epoch": 0.024077369891050027, "grad_norm": 16.828320695018007, "learning_rate": 2.4076909131196745e-05, "loss": 2.3119, "mean_token_accuracy": 0.4206896543502808, "step": 23905 }, { "epoch": 0.0240824059441542, "grad_norm": 21.81673854474231, "learning_rate": 2.4081945087927807e-05, "loss": 2.4467, "mean_token_accuracy": 0.4034482777118683, "step": 23910 }, { "epoch": 0.024087441997258374, "grad_norm": 23.49287935803324, "learning_rate": 2.4086981044658866e-05, "loss": 2.5759, "mean_token_accuracy": 0.3965517282485962, "step": 23915 }, { "epoch": 0.024092478050362545, "grad_norm": 19.30502263654567, "learning_rate": 2.4092017001389926e-05, "loss": 2.7033, "mean_token_accuracy": 0.39310343861579894, "step": 23920 }, { "epoch": 0.02409751410346672, "grad_norm": 17.662606857422873, "learning_rate": 2.4097052958120985e-05, "loss": 2.7485, "mean_token_accuracy": 0.4034482777118683, "step": 23925 }, { "epoch": 0.024102550156570892, "grad_norm": 16.215784273263488, "learning_rate": 2.4102088914852044e-05, "loss": 2.5865, "mean_token_accuracy": 0.4, "step": 23930 }, { "epoch": 0.024107586209675062, "grad_norm": 26.66835260547999, "learning_rate": 2.4107124871583103e-05, "loss": 2.7017, "mean_token_accuracy": 0.42758620381355283, "step": 23935 }, { "epoch": 0.024112622262779236, "grad_norm": 19.913434709524605, "learning_rate": 2.4112160828314166e-05, "loss": 2.4656, "mean_token_accuracy": 0.3999999940395355, "step": 23940 }, { "epoch": 0.02411765831588341, "grad_norm": 21.170122512768614, "learning_rate": 2.4117196785045222e-05, "loss": 2.6768, "mean_token_accuracy": 0.3517241269350052, "step": 23945 }, { "epoch": 0.024122694368987584, "grad_norm": 18.035763102311954, "learning_rate": 2.4122232741776285e-05, "loss": 2.5846, "mean_token_accuracy": 0.42758620977401735, "step": 23950 }, { "epoch": 0.024127730422091754, "grad_norm": 19.4718076259686, "learning_rate": 2.4127268698507344e-05, "loss": 2.322, "mean_token_accuracy": 0.44827585816383364, "step": 23955 }, { "epoch": 0.024132766475195928, "grad_norm": 21.585899262498586, "learning_rate": 2.4132304655238403e-05, "loss": 2.6497, "mean_token_accuracy": 0.43793103098869324, "step": 23960 }, { "epoch": 0.0241378025283001, "grad_norm": 19.850923863495765, "learning_rate": 2.4137340611969462e-05, "loss": 2.6617, "mean_token_accuracy": 0.4075123131275177, "step": 23965 }, { "epoch": 0.024142838581404272, "grad_norm": 22.17783684920762, "learning_rate": 2.414237656870052e-05, "loss": 2.3709, "mean_token_accuracy": 0.4600120961666107, "step": 23970 }, { "epoch": 0.024147874634508446, "grad_norm": 16.894900167050032, "learning_rate": 2.414741252543158e-05, "loss": 2.8468, "mean_token_accuracy": 0.3379310339689255, "step": 23975 }, { "epoch": 0.02415291068761262, "grad_norm": 16.912222409000538, "learning_rate": 2.4152448482162644e-05, "loss": 2.8597, "mean_token_accuracy": 0.3551724076271057, "step": 23980 }, { "epoch": 0.024157946740716793, "grad_norm": 16.193736500925475, "learning_rate": 2.4157484438893703e-05, "loss": 2.3511, "mean_token_accuracy": 0.44482757449150084, "step": 23985 }, { "epoch": 0.024162982793820963, "grad_norm": 21.53111701601264, "learning_rate": 2.4162520395624762e-05, "loss": 2.2601, "mean_token_accuracy": 0.44827585816383364, "step": 23990 }, { "epoch": 0.024168018846925137, "grad_norm": 21.193268423501816, "learning_rate": 2.416755635235582e-05, "loss": 2.8239, "mean_token_accuracy": 0.3551724135875702, "step": 23995 }, { "epoch": 0.02417305490002931, "grad_norm": 20.592701760354394, "learning_rate": 2.417259230908688e-05, "loss": 2.4138, "mean_token_accuracy": 0.37241379618644715, "step": 24000 }, { "epoch": 0.02417809095313348, "grad_norm": 19.921432117200787, "learning_rate": 2.4177628265817943e-05, "loss": 2.4218, "mean_token_accuracy": 0.4034482717514038, "step": 24005 }, { "epoch": 0.024183127006237655, "grad_norm": 21.550864731566712, "learning_rate": 2.4182664222549002e-05, "loss": 2.817, "mean_token_accuracy": 0.4103448331356049, "step": 24010 }, { "epoch": 0.02418816305934183, "grad_norm": 17.768554714060226, "learning_rate": 2.418770017928006e-05, "loss": 2.3806, "mean_token_accuracy": 0.44482759237289426, "step": 24015 }, { "epoch": 0.024193199112446002, "grad_norm": 18.925638938620864, "learning_rate": 2.419273613601112e-05, "loss": 2.7243, "mean_token_accuracy": 0.36896550953388213, "step": 24020 }, { "epoch": 0.024198235165550173, "grad_norm": 24.90100334270838, "learning_rate": 2.419777209274218e-05, "loss": 2.8157, "mean_token_accuracy": 0.4034482777118683, "step": 24025 }, { "epoch": 0.024203271218654347, "grad_norm": 17.928949221693, "learning_rate": 2.4202808049473243e-05, "loss": 2.4075, "mean_token_accuracy": 0.42758620381355283, "step": 24030 }, { "epoch": 0.02420830727175852, "grad_norm": 19.140987729560628, "learning_rate": 2.42078440062043e-05, "loss": 2.5518, "mean_token_accuracy": 0.4034482777118683, "step": 24035 }, { "epoch": 0.02421334332486269, "grad_norm": 20.30586760951174, "learning_rate": 2.4212879962935358e-05, "loss": 2.5527, "mean_token_accuracy": 0.4206896543502808, "step": 24040 }, { "epoch": 0.024218379377966864, "grad_norm": 17.551572643904933, "learning_rate": 2.421791591966642e-05, "loss": 2.455, "mean_token_accuracy": 0.42758620381355283, "step": 24045 }, { "epoch": 0.024223415431071038, "grad_norm": 16.342841664109613, "learning_rate": 2.422295187639748e-05, "loss": 2.3376, "mean_token_accuracy": 0.4275862157344818, "step": 24050 }, { "epoch": 0.024228451484175212, "grad_norm": 17.804604179470658, "learning_rate": 2.422798783312854e-05, "loss": 2.6032, "mean_token_accuracy": 0.4068965494632721, "step": 24055 }, { "epoch": 0.024233487537279382, "grad_norm": 17.592264622386562, "learning_rate": 2.42330237898596e-05, "loss": 2.7319, "mean_token_accuracy": 0.37241379022598264, "step": 24060 }, { "epoch": 0.024238523590383556, "grad_norm": 18.70524892686871, "learning_rate": 2.4238059746590658e-05, "loss": 2.792, "mean_token_accuracy": 0.33103448152542114, "step": 24065 }, { "epoch": 0.02424355964348773, "grad_norm": 18.73021823163781, "learning_rate": 2.424309570332172e-05, "loss": 2.5406, "mean_token_accuracy": 0.38965516686439516, "step": 24070 }, { "epoch": 0.0242485956965919, "grad_norm": 16.912862108497624, "learning_rate": 2.424813166005278e-05, "loss": 2.2733, "mean_token_accuracy": 0.4379310369491577, "step": 24075 }, { "epoch": 0.024253631749696074, "grad_norm": 22.8795921811821, "learning_rate": 2.4253167616783835e-05, "loss": 2.5659, "mean_token_accuracy": 0.4, "step": 24080 }, { "epoch": 0.024258667802800248, "grad_norm": 12.80166004182257, "learning_rate": 2.4258203573514898e-05, "loss": 2.2321, "mean_token_accuracy": 0.458620685338974, "step": 24085 }, { "epoch": 0.02426370385590442, "grad_norm": 22.465262436657135, "learning_rate": 2.4263239530245957e-05, "loss": 2.2853, "mean_token_accuracy": 0.4344827592372894, "step": 24090 }, { "epoch": 0.02426873990900859, "grad_norm": 15.623299848179487, "learning_rate": 2.4268275486977017e-05, "loss": 2.3665, "mean_token_accuracy": 0.43623715043067934, "step": 24095 }, { "epoch": 0.024273775962112765, "grad_norm": 19.558770115813186, "learning_rate": 2.4273311443708076e-05, "loss": 2.2891, "mean_token_accuracy": 0.43103447556495667, "step": 24100 }, { "epoch": 0.02427881201521694, "grad_norm": 19.686222048753695, "learning_rate": 2.4278347400439135e-05, "loss": 2.1801, "mean_token_accuracy": 0.4689655125141144, "step": 24105 }, { "epoch": 0.02428384806832111, "grad_norm": 19.39144627877769, "learning_rate": 2.4283383357170198e-05, "loss": 2.7346, "mean_token_accuracy": 0.3758620619773865, "step": 24110 }, { "epoch": 0.024288884121425283, "grad_norm": 18.151312673542574, "learning_rate": 2.4288419313901257e-05, "loss": 2.4198, "mean_token_accuracy": 0.4379310369491577, "step": 24115 }, { "epoch": 0.024293920174529457, "grad_norm": 19.477249769427054, "learning_rate": 2.4293455270632316e-05, "loss": 2.4622, "mean_token_accuracy": 0.4379310429096222, "step": 24120 }, { "epoch": 0.02429895622763363, "grad_norm": 31.726814248013124, "learning_rate": 2.4298491227363375e-05, "loss": 2.5321, "mean_token_accuracy": 0.41724138259887694, "step": 24125 }, { "epoch": 0.0243039922807378, "grad_norm": 18.966070269860957, "learning_rate": 2.4303527184094435e-05, "loss": 2.758, "mean_token_accuracy": 0.37931033968925476, "step": 24130 }, { "epoch": 0.024309028333841975, "grad_norm": 17.80509596771346, "learning_rate": 2.4308563140825497e-05, "loss": 2.4839, "mean_token_accuracy": 0.4000000059604645, "step": 24135 }, { "epoch": 0.02431406438694615, "grad_norm": 19.937091406991943, "learning_rate": 2.4313599097556557e-05, "loss": 2.703, "mean_token_accuracy": 0.4344827592372894, "step": 24140 }, { "epoch": 0.02431910044005032, "grad_norm": 15.898890802254346, "learning_rate": 2.4318635054287612e-05, "loss": 2.3847, "mean_token_accuracy": 0.42413792908191683, "step": 24145 }, { "epoch": 0.024324136493154493, "grad_norm": 19.43573129709378, "learning_rate": 2.4323671011018675e-05, "loss": 2.5674, "mean_token_accuracy": 0.44137930274009707, "step": 24150 }, { "epoch": 0.024329172546258666, "grad_norm": 20.771969583912146, "learning_rate": 2.4328706967749734e-05, "loss": 2.2578, "mean_token_accuracy": 0.4482758641242981, "step": 24155 }, { "epoch": 0.02433420859936284, "grad_norm": 23.651249681451077, "learning_rate": 2.4333742924480794e-05, "loss": 2.3961, "mean_token_accuracy": 0.41724138259887694, "step": 24160 }, { "epoch": 0.02433924465246701, "grad_norm": 17.032808415753045, "learning_rate": 2.4338778881211853e-05, "loss": 2.236, "mean_token_accuracy": 0.44827585816383364, "step": 24165 }, { "epoch": 0.024344280705571184, "grad_norm": 24.679534951733046, "learning_rate": 2.4343814837942912e-05, "loss": 2.2653, "mean_token_accuracy": 0.42413793206214906, "step": 24170 }, { "epoch": 0.024349316758675358, "grad_norm": 20.729235655324032, "learning_rate": 2.4348850794673975e-05, "loss": 2.855, "mean_token_accuracy": 0.4068965494632721, "step": 24175 }, { "epoch": 0.024354352811779528, "grad_norm": 20.261565463783676, "learning_rate": 2.4353886751405034e-05, "loss": 2.6154, "mean_token_accuracy": 0.4068965554237366, "step": 24180 }, { "epoch": 0.024359388864883702, "grad_norm": 17.39036865028312, "learning_rate": 2.4358922708136093e-05, "loss": 2.2315, "mean_token_accuracy": 0.4551724076271057, "step": 24185 }, { "epoch": 0.024364424917987876, "grad_norm": 44.123394292928026, "learning_rate": 2.4363958664867153e-05, "loss": 2.3042, "mean_token_accuracy": 0.43103448748588563, "step": 24190 }, { "epoch": 0.02436946097109205, "grad_norm": 16.46897818962927, "learning_rate": 2.4368994621598212e-05, "loss": 2.192, "mean_token_accuracy": 0.5158499777317047, "step": 24195 }, { "epoch": 0.02437449702419622, "grad_norm": 28.367890542714264, "learning_rate": 2.437403057832927e-05, "loss": 2.8244, "mean_token_accuracy": 0.3482758551836014, "step": 24200 }, { "epoch": 0.024379533077300394, "grad_norm": 20.49783329111513, "learning_rate": 2.4379066535060334e-05, "loss": 2.336, "mean_token_accuracy": 0.4620689570903778, "step": 24205 }, { "epoch": 0.024384569130404567, "grad_norm": 15.318189432820885, "learning_rate": 2.4384102491791393e-05, "loss": 2.4151, "mean_token_accuracy": 0.43448275327682495, "step": 24210 }, { "epoch": 0.024389605183508738, "grad_norm": 20.647187093968235, "learning_rate": 2.4389138448522452e-05, "loss": 2.4961, "mean_token_accuracy": 0.36206896901130675, "step": 24215 }, { "epoch": 0.02439464123661291, "grad_norm": 17.669452701050684, "learning_rate": 2.439417440525351e-05, "loss": 2.3542, "mean_token_accuracy": 0.4241379380226135, "step": 24220 }, { "epoch": 0.024399677289717085, "grad_norm": 15.753709691855176, "learning_rate": 2.439921036198457e-05, "loss": 2.1501, "mean_token_accuracy": 0.45517241954803467, "step": 24225 }, { "epoch": 0.02440471334282126, "grad_norm": 23.721320606344243, "learning_rate": 2.4404246318715633e-05, "loss": 2.4075, "mean_token_accuracy": 0.4034482777118683, "step": 24230 }, { "epoch": 0.02440974939592543, "grad_norm": 22.503598763814765, "learning_rate": 2.440928227544669e-05, "loss": 2.3722, "mean_token_accuracy": 0.4724137902259827, "step": 24235 }, { "epoch": 0.024414785449029603, "grad_norm": 19.017136080703306, "learning_rate": 2.441431823217775e-05, "loss": 2.7946, "mean_token_accuracy": 0.3068965494632721, "step": 24240 }, { "epoch": 0.024419821502133777, "grad_norm": 23.327749943169568, "learning_rate": 2.441935418890881e-05, "loss": 2.7489, "mean_token_accuracy": 0.37931033968925476, "step": 24245 }, { "epoch": 0.024424857555237947, "grad_norm": 15.832258500108582, "learning_rate": 2.442439014563987e-05, "loss": 2.436, "mean_token_accuracy": 0.44652147889137267, "step": 24250 }, { "epoch": 0.02442989360834212, "grad_norm": 15.748477578508847, "learning_rate": 2.442942610237093e-05, "loss": 2.542, "mean_token_accuracy": 0.4034482777118683, "step": 24255 }, { "epoch": 0.024434929661446295, "grad_norm": 15.790439998427184, "learning_rate": 2.443446205910199e-05, "loss": 2.2461, "mean_token_accuracy": 0.4517241299152374, "step": 24260 }, { "epoch": 0.02443996571455047, "grad_norm": 18.772263555079316, "learning_rate": 2.4439498015833048e-05, "loss": 2.4401, "mean_token_accuracy": 0.44827585816383364, "step": 24265 }, { "epoch": 0.02444500176765464, "grad_norm": 18.544639463434557, "learning_rate": 2.444453397256411e-05, "loss": 2.3275, "mean_token_accuracy": 0.39310344457626345, "step": 24270 }, { "epoch": 0.024450037820758812, "grad_norm": 26.590497230399418, "learning_rate": 2.444956992929517e-05, "loss": 2.5562, "mean_token_accuracy": 0.43448275327682495, "step": 24275 }, { "epoch": 0.024455073873862986, "grad_norm": 12.62897062329328, "learning_rate": 2.4454605886026226e-05, "loss": 2.2338, "mean_token_accuracy": 0.44827585816383364, "step": 24280 }, { "epoch": 0.024460109926967157, "grad_norm": 18.154489320379174, "learning_rate": 2.445964184275729e-05, "loss": 2.3891, "mean_token_accuracy": 0.41379310488700866, "step": 24285 }, { "epoch": 0.02446514598007133, "grad_norm": 20.99429726009109, "learning_rate": 2.4464677799488348e-05, "loss": 2.691, "mean_token_accuracy": 0.41379310488700866, "step": 24290 }, { "epoch": 0.024470182033175504, "grad_norm": 20.71018682888791, "learning_rate": 2.446971375621941e-05, "loss": 2.6495, "mean_token_accuracy": 0.37241379022598264, "step": 24295 }, { "epoch": 0.024475218086279678, "grad_norm": 19.336803756713824, "learning_rate": 2.4474749712950466e-05, "loss": 2.3205, "mean_token_accuracy": 0.44482758045196535, "step": 24300 }, { "epoch": 0.024480254139383848, "grad_norm": 23.159013081144217, "learning_rate": 2.4479785669681525e-05, "loss": 2.7001, "mean_token_accuracy": 0.3758620649576187, "step": 24305 }, { "epoch": 0.024485290192488022, "grad_norm": 20.732985893209918, "learning_rate": 2.4484821626412588e-05, "loss": 2.4586, "mean_token_accuracy": 0.39310344457626345, "step": 24310 }, { "epoch": 0.024490326245592196, "grad_norm": 23.471164030980802, "learning_rate": 2.4489857583143647e-05, "loss": 2.3553, "mean_token_accuracy": 0.3999999940395355, "step": 24315 }, { "epoch": 0.024495362298696366, "grad_norm": 16.595434863592082, "learning_rate": 2.4494893539874707e-05, "loss": 1.9999, "mean_token_accuracy": 0.4896551728248596, "step": 24320 }, { "epoch": 0.02450039835180054, "grad_norm": 18.19664889333985, "learning_rate": 2.4499929496605766e-05, "loss": 2.4495, "mean_token_accuracy": 0.420689657330513, "step": 24325 }, { "epoch": 0.024505434404904713, "grad_norm": 20.45755013844865, "learning_rate": 2.4504965453336825e-05, "loss": 2.6182, "mean_token_accuracy": 0.4137930989265442, "step": 24330 }, { "epoch": 0.024510470458008887, "grad_norm": 20.325710443332817, "learning_rate": 2.4510001410067888e-05, "loss": 2.6119, "mean_token_accuracy": 0.3793103456497192, "step": 24335 }, { "epoch": 0.024515506511113058, "grad_norm": 19.34484418395446, "learning_rate": 2.4515037366798947e-05, "loss": 2.3206, "mean_token_accuracy": 0.4553538978099823, "step": 24340 }, { "epoch": 0.02452054256421723, "grad_norm": 15.669025808095792, "learning_rate": 2.4520073323530003e-05, "loss": 2.2205, "mean_token_accuracy": 0.46896551847457885, "step": 24345 }, { "epoch": 0.024525578617321405, "grad_norm": 13.827188307515428, "learning_rate": 2.4525109280261066e-05, "loss": 2.1297, "mean_token_accuracy": 0.4620689690113068, "step": 24350 }, { "epoch": 0.024530614670425575, "grad_norm": 19.620393159668154, "learning_rate": 2.4530145236992125e-05, "loss": 2.7106, "mean_token_accuracy": 0.41034482717514037, "step": 24355 }, { "epoch": 0.02453565072352975, "grad_norm": 30.779562881948802, "learning_rate": 2.4535181193723184e-05, "loss": 2.4532, "mean_token_accuracy": 0.4344827592372894, "step": 24360 }, { "epoch": 0.024540686776633923, "grad_norm": 18.567332813367724, "learning_rate": 2.4540217150454243e-05, "loss": 2.6432, "mean_token_accuracy": 0.3793103456497192, "step": 24365 }, { "epoch": 0.024545722829738097, "grad_norm": 16.927543303379768, "learning_rate": 2.4545253107185303e-05, "loss": 2.6049, "mean_token_accuracy": 0.41379310488700866, "step": 24370 }, { "epoch": 0.024550758882842267, "grad_norm": 19.021473216580382, "learning_rate": 2.4550289063916365e-05, "loss": 2.2336, "mean_token_accuracy": 0.4344827562570572, "step": 24375 }, { "epoch": 0.02455579493594644, "grad_norm": 19.40924859198866, "learning_rate": 2.4555325020647424e-05, "loss": 2.8492, "mean_token_accuracy": 0.4034482777118683, "step": 24380 }, { "epoch": 0.024560830989050614, "grad_norm": 19.304551417622783, "learning_rate": 2.4560360977378484e-05, "loss": 2.5777, "mean_token_accuracy": 0.37586206793785093, "step": 24385 }, { "epoch": 0.024565867042154785, "grad_norm": 16.389793628574363, "learning_rate": 2.4565396934109543e-05, "loss": 2.4374, "mean_token_accuracy": 0.42758620977401735, "step": 24390 }, { "epoch": 0.02457090309525896, "grad_norm": 19.716214146017972, "learning_rate": 2.4570432890840602e-05, "loss": 2.9267, "mean_token_accuracy": 0.3620689630508423, "step": 24395 }, { "epoch": 0.024575939148363132, "grad_norm": 15.230494681223712, "learning_rate": 2.457546884757166e-05, "loss": 2.2653, "mean_token_accuracy": 0.4206896543502808, "step": 24400 }, { "epoch": 0.024580975201467306, "grad_norm": 22.30345949185059, "learning_rate": 2.4580504804302724e-05, "loss": 2.2565, "mean_token_accuracy": 0.4206896543502808, "step": 24405 }, { "epoch": 0.024586011254571476, "grad_norm": 17.37170640533167, "learning_rate": 2.4585540761033783e-05, "loss": 2.0535, "mean_token_accuracy": 0.4834240794181824, "step": 24410 }, { "epoch": 0.02459104730767565, "grad_norm": 24.82227342826864, "learning_rate": 2.4590576717764843e-05, "loss": 2.9718, "mean_token_accuracy": 0.37586207389831544, "step": 24415 }, { "epoch": 0.024596083360779824, "grad_norm": 17.769985613307394, "learning_rate": 2.4595612674495902e-05, "loss": 2.3936, "mean_token_accuracy": 0.42758620977401735, "step": 24420 }, { "epoch": 0.024601119413883994, "grad_norm": 20.150924313811835, "learning_rate": 2.460064863122696e-05, "loss": 2.4459, "mean_token_accuracy": 0.4396854341030121, "step": 24425 }, { "epoch": 0.024606155466988168, "grad_norm": 21.11160829830446, "learning_rate": 2.4605684587958024e-05, "loss": 2.787, "mean_token_accuracy": 0.36896551847457887, "step": 24430 }, { "epoch": 0.02461119152009234, "grad_norm": 17.163781991658798, "learning_rate": 2.461072054468908e-05, "loss": 2.4174, "mean_token_accuracy": 0.4555958867073059, "step": 24435 }, { "epoch": 0.024616227573196515, "grad_norm": 29.660014470165088, "learning_rate": 2.461575650142014e-05, "loss": 2.6861, "mean_token_accuracy": 0.3655172407627106, "step": 24440 }, { "epoch": 0.024621263626300686, "grad_norm": 17.705153596476247, "learning_rate": 2.46207924581512e-05, "loss": 2.3078, "mean_token_accuracy": 0.4517241418361664, "step": 24445 }, { "epoch": 0.02462629967940486, "grad_norm": 18.729596378986045, "learning_rate": 2.462582841488226e-05, "loss": 2.5169, "mean_token_accuracy": 0.41911675333976744, "step": 24450 }, { "epoch": 0.024631335732509033, "grad_norm": 24.20005484762973, "learning_rate": 2.463086437161332e-05, "loss": 2.4975, "mean_token_accuracy": 0.3965517163276672, "step": 24455 }, { "epoch": 0.024636371785613204, "grad_norm": 18.28864537992677, "learning_rate": 2.463590032834438e-05, "loss": 2.3671, "mean_token_accuracy": 0.4620689630508423, "step": 24460 }, { "epoch": 0.024641407838717377, "grad_norm": 17.10611002962765, "learning_rate": 2.464093628507544e-05, "loss": 2.0954, "mean_token_accuracy": 0.43103447556495667, "step": 24465 }, { "epoch": 0.02464644389182155, "grad_norm": 18.72500807908979, "learning_rate": 2.46459722418065e-05, "loss": 2.2785, "mean_token_accuracy": 0.47241379618644713, "step": 24470 }, { "epoch": 0.024651479944925725, "grad_norm": 20.651166475084864, "learning_rate": 2.465100819853756e-05, "loss": 2.4147, "mean_token_accuracy": 0.3862069010734558, "step": 24475 }, { "epoch": 0.024656515998029895, "grad_norm": 23.84414566074428, "learning_rate": 2.4656044155268616e-05, "loss": 2.5527, "mean_token_accuracy": 0.4310344845056534, "step": 24480 }, { "epoch": 0.02466155205113407, "grad_norm": 19.637821449590685, "learning_rate": 2.466108011199968e-05, "loss": 2.3081, "mean_token_accuracy": 0.48136720061302185, "step": 24485 }, { "epoch": 0.024666588104238243, "grad_norm": 13.251230720661296, "learning_rate": 2.4666116068730738e-05, "loss": 2.3448, "mean_token_accuracy": 0.4413793087005615, "step": 24490 }, { "epoch": 0.024671624157342413, "grad_norm": 19.02134762061157, "learning_rate": 2.46711520254618e-05, "loss": 2.8104, "mean_token_accuracy": 0.37241379022598264, "step": 24495 }, { "epoch": 0.024676660210446587, "grad_norm": 17.52894686642188, "learning_rate": 2.4676187982192857e-05, "loss": 2.3716, "mean_token_accuracy": 0.4275862157344818, "step": 24500 }, { "epoch": 0.02468169626355076, "grad_norm": 15.92385736218499, "learning_rate": 2.4681223938923916e-05, "loss": 2.4998, "mean_token_accuracy": 0.4206896543502808, "step": 24505 }, { "epoch": 0.024686732316654934, "grad_norm": 16.775405667959248, "learning_rate": 2.468625989565498e-05, "loss": 2.4214, "mean_token_accuracy": 0.3862068891525269, "step": 24510 }, { "epoch": 0.024691768369759105, "grad_norm": 15.744594786796725, "learning_rate": 2.4691295852386038e-05, "loss": 2.4147, "mean_token_accuracy": 0.4689655125141144, "step": 24515 }, { "epoch": 0.02469680442286328, "grad_norm": 18.92062508377345, "learning_rate": 2.4696331809117097e-05, "loss": 2.4478, "mean_token_accuracy": 0.4379310250282288, "step": 24520 }, { "epoch": 0.024701840475967452, "grad_norm": 15.49140789864529, "learning_rate": 2.4701367765848156e-05, "loss": 2.332, "mean_token_accuracy": 0.5088929176330567, "step": 24525 }, { "epoch": 0.024706876529071622, "grad_norm": 18.91393168704787, "learning_rate": 2.4706403722579216e-05, "loss": 2.4616, "mean_token_accuracy": 0.3999999940395355, "step": 24530 }, { "epoch": 0.024711912582175796, "grad_norm": 23.20342370745326, "learning_rate": 2.4711439679310278e-05, "loss": 2.5195, "mean_token_accuracy": 0.41379311084747317, "step": 24535 }, { "epoch": 0.02471694863527997, "grad_norm": 14.75347209913705, "learning_rate": 2.4716475636041337e-05, "loss": 2.3001, "mean_token_accuracy": 0.4448275864124298, "step": 24540 }, { "epoch": 0.024721984688384144, "grad_norm": 22.740160004280103, "learning_rate": 2.4721511592772397e-05, "loss": 2.7785, "mean_token_accuracy": 0.41379310488700866, "step": 24545 }, { "epoch": 0.024727020741488314, "grad_norm": 19.953504453862625, "learning_rate": 2.4726547549503456e-05, "loss": 2.169, "mean_token_accuracy": 0.4206896543502808, "step": 24550 }, { "epoch": 0.024732056794592488, "grad_norm": 18.31913564290947, "learning_rate": 2.4731583506234515e-05, "loss": 2.6735, "mean_token_accuracy": 0.3758620619773865, "step": 24555 }, { "epoch": 0.02473709284769666, "grad_norm": 21.626815384586703, "learning_rate": 2.4736619462965578e-05, "loss": 2.2275, "mean_token_accuracy": 0.47586206197738645, "step": 24560 }, { "epoch": 0.024742128900800832, "grad_norm": 16.998328177654432, "learning_rate": 2.4741655419696637e-05, "loss": 2.4554, "mean_token_accuracy": 0.45710828304290774, "step": 24565 }, { "epoch": 0.024747164953905006, "grad_norm": 18.589248889678103, "learning_rate": 2.4746691376427693e-05, "loss": 2.645, "mean_token_accuracy": 0.39310344457626345, "step": 24570 }, { "epoch": 0.02475220100700918, "grad_norm": 16.764084229307176, "learning_rate": 2.4751727333158756e-05, "loss": 2.3329, "mean_token_accuracy": 0.42758620977401735, "step": 24575 }, { "epoch": 0.024757237060113353, "grad_norm": 20.164671405794497, "learning_rate": 2.4756763289889815e-05, "loss": 2.7357, "mean_token_accuracy": 0.3551724135875702, "step": 24580 }, { "epoch": 0.024762273113217523, "grad_norm": 14.868518052332309, "learning_rate": 2.4761799246620874e-05, "loss": 2.5724, "mean_token_accuracy": 0.38620689511299133, "step": 24585 }, { "epoch": 0.024767309166321697, "grad_norm": 13.859672935724632, "learning_rate": 2.4766835203351933e-05, "loss": 2.411, "mean_token_accuracy": 0.4413793087005615, "step": 24590 }, { "epoch": 0.02477234521942587, "grad_norm": 15.927321620647696, "learning_rate": 2.4771871160082993e-05, "loss": 2.609, "mean_token_accuracy": 0.40344828367233276, "step": 24595 }, { "epoch": 0.02477738127253004, "grad_norm": 16.895244227435985, "learning_rate": 2.4776907116814055e-05, "loss": 2.6078, "mean_token_accuracy": 0.3655172407627106, "step": 24600 }, { "epoch": 0.024782417325634215, "grad_norm": 19.04777104774694, "learning_rate": 2.4781943073545115e-05, "loss": 2.379, "mean_token_accuracy": 0.4379310369491577, "step": 24605 }, { "epoch": 0.02478745337873839, "grad_norm": 19.831878238366187, "learning_rate": 2.4786979030276174e-05, "loss": 2.4828, "mean_token_accuracy": 0.40344828367233276, "step": 24610 }, { "epoch": 0.024792489431842563, "grad_norm": 22.728341081182506, "learning_rate": 2.4792014987007233e-05, "loss": 2.7505, "mean_token_accuracy": 0.4068965494632721, "step": 24615 }, { "epoch": 0.024797525484946733, "grad_norm": 16.095819646876706, "learning_rate": 2.4797050943738292e-05, "loss": 2.461, "mean_token_accuracy": 0.4068965494632721, "step": 24620 }, { "epoch": 0.024802561538050907, "grad_norm": 16.79054018959549, "learning_rate": 2.480208690046935e-05, "loss": 2.3348, "mean_token_accuracy": 0.45862067937850953, "step": 24625 }, { "epoch": 0.02480759759115508, "grad_norm": 18.731074311834142, "learning_rate": 2.4807122857200414e-05, "loss": 2.6435, "mean_token_accuracy": 0.4332123279571533, "step": 24630 }, { "epoch": 0.02481263364425925, "grad_norm": 19.504385883659808, "learning_rate": 2.481215881393147e-05, "loss": 2.7015, "mean_token_accuracy": 0.39443435668945315, "step": 24635 }, { "epoch": 0.024817669697363424, "grad_norm": 20.773113840314863, "learning_rate": 2.4817194770662533e-05, "loss": 2.6341, "mean_token_accuracy": 0.3793103337287903, "step": 24640 }, { "epoch": 0.024822705750467598, "grad_norm": 19.93392308403074, "learning_rate": 2.4822230727393592e-05, "loss": 2.2136, "mean_token_accuracy": 0.4448275864124298, "step": 24645 }, { "epoch": 0.024827741803571772, "grad_norm": 20.33925334642438, "learning_rate": 2.482726668412465e-05, "loss": 2.3626, "mean_token_accuracy": 0.43448275327682495, "step": 24650 }, { "epoch": 0.024832777856675942, "grad_norm": 16.790907366349888, "learning_rate": 2.483230264085571e-05, "loss": 2.4879, "mean_token_accuracy": 0.43103447556495667, "step": 24655 }, { "epoch": 0.024837813909780116, "grad_norm": 20.972465004372598, "learning_rate": 2.483733859758677e-05, "loss": 2.6129, "mean_token_accuracy": 0.3896551728248596, "step": 24660 }, { "epoch": 0.02484284996288429, "grad_norm": 14.418340893590205, "learning_rate": 2.484237455431783e-05, "loss": 2.5755, "mean_token_accuracy": 0.39310343861579894, "step": 24665 }, { "epoch": 0.02484788601598846, "grad_norm": 18.5742558577864, "learning_rate": 2.484741051104889e-05, "loss": 2.7675, "mean_token_accuracy": 0.39655172228813174, "step": 24670 }, { "epoch": 0.024852922069092634, "grad_norm": 21.03926611979944, "learning_rate": 2.485244646777995e-05, "loss": 2.6652, "mean_token_accuracy": 0.43793103098869324, "step": 24675 }, { "epoch": 0.024857958122196808, "grad_norm": 29.160457014636684, "learning_rate": 2.485748242451101e-05, "loss": 2.5018, "mean_token_accuracy": 0.36896551847457887, "step": 24680 }, { "epoch": 0.02486299417530098, "grad_norm": 18.981458607799254, "learning_rate": 2.486251838124207e-05, "loss": 2.5516, "mean_token_accuracy": 0.3965517282485962, "step": 24685 }, { "epoch": 0.02486803022840515, "grad_norm": 82.18648647100278, "learning_rate": 2.486755433797313e-05, "loss": 2.5099, "mean_token_accuracy": 0.42438423037528994, "step": 24690 }, { "epoch": 0.024873066281509325, "grad_norm": 21.305590635899502, "learning_rate": 2.487259029470419e-05, "loss": 2.4482, "mean_token_accuracy": 0.36896551251411436, "step": 24695 }, { "epoch": 0.0248781023346135, "grad_norm": 19.534029317380895, "learning_rate": 2.4877626251435247e-05, "loss": 2.3878, "mean_token_accuracy": 0.4034482777118683, "step": 24700 }, { "epoch": 0.02488313838771767, "grad_norm": 15.089009895598371, "learning_rate": 2.4882662208166306e-05, "loss": 2.6547, "mean_token_accuracy": 0.3517241358757019, "step": 24705 }, { "epoch": 0.024888174440821843, "grad_norm": 15.522176264309552, "learning_rate": 2.488769816489737e-05, "loss": 2.2925, "mean_token_accuracy": 0.42413792908191683, "step": 24710 }, { "epoch": 0.024893210493926017, "grad_norm": 24.931839419742985, "learning_rate": 2.4892734121628428e-05, "loss": 2.8113, "mean_token_accuracy": 0.37586206793785093, "step": 24715 }, { "epoch": 0.02489824654703019, "grad_norm": 22.913966597139822, "learning_rate": 2.4897770078359488e-05, "loss": 2.8452, "mean_token_accuracy": 0.4241379380226135, "step": 24720 }, { "epoch": 0.02490328260013436, "grad_norm": 17.916058954501537, "learning_rate": 2.4902806035090547e-05, "loss": 2.5551, "mean_token_accuracy": 0.33793103098869326, "step": 24725 }, { "epoch": 0.024908318653238535, "grad_norm": 18.956282188232244, "learning_rate": 2.4907841991821606e-05, "loss": 2.5347, "mean_token_accuracy": 0.4047791838645935, "step": 24730 }, { "epoch": 0.02491335470634271, "grad_norm": 22.721342847492643, "learning_rate": 2.491287794855267e-05, "loss": 2.49, "mean_token_accuracy": 0.4379310369491577, "step": 24735 }, { "epoch": 0.02491839075944688, "grad_norm": 17.05183906231256, "learning_rate": 2.4917913905283728e-05, "loss": 2.5987, "mean_token_accuracy": 0.39310344457626345, "step": 24740 }, { "epoch": 0.024923426812551053, "grad_norm": 189.33649920766237, "learning_rate": 2.4922949862014787e-05, "loss": 2.8688, "mean_token_accuracy": 0.35862069129943847, "step": 24745 }, { "epoch": 0.024928462865655227, "grad_norm": 17.581526624657652, "learning_rate": 2.4927985818745846e-05, "loss": 2.3052, "mean_token_accuracy": 0.4068965494632721, "step": 24750 }, { "epoch": 0.024933498918759397, "grad_norm": 17.413732844641164, "learning_rate": 2.4933021775476906e-05, "loss": 2.7568, "mean_token_accuracy": 0.34482758641242983, "step": 24755 }, { "epoch": 0.02493853497186357, "grad_norm": 18.466674612906953, "learning_rate": 2.493805773220797e-05, "loss": 2.5837, "mean_token_accuracy": 0.4310344815254211, "step": 24760 }, { "epoch": 0.024943571024967744, "grad_norm": 16.87559628930719, "learning_rate": 2.4943093688939028e-05, "loss": 2.5021, "mean_token_accuracy": 0.4034482717514038, "step": 24765 }, { "epoch": 0.024948607078071918, "grad_norm": 14.209737865021951, "learning_rate": 2.4948129645670083e-05, "loss": 2.3501, "mean_token_accuracy": 0.4000000059604645, "step": 24770 }, { "epoch": 0.02495364313117609, "grad_norm": 20.22143574985584, "learning_rate": 2.4953165602401146e-05, "loss": 2.4344, "mean_token_accuracy": 0.36551724970340727, "step": 24775 }, { "epoch": 0.024958679184280262, "grad_norm": 18.602972689867542, "learning_rate": 2.4958201559132205e-05, "loss": 2.3165, "mean_token_accuracy": 0.41379310488700866, "step": 24780 }, { "epoch": 0.024963715237384436, "grad_norm": 17.35513427665744, "learning_rate": 2.4963237515863265e-05, "loss": 2.6037, "mean_token_accuracy": 0.3827586233615875, "step": 24785 }, { "epoch": 0.024968751290488606, "grad_norm": 21.82579321341961, "learning_rate": 2.4968273472594324e-05, "loss": 2.518, "mean_token_accuracy": 0.3965517282485962, "step": 24790 }, { "epoch": 0.02497378734359278, "grad_norm": 19.367511163002423, "learning_rate": 2.4973309429325383e-05, "loss": 2.7331, "mean_token_accuracy": 0.38965516686439516, "step": 24795 }, { "epoch": 0.024978823396696954, "grad_norm": 18.560369745773816, "learning_rate": 2.4978345386056446e-05, "loss": 2.9231, "mean_token_accuracy": 0.3939503937959671, "step": 24800 }, { "epoch": 0.024983859449801128, "grad_norm": 19.018720791785338, "learning_rate": 2.4983381342787505e-05, "loss": 2.737, "mean_token_accuracy": 0.4137930989265442, "step": 24805 }, { "epoch": 0.024988895502905298, "grad_norm": 16.156242104946447, "learning_rate": 2.4988417299518564e-05, "loss": 2.2594, "mean_token_accuracy": 0.4551724076271057, "step": 24810 }, { "epoch": 0.02499393155600947, "grad_norm": 18.215359716320375, "learning_rate": 2.4993453256249623e-05, "loss": 2.4389, "mean_token_accuracy": 0.41034482717514037, "step": 24815 }, { "epoch": 0.024998967609113645, "grad_norm": 15.965827269457066, "learning_rate": 2.4998489212980683e-05, "loss": 2.1229, "mean_token_accuracy": 0.5241379320621491, "step": 24820 }, { "epoch": 0.025004003662217816, "grad_norm": 18.017658849569003, "learning_rate": 2.5003525169711745e-05, "loss": 2.3537, "mean_token_accuracy": 0.44827587008476255, "step": 24825 }, { "epoch": 0.02500903971532199, "grad_norm": 19.32321763099768, "learning_rate": 2.5008561126442805e-05, "loss": 2.2264, "mean_token_accuracy": 0.42068966031074523, "step": 24830 }, { "epoch": 0.025014075768426163, "grad_norm": 16.79237613166486, "learning_rate": 2.501359708317386e-05, "loss": 1.9761, "mean_token_accuracy": 0.4918935239315033, "step": 24835 }, { "epoch": 0.025019111821530337, "grad_norm": 18.70889420607284, "learning_rate": 2.501863303990492e-05, "loss": 2.5177, "mean_token_accuracy": 0.37586206793785093, "step": 24840 }, { "epoch": 0.025024147874634507, "grad_norm": 19.736300992628863, "learning_rate": 2.502366899663598e-05, "loss": 2.64, "mean_token_accuracy": 0.38620689511299133, "step": 24845 }, { "epoch": 0.02502918392773868, "grad_norm": 25.939978522503115, "learning_rate": 2.5028704953367045e-05, "loss": 2.6035, "mean_token_accuracy": 0.3965517282485962, "step": 24850 }, { "epoch": 0.025034219980842855, "grad_norm": 16.35035068645743, "learning_rate": 2.50337409100981e-05, "loss": 2.733, "mean_token_accuracy": 0.4034482777118683, "step": 24855 }, { "epoch": 0.025039256033947025, "grad_norm": 20.1360113122945, "learning_rate": 2.503877686682916e-05, "loss": 2.5704, "mean_token_accuracy": 0.3655172407627106, "step": 24860 }, { "epoch": 0.0250442920870512, "grad_norm": 14.832555202575852, "learning_rate": 2.504381282356022e-05, "loss": 2.7804, "mean_token_accuracy": 0.36206896007061007, "step": 24865 }, { "epoch": 0.025049328140155373, "grad_norm": 16.66674451613789, "learning_rate": 2.504884878029128e-05, "loss": 2.3054, "mean_token_accuracy": 0.4517241358757019, "step": 24870 }, { "epoch": 0.025054364193259546, "grad_norm": 21.558419006097473, "learning_rate": 2.505388473702234e-05, "loss": 2.197, "mean_token_accuracy": 0.49879008531570435, "step": 24875 }, { "epoch": 0.025059400246363717, "grad_norm": 17.697669562281778, "learning_rate": 2.50589206937534e-05, "loss": 2.8036, "mean_token_accuracy": 0.34137930870056155, "step": 24880 }, { "epoch": 0.02506443629946789, "grad_norm": 19.8202204158267, "learning_rate": 2.506395665048446e-05, "loss": 2.4133, "mean_token_accuracy": 0.36896551251411436, "step": 24885 }, { "epoch": 0.025069472352572064, "grad_norm": 18.872696023625345, "learning_rate": 2.506899260721552e-05, "loss": 2.2376, "mean_token_accuracy": 0.441379314661026, "step": 24890 }, { "epoch": 0.025074508405676234, "grad_norm": 14.623671374832881, "learning_rate": 2.507402856394658e-05, "loss": 2.0833, "mean_token_accuracy": 0.5, "step": 24895 }, { "epoch": 0.025079544458780408, "grad_norm": 19.188064940481706, "learning_rate": 2.507906452067764e-05, "loss": 2.246, "mean_token_accuracy": 0.4275861978530884, "step": 24900 }, { "epoch": 0.025084580511884582, "grad_norm": 18.003936616059182, "learning_rate": 2.50841004774087e-05, "loss": 3.018, "mean_token_accuracy": 0.3620689630508423, "step": 24905 }, { "epoch": 0.025089616564988756, "grad_norm": 21.9859501299574, "learning_rate": 2.508913643413976e-05, "loss": 2.5498, "mean_token_accuracy": 0.4103448212146759, "step": 24910 }, { "epoch": 0.025094652618092926, "grad_norm": 24.648330978277695, "learning_rate": 2.509417239087082e-05, "loss": 2.4007, "mean_token_accuracy": 0.41379311084747317, "step": 24915 }, { "epoch": 0.0250996886711971, "grad_norm": 16.370652129392177, "learning_rate": 2.5099208347601878e-05, "loss": 2.2039, "mean_token_accuracy": 0.48457351326942444, "step": 24920 }, { "epoch": 0.025104724724301274, "grad_norm": 19.077952558668244, "learning_rate": 2.5104244304332937e-05, "loss": 2.4057, "mean_token_accuracy": 0.43272837400436404, "step": 24925 }, { "epoch": 0.025109760777405444, "grad_norm": 17.325037012733645, "learning_rate": 2.5109280261064e-05, "loss": 2.264, "mean_token_accuracy": 0.46031457781791685, "step": 24930 }, { "epoch": 0.025114796830509618, "grad_norm": 20.68964216092335, "learning_rate": 2.511431621779506e-05, "loss": 2.7405, "mean_token_accuracy": 0.3931034505367279, "step": 24935 }, { "epoch": 0.02511983288361379, "grad_norm": 20.040805741705526, "learning_rate": 2.511935217452612e-05, "loss": 2.8667, "mean_token_accuracy": 0.4379310369491577, "step": 24940 }, { "epoch": 0.025124868936717965, "grad_norm": 14.198128612907638, "learning_rate": 2.5124388131257178e-05, "loss": 2.0427, "mean_token_accuracy": 0.47931033968925474, "step": 24945 }, { "epoch": 0.025129904989822135, "grad_norm": 17.409894932930676, "learning_rate": 2.5129424087988233e-05, "loss": 2.6121, "mean_token_accuracy": 0.39655172228813174, "step": 24950 }, { "epoch": 0.02513494104292631, "grad_norm": 19.0159386616897, "learning_rate": 2.51344600447193e-05, "loss": 2.6767, "mean_token_accuracy": 0.3862068891525269, "step": 24955 }, { "epoch": 0.025139977096030483, "grad_norm": 20.89381608612409, "learning_rate": 2.513949600145036e-05, "loss": 2.585, "mean_token_accuracy": 0.394313371181488, "step": 24960 }, { "epoch": 0.025145013149134653, "grad_norm": 16.92376902226771, "learning_rate": 2.5144531958181418e-05, "loss": 2.561, "mean_token_accuracy": 0.40193587839603423, "step": 24965 }, { "epoch": 0.025150049202238827, "grad_norm": 17.116078318407272, "learning_rate": 2.5149567914912474e-05, "loss": 2.2923, "mean_token_accuracy": 0.43303084969520567, "step": 24970 }, { "epoch": 0.025155085255343, "grad_norm": 18.10420010094955, "learning_rate": 2.5154603871643533e-05, "loss": 2.2387, "mean_token_accuracy": 0.41379310488700866, "step": 24975 }, { "epoch": 0.025160121308447175, "grad_norm": 21.624387760201206, "learning_rate": 2.51596398283746e-05, "loss": 2.5693, "mean_token_accuracy": 0.35862069129943847, "step": 24980 }, { "epoch": 0.025165157361551345, "grad_norm": 18.333290732176884, "learning_rate": 2.516467578510566e-05, "loss": 2.3173, "mean_token_accuracy": 0.46896551847457885, "step": 24985 }, { "epoch": 0.02517019341465552, "grad_norm": 18.845008692154, "learning_rate": 2.5169711741836714e-05, "loss": 2.7614, "mean_token_accuracy": 0.3931034505367279, "step": 24990 }, { "epoch": 0.025175229467759692, "grad_norm": 17.255559981267496, "learning_rate": 2.5174747698567774e-05, "loss": 2.335, "mean_token_accuracy": 0.4862069010734558, "step": 24995 }, { "epoch": 0.025180265520863863, "grad_norm": 21.88476049416182, "learning_rate": 2.5179783655298833e-05, "loss": 2.8608, "mean_token_accuracy": 0.358620685338974, "step": 25000 }, { "epoch": 0.025185301573968037, "grad_norm": 17.96187983720199, "learning_rate": 2.51848196120299e-05, "loss": 2.5135, "mean_token_accuracy": 0.39655172228813174, "step": 25005 }, { "epoch": 0.02519033762707221, "grad_norm": 24.270993844206423, "learning_rate": 2.5189855568760955e-05, "loss": 2.5444, "mean_token_accuracy": 0.3827586114406586, "step": 25010 }, { "epoch": 0.025195373680176384, "grad_norm": 15.833730229194975, "learning_rate": 2.5194891525492014e-05, "loss": 2.5186, "mean_token_accuracy": 0.3793103456497192, "step": 25015 }, { "epoch": 0.025200409733280554, "grad_norm": 16.14537585954356, "learning_rate": 2.5199927482223073e-05, "loss": 2.6364, "mean_token_accuracy": 0.3931034505367279, "step": 25020 }, { "epoch": 0.025205445786384728, "grad_norm": 19.35379596063752, "learning_rate": 2.5204963438954132e-05, "loss": 2.5304, "mean_token_accuracy": 0.42758620381355283, "step": 25025 }, { "epoch": 0.025210481839488902, "grad_norm": 20.81768854210628, "learning_rate": 2.5209999395685192e-05, "loss": 2.5129, "mean_token_accuracy": 0.4034482717514038, "step": 25030 }, { "epoch": 0.025215517892593072, "grad_norm": 22.46113023506108, "learning_rate": 2.5215035352416254e-05, "loss": 2.5387, "mean_token_accuracy": 0.41034482717514037, "step": 25035 }, { "epoch": 0.025220553945697246, "grad_norm": 18.54863069543931, "learning_rate": 2.5220071309147314e-05, "loss": 2.5678, "mean_token_accuracy": 0.42758620381355283, "step": 25040 }, { "epoch": 0.02522558999880142, "grad_norm": 19.66333815922003, "learning_rate": 2.5225107265878373e-05, "loss": 2.3509, "mean_token_accuracy": 0.4448275864124298, "step": 25045 }, { "epoch": 0.025230626051905593, "grad_norm": 17.72482811832775, "learning_rate": 2.5230143222609432e-05, "loss": 2.1283, "mean_token_accuracy": 0.5128856599330902, "step": 25050 }, { "epoch": 0.025235662105009764, "grad_norm": 17.272617012682325, "learning_rate": 2.523517917934049e-05, "loss": 2.5181, "mean_token_accuracy": 0.45517241954803467, "step": 25055 }, { "epoch": 0.025240698158113938, "grad_norm": 19.82932438424549, "learning_rate": 2.5240215136071554e-05, "loss": 2.4561, "mean_token_accuracy": 0.44137930274009707, "step": 25060 }, { "epoch": 0.02524573421121811, "grad_norm": 22.77298130370189, "learning_rate": 2.5245251092802613e-05, "loss": 2.4903, "mean_token_accuracy": 0.3931034505367279, "step": 25065 }, { "epoch": 0.02525077026432228, "grad_norm": 21.62257489094834, "learning_rate": 2.5250287049533672e-05, "loss": 2.689, "mean_token_accuracy": 0.3793103456497192, "step": 25070 }, { "epoch": 0.025255806317426455, "grad_norm": 13.863664567921546, "learning_rate": 2.5255323006264732e-05, "loss": 2.403, "mean_token_accuracy": 0.4310344815254211, "step": 25075 }, { "epoch": 0.02526084237053063, "grad_norm": 22.331022361436613, "learning_rate": 2.5260358962995788e-05, "loss": 2.525, "mean_token_accuracy": 0.3840290427207947, "step": 25080 }, { "epoch": 0.025265878423634803, "grad_norm": 16.49316377332385, "learning_rate": 2.5265394919726854e-05, "loss": 2.2914, "mean_token_accuracy": 0.4586206912994385, "step": 25085 }, { "epoch": 0.025270914476738973, "grad_norm": 18.97188090009966, "learning_rate": 2.5270430876457913e-05, "loss": 2.4009, "mean_token_accuracy": 0.42413793206214906, "step": 25090 }, { "epoch": 0.025275950529843147, "grad_norm": 15.620107160335737, "learning_rate": 2.5275466833188972e-05, "loss": 2.4613, "mean_token_accuracy": 0.3965517282485962, "step": 25095 }, { "epoch": 0.02528098658294732, "grad_norm": 16.922721278950874, "learning_rate": 2.528050278992003e-05, "loss": 2.4986, "mean_token_accuracy": 0.4413793087005615, "step": 25100 }, { "epoch": 0.02528602263605149, "grad_norm": 20.00535561966354, "learning_rate": 2.5285538746651087e-05, "loss": 2.7847, "mean_token_accuracy": 0.37241379022598264, "step": 25105 }, { "epoch": 0.025291058689155665, "grad_norm": 17.215061285637095, "learning_rate": 2.5290574703382147e-05, "loss": 2.6579, "mean_token_accuracy": 0.4172413766384125, "step": 25110 }, { "epoch": 0.02529609474225984, "grad_norm": 14.513372330474597, "learning_rate": 2.5295610660113213e-05, "loss": 2.1779, "mean_token_accuracy": 0.4724137902259827, "step": 25115 }, { "epoch": 0.025301130795364012, "grad_norm": 20.863186511496853, "learning_rate": 2.5300646616844272e-05, "loss": 2.4937, "mean_token_accuracy": 0.37586206793785093, "step": 25120 }, { "epoch": 0.025306166848468183, "grad_norm": 21.46342969419995, "learning_rate": 2.5305682573575328e-05, "loss": 2.4036, "mean_token_accuracy": 0.38620689511299133, "step": 25125 }, { "epoch": 0.025311202901572356, "grad_norm": 26.00649860401789, "learning_rate": 2.5310718530306387e-05, "loss": 2.4331, "mean_token_accuracy": 0.3965517163276672, "step": 25130 }, { "epoch": 0.02531623895467653, "grad_norm": 34.40026324279338, "learning_rate": 2.5315754487037446e-05, "loss": 2.8299, "mean_token_accuracy": 0.38965516686439516, "step": 25135 }, { "epoch": 0.0253212750077807, "grad_norm": 18.20071192619595, "learning_rate": 2.5320790443768512e-05, "loss": 2.3868, "mean_token_accuracy": 0.441379314661026, "step": 25140 }, { "epoch": 0.025326311060884874, "grad_norm": 20.190765148183047, "learning_rate": 2.5325826400499568e-05, "loss": 2.7907, "mean_token_accuracy": 0.358620685338974, "step": 25145 }, { "epoch": 0.025331347113989048, "grad_norm": 14.735776789645938, "learning_rate": 2.5330862357230627e-05, "loss": 2.7531, "mean_token_accuracy": 0.3724137932062149, "step": 25150 }, { "epoch": 0.02533638316709322, "grad_norm": 18.06522623033623, "learning_rate": 2.5335898313961687e-05, "loss": 2.4597, "mean_token_accuracy": 0.4413793087005615, "step": 25155 }, { "epoch": 0.025341419220197392, "grad_norm": 21.628463505813496, "learning_rate": 2.5340934270692746e-05, "loss": 2.399, "mean_token_accuracy": 0.45517241954803467, "step": 25160 }, { "epoch": 0.025346455273301566, "grad_norm": 19.23205904667441, "learning_rate": 2.534597022742381e-05, "loss": 2.864, "mean_token_accuracy": 0.37241379618644715, "step": 25165 }, { "epoch": 0.02535149132640574, "grad_norm": 30.278630424450583, "learning_rate": 2.5351006184154868e-05, "loss": 2.422, "mean_token_accuracy": 0.4344827592372894, "step": 25170 }, { "epoch": 0.02535652737950991, "grad_norm": 17.83850074331239, "learning_rate": 2.5356042140885927e-05, "loss": 2.1901, "mean_token_accuracy": 0.4517241358757019, "step": 25175 }, { "epoch": 0.025361563432614084, "grad_norm": 18.48080690979553, "learning_rate": 2.5361078097616986e-05, "loss": 2.6698, "mean_token_accuracy": 0.46249244809150697, "step": 25180 }, { "epoch": 0.025366599485718257, "grad_norm": 32.049637530233355, "learning_rate": 2.5366114054348045e-05, "loss": 2.2209, "mean_token_accuracy": 0.4448275864124298, "step": 25185 }, { "epoch": 0.02537163553882243, "grad_norm": 15.69236784579381, "learning_rate": 2.5371150011079105e-05, "loss": 2.2953, "mean_token_accuracy": 0.45517240166664125, "step": 25190 }, { "epoch": 0.0253766715919266, "grad_norm": 19.148726080851517, "learning_rate": 2.5376185967810167e-05, "loss": 2.6002, "mean_token_accuracy": 0.4258318156003952, "step": 25195 }, { "epoch": 0.025381707645030775, "grad_norm": 21.03997649284189, "learning_rate": 2.5381221924541227e-05, "loss": 2.759, "mean_token_accuracy": 0.36896551549434664, "step": 25200 }, { "epoch": 0.02538674369813495, "grad_norm": 20.317732195565593, "learning_rate": 2.5386257881272286e-05, "loss": 2.8817, "mean_token_accuracy": 0.43611615896224976, "step": 25205 }, { "epoch": 0.02539177975123912, "grad_norm": 106.29538120599565, "learning_rate": 2.5391293838003345e-05, "loss": 2.8172, "mean_token_accuracy": 0.4000000059604645, "step": 25210 }, { "epoch": 0.025396815804343293, "grad_norm": 15.947181681645722, "learning_rate": 2.53963297947344e-05, "loss": 2.6476, "mean_token_accuracy": 0.3896551728248596, "step": 25215 }, { "epoch": 0.025401851857447467, "grad_norm": 17.532040045031415, "learning_rate": 2.5401365751465467e-05, "loss": 2.7778, "mean_token_accuracy": 0.4068965494632721, "step": 25220 }, { "epoch": 0.02540688791055164, "grad_norm": 19.5977381701148, "learning_rate": 2.5406401708196526e-05, "loss": 2.6396, "mean_token_accuracy": 0.3689655244350433, "step": 25225 }, { "epoch": 0.02541192396365581, "grad_norm": 18.143791265038214, "learning_rate": 2.5411437664927586e-05, "loss": 2.4555, "mean_token_accuracy": 0.37864487767219546, "step": 25230 }, { "epoch": 0.025416960016759985, "grad_norm": 17.002685910828774, "learning_rate": 2.541647362165864e-05, "loss": 2.4609, "mean_token_accuracy": 0.414039409160614, "step": 25235 }, { "epoch": 0.02542199606986416, "grad_norm": 13.578408900533748, "learning_rate": 2.54215095783897e-05, "loss": 2.3709, "mean_token_accuracy": 0.42068964838981626, "step": 25240 }, { "epoch": 0.02542703212296833, "grad_norm": 19.86415541795426, "learning_rate": 2.5426545535120767e-05, "loss": 2.1122, "mean_token_accuracy": 0.4586206912994385, "step": 25245 }, { "epoch": 0.025432068176072502, "grad_norm": 23.289069648920503, "learning_rate": 2.5431581491851826e-05, "loss": 2.7462, "mean_token_accuracy": 0.3862068891525269, "step": 25250 }, { "epoch": 0.025437104229176676, "grad_norm": 15.311422446034133, "learning_rate": 2.5436617448582882e-05, "loss": 2.7361, "mean_token_accuracy": 0.3655172407627106, "step": 25255 }, { "epoch": 0.02544214028228085, "grad_norm": 16.608153866502843, "learning_rate": 2.544165340531394e-05, "loss": 2.6996, "mean_token_accuracy": 0.3758620619773865, "step": 25260 }, { "epoch": 0.02544717633538502, "grad_norm": 19.18888651941495, "learning_rate": 2.5446689362045e-05, "loss": 2.6177, "mean_token_accuracy": 0.4034482777118683, "step": 25265 }, { "epoch": 0.025452212388489194, "grad_norm": 15.33509087846629, "learning_rate": 2.545172531877606e-05, "loss": 2.4175, "mean_token_accuracy": 0.40689654350280763, "step": 25270 }, { "epoch": 0.025457248441593368, "grad_norm": 17.36760810208012, "learning_rate": 2.5456761275507122e-05, "loss": 2.4885, "mean_token_accuracy": 0.417241370677948, "step": 25275 }, { "epoch": 0.025462284494697538, "grad_norm": 23.693052056183188, "learning_rate": 2.546179723223818e-05, "loss": 2.6248, "mean_token_accuracy": 0.4344827562570572, "step": 25280 }, { "epoch": 0.025467320547801712, "grad_norm": 19.860335632710257, "learning_rate": 2.546683318896924e-05, "loss": 2.7004, "mean_token_accuracy": 0.3655172407627106, "step": 25285 }, { "epoch": 0.025472356600905886, "grad_norm": 16.646581272070463, "learning_rate": 2.54718691457003e-05, "loss": 2.2726, "mean_token_accuracy": 0.4482758641242981, "step": 25290 }, { "epoch": 0.02547739265401006, "grad_norm": 18.260723463955063, "learning_rate": 2.547690510243136e-05, "loss": 2.6047, "mean_token_accuracy": 0.3862069010734558, "step": 25295 }, { "epoch": 0.02548242870711423, "grad_norm": 21.000602471070593, "learning_rate": 2.5481941059162422e-05, "loss": 2.9808, "mean_token_accuracy": 0.3517241358757019, "step": 25300 }, { "epoch": 0.025487464760218403, "grad_norm": 17.401585788863354, "learning_rate": 2.548697701589348e-05, "loss": 2.3299, "mean_token_accuracy": 0.4551724135875702, "step": 25305 }, { "epoch": 0.025492500813322577, "grad_norm": 18.689687345515637, "learning_rate": 2.549201297262454e-05, "loss": 2.4588, "mean_token_accuracy": 0.39310343861579894, "step": 25310 }, { "epoch": 0.025497536866426748, "grad_norm": 14.606915998653385, "learning_rate": 2.54970489293556e-05, "loss": 2.5731, "mean_token_accuracy": 0.37241379618644715, "step": 25315 }, { "epoch": 0.02550257291953092, "grad_norm": 17.013073598271678, "learning_rate": 2.550208488608666e-05, "loss": 2.259, "mean_token_accuracy": 0.45722927451133727, "step": 25320 }, { "epoch": 0.025507608972635095, "grad_norm": 17.38080225885054, "learning_rate": 2.550712084281772e-05, "loss": 2.2518, "mean_token_accuracy": 0.4068965554237366, "step": 25325 }, { "epoch": 0.02551264502573927, "grad_norm": 17.978972083770167, "learning_rate": 2.551215679954878e-05, "loss": 2.4513, "mean_token_accuracy": 0.38620689511299133, "step": 25330 }, { "epoch": 0.02551768107884344, "grad_norm": 19.774094161197233, "learning_rate": 2.551719275627984e-05, "loss": 2.9378, "mean_token_accuracy": 0.3862069010734558, "step": 25335 }, { "epoch": 0.025522717131947613, "grad_norm": 21.714034481302644, "learning_rate": 2.55222287130109e-05, "loss": 2.5489, "mean_token_accuracy": 0.4117362380027771, "step": 25340 }, { "epoch": 0.025527753185051787, "grad_norm": 23.22243259120189, "learning_rate": 2.552726466974196e-05, "loss": 2.5927, "mean_token_accuracy": 0.4241379201412201, "step": 25345 }, { "epoch": 0.025532789238155957, "grad_norm": 14.92782542071349, "learning_rate": 2.5532300626473014e-05, "loss": 2.5713, "mean_token_accuracy": 0.4034482717514038, "step": 25350 }, { "epoch": 0.02553782529126013, "grad_norm": 17.493311110377483, "learning_rate": 2.553733658320408e-05, "loss": 2.7252, "mean_token_accuracy": 0.3793103337287903, "step": 25355 }, { "epoch": 0.025542861344364304, "grad_norm": 34.95881759924783, "learning_rate": 2.554237253993514e-05, "loss": 2.6831, "mean_token_accuracy": 0.3620689630508423, "step": 25360 }, { "epoch": 0.025547897397468478, "grad_norm": 16.23018990945946, "learning_rate": 2.55474084966662e-05, "loss": 2.6707, "mean_token_accuracy": 0.38965516686439516, "step": 25365 }, { "epoch": 0.02555293345057265, "grad_norm": 16.40583605429416, "learning_rate": 2.5552444453397255e-05, "loss": 2.7782, "mean_token_accuracy": 0.3689655065536499, "step": 25370 }, { "epoch": 0.025557969503676822, "grad_norm": 15.840904171214309, "learning_rate": 2.5557480410128314e-05, "loss": 2.2282, "mean_token_accuracy": 0.4676346004009247, "step": 25375 }, { "epoch": 0.025563005556780996, "grad_norm": 19.312794213749445, "learning_rate": 2.556251636685938e-05, "loss": 2.2131, "mean_token_accuracy": 0.4206896543502808, "step": 25380 }, { "epoch": 0.025568041609885166, "grad_norm": 17.948052173318416, "learning_rate": 2.556755232359044e-05, "loss": 2.3424, "mean_token_accuracy": 0.4620689570903778, "step": 25385 }, { "epoch": 0.02557307766298934, "grad_norm": 18.769787843605116, "learning_rate": 2.5572588280321495e-05, "loss": 2.1896, "mean_token_accuracy": 0.4551724076271057, "step": 25390 }, { "epoch": 0.025578113716093514, "grad_norm": 23.47584088596801, "learning_rate": 2.5577624237052554e-05, "loss": 2.7814, "mean_token_accuracy": 0.3846340000629425, "step": 25395 }, { "epoch": 0.025583149769197688, "grad_norm": 19.429535941346668, "learning_rate": 2.5582660193783614e-05, "loss": 2.6939, "mean_token_accuracy": 0.3896551728248596, "step": 25400 }, { "epoch": 0.025588185822301858, "grad_norm": 18.36807691020928, "learning_rate": 2.558769615051468e-05, "loss": 2.37, "mean_token_accuracy": 0.4517241299152374, "step": 25405 }, { "epoch": 0.02559322187540603, "grad_norm": 19.29360840729329, "learning_rate": 2.5592732107245736e-05, "loss": 2.9643, "mean_token_accuracy": 0.3482758581638336, "step": 25410 }, { "epoch": 0.025598257928510205, "grad_norm": 18.679836721644953, "learning_rate": 2.5597768063976795e-05, "loss": 2.5049, "mean_token_accuracy": 0.39655172228813174, "step": 25415 }, { "epoch": 0.025603293981614376, "grad_norm": 23.384422070075516, "learning_rate": 2.5602804020707854e-05, "loss": 2.7445, "mean_token_accuracy": 0.4344827473163605, "step": 25420 }, { "epoch": 0.02560833003471855, "grad_norm": 19.740392865604928, "learning_rate": 2.5607839977438913e-05, "loss": 2.6483, "mean_token_accuracy": 0.3896551728248596, "step": 25425 }, { "epoch": 0.025613366087822723, "grad_norm": 18.149656931395914, "learning_rate": 2.5612875934169976e-05, "loss": 2.5275, "mean_token_accuracy": 0.38620689511299133, "step": 25430 }, { "epoch": 0.025618402140926897, "grad_norm": 27.272597762683887, "learning_rate": 2.5617911890901035e-05, "loss": 2.333, "mean_token_accuracy": 0.46896552443504336, "step": 25435 }, { "epoch": 0.025623438194031067, "grad_norm": 16.949556414515875, "learning_rate": 2.5622947847632094e-05, "loss": 2.2316, "mean_token_accuracy": 0.41724138259887694, "step": 25440 }, { "epoch": 0.02562847424713524, "grad_norm": 20.52569275798412, "learning_rate": 2.5627983804363154e-05, "loss": 2.6726, "mean_token_accuracy": 0.41530550718307496, "step": 25445 }, { "epoch": 0.025633510300239415, "grad_norm": 24.73756581169964, "learning_rate": 2.5633019761094213e-05, "loss": 2.3707, "mean_token_accuracy": 0.4620689690113068, "step": 25450 }, { "epoch": 0.025638546353343585, "grad_norm": 18.105321636581134, "learning_rate": 2.5638055717825272e-05, "loss": 2.6415, "mean_token_accuracy": 0.4103448331356049, "step": 25455 }, { "epoch": 0.02564358240644776, "grad_norm": 18.883994358153522, "learning_rate": 2.5643091674556335e-05, "loss": 2.5541, "mean_token_accuracy": 0.4068965494632721, "step": 25460 }, { "epoch": 0.025648618459551933, "grad_norm": 14.686502510630897, "learning_rate": 2.5648127631287394e-05, "loss": 2.4379, "mean_token_accuracy": 0.43103447556495667, "step": 25465 }, { "epoch": 0.025653654512656106, "grad_norm": 20.914455703014923, "learning_rate": 2.5653163588018453e-05, "loss": 2.4043, "mean_token_accuracy": 0.43266788125038147, "step": 25470 }, { "epoch": 0.025658690565760277, "grad_norm": 20.138971409900044, "learning_rate": 2.5658199544749513e-05, "loss": 2.8157, "mean_token_accuracy": 0.33793103098869326, "step": 25475 }, { "epoch": 0.02566372661886445, "grad_norm": 16.82444156946405, "learning_rate": 2.5663235501480572e-05, "loss": 2.5046, "mean_token_accuracy": 0.4103448212146759, "step": 25480 }, { "epoch": 0.025668762671968624, "grad_norm": 18.443627836378443, "learning_rate": 2.5668271458211635e-05, "loss": 2.6333, "mean_token_accuracy": 0.4310344815254211, "step": 25485 }, { "epoch": 0.025673798725072795, "grad_norm": 20.709829752009043, "learning_rate": 2.5673307414942694e-05, "loss": 2.6359, "mean_token_accuracy": 0.4344827651977539, "step": 25490 }, { "epoch": 0.02567883477817697, "grad_norm": 18.130729576689276, "learning_rate": 2.5678343371673753e-05, "loss": 2.3244, "mean_token_accuracy": 0.41034482717514037, "step": 25495 }, { "epoch": 0.025683870831281142, "grad_norm": 16.230731785149043, "learning_rate": 2.5683379328404812e-05, "loss": 2.4726, "mean_token_accuracy": 0.4310344815254211, "step": 25500 }, { "epoch": 0.025688906884385316, "grad_norm": 18.20539320947682, "learning_rate": 2.5688415285135868e-05, "loss": 2.5169, "mean_token_accuracy": 0.3862068891525269, "step": 25505 }, { "epoch": 0.025693942937489486, "grad_norm": 16.69858668203235, "learning_rate": 2.5693451241866934e-05, "loss": 2.5614, "mean_token_accuracy": 0.39310344457626345, "step": 25510 }, { "epoch": 0.02569897899059366, "grad_norm": 18.779806064249932, "learning_rate": 2.5698487198597993e-05, "loss": 2.3395, "mean_token_accuracy": 0.42068964838981626, "step": 25515 }, { "epoch": 0.025704015043697834, "grad_norm": 16.09783371069836, "learning_rate": 2.5703523155329053e-05, "loss": 2.465, "mean_token_accuracy": 0.4344827592372894, "step": 25520 }, { "epoch": 0.025709051096802004, "grad_norm": 16.56870002643387, "learning_rate": 2.570855911206011e-05, "loss": 2.0729, "mean_token_accuracy": 0.4918330192565918, "step": 25525 }, { "epoch": 0.025714087149906178, "grad_norm": 16.60627444423652, "learning_rate": 2.5713595068791168e-05, "loss": 2.2981, "mean_token_accuracy": 0.4482758641242981, "step": 25530 }, { "epoch": 0.02571912320301035, "grad_norm": 19.130053709863038, "learning_rate": 2.5718631025522227e-05, "loss": 2.4905, "mean_token_accuracy": 0.4034482777118683, "step": 25535 }, { "epoch": 0.025724159256114525, "grad_norm": 21.129642855953062, "learning_rate": 2.5723666982253293e-05, "loss": 2.2408, "mean_token_accuracy": 0.47931033968925474, "step": 25540 }, { "epoch": 0.025729195309218696, "grad_norm": 16.112794774482463, "learning_rate": 2.572870293898435e-05, "loss": 2.785, "mean_token_accuracy": 0.37931033968925476, "step": 25545 }, { "epoch": 0.02573423136232287, "grad_norm": 23.99444515186133, "learning_rate": 2.5733738895715408e-05, "loss": 2.5049, "mean_token_accuracy": 0.3999999940395355, "step": 25550 }, { "epoch": 0.025739267415427043, "grad_norm": 23.497804700842202, "learning_rate": 2.5738774852446467e-05, "loss": 2.7774, "mean_token_accuracy": 0.42413793206214906, "step": 25555 }, { "epoch": 0.025744303468531213, "grad_norm": 16.100507873258035, "learning_rate": 2.5743810809177527e-05, "loss": 2.2967, "mean_token_accuracy": 0.4586206912994385, "step": 25560 }, { "epoch": 0.025749339521635387, "grad_norm": 17.301694710784243, "learning_rate": 2.574884676590859e-05, "loss": 2.5299, "mean_token_accuracy": 0.4206896543502808, "step": 25565 }, { "epoch": 0.02575437557473956, "grad_norm": 21.98533833404907, "learning_rate": 2.575388272263965e-05, "loss": 2.4249, "mean_token_accuracy": 0.43103447556495667, "step": 25570 }, { "epoch": 0.025759411627843735, "grad_norm": 17.810299201605687, "learning_rate": 2.5758918679370708e-05, "loss": 2.3648, "mean_token_accuracy": 0.4379310250282288, "step": 25575 }, { "epoch": 0.025764447680947905, "grad_norm": 16.090632424096817, "learning_rate": 2.5763954636101767e-05, "loss": 2.5688, "mean_token_accuracy": 0.4073891669511795, "step": 25580 }, { "epoch": 0.02576948373405208, "grad_norm": 14.58884491277401, "learning_rate": 2.5768990592832826e-05, "loss": 2.3695, "mean_token_accuracy": 0.4551724135875702, "step": 25585 }, { "epoch": 0.025774519787156253, "grad_norm": 19.943803833642274, "learning_rate": 2.577402654956389e-05, "loss": 2.6594, "mean_token_accuracy": 0.4206896543502808, "step": 25590 }, { "epoch": 0.025779555840260423, "grad_norm": 14.95317666272273, "learning_rate": 2.5779062506294948e-05, "loss": 2.2516, "mean_token_accuracy": 0.41724138259887694, "step": 25595 }, { "epoch": 0.025784591893364597, "grad_norm": 17.2318539130284, "learning_rate": 2.5784098463026008e-05, "loss": 2.5403, "mean_token_accuracy": 0.4241379201412201, "step": 25600 }, { "epoch": 0.02578962794646877, "grad_norm": 16.951517867367336, "learning_rate": 2.5789134419757067e-05, "loss": 2.6145, "mean_token_accuracy": 0.4068965554237366, "step": 25605 }, { "epoch": 0.025794663999572944, "grad_norm": 17.988879447439896, "learning_rate": 2.5794170376488126e-05, "loss": 2.6668, "mean_token_accuracy": 0.34827586114406583, "step": 25610 }, { "epoch": 0.025799700052677114, "grad_norm": 19.43605848909659, "learning_rate": 2.5799206333219182e-05, "loss": 2.7925, "mean_token_accuracy": 0.3551724135875702, "step": 25615 }, { "epoch": 0.025804736105781288, "grad_norm": 17.25922898543315, "learning_rate": 2.5804242289950248e-05, "loss": 2.4205, "mean_token_accuracy": 0.43793103098869324, "step": 25620 }, { "epoch": 0.025809772158885462, "grad_norm": 17.429022745407767, "learning_rate": 2.5809278246681307e-05, "loss": 2.3654, "mean_token_accuracy": 0.42068964838981626, "step": 25625 }, { "epoch": 0.025814808211989632, "grad_norm": 19.49157222662756, "learning_rate": 2.5814314203412366e-05, "loss": 2.4973, "mean_token_accuracy": 0.43103447556495667, "step": 25630 }, { "epoch": 0.025819844265093806, "grad_norm": 16.19556525097804, "learning_rate": 2.5819350160143426e-05, "loss": 2.4594, "mean_token_accuracy": 0.4655172437429428, "step": 25635 }, { "epoch": 0.02582488031819798, "grad_norm": 15.068054900800872, "learning_rate": 2.582438611687448e-05, "loss": 2.6226, "mean_token_accuracy": 0.41724138259887694, "step": 25640 }, { "epoch": 0.025829916371302154, "grad_norm": 15.870176236691693, "learning_rate": 2.5829422073605548e-05, "loss": 2.7616, "mean_token_accuracy": 0.39716748595237733, "step": 25645 }, { "epoch": 0.025834952424406324, "grad_norm": 15.994175301859162, "learning_rate": 2.5834458030336607e-05, "loss": 2.8843, "mean_token_accuracy": 0.3862068891525269, "step": 25650 }, { "epoch": 0.025839988477510498, "grad_norm": 19.618323688037695, "learning_rate": 2.5839493987067666e-05, "loss": 2.5504, "mean_token_accuracy": 0.3482758581638336, "step": 25655 }, { "epoch": 0.02584502453061467, "grad_norm": 19.23104692090036, "learning_rate": 2.5844529943798722e-05, "loss": 2.2857, "mean_token_accuracy": 0.4620689630508423, "step": 25660 }, { "epoch": 0.02585006058371884, "grad_norm": 21.195469283711727, "learning_rate": 2.584956590052978e-05, "loss": 2.6651, "mean_token_accuracy": 0.41724138259887694, "step": 25665 }, { "epoch": 0.025855096636823015, "grad_norm": 24.52536002967969, "learning_rate": 2.5854601857260847e-05, "loss": 2.475, "mean_token_accuracy": 0.4172413766384125, "step": 25670 }, { "epoch": 0.02586013268992719, "grad_norm": 22.866080866976052, "learning_rate": 2.5859637813991906e-05, "loss": 2.7621, "mean_token_accuracy": 0.38275861740112305, "step": 25675 }, { "epoch": 0.025865168743031363, "grad_norm": 17.68343265998821, "learning_rate": 2.5864673770722962e-05, "loss": 2.8953, "mean_token_accuracy": 0.38620689511299133, "step": 25680 }, { "epoch": 0.025870204796135533, "grad_norm": 15.32400383488988, "learning_rate": 2.586970972745402e-05, "loss": 2.1915, "mean_token_accuracy": 0.4482758641242981, "step": 25685 }, { "epoch": 0.025875240849239707, "grad_norm": 16.87317473751135, "learning_rate": 2.587474568418508e-05, "loss": 2.3227, "mean_token_accuracy": 0.40689654350280763, "step": 25690 }, { "epoch": 0.02588027690234388, "grad_norm": 19.859119583445707, "learning_rate": 2.587978164091614e-05, "loss": 2.693, "mean_token_accuracy": 0.36206896901130675, "step": 25695 }, { "epoch": 0.02588531295544805, "grad_norm": 135.1677720117011, "learning_rate": 2.5884817597647203e-05, "loss": 2.6601, "mean_token_accuracy": 0.39655172228813174, "step": 25700 }, { "epoch": 0.025890349008552225, "grad_norm": 22.023851491174877, "learning_rate": 2.5889853554378262e-05, "loss": 2.3791, "mean_token_accuracy": 0.3965517282485962, "step": 25705 }, { "epoch": 0.0258953850616564, "grad_norm": 15.910325342440771, "learning_rate": 2.589488951110932e-05, "loss": 2.7685, "mean_token_accuracy": 0.3896551728248596, "step": 25710 }, { "epoch": 0.025900421114760572, "grad_norm": 25.30780956932258, "learning_rate": 2.589992546784038e-05, "loss": 2.4237, "mean_token_accuracy": 0.39655172228813174, "step": 25715 }, { "epoch": 0.025905457167864743, "grad_norm": 16.815822799509856, "learning_rate": 2.590496142457144e-05, "loss": 2.5661, "mean_token_accuracy": 0.3931034505367279, "step": 25720 }, { "epoch": 0.025910493220968916, "grad_norm": 16.52923888349581, "learning_rate": 2.5909997381302502e-05, "loss": 2.3364, "mean_token_accuracy": 0.3965517282485962, "step": 25725 }, { "epoch": 0.02591552927407309, "grad_norm": 17.715842359817223, "learning_rate": 2.591503333803356e-05, "loss": 2.3835, "mean_token_accuracy": 0.4206896543502808, "step": 25730 }, { "epoch": 0.02592056532717726, "grad_norm": 21.018188372487078, "learning_rate": 2.592006929476462e-05, "loss": 2.4326, "mean_token_accuracy": 0.44301270246505736, "step": 25735 }, { "epoch": 0.025925601380281434, "grad_norm": 18.27056015979054, "learning_rate": 2.592510525149568e-05, "loss": 2.6487, "mean_token_accuracy": 0.37931033968925476, "step": 25740 }, { "epoch": 0.025930637433385608, "grad_norm": 18.017348857205633, "learning_rate": 2.593014120822674e-05, "loss": 1.9818, "mean_token_accuracy": 0.4724137783050537, "step": 25745 }, { "epoch": 0.025935673486489782, "grad_norm": 18.74564105094958, "learning_rate": 2.5935177164957802e-05, "loss": 2.6819, "mean_token_accuracy": 0.38965516090393065, "step": 25750 }, { "epoch": 0.025940709539593952, "grad_norm": 18.219165566994345, "learning_rate": 2.594021312168886e-05, "loss": 2.2987, "mean_token_accuracy": 0.41034482717514037, "step": 25755 }, { "epoch": 0.025945745592698126, "grad_norm": 17.506243977813853, "learning_rate": 2.594524907841992e-05, "loss": 2.5484, "mean_token_accuracy": 0.44137930274009707, "step": 25760 }, { "epoch": 0.0259507816458023, "grad_norm": 28.640369531836274, "learning_rate": 2.595028503515098e-05, "loss": 2.658, "mean_token_accuracy": 0.42758620381355283, "step": 25765 }, { "epoch": 0.02595581769890647, "grad_norm": 17.766867069825732, "learning_rate": 2.5955320991882036e-05, "loss": 2.8072, "mean_token_accuracy": 0.39310344457626345, "step": 25770 }, { "epoch": 0.025960853752010644, "grad_norm": 18.5640748319897, "learning_rate": 2.5960356948613102e-05, "loss": 2.4961, "mean_token_accuracy": 0.42758620381355283, "step": 25775 }, { "epoch": 0.025965889805114818, "grad_norm": 21.973774046866726, "learning_rate": 2.596539290534416e-05, "loss": 2.0544, "mean_token_accuracy": 0.47586206197738645, "step": 25780 }, { "epoch": 0.02597092585821899, "grad_norm": 22.37003745862752, "learning_rate": 2.597042886207522e-05, "loss": 2.7339, "mean_token_accuracy": 0.38965516686439516, "step": 25785 }, { "epoch": 0.02597596191132316, "grad_norm": 17.99471368359866, "learning_rate": 2.5975464818806276e-05, "loss": 2.4399, "mean_token_accuracy": 0.4256503224372864, "step": 25790 }, { "epoch": 0.025980997964427335, "grad_norm": 20.82192750743505, "learning_rate": 2.5980500775537335e-05, "loss": 2.4286, "mean_token_accuracy": 0.4413793087005615, "step": 25795 }, { "epoch": 0.02598603401753151, "grad_norm": 16.021568612097294, "learning_rate": 2.5985536732268395e-05, "loss": 3.0416, "mean_token_accuracy": 0.3413793116807938, "step": 25800 }, { "epoch": 0.02599107007063568, "grad_norm": 15.812387949790491, "learning_rate": 2.599057268899946e-05, "loss": 2.2043, "mean_token_accuracy": 0.4344827592372894, "step": 25805 }, { "epoch": 0.025996106123739853, "grad_norm": 22.98439497317223, "learning_rate": 2.5995608645730516e-05, "loss": 2.8114, "mean_token_accuracy": 0.3793103456497192, "step": 25810 }, { "epoch": 0.026001142176844027, "grad_norm": 25.495545544050255, "learning_rate": 2.6000644602461576e-05, "loss": 2.7643, "mean_token_accuracy": 0.40816696882247927, "step": 25815 }, { "epoch": 0.0260061782299482, "grad_norm": 23.902481811966652, "learning_rate": 2.6005680559192635e-05, "loss": 2.5083, "mean_token_accuracy": 0.44137930274009707, "step": 25820 }, { "epoch": 0.02601121428305237, "grad_norm": 22.41158258452064, "learning_rate": 2.6010716515923694e-05, "loss": 2.3991, "mean_token_accuracy": 0.4538415014743805, "step": 25825 }, { "epoch": 0.026016250336156545, "grad_norm": 16.385023110520898, "learning_rate": 2.601575247265476e-05, "loss": 2.3529, "mean_token_accuracy": 0.4586206912994385, "step": 25830 }, { "epoch": 0.02602128638926072, "grad_norm": 16.46552764432163, "learning_rate": 2.6020788429385816e-05, "loss": 2.3643, "mean_token_accuracy": 0.39310343861579894, "step": 25835 }, { "epoch": 0.02602632244236489, "grad_norm": 20.10751831243197, "learning_rate": 2.6025824386116875e-05, "loss": 2.3995, "mean_token_accuracy": 0.4344827592372894, "step": 25840 }, { "epoch": 0.026031358495469063, "grad_norm": 18.83278374091365, "learning_rate": 2.6030860342847935e-05, "loss": 2.161, "mean_token_accuracy": 0.4551724076271057, "step": 25845 }, { "epoch": 0.026036394548573236, "grad_norm": 23.83657279959745, "learning_rate": 2.6035896299578994e-05, "loss": 2.7131, "mean_token_accuracy": 0.3310344755649567, "step": 25850 }, { "epoch": 0.02604143060167741, "grad_norm": 20.64653180181237, "learning_rate": 2.6040932256310057e-05, "loss": 2.3485, "mean_token_accuracy": 0.42413792610168455, "step": 25855 }, { "epoch": 0.02604646665478158, "grad_norm": 15.908623717196344, "learning_rate": 2.6045968213041116e-05, "loss": 2.3456, "mean_token_accuracy": 0.41379310190677643, "step": 25860 }, { "epoch": 0.026051502707885754, "grad_norm": 17.77808213405826, "learning_rate": 2.6051004169772175e-05, "loss": 2.5897, "mean_token_accuracy": 0.4000000059604645, "step": 25865 }, { "epoch": 0.026056538760989928, "grad_norm": 16.600821966319305, "learning_rate": 2.6056040126503234e-05, "loss": 2.7458, "mean_token_accuracy": 0.41379310488700866, "step": 25870 }, { "epoch": 0.026061574814094098, "grad_norm": 18.477933064527882, "learning_rate": 2.6061076083234294e-05, "loss": 2.7099, "mean_token_accuracy": 0.3517241358757019, "step": 25875 }, { "epoch": 0.026066610867198272, "grad_norm": 14.989907244546547, "learning_rate": 2.6066112039965353e-05, "loss": 2.5055, "mean_token_accuracy": 0.4482758641242981, "step": 25880 }, { "epoch": 0.026071646920302446, "grad_norm": 19.68610304171674, "learning_rate": 2.6071147996696415e-05, "loss": 2.3742, "mean_token_accuracy": 0.46896551847457885, "step": 25885 }, { "epoch": 0.02607668297340662, "grad_norm": 22.7325742237096, "learning_rate": 2.6076183953427475e-05, "loss": 2.4356, "mean_token_accuracy": 0.39310345649719236, "step": 25890 }, { "epoch": 0.02608171902651079, "grad_norm": 17.478388040797597, "learning_rate": 2.6081219910158534e-05, "loss": 2.6535, "mean_token_accuracy": 0.38620689511299133, "step": 25895 }, { "epoch": 0.026086755079614964, "grad_norm": 17.0547481257583, "learning_rate": 2.6086255866889593e-05, "loss": 2.1989, "mean_token_accuracy": 0.42758620381355283, "step": 25900 }, { "epoch": 0.026091791132719137, "grad_norm": 18.86088943424977, "learning_rate": 2.609129182362065e-05, "loss": 2.65, "mean_token_accuracy": 0.4362371563911438, "step": 25905 }, { "epoch": 0.026096827185823308, "grad_norm": 21.47708979460711, "learning_rate": 2.6096327780351715e-05, "loss": 2.364, "mean_token_accuracy": 0.44137930274009707, "step": 25910 }, { "epoch": 0.02610186323892748, "grad_norm": 16.176665968790154, "learning_rate": 2.6101363737082774e-05, "loss": 2.7433, "mean_token_accuracy": 0.32413792610168457, "step": 25915 }, { "epoch": 0.026106899292031655, "grad_norm": 19.767097960334574, "learning_rate": 2.6106399693813834e-05, "loss": 2.2928, "mean_token_accuracy": 0.4379310429096222, "step": 25920 }, { "epoch": 0.02611193534513583, "grad_norm": 18.958794498654328, "learning_rate": 2.611143565054489e-05, "loss": 2.5264, "mean_token_accuracy": 0.3758620709180832, "step": 25925 }, { "epoch": 0.02611697139824, "grad_norm": 21.31067008455053, "learning_rate": 2.611647160727595e-05, "loss": 2.4152, "mean_token_accuracy": 0.41034482717514037, "step": 25930 }, { "epoch": 0.026122007451344173, "grad_norm": 16.94517816408171, "learning_rate": 2.6121507564007015e-05, "loss": 2.3422, "mean_token_accuracy": 0.4344827651977539, "step": 25935 }, { "epoch": 0.026127043504448347, "grad_norm": 22.283280642839802, "learning_rate": 2.6126543520738074e-05, "loss": 2.3823, "mean_token_accuracy": 0.4296430587768555, "step": 25940 }, { "epoch": 0.026132079557552517, "grad_norm": 14.534419908807775, "learning_rate": 2.613157947746913e-05, "loss": 2.651, "mean_token_accuracy": 0.3896551728248596, "step": 25945 }, { "epoch": 0.02613711561065669, "grad_norm": 15.191204424783667, "learning_rate": 2.613661543420019e-05, "loss": 2.7103, "mean_token_accuracy": 0.42413793206214906, "step": 25950 }, { "epoch": 0.026142151663760865, "grad_norm": 20.072054598243465, "learning_rate": 2.614165139093125e-05, "loss": 2.5941, "mean_token_accuracy": 0.3793103456497192, "step": 25955 }, { "epoch": 0.02614718771686504, "grad_norm": 15.881046811846868, "learning_rate": 2.6146687347662308e-05, "loss": 2.6041, "mean_token_accuracy": 0.41034482717514037, "step": 25960 }, { "epoch": 0.02615222376996921, "grad_norm": 17.55399754053558, "learning_rate": 2.615172330439337e-05, "loss": 2.5773, "mean_token_accuracy": 0.42280701994895936, "step": 25965 }, { "epoch": 0.026157259823073382, "grad_norm": 18.13474837180395, "learning_rate": 2.615675926112443e-05, "loss": 2.4811, "mean_token_accuracy": 0.43103448748588563, "step": 25970 }, { "epoch": 0.026162295876177556, "grad_norm": 20.762393383593924, "learning_rate": 2.616179521785549e-05, "loss": 2.4835, "mean_token_accuracy": 0.4068965494632721, "step": 25975 }, { "epoch": 0.026167331929281726, "grad_norm": 18.646043287955806, "learning_rate": 2.6166831174586548e-05, "loss": 2.221, "mean_token_accuracy": 0.4517241418361664, "step": 25980 }, { "epoch": 0.0261723679823859, "grad_norm": 18.170538491944484, "learning_rate": 2.6171867131317607e-05, "loss": 2.3222, "mean_token_accuracy": 0.4517241418361664, "step": 25985 }, { "epoch": 0.026177404035490074, "grad_norm": 20.235071317859106, "learning_rate": 2.617690308804867e-05, "loss": 2.9746, "mean_token_accuracy": 0.34827586114406583, "step": 25990 }, { "epoch": 0.026182440088594248, "grad_norm": 16.02823025745344, "learning_rate": 2.618193904477973e-05, "loss": 2.1697, "mean_token_accuracy": 0.4862069010734558, "step": 25995 }, { "epoch": 0.026187476141698418, "grad_norm": 28.581867209201228, "learning_rate": 2.618697500151079e-05, "loss": 2.5501, "mean_token_accuracy": 0.3840895354747772, "step": 26000 }, { "epoch": 0.026192512194802592, "grad_norm": 16.35438875023757, "learning_rate": 2.6192010958241848e-05, "loss": 2.5164, "mean_token_accuracy": 0.3999999940395355, "step": 26005 }, { "epoch": 0.026197548247906766, "grad_norm": 18.598280665744497, "learning_rate": 2.6197046914972907e-05, "loss": 2.523, "mean_token_accuracy": 0.38620689511299133, "step": 26010 }, { "epoch": 0.026202584301010936, "grad_norm": 16.148807171906178, "learning_rate": 2.620208287170397e-05, "loss": 2.1861, "mean_token_accuracy": 0.4917120337486267, "step": 26015 }, { "epoch": 0.02620762035411511, "grad_norm": 28.374416166804952, "learning_rate": 2.620711882843503e-05, "loss": 3.1545, "mean_token_accuracy": 0.3482758581638336, "step": 26020 }, { "epoch": 0.026212656407219283, "grad_norm": 16.7052585556346, "learning_rate": 2.6212154785166088e-05, "loss": 2.9811, "mean_token_accuracy": 0.41379310488700866, "step": 26025 }, { "epoch": 0.026217692460323457, "grad_norm": 18.409387823406544, "learning_rate": 2.6217190741897147e-05, "loss": 2.3969, "mean_token_accuracy": 0.42413792610168455, "step": 26030 }, { "epoch": 0.026222728513427628, "grad_norm": 20.075371426043436, "learning_rate": 2.6222226698628207e-05, "loss": 2.5284, "mean_token_accuracy": 0.4068965554237366, "step": 26035 }, { "epoch": 0.0262277645665318, "grad_norm": 15.918577255361972, "learning_rate": 2.6227262655359262e-05, "loss": 2.5395, "mean_token_accuracy": 0.4448275864124298, "step": 26040 }, { "epoch": 0.026232800619635975, "grad_norm": 17.56779060560404, "learning_rate": 2.623229861209033e-05, "loss": 2.4262, "mean_token_accuracy": 0.39655172228813174, "step": 26045 }, { "epoch": 0.026237836672740145, "grad_norm": 15.920391598927626, "learning_rate": 2.6237334568821388e-05, "loss": 2.9236, "mean_token_accuracy": 0.4103448212146759, "step": 26050 }, { "epoch": 0.02624287272584432, "grad_norm": 19.982084165770296, "learning_rate": 2.6242370525552447e-05, "loss": 2.7152, "mean_token_accuracy": 0.4103448331356049, "step": 26055 }, { "epoch": 0.026247908778948493, "grad_norm": 22.713750530514663, "learning_rate": 2.6247406482283503e-05, "loss": 2.8908, "mean_token_accuracy": 0.3620689660310745, "step": 26060 }, { "epoch": 0.026252944832052667, "grad_norm": 23.267324228420723, "learning_rate": 2.6252442439014562e-05, "loss": 2.6547, "mean_token_accuracy": 0.42758620977401735, "step": 26065 }, { "epoch": 0.026257980885156837, "grad_norm": 20.882740320609155, "learning_rate": 2.6257478395745628e-05, "loss": 2.4161, "mean_token_accuracy": 0.4413793087005615, "step": 26070 }, { "epoch": 0.02626301693826101, "grad_norm": 18.498385332204048, "learning_rate": 2.6262514352476687e-05, "loss": 2.3803, "mean_token_accuracy": 0.4241379380226135, "step": 26075 }, { "epoch": 0.026268052991365184, "grad_norm": 16.837928652063248, "learning_rate": 2.6267550309207743e-05, "loss": 2.4041, "mean_token_accuracy": 0.42758620977401735, "step": 26080 }, { "epoch": 0.026273089044469355, "grad_norm": 16.31266645262563, "learning_rate": 2.6272586265938802e-05, "loss": 2.3609, "mean_token_accuracy": 0.3881427705287933, "step": 26085 }, { "epoch": 0.02627812509757353, "grad_norm": 17.449170934730695, "learning_rate": 2.6277622222669862e-05, "loss": 2.5095, "mean_token_accuracy": 0.3758620619773865, "step": 26090 }, { "epoch": 0.026283161150677702, "grad_norm": 18.7924229573458, "learning_rate": 2.6282658179400928e-05, "loss": 2.1091, "mean_token_accuracy": 0.47241378426551817, "step": 26095 }, { "epoch": 0.026288197203781876, "grad_norm": 17.644889229293288, "learning_rate": 2.6287694136131984e-05, "loss": 2.5119, "mean_token_accuracy": 0.41560799181461333, "step": 26100 }, { "epoch": 0.026293233256886046, "grad_norm": 26.663174742177656, "learning_rate": 2.6292730092863043e-05, "loss": 2.5348, "mean_token_accuracy": 0.4050211668014526, "step": 26105 }, { "epoch": 0.02629826930999022, "grad_norm": 21.22836011116904, "learning_rate": 2.6297766049594102e-05, "loss": 2.7621, "mean_token_accuracy": 0.38275861740112305, "step": 26110 }, { "epoch": 0.026303305363094394, "grad_norm": 18.542535876119285, "learning_rate": 2.630280200632516e-05, "loss": 2.4463, "mean_token_accuracy": 0.4517241299152374, "step": 26115 }, { "epoch": 0.026308341416198564, "grad_norm": 21.911508126273496, "learning_rate": 2.630783796305622e-05, "loss": 2.4079, "mean_token_accuracy": 0.3862068891525269, "step": 26120 }, { "epoch": 0.026313377469302738, "grad_norm": 19.651240442874094, "learning_rate": 2.6312873919787283e-05, "loss": 3.0094, "mean_token_accuracy": 0.33793103098869326, "step": 26125 }, { "epoch": 0.02631841352240691, "grad_norm": 17.69593332673549, "learning_rate": 2.6317909876518343e-05, "loss": 2.2797, "mean_token_accuracy": 0.46551724076271056, "step": 26130 }, { "epoch": 0.026323449575511085, "grad_norm": 20.692165945646035, "learning_rate": 2.6322945833249402e-05, "loss": 2.7463, "mean_token_accuracy": 0.41034482717514037, "step": 26135 }, { "epoch": 0.026328485628615256, "grad_norm": 21.706521491961233, "learning_rate": 2.632798178998046e-05, "loss": 2.3286, "mean_token_accuracy": 0.42758620381355283, "step": 26140 }, { "epoch": 0.02633352168171943, "grad_norm": 16.873861176971804, "learning_rate": 2.633301774671152e-05, "loss": 2.2868, "mean_token_accuracy": 0.45862069725990295, "step": 26145 }, { "epoch": 0.026338557734823603, "grad_norm": 14.176027348070487, "learning_rate": 2.6338053703442583e-05, "loss": 2.3125, "mean_token_accuracy": 0.4241379201412201, "step": 26150 }, { "epoch": 0.026343593787927774, "grad_norm": 18.125882997275788, "learning_rate": 2.6343089660173642e-05, "loss": 2.1809, "mean_token_accuracy": 0.4241379380226135, "step": 26155 }, { "epoch": 0.026348629841031947, "grad_norm": 19.870521606188333, "learning_rate": 2.63481256169047e-05, "loss": 2.5662, "mean_token_accuracy": 0.4034482717514038, "step": 26160 }, { "epoch": 0.02635366589413612, "grad_norm": 17.496770658728988, "learning_rate": 2.635316157363576e-05, "loss": 2.3505, "mean_token_accuracy": 0.4359951615333557, "step": 26165 }, { "epoch": 0.026358701947240295, "grad_norm": 19.903553768195984, "learning_rate": 2.635819753036682e-05, "loss": 2.4462, "mean_token_accuracy": 0.39655172228813174, "step": 26170 }, { "epoch": 0.026363738000344465, "grad_norm": 17.738590490295884, "learning_rate": 2.6363233487097883e-05, "loss": 2.6248, "mean_token_accuracy": 0.4384236454963684, "step": 26175 }, { "epoch": 0.02636877405344864, "grad_norm": 19.53928278627887, "learning_rate": 2.6368269443828942e-05, "loss": 3.031, "mean_token_accuracy": 0.36896551847457887, "step": 26180 }, { "epoch": 0.026373810106552813, "grad_norm": 15.730385086977082, "learning_rate": 2.637330540056e-05, "loss": 2.4054, "mean_token_accuracy": 0.3965517282485962, "step": 26185 }, { "epoch": 0.026378846159656983, "grad_norm": 30.52860961192424, "learning_rate": 2.637834135729106e-05, "loss": 2.7037, "mean_token_accuracy": 0.3793103516101837, "step": 26190 }, { "epoch": 0.026383882212761157, "grad_norm": 16.791824651966305, "learning_rate": 2.6383377314022116e-05, "loss": 2.1038, "mean_token_accuracy": 0.45862069725990295, "step": 26195 }, { "epoch": 0.02638891826586533, "grad_norm": 21.73665551890945, "learning_rate": 2.6388413270753182e-05, "loss": 2.5717, "mean_token_accuracy": 0.38620689511299133, "step": 26200 }, { "epoch": 0.026393954318969504, "grad_norm": 20.007691118140873, "learning_rate": 2.639344922748424e-05, "loss": 2.7644, "mean_token_accuracy": 0.38965516686439516, "step": 26205 }, { "epoch": 0.026398990372073675, "grad_norm": 14.65794053329391, "learning_rate": 2.63984851842153e-05, "loss": 2.3932, "mean_token_accuracy": 0.42413793206214906, "step": 26210 }, { "epoch": 0.02640402642517785, "grad_norm": 20.08702757380599, "learning_rate": 2.6403521140946357e-05, "loss": 2.5449, "mean_token_accuracy": 0.4517241299152374, "step": 26215 }, { "epoch": 0.026409062478282022, "grad_norm": 19.761384868979658, "learning_rate": 2.6408557097677416e-05, "loss": 2.3771, "mean_token_accuracy": 0.4413793206214905, "step": 26220 }, { "epoch": 0.026414098531386192, "grad_norm": 17.19068219959304, "learning_rate": 2.6413593054408475e-05, "loss": 2.3559, "mean_token_accuracy": 0.4517241299152374, "step": 26225 }, { "epoch": 0.026419134584490366, "grad_norm": 28.94639118718414, "learning_rate": 2.641862901113954e-05, "loss": 2.3919, "mean_token_accuracy": 0.4517241299152374, "step": 26230 }, { "epoch": 0.02642417063759454, "grad_norm": 18.030231458083296, "learning_rate": 2.6423664967870597e-05, "loss": 2.509, "mean_token_accuracy": 0.36551723778247835, "step": 26235 }, { "epoch": 0.026429206690698714, "grad_norm": 16.074891579148566, "learning_rate": 2.6428700924601656e-05, "loss": 2.3276, "mean_token_accuracy": 0.4103448212146759, "step": 26240 }, { "epoch": 0.026434242743802884, "grad_norm": 18.227665788986528, "learning_rate": 2.6433736881332716e-05, "loss": 2.3504, "mean_token_accuracy": 0.4034482777118683, "step": 26245 }, { "epoch": 0.026439278796907058, "grad_norm": 18.372551153445723, "learning_rate": 2.6438772838063775e-05, "loss": 2.676, "mean_token_accuracy": 0.37931033968925476, "step": 26250 }, { "epoch": 0.02644431485001123, "grad_norm": 20.244290584775747, "learning_rate": 2.6443808794794837e-05, "loss": 2.3995, "mean_token_accuracy": 0.4344827473163605, "step": 26255 }, { "epoch": 0.026449350903115402, "grad_norm": 15.321349976709902, "learning_rate": 2.6448844751525897e-05, "loss": 2.3865, "mean_token_accuracy": 0.458620685338974, "step": 26260 }, { "epoch": 0.026454386956219576, "grad_norm": 14.369231243448612, "learning_rate": 2.6453880708256956e-05, "loss": 2.0356, "mean_token_accuracy": 0.4862068951129913, "step": 26265 }, { "epoch": 0.02645942300932375, "grad_norm": 20.14346947038924, "learning_rate": 2.6458916664988015e-05, "loss": 2.5368, "mean_token_accuracy": 0.4551724076271057, "step": 26270 }, { "epoch": 0.026464459062427923, "grad_norm": 17.42228362121931, "learning_rate": 2.6463952621719074e-05, "loss": 2.4031, "mean_token_accuracy": 0.44482758045196535, "step": 26275 }, { "epoch": 0.026469495115532093, "grad_norm": 17.408870991882434, "learning_rate": 2.6468988578450137e-05, "loss": 2.5286, "mean_token_accuracy": 0.41167573928833007, "step": 26280 }, { "epoch": 0.026474531168636267, "grad_norm": 18.16205445534524, "learning_rate": 2.6474024535181196e-05, "loss": 2.6645, "mean_token_accuracy": 0.38275861740112305, "step": 26285 }, { "epoch": 0.02647956722174044, "grad_norm": 21.07187117745845, "learning_rate": 2.6479060491912256e-05, "loss": 2.7223, "mean_token_accuracy": 0.42413793206214906, "step": 26290 }, { "epoch": 0.02648460327484461, "grad_norm": 22.25249574898106, "learning_rate": 2.6484096448643315e-05, "loss": 2.7617, "mean_token_accuracy": 0.3758620619773865, "step": 26295 }, { "epoch": 0.026489639327948785, "grad_norm": 18.834483008035605, "learning_rate": 2.6489132405374374e-05, "loss": 2.4561, "mean_token_accuracy": 0.4172413766384125, "step": 26300 }, { "epoch": 0.02649467538105296, "grad_norm": 19.206897419078583, "learning_rate": 2.649416836210543e-05, "loss": 2.434, "mean_token_accuracy": 0.41724138259887694, "step": 26305 }, { "epoch": 0.026499711434157133, "grad_norm": 17.033178828163056, "learning_rate": 2.6499204318836496e-05, "loss": 2.4917, "mean_token_accuracy": 0.42413792610168455, "step": 26310 }, { "epoch": 0.026504747487261303, "grad_norm": 16.0311321100682, "learning_rate": 2.6504240275567555e-05, "loss": 2.5647, "mean_token_accuracy": 0.3931034505367279, "step": 26315 }, { "epoch": 0.026509783540365477, "grad_norm": 15.321897638472267, "learning_rate": 2.6509276232298614e-05, "loss": 2.3784, "mean_token_accuracy": 0.41034482717514037, "step": 26320 }, { "epoch": 0.02651481959346965, "grad_norm": 20.68872243861882, "learning_rate": 2.651431218902967e-05, "loss": 2.2419, "mean_token_accuracy": 0.439443439245224, "step": 26325 }, { "epoch": 0.02651985564657382, "grad_norm": 22.635571085642113, "learning_rate": 2.651934814576073e-05, "loss": 2.3832, "mean_token_accuracy": 0.42758620381355283, "step": 26330 }, { "epoch": 0.026524891699677994, "grad_norm": 19.10018053868613, "learning_rate": 2.6524384102491796e-05, "loss": 2.6389, "mean_token_accuracy": 0.41034482717514037, "step": 26335 }, { "epoch": 0.026529927752782168, "grad_norm": 17.310142897443317, "learning_rate": 2.6529420059222855e-05, "loss": 2.7653, "mean_token_accuracy": 0.42068964838981626, "step": 26340 }, { "epoch": 0.026534963805886342, "grad_norm": 18.289235779874982, "learning_rate": 2.653445601595391e-05, "loss": 2.4187, "mean_token_accuracy": 0.4505747139453888, "step": 26345 }, { "epoch": 0.026539999858990512, "grad_norm": 14.72144814066945, "learning_rate": 2.653949197268497e-05, "loss": 1.847, "mean_token_accuracy": 0.5226860225200654, "step": 26350 }, { "epoch": 0.026545035912094686, "grad_norm": 19.291380147515234, "learning_rate": 2.654452792941603e-05, "loss": 2.3638, "mean_token_accuracy": 0.4448275864124298, "step": 26355 }, { "epoch": 0.02655007196519886, "grad_norm": 24.490456362132456, "learning_rate": 2.6549563886147095e-05, "loss": 2.6383, "mean_token_accuracy": 0.41923774480819703, "step": 26360 }, { "epoch": 0.02655510801830303, "grad_norm": 19.106938611537938, "learning_rate": 2.6554599842878155e-05, "loss": 2.5966, "mean_token_accuracy": 0.4103448212146759, "step": 26365 }, { "epoch": 0.026560144071407204, "grad_norm": 23.37373698501716, "learning_rate": 2.655963579960921e-05, "loss": 2.6726, "mean_token_accuracy": 0.41034482717514037, "step": 26370 }, { "epoch": 0.026565180124511378, "grad_norm": 21.236744844716764, "learning_rate": 2.656467175634027e-05, "loss": 2.3783, "mean_token_accuracy": 0.42758620977401735, "step": 26375 }, { "epoch": 0.02657021617761555, "grad_norm": 22.67547365851139, "learning_rate": 2.656970771307133e-05, "loss": 2.7224, "mean_token_accuracy": 0.4137930989265442, "step": 26380 }, { "epoch": 0.02657525223071972, "grad_norm": 18.02583984070576, "learning_rate": 2.6574743669802388e-05, "loss": 2.7375, "mean_token_accuracy": 0.35172413289546967, "step": 26385 }, { "epoch": 0.026580288283823895, "grad_norm": 16.14427873406681, "learning_rate": 2.657977962653345e-05, "loss": 2.6752, "mean_token_accuracy": 0.4000000059604645, "step": 26390 }, { "epoch": 0.02658532433692807, "grad_norm": 25.49435604664068, "learning_rate": 2.658481558326451e-05, "loss": 2.8213, "mean_token_accuracy": 0.38275861740112305, "step": 26395 }, { "epoch": 0.02659036039003224, "grad_norm": 22.73198606305175, "learning_rate": 2.658985153999557e-05, "loss": 2.6673, "mean_token_accuracy": 0.36206896901130675, "step": 26400 }, { "epoch": 0.026595396443136413, "grad_norm": 15.150778368839646, "learning_rate": 2.659488749672663e-05, "loss": 2.7391, "mean_token_accuracy": 0.34482758641242983, "step": 26405 }, { "epoch": 0.026600432496240587, "grad_norm": 16.52730320678833, "learning_rate": 2.6599923453457688e-05, "loss": 2.4525, "mean_token_accuracy": 0.42758620381355283, "step": 26410 }, { "epoch": 0.02660546854934476, "grad_norm": 18.851941243178903, "learning_rate": 2.660495941018875e-05, "loss": 2.5088, "mean_token_accuracy": 0.4469449520111084, "step": 26415 }, { "epoch": 0.02661050460244893, "grad_norm": 19.163000430352216, "learning_rate": 2.660999536691981e-05, "loss": 2.5397, "mean_token_accuracy": 0.3931034475564957, "step": 26420 }, { "epoch": 0.026615540655553105, "grad_norm": 17.515856701055565, "learning_rate": 2.661503132365087e-05, "loss": 2.4471, "mean_token_accuracy": 0.4137930989265442, "step": 26425 }, { "epoch": 0.02662057670865728, "grad_norm": 17.50651473714608, "learning_rate": 2.6620067280381928e-05, "loss": 2.5059, "mean_token_accuracy": 0.43103448748588563, "step": 26430 }, { "epoch": 0.02662561276176145, "grad_norm": 20.836084231608638, "learning_rate": 2.6625103237112987e-05, "loss": 2.6053, "mean_token_accuracy": 0.403448274731636, "step": 26435 }, { "epoch": 0.026630648814865623, "grad_norm": 16.819906694833826, "learning_rate": 2.663013919384405e-05, "loss": 2.2948, "mean_token_accuracy": 0.4433151841163635, "step": 26440 }, { "epoch": 0.026635684867969796, "grad_norm": 20.018234219793513, "learning_rate": 2.663517515057511e-05, "loss": 2.5822, "mean_token_accuracy": 0.36551724672317504, "step": 26445 }, { "epoch": 0.02664072092107397, "grad_norm": 17.43892433515772, "learning_rate": 2.664021110730617e-05, "loss": 2.6166, "mean_token_accuracy": 0.4068965554237366, "step": 26450 }, { "epoch": 0.02664575697417814, "grad_norm": 17.03087560976558, "learning_rate": 2.6645247064037228e-05, "loss": 3.0, "mean_token_accuracy": 0.320689657330513, "step": 26455 }, { "epoch": 0.026650793027282314, "grad_norm": 22.121839046282748, "learning_rate": 2.6650283020768284e-05, "loss": 2.6548, "mean_token_accuracy": 0.36551724672317504, "step": 26460 }, { "epoch": 0.026655829080386488, "grad_norm": 22.480825895544886, "learning_rate": 2.6655318977499343e-05, "loss": 2.7532, "mean_token_accuracy": 0.3965517282485962, "step": 26465 }, { "epoch": 0.02666086513349066, "grad_norm": 14.583735454226035, "learning_rate": 2.666035493423041e-05, "loss": 2.3018, "mean_token_accuracy": 0.4896551728248596, "step": 26470 }, { "epoch": 0.026665901186594832, "grad_norm": 16.278201843025588, "learning_rate": 2.6665390890961468e-05, "loss": 2.5249, "mean_token_accuracy": 0.4379310369491577, "step": 26475 }, { "epoch": 0.026670937239699006, "grad_norm": 19.610670204553035, "learning_rate": 2.6670426847692524e-05, "loss": 2.7158, "mean_token_accuracy": 0.42413793206214906, "step": 26480 }, { "epoch": 0.02667597329280318, "grad_norm": 18.37476725856538, "learning_rate": 2.6675462804423583e-05, "loss": 2.6315, "mean_token_accuracy": 0.41034482717514037, "step": 26485 }, { "epoch": 0.02668100934590735, "grad_norm": 15.527395835135536, "learning_rate": 2.6680498761154643e-05, "loss": 2.5128, "mean_token_accuracy": 0.42413792610168455, "step": 26490 }, { "epoch": 0.026686045399011524, "grad_norm": 18.439513665993097, "learning_rate": 2.668553471788571e-05, "loss": 2.5044, "mean_token_accuracy": 0.41724138259887694, "step": 26495 }, { "epoch": 0.026691081452115697, "grad_norm": 18.166266412915164, "learning_rate": 2.6690570674616765e-05, "loss": 2.4041, "mean_token_accuracy": 0.42068966031074523, "step": 26500 }, { "epoch": 0.026696117505219868, "grad_norm": 43.84216410257988, "learning_rate": 2.6695606631347824e-05, "loss": 3.0035, "mean_token_accuracy": 0.3896551728248596, "step": 26505 }, { "epoch": 0.02670115355832404, "grad_norm": 24.907906952258795, "learning_rate": 2.6700642588078883e-05, "loss": 2.5792, "mean_token_accuracy": 0.38965516686439516, "step": 26510 }, { "epoch": 0.026706189611428215, "grad_norm": 18.975182651807927, "learning_rate": 2.6705678544809942e-05, "loss": 2.2964, "mean_token_accuracy": 0.42413792610168455, "step": 26515 }, { "epoch": 0.02671122566453239, "grad_norm": 26.6230578720545, "learning_rate": 2.6710714501541005e-05, "loss": 2.3411, "mean_token_accuracy": 0.37586206793785093, "step": 26520 }, { "epoch": 0.02671626171763656, "grad_norm": 17.475275833237866, "learning_rate": 2.6715750458272064e-05, "loss": 2.0487, "mean_token_accuracy": 0.5082512438297272, "step": 26525 }, { "epoch": 0.026721297770740733, "grad_norm": 19.749955516066873, "learning_rate": 2.6720786415003123e-05, "loss": 2.5239, "mean_token_accuracy": 0.39310344457626345, "step": 26530 }, { "epoch": 0.026726333823844907, "grad_norm": 18.345891727680215, "learning_rate": 2.6725822371734183e-05, "loss": 2.4944, "mean_token_accuracy": 0.42758620381355283, "step": 26535 }, { "epoch": 0.026731369876949077, "grad_norm": 24.237299235181624, "learning_rate": 2.6730858328465242e-05, "loss": 2.7044, "mean_token_accuracy": 0.3896551728248596, "step": 26540 }, { "epoch": 0.02673640593005325, "grad_norm": 18.50473053043379, "learning_rate": 2.67358942851963e-05, "loss": 2.7367, "mean_token_accuracy": 0.3586206942796707, "step": 26545 }, { "epoch": 0.026741441983157425, "grad_norm": 16.893301722057114, "learning_rate": 2.6740930241927364e-05, "loss": 2.5895, "mean_token_accuracy": 0.39310344457626345, "step": 26550 }, { "epoch": 0.0267464780362616, "grad_norm": 20.952702031989787, "learning_rate": 2.6745966198658423e-05, "loss": 2.4712, "mean_token_accuracy": 0.40689654350280763, "step": 26555 }, { "epoch": 0.02675151408936577, "grad_norm": 20.227259840838073, "learning_rate": 2.6751002155389482e-05, "loss": 2.6414, "mean_token_accuracy": 0.3724137932062149, "step": 26560 }, { "epoch": 0.026756550142469943, "grad_norm": 17.363299492869327, "learning_rate": 2.675603811212054e-05, "loss": 2.5097, "mean_token_accuracy": 0.41379311084747317, "step": 26565 }, { "epoch": 0.026761586195574116, "grad_norm": 22.942609479603245, "learning_rate": 2.67610740688516e-05, "loss": 2.5691, "mean_token_accuracy": 0.37586207389831544, "step": 26570 }, { "epoch": 0.026766622248678287, "grad_norm": 23.705061754197057, "learning_rate": 2.6766110025582663e-05, "loss": 2.3177, "mean_token_accuracy": 0.4551724076271057, "step": 26575 }, { "epoch": 0.02677165830178246, "grad_norm": 16.52475817165424, "learning_rate": 2.6771145982313723e-05, "loss": 2.2935, "mean_token_accuracy": 0.4413793087005615, "step": 26580 }, { "epoch": 0.026776694354886634, "grad_norm": 16.994721702510603, "learning_rate": 2.6776181939044782e-05, "loss": 2.3538, "mean_token_accuracy": 0.44482757449150084, "step": 26585 }, { "epoch": 0.026781730407990804, "grad_norm": 16.93425569342198, "learning_rate": 2.678121789577584e-05, "loss": 3.0897, "mean_token_accuracy": 0.34137930572032926, "step": 26590 }, { "epoch": 0.026786766461094978, "grad_norm": 17.71527059179849, "learning_rate": 2.6786253852506897e-05, "loss": 2.8102, "mean_token_accuracy": 0.3758620709180832, "step": 26595 }, { "epoch": 0.026791802514199152, "grad_norm": 20.41262201795603, "learning_rate": 2.6791289809237963e-05, "loss": 2.331, "mean_token_accuracy": 0.4448275864124298, "step": 26600 }, { "epoch": 0.026796838567303326, "grad_norm": 19.836118096521158, "learning_rate": 2.6796325765969022e-05, "loss": 2.6239, "mean_token_accuracy": 0.4068965494632721, "step": 26605 }, { "epoch": 0.026801874620407496, "grad_norm": 16.632618946257917, "learning_rate": 2.680136172270008e-05, "loss": 2.4838, "mean_token_accuracy": 0.41724138259887694, "step": 26610 }, { "epoch": 0.02680691067351167, "grad_norm": 25.57761902460903, "learning_rate": 2.6806397679431138e-05, "loss": 2.5928, "mean_token_accuracy": 0.39310344457626345, "step": 26615 }, { "epoch": 0.026811946726615844, "grad_norm": 20.335120611582486, "learning_rate": 2.6811433636162197e-05, "loss": 2.7376, "mean_token_accuracy": 0.33448276221752166, "step": 26620 }, { "epoch": 0.026816982779720014, "grad_norm": 17.75998385795157, "learning_rate": 2.6816469592893263e-05, "loss": 2.7259, "mean_token_accuracy": 0.3999999940395355, "step": 26625 }, { "epoch": 0.026822018832824188, "grad_norm": 20.875467040203176, "learning_rate": 2.6821505549624322e-05, "loss": 2.383, "mean_token_accuracy": 0.4103448212146759, "step": 26630 }, { "epoch": 0.02682705488592836, "grad_norm": 16.477109888812123, "learning_rate": 2.6826541506355378e-05, "loss": 2.3424, "mean_token_accuracy": 0.49171202778816225, "step": 26635 }, { "epoch": 0.026832090939032535, "grad_norm": 16.564570244018086, "learning_rate": 2.6831577463086437e-05, "loss": 2.397, "mean_token_accuracy": 0.417241370677948, "step": 26640 }, { "epoch": 0.026837126992136705, "grad_norm": 17.905535676758145, "learning_rate": 2.6836613419817496e-05, "loss": 2.1829, "mean_token_accuracy": 0.4724137902259827, "step": 26645 }, { "epoch": 0.02684216304524088, "grad_norm": 18.582089642733994, "learning_rate": 2.6841649376548556e-05, "loss": 2.5143, "mean_token_accuracy": 0.41034482717514037, "step": 26650 }, { "epoch": 0.026847199098345053, "grad_norm": 16.102334526438064, "learning_rate": 2.6846685333279618e-05, "loss": 2.3065, "mean_token_accuracy": 0.4793103516101837, "step": 26655 }, { "epoch": 0.026852235151449223, "grad_norm": 23.612563020690057, "learning_rate": 2.6851721290010678e-05, "loss": 2.5103, "mean_token_accuracy": 0.3965517282485962, "step": 26660 }, { "epoch": 0.026857271204553397, "grad_norm": 17.294890265387224, "learning_rate": 2.6856757246741737e-05, "loss": 2.3032, "mean_token_accuracy": 0.46896551847457885, "step": 26665 }, { "epoch": 0.02686230725765757, "grad_norm": 16.16031750561536, "learning_rate": 2.6861793203472796e-05, "loss": 2.297, "mean_token_accuracy": 0.42068965137004855, "step": 26670 }, { "epoch": 0.026867343310761745, "grad_norm": 19.91220422084447, "learning_rate": 2.6866829160203855e-05, "loss": 2.5976, "mean_token_accuracy": 0.38620689511299133, "step": 26675 }, { "epoch": 0.026872379363865915, "grad_norm": 15.2960239570482, "learning_rate": 2.6871865116934918e-05, "loss": 1.9801, "mean_token_accuracy": 0.4620689630508423, "step": 26680 }, { "epoch": 0.02687741541697009, "grad_norm": 22.639381307893732, "learning_rate": 2.6876901073665977e-05, "loss": 2.8707, "mean_token_accuracy": 0.37586207389831544, "step": 26685 }, { "epoch": 0.026882451470074262, "grad_norm": 15.410212350156943, "learning_rate": 2.6881937030397036e-05, "loss": 2.385, "mean_token_accuracy": 0.42758620977401735, "step": 26690 }, { "epoch": 0.026887487523178433, "grad_norm": 21.210383733920175, "learning_rate": 2.6886972987128096e-05, "loss": 2.6044, "mean_token_accuracy": 0.38106473088264464, "step": 26695 }, { "epoch": 0.026892523576282606, "grad_norm": 19.01866715972732, "learning_rate": 2.6892008943859155e-05, "loss": 2.8327, "mean_token_accuracy": 0.3620689570903778, "step": 26700 }, { "epoch": 0.02689755962938678, "grad_norm": 15.595556644708015, "learning_rate": 2.6897044900590218e-05, "loss": 2.1775, "mean_token_accuracy": 0.4482758641242981, "step": 26705 }, { "epoch": 0.026902595682490954, "grad_norm": 17.190845267179338, "learning_rate": 2.6902080857321277e-05, "loss": 2.6164, "mean_token_accuracy": 0.41379311084747317, "step": 26710 }, { "epoch": 0.026907631735595124, "grad_norm": 20.613282509870697, "learning_rate": 2.6907116814052336e-05, "loss": 2.6201, "mean_token_accuracy": 0.4, "step": 26715 }, { "epoch": 0.026912667788699298, "grad_norm": 14.536783358514555, "learning_rate": 2.6912152770783395e-05, "loss": 2.4021, "mean_token_accuracy": 0.42413793206214906, "step": 26720 }, { "epoch": 0.026917703841803472, "grad_norm": 17.187835334340203, "learning_rate": 2.6917188727514455e-05, "loss": 2.113, "mean_token_accuracy": 0.4862068951129913, "step": 26725 }, { "epoch": 0.026922739894907642, "grad_norm": 18.74858153677933, "learning_rate": 2.692222468424551e-05, "loss": 2.3955, "mean_token_accuracy": 0.4241379380226135, "step": 26730 }, { "epoch": 0.026927775948011816, "grad_norm": 17.62455447866605, "learning_rate": 2.6927260640976577e-05, "loss": 2.6174, "mean_token_accuracy": 0.37586206793785093, "step": 26735 }, { "epoch": 0.02693281200111599, "grad_norm": 21.0848241618325, "learning_rate": 2.6932296597707636e-05, "loss": 2.6278, "mean_token_accuracy": 0.41034482717514037, "step": 26740 }, { "epoch": 0.026937848054220163, "grad_norm": 15.06289578582256, "learning_rate": 2.6937332554438695e-05, "loss": 2.6431, "mean_token_accuracy": 0.358620685338974, "step": 26745 }, { "epoch": 0.026942884107324334, "grad_norm": 17.424882190669397, "learning_rate": 2.694236851116975e-05, "loss": 2.2415, "mean_token_accuracy": 0.46896551847457885, "step": 26750 }, { "epoch": 0.026947920160428507, "grad_norm": 19.050732013016383, "learning_rate": 2.694740446790081e-05, "loss": 2.7662, "mean_token_accuracy": 0.33793102502822875, "step": 26755 }, { "epoch": 0.02695295621353268, "grad_norm": 16.64238312550753, "learning_rate": 2.6952440424631876e-05, "loss": 2.5786, "mean_token_accuracy": 0.38275861740112305, "step": 26760 }, { "epoch": 0.02695799226663685, "grad_norm": 15.265324960878868, "learning_rate": 2.6957476381362935e-05, "loss": 2.3636, "mean_token_accuracy": 0.3965517282485962, "step": 26765 }, { "epoch": 0.026963028319741025, "grad_norm": 15.327091127577264, "learning_rate": 2.696251233809399e-05, "loss": 2.6416, "mean_token_accuracy": 0.32758620083332063, "step": 26770 }, { "epoch": 0.0269680643728452, "grad_norm": 20.964042168386715, "learning_rate": 2.696754829482505e-05, "loss": 2.741, "mean_token_accuracy": 0.4068965554237366, "step": 26775 }, { "epoch": 0.026973100425949373, "grad_norm": 14.946707768657257, "learning_rate": 2.697258425155611e-05, "loss": 2.3998, "mean_token_accuracy": 0.41034482717514037, "step": 26780 }, { "epoch": 0.026978136479053543, "grad_norm": 18.31421125412061, "learning_rate": 2.6977620208287176e-05, "loss": 2.7007, "mean_token_accuracy": 0.3689655214548111, "step": 26785 }, { "epoch": 0.026983172532157717, "grad_norm": 18.83236959697641, "learning_rate": 2.698265616501823e-05, "loss": 2.8449, "mean_token_accuracy": 0.39310344457626345, "step": 26790 }, { "epoch": 0.02698820858526189, "grad_norm": 15.591172964919453, "learning_rate": 2.698769212174929e-05, "loss": 2.2724, "mean_token_accuracy": 0.4068965494632721, "step": 26795 }, { "epoch": 0.02699324463836606, "grad_norm": 17.825145649700218, "learning_rate": 2.699272807848035e-05, "loss": 2.42, "mean_token_accuracy": 0.43448275327682495, "step": 26800 }, { "epoch": 0.026998280691470235, "grad_norm": 15.925485439034828, "learning_rate": 2.699776403521141e-05, "loss": 2.5881, "mean_token_accuracy": 0.42068964838981626, "step": 26805 }, { "epoch": 0.02700331674457441, "grad_norm": 15.144272722144196, "learning_rate": 2.700279999194247e-05, "loss": 2.3852, "mean_token_accuracy": 0.41724138259887694, "step": 26810 }, { "epoch": 0.027008352797678582, "grad_norm": 20.335665610270276, "learning_rate": 2.700783594867353e-05, "loss": 2.4388, "mean_token_accuracy": 0.4172413766384125, "step": 26815 }, { "epoch": 0.027013388850782753, "grad_norm": 21.056845117806382, "learning_rate": 2.701287190540459e-05, "loss": 2.7865, "mean_token_accuracy": 0.41379310488700866, "step": 26820 }, { "epoch": 0.027018424903886926, "grad_norm": 13.770110047772942, "learning_rate": 2.701790786213565e-05, "loss": 2.5349, "mean_token_accuracy": 0.4103448331356049, "step": 26825 }, { "epoch": 0.0270234609569911, "grad_norm": 18.94634921490606, "learning_rate": 2.702294381886671e-05, "loss": 2.4992, "mean_token_accuracy": 0.358620685338974, "step": 26830 }, { "epoch": 0.02702849701009527, "grad_norm": 17.53954027205738, "learning_rate": 2.702797977559777e-05, "loss": 2.2204, "mean_token_accuracy": 0.4586206912994385, "step": 26835 }, { "epoch": 0.027033533063199444, "grad_norm": 30.561714742724668, "learning_rate": 2.703301573232883e-05, "loss": 2.3019, "mean_token_accuracy": 0.4901477873325348, "step": 26840 }, { "epoch": 0.027038569116303618, "grad_norm": 17.753730779509883, "learning_rate": 2.703805168905989e-05, "loss": 2.6214, "mean_token_accuracy": 0.362068971991539, "step": 26845 }, { "epoch": 0.02704360516940779, "grad_norm": 18.252880811630657, "learning_rate": 2.704308764579095e-05, "loss": 2.5086, "mean_token_accuracy": 0.379310342669487, "step": 26850 }, { "epoch": 0.027048641222511962, "grad_norm": 21.59894897189673, "learning_rate": 2.704812360252201e-05, "loss": 2.3058, "mean_token_accuracy": 0.47132486701011655, "step": 26855 }, { "epoch": 0.027053677275616136, "grad_norm": 18.852684013264735, "learning_rate": 2.7053159559253065e-05, "loss": 2.6412, "mean_token_accuracy": 0.43448275327682495, "step": 26860 }, { "epoch": 0.02705871332872031, "grad_norm": 26.79472507347962, "learning_rate": 2.705819551598413e-05, "loss": 2.1864, "mean_token_accuracy": 0.4482758641242981, "step": 26865 }, { "epoch": 0.02706374938182448, "grad_norm": 17.60896181927883, "learning_rate": 2.706323147271519e-05, "loss": 2.4758, "mean_token_accuracy": 0.3793103516101837, "step": 26870 }, { "epoch": 0.027068785434928654, "grad_norm": 18.82987881197227, "learning_rate": 2.706826742944625e-05, "loss": 2.5264, "mean_token_accuracy": 0.4551724135875702, "step": 26875 }, { "epoch": 0.027073821488032827, "grad_norm": 20.03584561924533, "learning_rate": 2.7073303386177305e-05, "loss": 2.5425, "mean_token_accuracy": 0.37241379618644715, "step": 26880 }, { "epoch": 0.027078857541137, "grad_norm": 18.30948766305271, "learning_rate": 2.7078339342908364e-05, "loss": 2.4934, "mean_token_accuracy": 0.42758620977401735, "step": 26885 }, { "epoch": 0.02708389359424117, "grad_norm": 17.207135611741407, "learning_rate": 2.7083375299639424e-05, "loss": 2.7789, "mean_token_accuracy": 0.3896551787853241, "step": 26890 }, { "epoch": 0.027088929647345345, "grad_norm": 22.960485582479805, "learning_rate": 2.708841125637049e-05, "loss": 2.7601, "mean_token_accuracy": 0.3896551728248596, "step": 26895 }, { "epoch": 0.02709396570044952, "grad_norm": 19.925661290776958, "learning_rate": 2.7093447213101545e-05, "loss": 2.5567, "mean_token_accuracy": 0.3965517282485962, "step": 26900 }, { "epoch": 0.02709900175355369, "grad_norm": 17.056318333424937, "learning_rate": 2.7098483169832605e-05, "loss": 2.1354, "mean_token_accuracy": 0.49999999403953554, "step": 26905 }, { "epoch": 0.027104037806657863, "grad_norm": 20.625864014975154, "learning_rate": 2.7103519126563664e-05, "loss": 2.492, "mean_token_accuracy": 0.441379314661026, "step": 26910 }, { "epoch": 0.027109073859762037, "grad_norm": 36.02802024084089, "learning_rate": 2.7108555083294723e-05, "loss": 2.554, "mean_token_accuracy": 0.3931034505367279, "step": 26915 }, { "epoch": 0.02711410991286621, "grad_norm": 21.865503840078667, "learning_rate": 2.711359104002579e-05, "loss": 2.3369, "mean_token_accuracy": 0.3965517282485962, "step": 26920 }, { "epoch": 0.02711914596597038, "grad_norm": 28.08703386516553, "learning_rate": 2.7118626996756845e-05, "loss": 2.34, "mean_token_accuracy": 0.4482758641242981, "step": 26925 }, { "epoch": 0.027124182019074555, "grad_norm": 16.522715965145718, "learning_rate": 2.7123662953487904e-05, "loss": 2.0986, "mean_token_accuracy": 0.47241379618644713, "step": 26930 }, { "epoch": 0.02712921807217873, "grad_norm": 18.968549413848102, "learning_rate": 2.7128698910218964e-05, "loss": 2.5155, "mean_token_accuracy": 0.4344827592372894, "step": 26935 }, { "epoch": 0.0271342541252829, "grad_norm": 19.340938659631657, "learning_rate": 2.7133734866950023e-05, "loss": 3.1601, "mean_token_accuracy": 0.35862069129943847, "step": 26940 }, { "epoch": 0.027139290178387072, "grad_norm": 17.230546271280193, "learning_rate": 2.7138770823681085e-05, "loss": 2.6367, "mean_token_accuracy": 0.43448275327682495, "step": 26945 }, { "epoch": 0.027144326231491246, "grad_norm": 16.708268477234174, "learning_rate": 2.7143806780412145e-05, "loss": 2.2461, "mean_token_accuracy": 0.4551724135875702, "step": 26950 }, { "epoch": 0.02714936228459542, "grad_norm": 20.716048452102342, "learning_rate": 2.7148842737143204e-05, "loss": 2.8234, "mean_token_accuracy": 0.3931034505367279, "step": 26955 }, { "epoch": 0.02715439833769959, "grad_norm": 18.32760900396895, "learning_rate": 2.7153878693874263e-05, "loss": 2.4838, "mean_token_accuracy": 0.4379310369491577, "step": 26960 }, { "epoch": 0.027159434390803764, "grad_norm": 15.444974810534694, "learning_rate": 2.7158914650605322e-05, "loss": 1.9527, "mean_token_accuracy": 0.5137930989265442, "step": 26965 }, { "epoch": 0.027164470443907938, "grad_norm": 19.93434166031694, "learning_rate": 2.7163950607336382e-05, "loss": 2.7885, "mean_token_accuracy": 0.38620689511299133, "step": 26970 }, { "epoch": 0.027169506497012108, "grad_norm": 20.776767886046805, "learning_rate": 2.7168986564067444e-05, "loss": 2.6308, "mean_token_accuracy": 0.40344826579093934, "step": 26975 }, { "epoch": 0.027174542550116282, "grad_norm": 25.440511474432316, "learning_rate": 2.7174022520798504e-05, "loss": 2.5263, "mean_token_accuracy": 0.4503327190876007, "step": 26980 }, { "epoch": 0.027179578603220456, "grad_norm": 19.37709120574868, "learning_rate": 2.7179058477529563e-05, "loss": 2.3057, "mean_token_accuracy": 0.4793103516101837, "step": 26985 }, { "epoch": 0.02718461465632463, "grad_norm": 23.281731019611883, "learning_rate": 2.7184094434260622e-05, "loss": 2.4307, "mean_token_accuracy": 0.4275861978530884, "step": 26990 }, { "epoch": 0.0271896507094288, "grad_norm": 17.519760365166217, "learning_rate": 2.7189130390991678e-05, "loss": 2.3818, "mean_token_accuracy": 0.4620689690113068, "step": 26995 }, { "epoch": 0.027194686762532973, "grad_norm": 15.647199536126944, "learning_rate": 2.7194166347722744e-05, "loss": 2.4159, "mean_token_accuracy": 0.3810042321681976, "step": 27000 }, { "epoch": 0.027199722815637147, "grad_norm": 16.69858153493782, "learning_rate": 2.7199202304453803e-05, "loss": 2.4466, "mean_token_accuracy": 0.4344827592372894, "step": 27005 }, { "epoch": 0.027204758868741317, "grad_norm": 34.2201166855786, "learning_rate": 2.7204238261184863e-05, "loss": 2.4222, "mean_token_accuracy": 0.43103448748588563, "step": 27010 }, { "epoch": 0.02720979492184549, "grad_norm": 26.316403109142225, "learning_rate": 2.720927421791592e-05, "loss": 2.4732, "mean_token_accuracy": 0.45862067937850953, "step": 27015 }, { "epoch": 0.027214830974949665, "grad_norm": 23.191213483264814, "learning_rate": 2.7214310174646978e-05, "loss": 2.5648, "mean_token_accuracy": 0.39655172228813174, "step": 27020 }, { "epoch": 0.02721986702805384, "grad_norm": 22.964984968292125, "learning_rate": 2.7219346131378044e-05, "loss": 2.6449, "mean_token_accuracy": 0.41034482717514037, "step": 27025 }, { "epoch": 0.02722490308115801, "grad_norm": 22.08642847487544, "learning_rate": 2.7224382088109103e-05, "loss": 2.4799, "mean_token_accuracy": 0.4, "step": 27030 }, { "epoch": 0.027229939134262183, "grad_norm": 22.146748101344265, "learning_rate": 2.722941804484016e-05, "loss": 2.6073, "mean_token_accuracy": 0.41724138259887694, "step": 27035 }, { "epoch": 0.027234975187366357, "grad_norm": 13.863170103345503, "learning_rate": 2.7234454001571218e-05, "loss": 2.1983, "mean_token_accuracy": 0.45172414779663084, "step": 27040 }, { "epoch": 0.027240011240470527, "grad_norm": 16.525576335716433, "learning_rate": 2.7239489958302277e-05, "loss": 2.4636, "mean_token_accuracy": 0.39655172228813174, "step": 27045 }, { "epoch": 0.0272450472935747, "grad_norm": 19.459897906754897, "learning_rate": 2.7244525915033343e-05, "loss": 2.2676, "mean_token_accuracy": 0.4689655005931854, "step": 27050 }, { "epoch": 0.027250083346678874, "grad_norm": 20.05728976113244, "learning_rate": 2.72495618717644e-05, "loss": 2.5845, "mean_token_accuracy": 0.3965517282485962, "step": 27055 }, { "epoch": 0.027255119399783048, "grad_norm": 17.626868207961696, "learning_rate": 2.725459782849546e-05, "loss": 2.3637, "mean_token_accuracy": 0.41379310488700866, "step": 27060 }, { "epoch": 0.02726015545288722, "grad_norm": 15.158798206477504, "learning_rate": 2.7259633785226518e-05, "loss": 2.633, "mean_token_accuracy": 0.37931033968925476, "step": 27065 }, { "epoch": 0.027265191505991392, "grad_norm": 13.481894532543539, "learning_rate": 2.7264669741957577e-05, "loss": 2.4068, "mean_token_accuracy": 0.43103448748588563, "step": 27070 }, { "epoch": 0.027270227559095566, "grad_norm": 16.852618664683018, "learning_rate": 2.7269705698688636e-05, "loss": 2.3698, "mean_token_accuracy": 0.4310344815254211, "step": 27075 }, { "epoch": 0.027275263612199736, "grad_norm": 19.734619599837295, "learning_rate": 2.72747416554197e-05, "loss": 2.2818, "mean_token_accuracy": 0.41379310488700866, "step": 27080 }, { "epoch": 0.02728029966530391, "grad_norm": 20.307936796990443, "learning_rate": 2.7279777612150758e-05, "loss": 2.5906, "mean_token_accuracy": 0.4103448331356049, "step": 27085 }, { "epoch": 0.027285335718408084, "grad_norm": 19.32360831491387, "learning_rate": 2.7284813568881817e-05, "loss": 2.3485, "mean_token_accuracy": 0.41724138259887694, "step": 27090 }, { "epoch": 0.027290371771512258, "grad_norm": 19.115525201502127, "learning_rate": 2.7289849525612877e-05, "loss": 2.3921, "mean_token_accuracy": 0.4482758641242981, "step": 27095 }, { "epoch": 0.027295407824616428, "grad_norm": 15.637345081140717, "learning_rate": 2.7294885482343936e-05, "loss": 2.4251, "mean_token_accuracy": 0.3931034505367279, "step": 27100 }, { "epoch": 0.0273004438777206, "grad_norm": 23.685773578035285, "learning_rate": 2.7299921439075e-05, "loss": 2.4629, "mean_token_accuracy": 0.46896552443504336, "step": 27105 }, { "epoch": 0.027305479930824775, "grad_norm": 15.799165360948571, "learning_rate": 2.7304957395806058e-05, "loss": 2.2292, "mean_token_accuracy": 0.4413793087005615, "step": 27110 }, { "epoch": 0.027310515983928946, "grad_norm": 17.59960935721835, "learning_rate": 2.7309993352537117e-05, "loss": 2.5913, "mean_token_accuracy": 0.36896551847457887, "step": 27115 }, { "epoch": 0.02731555203703312, "grad_norm": 18.7232075113392, "learning_rate": 2.7315029309268176e-05, "loss": 2.4847, "mean_token_accuracy": 0.4413793087005615, "step": 27120 }, { "epoch": 0.027320588090137293, "grad_norm": 17.032580750019978, "learning_rate": 2.7320065265999236e-05, "loss": 2.5046, "mean_token_accuracy": 0.43103447556495667, "step": 27125 }, { "epoch": 0.027325624143241467, "grad_norm": 19.242248713464573, "learning_rate": 2.7325101222730298e-05, "loss": 2.8294, "mean_token_accuracy": 0.358620685338974, "step": 27130 }, { "epoch": 0.027330660196345637, "grad_norm": 16.340068498774713, "learning_rate": 2.7330137179461357e-05, "loss": 2.2902, "mean_token_accuracy": 0.38965516686439516, "step": 27135 }, { "epoch": 0.02733569624944981, "grad_norm": 19.702415815858213, "learning_rate": 2.7335173136192417e-05, "loss": 2.6053, "mean_token_accuracy": 0.3999999940395355, "step": 27140 }, { "epoch": 0.027340732302553985, "grad_norm": 17.912554722972416, "learning_rate": 2.7340209092923476e-05, "loss": 2.7645, "mean_token_accuracy": 0.3275862097740173, "step": 27145 }, { "epoch": 0.027345768355658155, "grad_norm": 18.37105908707584, "learning_rate": 2.7345245049654532e-05, "loss": 2.3614, "mean_token_accuracy": 0.42413792610168455, "step": 27150 }, { "epoch": 0.02735080440876233, "grad_norm": 14.139528130755384, "learning_rate": 2.735028100638559e-05, "loss": 2.4512, "mean_token_accuracy": 0.4413793087005615, "step": 27155 }, { "epoch": 0.027355840461866503, "grad_norm": 14.941170369381537, "learning_rate": 2.7355316963116657e-05, "loss": 2.3365, "mean_token_accuracy": 0.458620685338974, "step": 27160 }, { "epoch": 0.027360876514970676, "grad_norm": 16.22472969922191, "learning_rate": 2.7360352919847716e-05, "loss": 2.7369, "mean_token_accuracy": 0.38275861740112305, "step": 27165 }, { "epoch": 0.027365912568074847, "grad_norm": 20.399271711134233, "learning_rate": 2.7365388876578772e-05, "loss": 2.386, "mean_token_accuracy": 0.4655172288417816, "step": 27170 }, { "epoch": 0.02737094862117902, "grad_norm": 19.162675336913377, "learning_rate": 2.737042483330983e-05, "loss": 2.4567, "mean_token_accuracy": 0.42758620381355283, "step": 27175 }, { "epoch": 0.027375984674283194, "grad_norm": 14.320613954501372, "learning_rate": 2.737546079004089e-05, "loss": 2.2488, "mean_token_accuracy": 0.40689654350280763, "step": 27180 }, { "epoch": 0.027381020727387365, "grad_norm": 18.681276203673328, "learning_rate": 2.7380496746771957e-05, "loss": 2.6047, "mean_token_accuracy": 0.3845735102891922, "step": 27185 }, { "epoch": 0.02738605678049154, "grad_norm": 20.93588520302252, "learning_rate": 2.7385532703503013e-05, "loss": 2.6694, "mean_token_accuracy": 0.4, "step": 27190 }, { "epoch": 0.027391092833595712, "grad_norm": 18.13116529545736, "learning_rate": 2.7390568660234072e-05, "loss": 2.3693, "mean_token_accuracy": 0.43980641961097716, "step": 27195 }, { "epoch": 0.027396128886699886, "grad_norm": 23.654789853833485, "learning_rate": 2.739560461696513e-05, "loss": 2.7075, "mean_token_accuracy": 0.41379310488700866, "step": 27200 }, { "epoch": 0.027401164939804056, "grad_norm": 18.853120486741567, "learning_rate": 2.740064057369619e-05, "loss": 2.3148, "mean_token_accuracy": 0.37586206793785093, "step": 27205 }, { "epoch": 0.02740620099290823, "grad_norm": 16.511041412220326, "learning_rate": 2.7405676530427253e-05, "loss": 2.6131, "mean_token_accuracy": 0.3793103516101837, "step": 27210 }, { "epoch": 0.027411237046012404, "grad_norm": 19.001390329784446, "learning_rate": 2.7410712487158312e-05, "loss": 2.5543, "mean_token_accuracy": 0.3793103337287903, "step": 27215 }, { "epoch": 0.027416273099116574, "grad_norm": 18.111528051606957, "learning_rate": 2.741574844388937e-05, "loss": 2.5501, "mean_token_accuracy": 0.41034482717514037, "step": 27220 }, { "epoch": 0.027421309152220748, "grad_norm": 15.084221409984176, "learning_rate": 2.742078440062043e-05, "loss": 2.3916, "mean_token_accuracy": 0.4068965494632721, "step": 27225 }, { "epoch": 0.02742634520532492, "grad_norm": 16.546786983202054, "learning_rate": 2.742582035735149e-05, "loss": 2.5995, "mean_token_accuracy": 0.41034482419490814, "step": 27230 }, { "epoch": 0.027431381258429095, "grad_norm": 27.30198478534222, "learning_rate": 2.743085631408255e-05, "loss": 2.4059, "mean_token_accuracy": 0.4379310369491577, "step": 27235 }, { "epoch": 0.027436417311533266, "grad_norm": 19.04729231564735, "learning_rate": 2.7435892270813612e-05, "loss": 2.4831, "mean_token_accuracy": 0.39655172228813174, "step": 27240 }, { "epoch": 0.02744145336463744, "grad_norm": 15.459256543940615, "learning_rate": 2.744092822754467e-05, "loss": 2.1928, "mean_token_accuracy": 0.4275862157344818, "step": 27245 }, { "epoch": 0.027446489417741613, "grad_norm": 17.565142652486028, "learning_rate": 2.744596418427573e-05, "loss": 2.3983, "mean_token_accuracy": 0.45517241954803467, "step": 27250 }, { "epoch": 0.027451525470845783, "grad_norm": 16.007976885258323, "learning_rate": 2.745100014100679e-05, "loss": 2.6119, "mean_token_accuracy": 0.4034482717514038, "step": 27255 }, { "epoch": 0.027456561523949957, "grad_norm": 16.704478205800484, "learning_rate": 2.745603609773785e-05, "loss": 2.2223, "mean_token_accuracy": 0.44482758045196535, "step": 27260 }, { "epoch": 0.02746159757705413, "grad_norm": 20.047154352302815, "learning_rate": 2.746107205446891e-05, "loss": 2.2953, "mean_token_accuracy": 0.42857142686843874, "step": 27265 }, { "epoch": 0.027466633630158305, "grad_norm": 17.67405689327055, "learning_rate": 2.746610801119997e-05, "loss": 2.3107, "mean_token_accuracy": 0.4379310369491577, "step": 27270 }, { "epoch": 0.027471669683262475, "grad_norm": 17.399814925804186, "learning_rate": 2.747114396793103e-05, "loss": 2.5877, "mean_token_accuracy": 0.4034482717514038, "step": 27275 }, { "epoch": 0.02747670573636665, "grad_norm": 17.09992509959253, "learning_rate": 2.747617992466209e-05, "loss": 2.8556, "mean_token_accuracy": 0.3344827562570572, "step": 27280 }, { "epoch": 0.027481741789470823, "grad_norm": 16.59739341088239, "learning_rate": 2.7481215881393145e-05, "loss": 2.3149, "mean_token_accuracy": 0.4536600112915039, "step": 27285 }, { "epoch": 0.027486777842574993, "grad_norm": 13.747106996107085, "learning_rate": 2.748625183812421e-05, "loss": 2.673, "mean_token_accuracy": 0.38771929740905764, "step": 27290 }, { "epoch": 0.027491813895679167, "grad_norm": 16.07875722969407, "learning_rate": 2.749128779485527e-05, "loss": 2.4356, "mean_token_accuracy": 0.38620689511299133, "step": 27295 }, { "epoch": 0.02749684994878334, "grad_norm": 16.95683051408008, "learning_rate": 2.749632375158633e-05, "loss": 2.363, "mean_token_accuracy": 0.4586206912994385, "step": 27300 }, { "epoch": 0.027501886001887514, "grad_norm": 16.795844924933615, "learning_rate": 2.7501359708317386e-05, "loss": 2.0186, "mean_token_accuracy": 0.5225650310516358, "step": 27305 }, { "epoch": 0.027506922054991684, "grad_norm": 17.266893980462907, "learning_rate": 2.7506395665048445e-05, "loss": 2.1679, "mean_token_accuracy": 0.4744101643562317, "step": 27310 }, { "epoch": 0.027511958108095858, "grad_norm": 16.273114282901634, "learning_rate": 2.7511431621779504e-05, "loss": 2.319, "mean_token_accuracy": 0.37241379022598264, "step": 27315 }, { "epoch": 0.027516994161200032, "grad_norm": 19.307710076705433, "learning_rate": 2.751646757851057e-05, "loss": 2.5767, "mean_token_accuracy": 0.34137930274009703, "step": 27320 }, { "epoch": 0.027522030214304202, "grad_norm": 30.43067687637569, "learning_rate": 2.7521503535241626e-05, "loss": 2.6254, "mean_token_accuracy": 0.39655172228813174, "step": 27325 }, { "epoch": 0.027527066267408376, "grad_norm": 18.374438326215635, "learning_rate": 2.7526539491972685e-05, "loss": 2.5149, "mean_token_accuracy": 0.39655172228813174, "step": 27330 }, { "epoch": 0.02753210232051255, "grad_norm": 26.129356140041466, "learning_rate": 2.7531575448703744e-05, "loss": 2.8693, "mean_token_accuracy": 0.41034482717514037, "step": 27335 }, { "epoch": 0.027537138373616724, "grad_norm": 15.796704079911935, "learning_rate": 2.7536611405434804e-05, "loss": 2.6027, "mean_token_accuracy": 0.40344828367233276, "step": 27340 }, { "epoch": 0.027542174426720894, "grad_norm": 16.69399783766348, "learning_rate": 2.7541647362165866e-05, "loss": 2.636, "mean_token_accuracy": 0.37241379022598264, "step": 27345 }, { "epoch": 0.027547210479825068, "grad_norm": 14.943979543328604, "learning_rate": 2.7546683318896926e-05, "loss": 2.3698, "mean_token_accuracy": 0.41512401700019835, "step": 27350 }, { "epoch": 0.02755224653292924, "grad_norm": 18.816893755896213, "learning_rate": 2.7551719275627985e-05, "loss": 2.511, "mean_token_accuracy": 0.42068964838981626, "step": 27355 }, { "epoch": 0.02755728258603341, "grad_norm": 17.908242418328797, "learning_rate": 2.7556755232359044e-05, "loss": 2.5565, "mean_token_accuracy": 0.3793103337287903, "step": 27360 }, { "epoch": 0.027562318639137585, "grad_norm": 17.804988205490563, "learning_rate": 2.7561791189090103e-05, "loss": 2.0971, "mean_token_accuracy": 0.4620689630508423, "step": 27365 }, { "epoch": 0.02756735469224176, "grad_norm": 15.87490201353111, "learning_rate": 2.7566827145821166e-05, "loss": 2.3441, "mean_token_accuracy": 0.41379310488700866, "step": 27370 }, { "epoch": 0.027572390745345933, "grad_norm": 21.193956710503894, "learning_rate": 2.7571863102552225e-05, "loss": 2.8235, "mean_token_accuracy": 0.4068965554237366, "step": 27375 }, { "epoch": 0.027577426798450103, "grad_norm": 18.552782019103464, "learning_rate": 2.7576899059283285e-05, "loss": 2.6063, "mean_token_accuracy": 0.41724137365818026, "step": 27380 }, { "epoch": 0.027582462851554277, "grad_norm": 21.421510709269537, "learning_rate": 2.7581935016014344e-05, "loss": 2.0666, "mean_token_accuracy": 0.4620689690113068, "step": 27385 }, { "epoch": 0.02758749890465845, "grad_norm": 17.44875880129707, "learning_rate": 2.7586970972745403e-05, "loss": 2.8058, "mean_token_accuracy": 0.3655172407627106, "step": 27390 }, { "epoch": 0.02759253495776262, "grad_norm": 20.995873321355464, "learning_rate": 2.759200692947646e-05, "loss": 2.3796, "mean_token_accuracy": 0.3965517163276672, "step": 27395 }, { "epoch": 0.027597571010866795, "grad_norm": 13.63328799253174, "learning_rate": 2.7597042886207525e-05, "loss": 2.7358, "mean_token_accuracy": 0.3379310369491577, "step": 27400 }, { "epoch": 0.02760260706397097, "grad_norm": 16.588727252003245, "learning_rate": 2.7602078842938584e-05, "loss": 2.5769, "mean_token_accuracy": 0.4227465093135834, "step": 27405 }, { "epoch": 0.027607643117075142, "grad_norm": 23.59285830835005, "learning_rate": 2.7607114799669643e-05, "loss": 2.2749, "mean_token_accuracy": 0.46551724672317507, "step": 27410 }, { "epoch": 0.027612679170179313, "grad_norm": 20.440765333056376, "learning_rate": 2.76121507564007e-05, "loss": 2.8228, "mean_token_accuracy": 0.36896550953388213, "step": 27415 }, { "epoch": 0.027617715223283486, "grad_norm": 20.91788641667471, "learning_rate": 2.761718671313176e-05, "loss": 2.9673, "mean_token_accuracy": 0.32413792312145234, "step": 27420 }, { "epoch": 0.02762275127638766, "grad_norm": 16.396285605752766, "learning_rate": 2.7622222669862825e-05, "loss": 1.8723, "mean_token_accuracy": 0.5275862038135528, "step": 27425 }, { "epoch": 0.02762778732949183, "grad_norm": 18.880705172944772, "learning_rate": 2.7627258626593884e-05, "loss": 2.7126, "mean_token_accuracy": 0.41034483909606934, "step": 27430 }, { "epoch": 0.027632823382596004, "grad_norm": 16.92064894459665, "learning_rate": 2.763229458332494e-05, "loss": 3.0504, "mean_token_accuracy": 0.33793103992938994, "step": 27435 }, { "epoch": 0.027637859435700178, "grad_norm": 16.878082120130355, "learning_rate": 2.7637330540056e-05, "loss": 2.6974, "mean_token_accuracy": 0.4034482777118683, "step": 27440 }, { "epoch": 0.027642895488804352, "grad_norm": 18.289697322325747, "learning_rate": 2.7642366496787058e-05, "loss": 2.6675, "mean_token_accuracy": 0.37586206793785093, "step": 27445 }, { "epoch": 0.027647931541908522, "grad_norm": 19.102930256434053, "learning_rate": 2.7647402453518124e-05, "loss": 2.6065, "mean_token_accuracy": 0.41034482717514037, "step": 27450 }, { "epoch": 0.027652967595012696, "grad_norm": 15.220962965503691, "learning_rate": 2.7652438410249183e-05, "loss": 2.3591, "mean_token_accuracy": 0.4068965554237366, "step": 27455 }, { "epoch": 0.02765800364811687, "grad_norm": 17.91015905054953, "learning_rate": 2.765747436698024e-05, "loss": 2.5209, "mean_token_accuracy": 0.41034482717514037, "step": 27460 }, { "epoch": 0.02766303970122104, "grad_norm": 18.248586119445473, "learning_rate": 2.76625103237113e-05, "loss": 2.8705, "mean_token_accuracy": 0.41034482717514037, "step": 27465 }, { "epoch": 0.027668075754325214, "grad_norm": 20.875979916301507, "learning_rate": 2.7667546280442358e-05, "loss": 2.6522, "mean_token_accuracy": 0.3965517163276672, "step": 27470 }, { "epoch": 0.027673111807429387, "grad_norm": 17.518254495629723, "learning_rate": 2.7672582237173424e-05, "loss": 2.618, "mean_token_accuracy": 0.4, "step": 27475 }, { "epoch": 0.02767814786053356, "grad_norm": 20.144061773995343, "learning_rate": 2.767761819390448e-05, "loss": 2.7964, "mean_token_accuracy": 0.39655172228813174, "step": 27480 }, { "epoch": 0.02768318391363773, "grad_norm": 17.486141851878344, "learning_rate": 2.768265415063554e-05, "loss": 2.4017, "mean_token_accuracy": 0.4103448331356049, "step": 27485 }, { "epoch": 0.027688219966741905, "grad_norm": 13.308094106974512, "learning_rate": 2.7687690107366598e-05, "loss": 2.3088, "mean_token_accuracy": 0.4363581359386444, "step": 27490 }, { "epoch": 0.02769325601984608, "grad_norm": 18.10915018593621, "learning_rate": 2.7692726064097657e-05, "loss": 2.3441, "mean_token_accuracy": 0.39310344457626345, "step": 27495 }, { "epoch": 0.02769829207295025, "grad_norm": 16.233617345222303, "learning_rate": 2.7697762020828717e-05, "loss": 3.003, "mean_token_accuracy": 0.38965516686439516, "step": 27500 }, { "epoch": 0.027703328126054423, "grad_norm": 19.210116288880382, "learning_rate": 2.770279797755978e-05, "loss": 2.6668, "mean_token_accuracy": 0.36551723480224607, "step": 27505 }, { "epoch": 0.027708364179158597, "grad_norm": 18.12197878734856, "learning_rate": 2.770783393429084e-05, "loss": 2.4597, "mean_token_accuracy": 0.41724138259887694, "step": 27510 }, { "epoch": 0.02771340023226277, "grad_norm": 13.862625792092306, "learning_rate": 2.7712869891021898e-05, "loss": 2.6043, "mean_token_accuracy": 0.4, "step": 27515 }, { "epoch": 0.02771843628536694, "grad_norm": 18.153046794711525, "learning_rate": 2.7717905847752957e-05, "loss": 2.2161, "mean_token_accuracy": 0.44996975660324096, "step": 27520 }, { "epoch": 0.027723472338471115, "grad_norm": 18.425282122017222, "learning_rate": 2.7722941804484016e-05, "loss": 2.5786, "mean_token_accuracy": 0.3620689630508423, "step": 27525 }, { "epoch": 0.02772850839157529, "grad_norm": 14.436900841502986, "learning_rate": 2.772797776121508e-05, "loss": 2.6201, "mean_token_accuracy": 0.4034482717514038, "step": 27530 }, { "epoch": 0.02773354444467946, "grad_norm": 20.345230198578577, "learning_rate": 2.7733013717946138e-05, "loss": 2.2719, "mean_token_accuracy": 0.4137930929660797, "step": 27535 }, { "epoch": 0.027738580497783633, "grad_norm": 16.619536035295685, "learning_rate": 2.7738049674677198e-05, "loss": 2.8645, "mean_token_accuracy": 0.37586206793785093, "step": 27540 }, { "epoch": 0.027743616550887806, "grad_norm": 17.963895363822328, "learning_rate": 2.7743085631408257e-05, "loss": 2.2363, "mean_token_accuracy": 0.4194192349910736, "step": 27545 }, { "epoch": 0.02774865260399198, "grad_norm": 15.686307235911388, "learning_rate": 2.7748121588139313e-05, "loss": 2.4027, "mean_token_accuracy": 0.4517241418361664, "step": 27550 }, { "epoch": 0.02775368865709615, "grad_norm": 15.522724948962093, "learning_rate": 2.775315754487038e-05, "loss": 2.4636, "mean_token_accuracy": 0.42758620381355283, "step": 27555 }, { "epoch": 0.027758724710200324, "grad_norm": 16.891427149175534, "learning_rate": 2.7758193501601438e-05, "loss": 2.3729, "mean_token_accuracy": 0.44827587008476255, "step": 27560 }, { "epoch": 0.027763760763304498, "grad_norm": 15.43213113576706, "learning_rate": 2.7763229458332497e-05, "loss": 2.3027, "mean_token_accuracy": 0.43968542814254763, "step": 27565 }, { "epoch": 0.027768796816408668, "grad_norm": 20.47605482607052, "learning_rate": 2.7768265415063553e-05, "loss": 2.5564, "mean_token_accuracy": 0.3862069010734558, "step": 27570 }, { "epoch": 0.027773832869512842, "grad_norm": 16.04500676165232, "learning_rate": 2.7773301371794612e-05, "loss": 2.4943, "mean_token_accuracy": 0.41724138259887694, "step": 27575 }, { "epoch": 0.027778868922617016, "grad_norm": 19.208416667036083, "learning_rate": 2.777833732852567e-05, "loss": 2.4975, "mean_token_accuracy": 0.42758620381355283, "step": 27580 }, { "epoch": 0.02778390497572119, "grad_norm": 15.891822868563045, "learning_rate": 2.7783373285256738e-05, "loss": 2.4579, "mean_token_accuracy": 0.4275862157344818, "step": 27585 }, { "epoch": 0.02778894102882536, "grad_norm": 22.853696288490546, "learning_rate": 2.7788409241987793e-05, "loss": 2.7407, "mean_token_accuracy": 0.35862069129943847, "step": 27590 }, { "epoch": 0.027793977081929534, "grad_norm": 15.34662176986484, "learning_rate": 2.7793445198718853e-05, "loss": 2.5677, "mean_token_accuracy": 0.46293104290962217, "step": 27595 }, { "epoch": 0.027799013135033707, "grad_norm": 31.111726459828645, "learning_rate": 2.7798481155449912e-05, "loss": 2.5319, "mean_token_accuracy": 0.4206896543502808, "step": 27600 }, { "epoch": 0.027804049188137878, "grad_norm": 19.55455983453019, "learning_rate": 2.780351711218097e-05, "loss": 2.447, "mean_token_accuracy": 0.4086509346961975, "step": 27605 }, { "epoch": 0.02780908524124205, "grad_norm": 25.970737043944847, "learning_rate": 2.7808553068912034e-05, "loss": 2.8492, "mean_token_accuracy": 0.3551724135875702, "step": 27610 }, { "epoch": 0.027814121294346225, "grad_norm": 15.495540722126387, "learning_rate": 2.7813589025643093e-05, "loss": 2.4784, "mean_token_accuracy": 0.41379311084747317, "step": 27615 }, { "epoch": 0.0278191573474504, "grad_norm": 20.555353907013142, "learning_rate": 2.7818624982374152e-05, "loss": 2.2526, "mean_token_accuracy": 0.4517241418361664, "step": 27620 }, { "epoch": 0.02782419340055457, "grad_norm": 15.137720618809404, "learning_rate": 2.782366093910521e-05, "loss": 2.4995, "mean_token_accuracy": 0.41034482717514037, "step": 27625 }, { "epoch": 0.027829229453658743, "grad_norm": 22.703246825761113, "learning_rate": 2.782869689583627e-05, "loss": 2.618, "mean_token_accuracy": 0.42068964838981626, "step": 27630 }, { "epoch": 0.027834265506762917, "grad_norm": 14.773112132307784, "learning_rate": 2.7833732852567334e-05, "loss": 2.7074, "mean_token_accuracy": 0.36896551847457887, "step": 27635 }, { "epoch": 0.027839301559867087, "grad_norm": 15.426159256590825, "learning_rate": 2.7838768809298393e-05, "loss": 2.4968, "mean_token_accuracy": 0.4862069010734558, "step": 27640 }, { "epoch": 0.02784433761297126, "grad_norm": 25.194763368924228, "learning_rate": 2.7843804766029452e-05, "loss": 2.497, "mean_token_accuracy": 0.4379310369491577, "step": 27645 }, { "epoch": 0.027849373666075435, "grad_norm": 14.246080655706088, "learning_rate": 2.784884072276051e-05, "loss": 2.8493, "mean_token_accuracy": 0.3862068891525269, "step": 27650 }, { "epoch": 0.02785440971917961, "grad_norm": 18.474287584549504, "learning_rate": 2.785387667949157e-05, "loss": 2.4052, "mean_token_accuracy": 0.4, "step": 27655 }, { "epoch": 0.02785944577228378, "grad_norm": 16.71409043233609, "learning_rate": 2.785891263622263e-05, "loss": 2.8911, "mean_token_accuracy": 0.4344827651977539, "step": 27660 }, { "epoch": 0.027864481825387952, "grad_norm": 20.090150364568835, "learning_rate": 2.7863948592953692e-05, "loss": 2.5883, "mean_token_accuracy": 0.41536598801612856, "step": 27665 }, { "epoch": 0.027869517878492126, "grad_norm": 16.882301367990504, "learning_rate": 2.786898454968475e-05, "loss": 2.4642, "mean_token_accuracy": 0.4, "step": 27670 }, { "epoch": 0.027874553931596296, "grad_norm": 15.74960710091616, "learning_rate": 2.787402050641581e-05, "loss": 2.4615, "mean_token_accuracy": 0.3931034505367279, "step": 27675 }, { "epoch": 0.02787958998470047, "grad_norm": 18.67177149607671, "learning_rate": 2.787905646314687e-05, "loss": 2.2864, "mean_token_accuracy": 0.4482758641242981, "step": 27680 }, { "epoch": 0.027884626037804644, "grad_norm": 19.05106213101106, "learning_rate": 2.7884092419877926e-05, "loss": 2.4978, "mean_token_accuracy": 0.3482758641242981, "step": 27685 }, { "epoch": 0.027889662090908818, "grad_norm": 21.507768234466003, "learning_rate": 2.7889128376608992e-05, "loss": 2.6816, "mean_token_accuracy": 0.3827586233615875, "step": 27690 }, { "epoch": 0.027894698144012988, "grad_norm": 15.344116964323065, "learning_rate": 2.789416433334005e-05, "loss": 2.0571, "mean_token_accuracy": 0.4886267364025116, "step": 27695 }, { "epoch": 0.027899734197117162, "grad_norm": 19.620085004026407, "learning_rate": 2.789920029007111e-05, "loss": 2.4958, "mean_token_accuracy": 0.3586206823587418, "step": 27700 }, { "epoch": 0.027904770250221336, "grad_norm": 14.550015380850073, "learning_rate": 2.7904236246802166e-05, "loss": 2.3624, "mean_token_accuracy": 0.4310344815254211, "step": 27705 }, { "epoch": 0.027909806303325506, "grad_norm": 17.46027914323568, "learning_rate": 2.7909272203533226e-05, "loss": 2.2816, "mean_token_accuracy": 0.42758620381355283, "step": 27710 }, { "epoch": 0.02791484235642968, "grad_norm": 20.0772271958674, "learning_rate": 2.7914308160264292e-05, "loss": 2.5303, "mean_token_accuracy": 0.43103447556495667, "step": 27715 }, { "epoch": 0.027919878409533853, "grad_norm": 17.585069722639467, "learning_rate": 2.791934411699535e-05, "loss": 2.5077, "mean_token_accuracy": 0.4034482717514038, "step": 27720 }, { "epoch": 0.027924914462638027, "grad_norm": 23.138571693140538, "learning_rate": 2.7924380073726407e-05, "loss": 2.471, "mean_token_accuracy": 0.42552934885025023, "step": 27725 }, { "epoch": 0.027929950515742197, "grad_norm": 17.546299031505505, "learning_rate": 2.7929416030457466e-05, "loss": 2.2751, "mean_token_accuracy": 0.46551724672317507, "step": 27730 }, { "epoch": 0.02793498656884637, "grad_norm": 15.976702165631282, "learning_rate": 2.7934451987188525e-05, "loss": 2.4993, "mean_token_accuracy": 0.4204476773738861, "step": 27735 }, { "epoch": 0.027940022621950545, "grad_norm": 18.01271129798969, "learning_rate": 2.7939487943919585e-05, "loss": 2.5559, "mean_token_accuracy": 0.3793103456497192, "step": 27740 }, { "epoch": 0.027945058675054715, "grad_norm": 14.834110003270903, "learning_rate": 2.7944523900650647e-05, "loss": 2.5397, "mean_token_accuracy": 0.44827587008476255, "step": 27745 }, { "epoch": 0.02795009472815889, "grad_norm": 17.94608178767141, "learning_rate": 2.7949559857381706e-05, "loss": 2.2028, "mean_token_accuracy": 0.47241378426551817, "step": 27750 }, { "epoch": 0.027955130781263063, "grad_norm": 18.804822890663335, "learning_rate": 2.7954595814112766e-05, "loss": 2.4007, "mean_token_accuracy": 0.4310344815254211, "step": 27755 }, { "epoch": 0.027960166834367237, "grad_norm": 16.39515656859393, "learning_rate": 2.7959631770843825e-05, "loss": 2.5348, "mean_token_accuracy": 0.3896551728248596, "step": 27760 }, { "epoch": 0.027965202887471407, "grad_norm": 28.065565299964494, "learning_rate": 2.7964667727574884e-05, "loss": 2.968, "mean_token_accuracy": 0.3655172407627106, "step": 27765 }, { "epoch": 0.02797023894057558, "grad_norm": 17.65372005465926, "learning_rate": 2.7969703684305947e-05, "loss": 2.7629, "mean_token_accuracy": 0.36896551847457887, "step": 27770 }, { "epoch": 0.027975274993679754, "grad_norm": 21.72268211220837, "learning_rate": 2.7974739641037006e-05, "loss": 2.2851, "mean_token_accuracy": 0.41379310488700866, "step": 27775 }, { "epoch": 0.027980311046783925, "grad_norm": 19.954780967081327, "learning_rate": 2.7979775597768065e-05, "loss": 2.4921, "mean_token_accuracy": 0.41724138259887694, "step": 27780 }, { "epoch": 0.0279853470998881, "grad_norm": 14.575487488474167, "learning_rate": 2.7984811554499125e-05, "loss": 2.3811, "mean_token_accuracy": 0.4034482777118683, "step": 27785 }, { "epoch": 0.027990383152992272, "grad_norm": 18.321923707770228, "learning_rate": 2.7989847511230184e-05, "loss": 2.6048, "mean_token_accuracy": 0.39310344457626345, "step": 27790 }, { "epoch": 0.027995419206096446, "grad_norm": 17.302357428247525, "learning_rate": 2.7994883467961247e-05, "loss": 2.226, "mean_token_accuracy": 0.4052631616592407, "step": 27795 }, { "epoch": 0.028000455259200616, "grad_norm": 16.25255576134554, "learning_rate": 2.7999919424692306e-05, "loss": 2.5809, "mean_token_accuracy": 0.4068965494632721, "step": 27800 }, { "epoch": 0.02800549131230479, "grad_norm": 19.39570295181595, "learning_rate": 2.8004955381423365e-05, "loss": 2.6778, "mean_token_accuracy": 0.3827586233615875, "step": 27805 }, { "epoch": 0.028010527365408964, "grad_norm": 16.37041238981368, "learning_rate": 2.8009991338154424e-05, "loss": 2.4955, "mean_token_accuracy": 0.4034482777118683, "step": 27810 }, { "epoch": 0.028015563418513134, "grad_norm": 89.30411450683553, "learning_rate": 2.8015027294885484e-05, "loss": 2.5292, "mean_token_accuracy": 0.43177340626716615, "step": 27815 }, { "epoch": 0.028020599471617308, "grad_norm": 21.527221565624284, "learning_rate": 2.8020063251616546e-05, "loss": 3.0379, "mean_token_accuracy": 0.3620689630508423, "step": 27820 }, { "epoch": 0.02802563552472148, "grad_norm": 12.733251531637329, "learning_rate": 2.8025099208347605e-05, "loss": 2.4715, "mean_token_accuracy": 0.4333938300609589, "step": 27825 }, { "epoch": 0.028030671577825655, "grad_norm": 16.430781808910176, "learning_rate": 2.8030135165078665e-05, "loss": 2.1636, "mean_token_accuracy": 0.4310344815254211, "step": 27830 }, { "epoch": 0.028035707630929826, "grad_norm": 15.884619870136369, "learning_rate": 2.8035171121809724e-05, "loss": 2.6577, "mean_token_accuracy": 0.39310344457626345, "step": 27835 }, { "epoch": 0.028040743684034, "grad_norm": 18.34668658156639, "learning_rate": 2.804020707854078e-05, "loss": 2.206, "mean_token_accuracy": 0.48511797189712524, "step": 27840 }, { "epoch": 0.028045779737138173, "grad_norm": 18.610599551933596, "learning_rate": 2.804524303527184e-05, "loss": 2.3253, "mean_token_accuracy": 0.4482758641242981, "step": 27845 }, { "epoch": 0.028050815790242344, "grad_norm": 16.743486299083173, "learning_rate": 2.8050278992002905e-05, "loss": 2.5761, "mean_token_accuracy": 0.4103448331356049, "step": 27850 }, { "epoch": 0.028055851843346517, "grad_norm": 18.150593909167224, "learning_rate": 2.8055314948733964e-05, "loss": 2.42, "mean_token_accuracy": 0.4448275864124298, "step": 27855 }, { "epoch": 0.02806088789645069, "grad_norm": 14.519229200680817, "learning_rate": 2.806035090546502e-05, "loss": 2.1876, "mean_token_accuracy": 0.47428917288780215, "step": 27860 }, { "epoch": 0.028065923949554865, "grad_norm": 19.056812244405776, "learning_rate": 2.806538686219608e-05, "loss": 2.1612, "mean_token_accuracy": 0.4379310369491577, "step": 27865 }, { "epoch": 0.028070960002659035, "grad_norm": 24.010320458535293, "learning_rate": 2.807042281892714e-05, "loss": 2.2851, "mean_token_accuracy": 0.3793103516101837, "step": 27870 }, { "epoch": 0.02807599605576321, "grad_norm": 21.060408677319874, "learning_rate": 2.8075458775658205e-05, "loss": 2.6736, "mean_token_accuracy": 0.3655172407627106, "step": 27875 }, { "epoch": 0.028081032108867383, "grad_norm": 22.289764026945186, "learning_rate": 2.808049473238926e-05, "loss": 1.9696, "mean_token_accuracy": 0.4551724255084991, "step": 27880 }, { "epoch": 0.028086068161971553, "grad_norm": 19.05755364109824, "learning_rate": 2.808553068912032e-05, "loss": 2.6949, "mean_token_accuracy": 0.4068965494632721, "step": 27885 }, { "epoch": 0.028091104215075727, "grad_norm": 15.319318103975831, "learning_rate": 2.809056664585138e-05, "loss": 2.8223, "mean_token_accuracy": 0.3551724135875702, "step": 27890 }, { "epoch": 0.0280961402681799, "grad_norm": 15.738098348745073, "learning_rate": 2.809560260258244e-05, "loss": 2.4248, "mean_token_accuracy": 0.41379310488700866, "step": 27895 }, { "epoch": 0.028101176321284074, "grad_norm": 15.332045566305235, "learning_rate": 2.81006385593135e-05, "loss": 2.7991, "mean_token_accuracy": 0.3793103456497192, "step": 27900 }, { "epoch": 0.028106212374388245, "grad_norm": 21.789178010342635, "learning_rate": 2.810567451604456e-05, "loss": 2.3271, "mean_token_accuracy": 0.43448275327682495, "step": 27905 }, { "epoch": 0.02811124842749242, "grad_norm": 19.778918854279127, "learning_rate": 2.811071047277562e-05, "loss": 2.5712, "mean_token_accuracy": 0.4103448331356049, "step": 27910 }, { "epoch": 0.028116284480596592, "grad_norm": 20.28958993987868, "learning_rate": 2.811574642950668e-05, "loss": 2.7128, "mean_token_accuracy": 0.3448275804519653, "step": 27915 }, { "epoch": 0.028121320533700762, "grad_norm": 18.103177879446424, "learning_rate": 2.8120782386237738e-05, "loss": 2.4406, "mean_token_accuracy": 0.4310344815254211, "step": 27920 }, { "epoch": 0.028126356586804936, "grad_norm": 14.469902032746289, "learning_rate": 2.8125818342968797e-05, "loss": 2.5093, "mean_token_accuracy": 0.3965517163276672, "step": 27925 }, { "epoch": 0.02813139263990911, "grad_norm": 20.27840409738902, "learning_rate": 2.813085429969986e-05, "loss": 2.418, "mean_token_accuracy": 0.41379310488700866, "step": 27930 }, { "epoch": 0.028136428693013284, "grad_norm": 15.40105445725144, "learning_rate": 2.813589025643092e-05, "loss": 2.2511, "mean_token_accuracy": 0.3999999940395355, "step": 27935 }, { "epoch": 0.028141464746117454, "grad_norm": 18.36958005639998, "learning_rate": 2.814092621316198e-05, "loss": 2.5261, "mean_token_accuracy": 0.4275861978530884, "step": 27940 }, { "epoch": 0.028146500799221628, "grad_norm": 12.681719289106361, "learning_rate": 2.8145962169893038e-05, "loss": 2.5071, "mean_token_accuracy": 0.3862069010734558, "step": 27945 }, { "epoch": 0.0281515368523258, "grad_norm": 14.039456793214061, "learning_rate": 2.8150998126624094e-05, "loss": 2.4002, "mean_token_accuracy": 0.45862069725990295, "step": 27950 }, { "epoch": 0.028156572905429972, "grad_norm": 28.882402929753226, "learning_rate": 2.815603408335516e-05, "loss": 2.4402, "mean_token_accuracy": 0.46551724672317507, "step": 27955 }, { "epoch": 0.028161608958534146, "grad_norm": 18.353488520838976, "learning_rate": 2.816107004008622e-05, "loss": 2.3858, "mean_token_accuracy": 0.44827585816383364, "step": 27960 }, { "epoch": 0.02816664501163832, "grad_norm": 16.682382985152902, "learning_rate": 2.8166105996817278e-05, "loss": 2.2709, "mean_token_accuracy": 0.44482759237289426, "step": 27965 }, { "epoch": 0.028171681064742493, "grad_norm": 15.009710129735078, "learning_rate": 2.8171141953548334e-05, "loss": 2.4922, "mean_token_accuracy": 0.35862069129943847, "step": 27970 }, { "epoch": 0.028176717117846663, "grad_norm": 17.95951643713577, "learning_rate": 2.8176177910279393e-05, "loss": 2.4578, "mean_token_accuracy": 0.3919540226459503, "step": 27975 }, { "epoch": 0.028181753170950837, "grad_norm": 18.327300901465197, "learning_rate": 2.818121386701046e-05, "loss": 2.5142, "mean_token_accuracy": 0.42758620977401735, "step": 27980 }, { "epoch": 0.02818678922405501, "grad_norm": 22.54786310399626, "learning_rate": 2.818624982374152e-05, "loss": 2.2914, "mean_token_accuracy": 0.42413792610168455, "step": 27985 }, { "epoch": 0.02819182527715918, "grad_norm": 19.95293805187181, "learning_rate": 2.8191285780472578e-05, "loss": 2.6907, "mean_token_accuracy": 0.36660616993904116, "step": 27990 }, { "epoch": 0.028196861330263355, "grad_norm": 19.0165389938536, "learning_rate": 2.8196321737203634e-05, "loss": 2.5573, "mean_token_accuracy": 0.36551724672317504, "step": 27995 }, { "epoch": 0.02820189738336753, "grad_norm": 15.219067319872153, "learning_rate": 2.8201357693934693e-05, "loss": 2.422, "mean_token_accuracy": 0.42928009033203124, "step": 28000 }, { "epoch": 0.028206933436471703, "grad_norm": 17.632750790599026, "learning_rate": 2.8206393650665752e-05, "loss": 2.4535, "mean_token_accuracy": 0.42758620381355283, "step": 28005 }, { "epoch": 0.028211969489575873, "grad_norm": 17.614657994248734, "learning_rate": 2.8211429607396818e-05, "loss": 2.6753, "mean_token_accuracy": 0.3965517282485962, "step": 28010 }, { "epoch": 0.028217005542680047, "grad_norm": 16.826485508318573, "learning_rate": 2.8216465564127874e-05, "loss": 2.7549, "mean_token_accuracy": 0.41379311084747317, "step": 28015 }, { "epoch": 0.02822204159578422, "grad_norm": 13.056252781451715, "learning_rate": 2.8221501520858933e-05, "loss": 2.3114, "mean_token_accuracy": 0.43103448748588563, "step": 28020 }, { "epoch": 0.02822707764888839, "grad_norm": 55.24421861421408, "learning_rate": 2.8226537477589993e-05, "loss": 2.7715, "mean_token_accuracy": 0.37241379618644715, "step": 28025 }, { "epoch": 0.028232113701992564, "grad_norm": 17.498126633231717, "learning_rate": 2.8231573434321052e-05, "loss": 2.5553, "mean_token_accuracy": 0.441379314661026, "step": 28030 }, { "epoch": 0.028237149755096738, "grad_norm": 15.421492339228518, "learning_rate": 2.8236609391052114e-05, "loss": 2.4536, "mean_token_accuracy": 0.4034482717514038, "step": 28035 }, { "epoch": 0.028242185808200912, "grad_norm": 19.344334330590048, "learning_rate": 2.8241645347783174e-05, "loss": 2.7456, "mean_token_accuracy": 0.36551724672317504, "step": 28040 }, { "epoch": 0.028247221861305082, "grad_norm": 19.202701143764326, "learning_rate": 2.8246681304514233e-05, "loss": 2.7319, "mean_token_accuracy": 0.3862069010734558, "step": 28045 }, { "epoch": 0.028252257914409256, "grad_norm": 17.71799314885439, "learning_rate": 2.8251717261245292e-05, "loss": 2.6246, "mean_token_accuracy": 0.3551724076271057, "step": 28050 }, { "epoch": 0.02825729396751343, "grad_norm": 23.302681906181128, "learning_rate": 2.825675321797635e-05, "loss": 2.4886, "mean_token_accuracy": 0.48275861144065857, "step": 28055 }, { "epoch": 0.0282623300206176, "grad_norm": 16.5962225829614, "learning_rate": 2.8261789174707414e-05, "loss": 2.2526, "mean_token_accuracy": 0.43974592089653014, "step": 28060 }, { "epoch": 0.028267366073721774, "grad_norm": 17.581021132250964, "learning_rate": 2.8266825131438473e-05, "loss": 2.3007, "mean_token_accuracy": 0.42758620381355283, "step": 28065 }, { "epoch": 0.028272402126825948, "grad_norm": 14.75800334350753, "learning_rate": 2.8271861088169533e-05, "loss": 2.4331, "mean_token_accuracy": 0.3999999940395355, "step": 28070 }, { "epoch": 0.02827743817993012, "grad_norm": 17.60937388031897, "learning_rate": 2.8276897044900592e-05, "loss": 2.313, "mean_token_accuracy": 0.4344827473163605, "step": 28075 }, { "epoch": 0.02828247423303429, "grad_norm": 16.604390218683527, "learning_rate": 2.828193300163165e-05, "loss": 2.7307, "mean_token_accuracy": 0.33448275923728943, "step": 28080 }, { "epoch": 0.028287510286138465, "grad_norm": 17.94464888724479, "learning_rate": 2.8286968958362707e-05, "loss": 2.8678, "mean_token_accuracy": 0.34137930274009703, "step": 28085 }, { "epoch": 0.02829254633924264, "grad_norm": 19.651100900398013, "learning_rate": 2.8292004915093773e-05, "loss": 2.2848, "mean_token_accuracy": 0.44137930274009707, "step": 28090 }, { "epoch": 0.02829758239234681, "grad_norm": 15.295795302792653, "learning_rate": 2.8297040871824832e-05, "loss": 2.4788, "mean_token_accuracy": 0.44137930274009707, "step": 28095 }, { "epoch": 0.028302618445450983, "grad_norm": 20.682741585919363, "learning_rate": 2.830207682855589e-05, "loss": 2.7506, "mean_token_accuracy": 0.3655172437429428, "step": 28100 }, { "epoch": 0.028307654498555157, "grad_norm": 16.04253339594385, "learning_rate": 2.8307112785286947e-05, "loss": 2.3808, "mean_token_accuracy": 0.4344827592372894, "step": 28105 }, { "epoch": 0.02831269055165933, "grad_norm": 23.49564760061073, "learning_rate": 2.8312148742018007e-05, "loss": 2.7526, "mean_token_accuracy": 0.3586206823587418, "step": 28110 }, { "epoch": 0.0283177266047635, "grad_norm": 16.536225308379713, "learning_rate": 2.8317184698749073e-05, "loss": 2.2987, "mean_token_accuracy": 0.4379310369491577, "step": 28115 }, { "epoch": 0.028322762657867675, "grad_norm": 18.153213257442527, "learning_rate": 2.8322220655480132e-05, "loss": 2.512, "mean_token_accuracy": 0.39310344457626345, "step": 28120 }, { "epoch": 0.02832779871097185, "grad_norm": 16.60307269598842, "learning_rate": 2.8327256612211188e-05, "loss": 2.6854, "mean_token_accuracy": 0.358620685338974, "step": 28125 }, { "epoch": 0.02833283476407602, "grad_norm": 32.37664369579404, "learning_rate": 2.8332292568942247e-05, "loss": 2.5425, "mean_token_accuracy": 0.4, "step": 28130 }, { "epoch": 0.028337870817180193, "grad_norm": 17.726107727848664, "learning_rate": 2.8337328525673306e-05, "loss": 2.7216, "mean_token_accuracy": 0.37931033968925476, "step": 28135 }, { "epoch": 0.028342906870284366, "grad_norm": 18.92129243591575, "learning_rate": 2.8342364482404372e-05, "loss": 2.9826, "mean_token_accuracy": 0.36896551847457887, "step": 28140 }, { "epoch": 0.02834794292338854, "grad_norm": 21.34004038817204, "learning_rate": 2.8347400439135428e-05, "loss": 2.425, "mean_token_accuracy": 0.375862056016922, "step": 28145 }, { "epoch": 0.02835297897649271, "grad_norm": 73.87280373537293, "learning_rate": 2.8352436395866487e-05, "loss": 2.5554, "mean_token_accuracy": 0.41724138259887694, "step": 28150 }, { "epoch": 0.028358015029596884, "grad_norm": 22.390770447538266, "learning_rate": 2.8357472352597547e-05, "loss": 2.6512, "mean_token_accuracy": 0.43599515557289126, "step": 28155 }, { "epoch": 0.028363051082701058, "grad_norm": 33.1309942449407, "learning_rate": 2.8362508309328606e-05, "loss": 2.4506, "mean_token_accuracy": 0.39655172228813174, "step": 28160 }, { "epoch": 0.02836808713580523, "grad_norm": 22.962184957668015, "learning_rate": 2.8367544266059665e-05, "loss": 2.7354, "mean_token_accuracy": 0.38275861740112305, "step": 28165 }, { "epoch": 0.028373123188909402, "grad_norm": 19.250004778554494, "learning_rate": 2.8372580222790728e-05, "loss": 2.5642, "mean_token_accuracy": 0.4531760334968567, "step": 28170 }, { "epoch": 0.028378159242013576, "grad_norm": 15.826561314830004, "learning_rate": 2.8377616179521787e-05, "loss": 2.2403, "mean_token_accuracy": 0.46896551847457885, "step": 28175 }, { "epoch": 0.02838319529511775, "grad_norm": 21.04112169100893, "learning_rate": 2.8382652136252846e-05, "loss": 2.7231, "mean_token_accuracy": 0.4052631616592407, "step": 28180 }, { "epoch": 0.02838823134822192, "grad_norm": 25.12171502325933, "learning_rate": 2.8387688092983906e-05, "loss": 2.8073, "mean_token_accuracy": 0.3638838529586792, "step": 28185 }, { "epoch": 0.028393267401326094, "grad_norm": 18.11717143269485, "learning_rate": 2.8392724049714965e-05, "loss": 2.6618, "mean_token_accuracy": 0.42068966329097746, "step": 28190 }, { "epoch": 0.028398303454430267, "grad_norm": 16.415557152505404, "learning_rate": 2.8397760006446027e-05, "loss": 2.498, "mean_token_accuracy": 0.4190562665462494, "step": 28195 }, { "epoch": 0.028403339507534438, "grad_norm": 19.576931906091925, "learning_rate": 2.8402795963177087e-05, "loss": 2.4795, "mean_token_accuracy": 0.4310344696044922, "step": 28200 }, { "epoch": 0.02840837556063861, "grad_norm": 17.940512496304905, "learning_rate": 2.8407831919908146e-05, "loss": 2.4904, "mean_token_accuracy": 0.38620689511299133, "step": 28205 }, { "epoch": 0.028413411613742785, "grad_norm": 17.566175018559427, "learning_rate": 2.8412867876639205e-05, "loss": 2.1915, "mean_token_accuracy": 0.413793095946312, "step": 28210 }, { "epoch": 0.02841844766684696, "grad_norm": 21.345729586068174, "learning_rate": 2.8417903833370264e-05, "loss": 2.8239, "mean_token_accuracy": 0.3896551728248596, "step": 28215 }, { "epoch": 0.02842348371995113, "grad_norm": 19.826280017647704, "learning_rate": 2.8422939790101327e-05, "loss": 2.41, "mean_token_accuracy": 0.44137930274009707, "step": 28220 }, { "epoch": 0.028428519773055303, "grad_norm": 14.174347006435658, "learning_rate": 2.8427975746832386e-05, "loss": 2.6227, "mean_token_accuracy": 0.4620689630508423, "step": 28225 }, { "epoch": 0.028433555826159477, "grad_norm": 19.85448790785396, "learning_rate": 2.8433011703563446e-05, "loss": 2.6685, "mean_token_accuracy": 0.37241379618644715, "step": 28230 }, { "epoch": 0.028438591879263647, "grad_norm": 23.46289325800877, "learning_rate": 2.8438047660294505e-05, "loss": 2.5836, "mean_token_accuracy": 0.4009679317474365, "step": 28235 }, { "epoch": 0.02844362793236782, "grad_norm": 16.521132576753825, "learning_rate": 2.844308361702556e-05, "loss": 2.5734, "mean_token_accuracy": 0.44482759237289426, "step": 28240 }, { "epoch": 0.028448663985471995, "grad_norm": 14.793356163608912, "learning_rate": 2.8448119573756627e-05, "loss": 2.5237, "mean_token_accuracy": 0.4034482777118683, "step": 28245 }, { "epoch": 0.02845370003857617, "grad_norm": 16.02951691183068, "learning_rate": 2.8453155530487686e-05, "loss": 2.6474, "mean_token_accuracy": 0.39655172228813174, "step": 28250 }, { "epoch": 0.02845873609168034, "grad_norm": 19.58246946050734, "learning_rate": 2.8458191487218745e-05, "loss": 2.8468, "mean_token_accuracy": 0.3620689630508423, "step": 28255 }, { "epoch": 0.028463772144784513, "grad_norm": 31.83850974740709, "learning_rate": 2.84632274439498e-05, "loss": 2.6814, "mean_token_accuracy": 0.39310345649719236, "step": 28260 }, { "epoch": 0.028468808197888686, "grad_norm": 17.952002282014504, "learning_rate": 2.846826340068086e-05, "loss": 2.7324, "mean_token_accuracy": 0.3758620619773865, "step": 28265 }, { "epoch": 0.028473844250992857, "grad_norm": 20.80711445114084, "learning_rate": 2.847329935741192e-05, "loss": 2.6828, "mean_token_accuracy": 0.34137930870056155, "step": 28270 }, { "epoch": 0.02847888030409703, "grad_norm": 15.608876145680531, "learning_rate": 2.8478335314142986e-05, "loss": 2.5087, "mean_token_accuracy": 0.42413793206214906, "step": 28275 }, { "epoch": 0.028483916357201204, "grad_norm": 22.808256318855943, "learning_rate": 2.848337127087404e-05, "loss": 2.3822, "mean_token_accuracy": 0.4068965554237366, "step": 28280 }, { "epoch": 0.028488952410305378, "grad_norm": 20.00187164588378, "learning_rate": 2.84884072276051e-05, "loss": 2.253, "mean_token_accuracy": 0.48481547832489014, "step": 28285 }, { "epoch": 0.028493988463409548, "grad_norm": 18.70652730109442, "learning_rate": 2.849344318433616e-05, "loss": 2.7495, "mean_token_accuracy": 0.38965516686439516, "step": 28290 }, { "epoch": 0.028499024516513722, "grad_norm": 14.957659782138862, "learning_rate": 2.849847914106722e-05, "loss": 2.7427, "mean_token_accuracy": 0.39122806787490844, "step": 28295 }, { "epoch": 0.028504060569617896, "grad_norm": 17.513004807296323, "learning_rate": 2.8503515097798282e-05, "loss": 2.5535, "mean_token_accuracy": 0.4034482717514038, "step": 28300 }, { "epoch": 0.028509096622722066, "grad_norm": 14.519661865270336, "learning_rate": 2.850855105452934e-05, "loss": 2.4891, "mean_token_accuracy": 0.43448275327682495, "step": 28305 }, { "epoch": 0.02851413267582624, "grad_norm": 17.13897125372733, "learning_rate": 2.85135870112604e-05, "loss": 2.8747, "mean_token_accuracy": 0.3724137842655182, "step": 28310 }, { "epoch": 0.028519168728930414, "grad_norm": 15.334421870405613, "learning_rate": 2.851862296799146e-05, "loss": 2.4958, "mean_token_accuracy": 0.3724137932062149, "step": 28315 }, { "epoch": 0.028524204782034587, "grad_norm": 20.10438363764687, "learning_rate": 2.852365892472252e-05, "loss": 2.4017, "mean_token_accuracy": 0.42758620977401735, "step": 28320 }, { "epoch": 0.028529240835138758, "grad_norm": 17.371840478624957, "learning_rate": 2.852869488145358e-05, "loss": 2.326, "mean_token_accuracy": 0.4137930989265442, "step": 28325 }, { "epoch": 0.02853427688824293, "grad_norm": 15.87289770440186, "learning_rate": 2.853373083818464e-05, "loss": 2.2709, "mean_token_accuracy": 0.46896551847457885, "step": 28330 }, { "epoch": 0.028539312941347105, "grad_norm": 17.704485349921804, "learning_rate": 2.85387667949157e-05, "loss": 2.6764, "mean_token_accuracy": 0.4034482717514038, "step": 28335 }, { "epoch": 0.028544348994451275, "grad_norm": 15.67704496678658, "learning_rate": 2.854380275164676e-05, "loss": 2.5248, "mean_token_accuracy": 0.3862068891525269, "step": 28340 }, { "epoch": 0.02854938504755545, "grad_norm": 18.674184103464494, "learning_rate": 2.854883870837782e-05, "loss": 2.7058, "mean_token_accuracy": 0.38777979016304015, "step": 28345 }, { "epoch": 0.028554421100659623, "grad_norm": 21.621585331533026, "learning_rate": 2.8553874665108878e-05, "loss": 2.6666, "mean_token_accuracy": 0.3793103456497192, "step": 28350 }, { "epoch": 0.028559457153763797, "grad_norm": 17.56552274378061, "learning_rate": 2.855891062183994e-05, "loss": 2.5288, "mean_token_accuracy": 0.3827586233615875, "step": 28355 }, { "epoch": 0.028564493206867967, "grad_norm": 20.934192337728412, "learning_rate": 2.8563946578571e-05, "loss": 2.3287, "mean_token_accuracy": 0.4655172288417816, "step": 28360 }, { "epoch": 0.02856952925997214, "grad_norm": 16.736453287038298, "learning_rate": 2.856898253530206e-05, "loss": 2.5984, "mean_token_accuracy": 0.4034482717514038, "step": 28365 }, { "epoch": 0.028574565313076315, "grad_norm": 16.660382262154176, "learning_rate": 2.8574018492033118e-05, "loss": 2.5409, "mean_token_accuracy": 0.43103447556495667, "step": 28370 }, { "epoch": 0.028579601366180485, "grad_norm": 17.354954695654357, "learning_rate": 2.8579054448764174e-05, "loss": 2.2757, "mean_token_accuracy": 0.4676950931549072, "step": 28375 }, { "epoch": 0.02858463741928466, "grad_norm": 18.932211981169004, "learning_rate": 2.858409040549524e-05, "loss": 2.4844, "mean_token_accuracy": 0.4137930989265442, "step": 28380 }, { "epoch": 0.028589673472388832, "grad_norm": 18.1070588675393, "learning_rate": 2.85891263622263e-05, "loss": 2.2866, "mean_token_accuracy": 0.4068965494632721, "step": 28385 }, { "epoch": 0.028594709525493006, "grad_norm": 16.001697254622623, "learning_rate": 2.859416231895736e-05, "loss": 2.6019, "mean_token_accuracy": 0.3793103456497192, "step": 28390 }, { "epoch": 0.028599745578597176, "grad_norm": 19.35876023913447, "learning_rate": 2.8599198275688414e-05, "loss": 2.3085, "mean_token_accuracy": 0.4344827592372894, "step": 28395 }, { "epoch": 0.02860478163170135, "grad_norm": 13.972991550530294, "learning_rate": 2.8604234232419474e-05, "loss": 2.5052, "mean_token_accuracy": 0.42413793206214906, "step": 28400 }, { "epoch": 0.028609817684805524, "grad_norm": 16.29793093475021, "learning_rate": 2.860927018915054e-05, "loss": 2.6207, "mean_token_accuracy": 0.3620689630508423, "step": 28405 }, { "epoch": 0.028614853737909694, "grad_norm": 15.134446247525407, "learning_rate": 2.86143061458816e-05, "loss": 2.5534, "mean_token_accuracy": 0.3551724135875702, "step": 28410 }, { "epoch": 0.028619889791013868, "grad_norm": 19.129487196087364, "learning_rate": 2.8619342102612655e-05, "loss": 2.3528, "mean_token_accuracy": 0.41584996581077577, "step": 28415 }, { "epoch": 0.028624925844118042, "grad_norm": 17.069704069467814, "learning_rate": 2.8624378059343714e-05, "loss": 2.39, "mean_token_accuracy": 0.4413793087005615, "step": 28420 }, { "epoch": 0.028629961897222212, "grad_norm": 14.42712897580744, "learning_rate": 2.8629414016074773e-05, "loss": 2.4717, "mean_token_accuracy": 0.3827586203813553, "step": 28425 }, { "epoch": 0.028634997950326386, "grad_norm": 14.38898296988094, "learning_rate": 2.8634449972805833e-05, "loss": 2.4298, "mean_token_accuracy": 0.42758620381355283, "step": 28430 }, { "epoch": 0.02864003400343056, "grad_norm": 19.914229506996257, "learning_rate": 2.8639485929536895e-05, "loss": 2.5071, "mean_token_accuracy": 0.4189957737922668, "step": 28435 }, { "epoch": 0.028645070056534733, "grad_norm": 22.120000341294624, "learning_rate": 2.8644521886267955e-05, "loss": 2.7276, "mean_token_accuracy": 0.37241379022598264, "step": 28440 }, { "epoch": 0.028650106109638904, "grad_norm": 15.6329795924144, "learning_rate": 2.8649557842999014e-05, "loss": 2.655, "mean_token_accuracy": 0.3827586203813553, "step": 28445 }, { "epoch": 0.028655142162743077, "grad_norm": 19.58222722848476, "learning_rate": 2.8654593799730073e-05, "loss": 2.6931, "mean_token_accuracy": 0.37586206793785093, "step": 28450 }, { "epoch": 0.02866017821584725, "grad_norm": 17.56165568743544, "learning_rate": 2.8659629756461132e-05, "loss": 2.5882, "mean_token_accuracy": 0.3793103516101837, "step": 28455 }, { "epoch": 0.02866521426895142, "grad_norm": 166.10776884085166, "learning_rate": 2.8664665713192195e-05, "loss": 2.3056, "mean_token_accuracy": 0.44482758045196535, "step": 28460 }, { "epoch": 0.028670250322055595, "grad_norm": 17.200862684573128, "learning_rate": 2.8669701669923254e-05, "loss": 2.3147, "mean_token_accuracy": 0.4034482777118683, "step": 28465 }, { "epoch": 0.02867528637515977, "grad_norm": 21.81120326717603, "learning_rate": 2.8674737626654313e-05, "loss": 2.724, "mean_token_accuracy": 0.4241379380226135, "step": 28470 }, { "epoch": 0.028680322428263943, "grad_norm": 15.79321637757581, "learning_rate": 2.8679773583385373e-05, "loss": 2.4602, "mean_token_accuracy": 0.3999999940395355, "step": 28475 }, { "epoch": 0.028685358481368113, "grad_norm": 15.595180280947343, "learning_rate": 2.8684809540116432e-05, "loss": 2.2768, "mean_token_accuracy": 0.39310343861579894, "step": 28480 }, { "epoch": 0.028690394534472287, "grad_norm": 15.710697209575326, "learning_rate": 2.8689845496847495e-05, "loss": 2.37, "mean_token_accuracy": 0.4640653431415558, "step": 28485 }, { "epoch": 0.02869543058757646, "grad_norm": 17.296339684920692, "learning_rate": 2.8694881453578554e-05, "loss": 2.7449, "mean_token_accuracy": 0.4206896543502808, "step": 28490 }, { "epoch": 0.02870046664068063, "grad_norm": 21.010737274898965, "learning_rate": 2.8699917410309613e-05, "loss": 2.6273, "mean_token_accuracy": 0.3950393170118332, "step": 28495 }, { "epoch": 0.028705502693784805, "grad_norm": 15.902920062875422, "learning_rate": 2.8704953367040672e-05, "loss": 2.4669, "mean_token_accuracy": 0.4172413766384125, "step": 28500 }, { "epoch": 0.02871053874688898, "grad_norm": 17.076341940325076, "learning_rate": 2.8709989323771728e-05, "loss": 2.6852, "mean_token_accuracy": 0.3862069010734558, "step": 28505 }, { "epoch": 0.028715574799993152, "grad_norm": 16.98709935218074, "learning_rate": 2.8715025280502787e-05, "loss": 2.3382, "mean_token_accuracy": 0.4310344815254211, "step": 28510 }, { "epoch": 0.028720610853097323, "grad_norm": 17.330316083954713, "learning_rate": 2.8720061237233854e-05, "loss": 2.3106, "mean_token_accuracy": 0.39655172228813174, "step": 28515 }, { "epoch": 0.028725646906201496, "grad_norm": 25.490686676913427, "learning_rate": 2.8725097193964913e-05, "loss": 2.805, "mean_token_accuracy": 0.37241379022598264, "step": 28520 }, { "epoch": 0.02873068295930567, "grad_norm": 18.231296155299, "learning_rate": 2.8730133150695972e-05, "loss": 2.4691, "mean_token_accuracy": 0.4360556542873383, "step": 28525 }, { "epoch": 0.02873571901240984, "grad_norm": 16.587977870382247, "learning_rate": 2.8735169107427028e-05, "loss": 2.5418, "mean_token_accuracy": 0.4413793087005615, "step": 28530 }, { "epoch": 0.028740755065514014, "grad_norm": 14.216201101577282, "learning_rate": 2.8740205064158087e-05, "loss": 2.6134, "mean_token_accuracy": 0.41724138259887694, "step": 28535 }, { "epoch": 0.028745791118618188, "grad_norm": 17.768584360627216, "learning_rate": 2.8745241020889153e-05, "loss": 2.675, "mean_token_accuracy": 0.4172413766384125, "step": 28540 }, { "epoch": 0.02875082717172236, "grad_norm": 16.925126993391544, "learning_rate": 2.8750276977620212e-05, "loss": 2.5142, "mean_token_accuracy": 0.44482758045196535, "step": 28545 }, { "epoch": 0.028755863224826532, "grad_norm": 18.095037049482617, "learning_rate": 2.8755312934351268e-05, "loss": 2.5346, "mean_token_accuracy": 0.3774349570274353, "step": 28550 }, { "epoch": 0.028760899277930706, "grad_norm": 15.974696553493052, "learning_rate": 2.8760348891082328e-05, "loss": 2.0573, "mean_token_accuracy": 0.47773745059967043, "step": 28555 }, { "epoch": 0.02876593533103488, "grad_norm": 17.24971840188284, "learning_rate": 2.8765384847813387e-05, "loss": 2.677, "mean_token_accuracy": 0.4172413766384125, "step": 28560 }, { "epoch": 0.02877097138413905, "grad_norm": 13.377900067735093, "learning_rate": 2.8770420804544453e-05, "loss": 2.2563, "mean_token_accuracy": 0.43793103098869324, "step": 28565 }, { "epoch": 0.028776007437243224, "grad_norm": 14.791403159375339, "learning_rate": 2.877545676127551e-05, "loss": 2.2453, "mean_token_accuracy": 0.4428917050361633, "step": 28570 }, { "epoch": 0.028781043490347397, "grad_norm": 16.030834517123232, "learning_rate": 2.8780492718006568e-05, "loss": 2.3416, "mean_token_accuracy": 0.4413793087005615, "step": 28575 }, { "epoch": 0.02878607954345157, "grad_norm": 16.36994147550745, "learning_rate": 2.8785528674737627e-05, "loss": 2.2641, "mean_token_accuracy": 0.4517241358757019, "step": 28580 }, { "epoch": 0.02879111559655574, "grad_norm": 18.650187751198107, "learning_rate": 2.8790564631468686e-05, "loss": 2.7386, "mean_token_accuracy": 0.3793103337287903, "step": 28585 }, { "epoch": 0.028796151649659915, "grad_norm": 15.249022585260006, "learning_rate": 2.8795600588199746e-05, "loss": 2.4477, "mean_token_accuracy": 0.4529340624809265, "step": 28590 }, { "epoch": 0.02880118770276409, "grad_norm": 18.02934128318509, "learning_rate": 2.880063654493081e-05, "loss": 2.6862, "mean_token_accuracy": 0.39310344457626345, "step": 28595 }, { "epoch": 0.02880622375586826, "grad_norm": 15.501591268719338, "learning_rate": 2.8805672501661868e-05, "loss": 2.5998, "mean_token_accuracy": 0.3448275804519653, "step": 28600 }, { "epoch": 0.028811259808972433, "grad_norm": 15.241972571276792, "learning_rate": 2.8810708458392927e-05, "loss": 2.2108, "mean_token_accuracy": 0.44827585816383364, "step": 28605 }, { "epoch": 0.028816295862076607, "grad_norm": 14.42015317227423, "learning_rate": 2.8815744415123986e-05, "loss": 2.7649, "mean_token_accuracy": 0.41905625760555265, "step": 28610 }, { "epoch": 0.02882133191518078, "grad_norm": 18.06061595423494, "learning_rate": 2.8820780371855045e-05, "loss": 2.9039, "mean_token_accuracy": 0.41724138259887694, "step": 28615 }, { "epoch": 0.02882636796828495, "grad_norm": 18.53433787871925, "learning_rate": 2.8825816328586108e-05, "loss": 2.5882, "mean_token_accuracy": 0.4275862157344818, "step": 28620 }, { "epoch": 0.028831404021389125, "grad_norm": 19.77284074438423, "learning_rate": 2.8830852285317167e-05, "loss": 2.3672, "mean_token_accuracy": 0.4689655125141144, "step": 28625 }, { "epoch": 0.0288364400744933, "grad_norm": 28.34621196503054, "learning_rate": 2.8835888242048226e-05, "loss": 2.5127, "mean_token_accuracy": 0.3931034505367279, "step": 28630 }, { "epoch": 0.02884147612759747, "grad_norm": 16.09174284888342, "learning_rate": 2.8840924198779286e-05, "loss": 2.3801, "mean_token_accuracy": 0.43793103098869324, "step": 28635 }, { "epoch": 0.028846512180701642, "grad_norm": 16.721714547091548, "learning_rate": 2.884596015551034e-05, "loss": 2.6349, "mean_token_accuracy": 0.4206896543502808, "step": 28640 }, { "epoch": 0.028851548233805816, "grad_norm": 13.898473792738638, "learning_rate": 2.8850996112241408e-05, "loss": 2.4432, "mean_token_accuracy": 0.4482758641242981, "step": 28645 }, { "epoch": 0.02885658428690999, "grad_norm": 15.364852462422112, "learning_rate": 2.8856032068972467e-05, "loss": 2.5988, "mean_token_accuracy": 0.4034482717514038, "step": 28650 }, { "epoch": 0.02886162034001416, "grad_norm": 17.972904343985753, "learning_rate": 2.8861068025703526e-05, "loss": 2.6786, "mean_token_accuracy": 0.41379310488700866, "step": 28655 }, { "epoch": 0.028866656393118334, "grad_norm": 21.127516747859747, "learning_rate": 2.8866103982434582e-05, "loss": 2.5323, "mean_token_accuracy": 0.38620689809322356, "step": 28660 }, { "epoch": 0.028871692446222508, "grad_norm": 18.755520314369743, "learning_rate": 2.887113993916564e-05, "loss": 2.9684, "mean_token_accuracy": 0.34137930870056155, "step": 28665 }, { "epoch": 0.028876728499326678, "grad_norm": 14.214699062595612, "learning_rate": 2.8876175895896707e-05, "loss": 2.6565, "mean_token_accuracy": 0.38965516686439516, "step": 28670 }, { "epoch": 0.028881764552430852, "grad_norm": 31.295169110630717, "learning_rate": 2.8881211852627767e-05, "loss": 2.5725, "mean_token_accuracy": 0.42413793206214906, "step": 28675 }, { "epoch": 0.028886800605535026, "grad_norm": 22.841088830484146, "learning_rate": 2.8886247809358822e-05, "loss": 2.6562, "mean_token_accuracy": 0.42413793206214906, "step": 28680 }, { "epoch": 0.0288918366586392, "grad_norm": 15.120564371399617, "learning_rate": 2.889128376608988e-05, "loss": 2.4734, "mean_token_accuracy": 0.4122201979160309, "step": 28685 }, { "epoch": 0.02889687271174337, "grad_norm": 30.18559371388071, "learning_rate": 2.889631972282094e-05, "loss": 2.3805, "mean_token_accuracy": 0.43103447556495667, "step": 28690 }, { "epoch": 0.028901908764847543, "grad_norm": 18.26290462016411, "learning_rate": 2.8901355679552e-05, "loss": 2.5333, "mean_token_accuracy": 0.4103448212146759, "step": 28695 }, { "epoch": 0.028906944817951717, "grad_norm": 16.55591481046095, "learning_rate": 2.8906391636283063e-05, "loss": 2.8416, "mean_token_accuracy": 0.36206896007061007, "step": 28700 }, { "epoch": 0.028911980871055887, "grad_norm": 14.484541858229901, "learning_rate": 2.8911427593014122e-05, "loss": 2.373, "mean_token_accuracy": 0.4379310369491577, "step": 28705 }, { "epoch": 0.02891701692416006, "grad_norm": 19.282777625526734, "learning_rate": 2.891646354974518e-05, "loss": 2.6113, "mean_token_accuracy": 0.41034482717514037, "step": 28710 }, { "epoch": 0.028922052977264235, "grad_norm": 22.624382275761743, "learning_rate": 2.892149950647624e-05, "loss": 2.6291, "mean_token_accuracy": 0.3965517282485962, "step": 28715 }, { "epoch": 0.02892708903036841, "grad_norm": 21.532411632617524, "learning_rate": 2.89265354632073e-05, "loss": 2.4124, "mean_token_accuracy": 0.39655172526836396, "step": 28720 }, { "epoch": 0.02893212508347258, "grad_norm": 19.394091941253663, "learning_rate": 2.8931571419938362e-05, "loss": 2.2191, "mean_token_accuracy": 0.4413793087005615, "step": 28725 }, { "epoch": 0.028937161136576753, "grad_norm": 24.423443576163727, "learning_rate": 2.8936607376669422e-05, "loss": 2.5435, "mean_token_accuracy": 0.4172413766384125, "step": 28730 }, { "epoch": 0.028942197189680927, "grad_norm": 18.0448105306421, "learning_rate": 2.894164333340048e-05, "loss": 2.784, "mean_token_accuracy": 0.39310344457626345, "step": 28735 }, { "epoch": 0.028947233242785097, "grad_norm": 24.74294927013064, "learning_rate": 2.894667929013154e-05, "loss": 2.4028, "mean_token_accuracy": 0.41724138259887694, "step": 28740 }, { "epoch": 0.02895226929588927, "grad_norm": 26.055191371340445, "learning_rate": 2.89517152468626e-05, "loss": 2.5702, "mean_token_accuracy": 0.4275861978530884, "step": 28745 }, { "epoch": 0.028957305348993444, "grad_norm": 16.961313041193723, "learning_rate": 2.8956751203593662e-05, "loss": 2.0385, "mean_token_accuracy": 0.46551724076271056, "step": 28750 }, { "epoch": 0.028962341402097618, "grad_norm": 17.143891513820765, "learning_rate": 2.896178716032472e-05, "loss": 2.6204, "mean_token_accuracy": 0.4206896543502808, "step": 28755 }, { "epoch": 0.02896737745520179, "grad_norm": 17.042355568511894, "learning_rate": 2.896682311705578e-05, "loss": 2.4401, "mean_token_accuracy": 0.4310344815254211, "step": 28760 }, { "epoch": 0.028972413508305962, "grad_norm": 13.904077424609985, "learning_rate": 2.897185907378684e-05, "loss": 2.6169, "mean_token_accuracy": 0.37586207389831544, "step": 28765 }, { "epoch": 0.028977449561410136, "grad_norm": 16.84133339059328, "learning_rate": 2.89768950305179e-05, "loss": 2.3678, "mean_token_accuracy": 0.47586206793785096, "step": 28770 }, { "epoch": 0.028982485614514306, "grad_norm": 15.053974153693597, "learning_rate": 2.8981930987248955e-05, "loss": 2.7146, "mean_token_accuracy": 0.3655172407627106, "step": 28775 }, { "epoch": 0.02898752166761848, "grad_norm": 16.970639635903364, "learning_rate": 2.898696694398002e-05, "loss": 2.6197, "mean_token_accuracy": 0.4259528160095215, "step": 28780 }, { "epoch": 0.028992557720722654, "grad_norm": 16.986679847070718, "learning_rate": 2.899200290071108e-05, "loss": 2.6679, "mean_token_accuracy": 0.3379310369491577, "step": 28785 }, { "epoch": 0.028997593773826828, "grad_norm": 18.635741052732673, "learning_rate": 2.899703885744214e-05, "loss": 2.212, "mean_token_accuracy": 0.4603750824928284, "step": 28790 }, { "epoch": 0.029002629826930998, "grad_norm": 16.213551756939896, "learning_rate": 2.9002074814173195e-05, "loss": 2.3797, "mean_token_accuracy": 0.43793103098869324, "step": 28795 }, { "epoch": 0.02900766588003517, "grad_norm": 23.6826600500554, "learning_rate": 2.9007110770904255e-05, "loss": 2.7053, "mean_token_accuracy": 0.3413793116807938, "step": 28800 }, { "epoch": 0.029012701933139345, "grad_norm": 20.756111455150805, "learning_rate": 2.901214672763532e-05, "loss": 2.4377, "mean_token_accuracy": 0.37931033968925476, "step": 28805 }, { "epoch": 0.029017737986243516, "grad_norm": 19.017768632761857, "learning_rate": 2.901718268436638e-05, "loss": 2.4619, "mean_token_accuracy": 0.40562612414360044, "step": 28810 }, { "epoch": 0.02902277403934769, "grad_norm": 18.4681412814779, "learning_rate": 2.9022218641097436e-05, "loss": 2.4969, "mean_token_accuracy": 0.37586206793785093, "step": 28815 }, { "epoch": 0.029027810092451863, "grad_norm": 17.08970036111446, "learning_rate": 2.9027254597828495e-05, "loss": 2.3785, "mean_token_accuracy": 0.4, "step": 28820 }, { "epoch": 0.029032846145556037, "grad_norm": 18.67574865526711, "learning_rate": 2.9032290554559554e-05, "loss": 2.778, "mean_token_accuracy": 0.382758629322052, "step": 28825 }, { "epoch": 0.029037882198660207, "grad_norm": 16.415356320505825, "learning_rate": 2.903732651129062e-05, "loss": 2.2058, "mean_token_accuracy": 0.4620689630508423, "step": 28830 }, { "epoch": 0.02904291825176438, "grad_norm": 16.92230200822066, "learning_rate": 2.9042362468021676e-05, "loss": 2.448, "mean_token_accuracy": 0.45760738253593447, "step": 28835 }, { "epoch": 0.029047954304868555, "grad_norm": 16.387023010354024, "learning_rate": 2.9047398424752735e-05, "loss": 2.5928, "mean_token_accuracy": 0.38275861740112305, "step": 28840 }, { "epoch": 0.029052990357972725, "grad_norm": 14.967049143037833, "learning_rate": 2.9052434381483795e-05, "loss": 2.2491, "mean_token_accuracy": 0.43103447556495667, "step": 28845 }, { "epoch": 0.0290580264110769, "grad_norm": 26.815592130974913, "learning_rate": 2.9057470338214854e-05, "loss": 2.6212, "mean_token_accuracy": 0.40000000298023225, "step": 28850 }, { "epoch": 0.029063062464181073, "grad_norm": 20.062050470017812, "learning_rate": 2.9062506294945913e-05, "loss": 2.7217, "mean_token_accuracy": 0.37241379022598264, "step": 28855 }, { "epoch": 0.029068098517285246, "grad_norm": 19.379101099018673, "learning_rate": 2.9067542251676976e-05, "loss": 2.65, "mean_token_accuracy": 0.3896551728248596, "step": 28860 }, { "epoch": 0.029073134570389417, "grad_norm": 14.259386012303532, "learning_rate": 2.9072578208408035e-05, "loss": 2.4506, "mean_token_accuracy": 0.42413792610168455, "step": 28865 }, { "epoch": 0.02907817062349359, "grad_norm": 17.373325793435555, "learning_rate": 2.9077614165139094e-05, "loss": 2.4955, "mean_token_accuracy": 0.4206896543502808, "step": 28870 }, { "epoch": 0.029083206676597764, "grad_norm": 18.379079909453107, "learning_rate": 2.9082650121870154e-05, "loss": 2.7414, "mean_token_accuracy": 0.3655172407627106, "step": 28875 }, { "epoch": 0.029088242729701935, "grad_norm": 23.65953967382418, "learning_rate": 2.9087686078601213e-05, "loss": 2.5753, "mean_token_accuracy": 0.42758620381355283, "step": 28880 }, { "epoch": 0.02909327878280611, "grad_norm": 14.21683210635724, "learning_rate": 2.9092722035332275e-05, "loss": 2.4781, "mean_token_accuracy": 0.4206896543502808, "step": 28885 }, { "epoch": 0.029098314835910282, "grad_norm": 16.709303586132034, "learning_rate": 2.9097757992063335e-05, "loss": 2.1534, "mean_token_accuracy": 0.5024803340435028, "step": 28890 }, { "epoch": 0.029103350889014456, "grad_norm": 16.554841471233352, "learning_rate": 2.9102793948794394e-05, "loss": 2.2126, "mean_token_accuracy": 0.47108287215232847, "step": 28895 }, { "epoch": 0.029108386942118626, "grad_norm": 15.408381946064868, "learning_rate": 2.9107829905525453e-05, "loss": 2.7137, "mean_token_accuracy": 0.38620689511299133, "step": 28900 }, { "epoch": 0.0291134229952228, "grad_norm": 11.88918490778324, "learning_rate": 2.9112865862256513e-05, "loss": 2.2805, "mean_token_accuracy": 0.47453114986419676, "step": 28905 }, { "epoch": 0.029118459048326974, "grad_norm": 17.851370060465552, "learning_rate": 2.9117901818987575e-05, "loss": 2.4468, "mean_token_accuracy": 0.3896551728248596, "step": 28910 }, { "epoch": 0.029123495101431144, "grad_norm": 16.879994540888287, "learning_rate": 2.9122937775718634e-05, "loss": 2.3116, "mean_token_accuracy": 0.4379310369491577, "step": 28915 }, { "epoch": 0.029128531154535318, "grad_norm": 15.534236473320558, "learning_rate": 2.9127973732449694e-05, "loss": 2.3864, "mean_token_accuracy": 0.394252872467041, "step": 28920 }, { "epoch": 0.02913356720763949, "grad_norm": 18.047662964835066, "learning_rate": 2.9133009689180753e-05, "loss": 2.7292, "mean_token_accuracy": 0.3896551728248596, "step": 28925 }, { "epoch": 0.029138603260743665, "grad_norm": 13.768485046362416, "learning_rate": 2.913804564591181e-05, "loss": 2.5179, "mean_token_accuracy": 0.3793103456497192, "step": 28930 }, { "epoch": 0.029143639313847836, "grad_norm": 16.093592172210936, "learning_rate": 2.9143081602642868e-05, "loss": 2.7077, "mean_token_accuracy": 0.4034482777118683, "step": 28935 }, { "epoch": 0.02914867536695201, "grad_norm": 17.392814111433935, "learning_rate": 2.9148117559373934e-05, "loss": 2.5771, "mean_token_accuracy": 0.4137930929660797, "step": 28940 }, { "epoch": 0.029153711420056183, "grad_norm": 16.997913626391654, "learning_rate": 2.9153153516104993e-05, "loss": 2.4807, "mean_token_accuracy": 0.441379314661026, "step": 28945 }, { "epoch": 0.029158747473160353, "grad_norm": 23.048566877888664, "learning_rate": 2.915818947283605e-05, "loss": 2.4722, "mean_token_accuracy": 0.3655172407627106, "step": 28950 }, { "epoch": 0.029163783526264527, "grad_norm": 11.786499720776606, "learning_rate": 2.916322542956711e-05, "loss": 2.1988, "mean_token_accuracy": 0.44827585816383364, "step": 28955 }, { "epoch": 0.0291688195793687, "grad_norm": 18.48451621817639, "learning_rate": 2.9168261386298168e-05, "loss": 2.2716, "mean_token_accuracy": 0.45862067937850953, "step": 28960 }, { "epoch": 0.029173855632472875, "grad_norm": 18.298629578798383, "learning_rate": 2.9173297343029234e-05, "loss": 2.2899, "mean_token_accuracy": 0.4793103337287903, "step": 28965 }, { "epoch": 0.029178891685577045, "grad_norm": 20.290639960735724, "learning_rate": 2.917833329976029e-05, "loss": 2.6811, "mean_token_accuracy": 0.38620689511299133, "step": 28970 }, { "epoch": 0.02918392773868122, "grad_norm": 18.446891217979125, "learning_rate": 2.918336925649135e-05, "loss": 2.6811, "mean_token_accuracy": 0.3931034505367279, "step": 28975 }, { "epoch": 0.029188963791785393, "grad_norm": 17.24943587046956, "learning_rate": 2.9188405213222408e-05, "loss": 2.3526, "mean_token_accuracy": 0.4551724076271057, "step": 28980 }, { "epoch": 0.029193999844889563, "grad_norm": 22.330405333989457, "learning_rate": 2.9193441169953467e-05, "loss": 2.7507, "mean_token_accuracy": 0.33793103098869326, "step": 28985 }, { "epoch": 0.029199035897993737, "grad_norm": 26.77777667475145, "learning_rate": 2.919847712668453e-05, "loss": 2.5966, "mean_token_accuracy": 0.3896551728248596, "step": 28990 }, { "epoch": 0.02920407195109791, "grad_norm": 16.09287025862488, "learning_rate": 2.920351308341559e-05, "loss": 2.3633, "mean_token_accuracy": 0.4586206912994385, "step": 28995 }, { "epoch": 0.029209108004202084, "grad_norm": 20.712085479079104, "learning_rate": 2.920854904014665e-05, "loss": 2.4607, "mean_token_accuracy": 0.4689655125141144, "step": 29000 }, { "epoch": 0.029214144057306254, "grad_norm": 16.497744888524167, "learning_rate": 2.9213584996877708e-05, "loss": 2.772, "mean_token_accuracy": 0.38620689511299133, "step": 29005 }, { "epoch": 0.029219180110410428, "grad_norm": 16.601714947164467, "learning_rate": 2.9218620953608767e-05, "loss": 2.6891, "mean_token_accuracy": 0.36896551847457887, "step": 29010 }, { "epoch": 0.029224216163514602, "grad_norm": 18.263833642592957, "learning_rate": 2.9223656910339826e-05, "loss": 2.8077, "mean_token_accuracy": 0.3551724135875702, "step": 29015 }, { "epoch": 0.029229252216618772, "grad_norm": 18.047539245433132, "learning_rate": 2.922869286707089e-05, "loss": 2.6885, "mean_token_accuracy": 0.40895341634750365, "step": 29020 }, { "epoch": 0.029234288269722946, "grad_norm": 17.035566428510997, "learning_rate": 2.9233728823801948e-05, "loss": 2.69, "mean_token_accuracy": 0.3896551728248596, "step": 29025 }, { "epoch": 0.02923932432282712, "grad_norm": 19.086760974320768, "learning_rate": 2.9238764780533007e-05, "loss": 2.594, "mean_token_accuracy": 0.4068965554237366, "step": 29030 }, { "epoch": 0.029244360375931294, "grad_norm": 15.488313772590844, "learning_rate": 2.9243800737264067e-05, "loss": 2.9088, "mean_token_accuracy": 0.3620689660310745, "step": 29035 }, { "epoch": 0.029249396429035464, "grad_norm": 28.282500763414347, "learning_rate": 2.9248836693995122e-05, "loss": 2.4808, "mean_token_accuracy": 0.34482758641242983, "step": 29040 }, { "epoch": 0.029254432482139638, "grad_norm": 17.186948400173062, "learning_rate": 2.925387265072619e-05, "loss": 2.5365, "mean_token_accuracy": 0.4258318305015564, "step": 29045 }, { "epoch": 0.02925946853524381, "grad_norm": 22.317920546196827, "learning_rate": 2.9258908607457248e-05, "loss": 2.8038, "mean_token_accuracy": 0.37241379022598264, "step": 29050 }, { "epoch": 0.02926450458834798, "grad_norm": 15.088772550319392, "learning_rate": 2.9263944564188307e-05, "loss": 2.8417, "mean_token_accuracy": 0.36896551847457887, "step": 29055 }, { "epoch": 0.029269540641452155, "grad_norm": 24.365864876780236, "learning_rate": 2.9268980520919366e-05, "loss": 2.6709, "mean_token_accuracy": 0.459359610080719, "step": 29060 }, { "epoch": 0.02927457669455633, "grad_norm": 21.00513092648901, "learning_rate": 2.9274016477650422e-05, "loss": 2.5766, "mean_token_accuracy": 0.39310344457626345, "step": 29065 }, { "epoch": 0.029279612747660503, "grad_norm": 19.77508562302065, "learning_rate": 2.9279052434381488e-05, "loss": 2.7295, "mean_token_accuracy": 0.3771929800510406, "step": 29070 }, { "epoch": 0.029284648800764673, "grad_norm": 39.450216037499416, "learning_rate": 2.9284088391112547e-05, "loss": 2.6759, "mean_token_accuracy": 0.37586206793785093, "step": 29075 }, { "epoch": 0.029289684853868847, "grad_norm": 19.885862307445414, "learning_rate": 2.9289124347843607e-05, "loss": 2.7889, "mean_token_accuracy": 0.3758620619773865, "step": 29080 }, { "epoch": 0.02929472090697302, "grad_norm": 18.964273210774564, "learning_rate": 2.9294160304574663e-05, "loss": 2.607, "mean_token_accuracy": 0.3931034475564957, "step": 29085 }, { "epoch": 0.02929975696007719, "grad_norm": 17.559174560588463, "learning_rate": 2.9299196261305722e-05, "loss": 2.5961, "mean_token_accuracy": 0.4119177222251892, "step": 29090 }, { "epoch": 0.029304793013181365, "grad_norm": 17.372796279028535, "learning_rate": 2.9304232218036788e-05, "loss": 2.7834, "mean_token_accuracy": 0.39655172228813174, "step": 29095 }, { "epoch": 0.02930982906628554, "grad_norm": 17.50850945505296, "learning_rate": 2.9309268174767847e-05, "loss": 2.8309, "mean_token_accuracy": 0.39310344457626345, "step": 29100 }, { "epoch": 0.029314865119389712, "grad_norm": 17.050620886152583, "learning_rate": 2.9314304131498903e-05, "loss": 2.4639, "mean_token_accuracy": 0.4379310369491577, "step": 29105 }, { "epoch": 0.029319901172493883, "grad_norm": 17.908420992085034, "learning_rate": 2.9319340088229962e-05, "loss": 2.5188, "mean_token_accuracy": 0.37931033968925476, "step": 29110 }, { "epoch": 0.029324937225598056, "grad_norm": 20.677634081857832, "learning_rate": 2.932437604496102e-05, "loss": 2.8095, "mean_token_accuracy": 0.34137930870056155, "step": 29115 }, { "epoch": 0.02932997327870223, "grad_norm": 19.35289143344176, "learning_rate": 2.932941200169208e-05, "loss": 2.6752, "mean_token_accuracy": 0.3620689630508423, "step": 29120 }, { "epoch": 0.0293350093318064, "grad_norm": 16.88107204987131, "learning_rate": 2.9334447958423143e-05, "loss": 2.4056, "mean_token_accuracy": 0.41379310488700866, "step": 29125 }, { "epoch": 0.029340045384910574, "grad_norm": 22.884628993761428, "learning_rate": 2.9339483915154203e-05, "loss": 2.2999, "mean_token_accuracy": 0.45862067937850953, "step": 29130 }, { "epoch": 0.029345081438014748, "grad_norm": 16.77593779405751, "learning_rate": 2.9344519871885262e-05, "loss": 2.2376, "mean_token_accuracy": 0.4379310369491577, "step": 29135 }, { "epoch": 0.029350117491118922, "grad_norm": 18.29855260680542, "learning_rate": 2.934955582861632e-05, "loss": 2.4546, "mean_token_accuracy": 0.4379310369491577, "step": 29140 }, { "epoch": 0.029355153544223092, "grad_norm": 13.106796495422785, "learning_rate": 2.935459178534738e-05, "loss": 2.3484, "mean_token_accuracy": 0.4601935803890228, "step": 29145 }, { "epoch": 0.029360189597327266, "grad_norm": 19.101003876005805, "learning_rate": 2.9359627742078443e-05, "loss": 2.6957, "mean_token_accuracy": 0.3448275804519653, "step": 29150 }, { "epoch": 0.02936522565043144, "grad_norm": 13.709347748046984, "learning_rate": 2.9364663698809502e-05, "loss": 2.7858, "mean_token_accuracy": 0.37241379022598264, "step": 29155 }, { "epoch": 0.02937026170353561, "grad_norm": 17.87970586706311, "learning_rate": 2.936969965554056e-05, "loss": 2.685, "mean_token_accuracy": 0.3620689630508423, "step": 29160 }, { "epoch": 0.029375297756639784, "grad_norm": 18.524163065875594, "learning_rate": 2.937473561227162e-05, "loss": 3.0222, "mean_token_accuracy": 0.3241379290819168, "step": 29165 }, { "epoch": 0.029380333809743957, "grad_norm": 13.921594957708502, "learning_rate": 2.937977156900268e-05, "loss": 2.6907, "mean_token_accuracy": 0.41034482717514037, "step": 29170 }, { "epoch": 0.02938536986284813, "grad_norm": 15.55554656741362, "learning_rate": 2.9384807525733743e-05, "loss": 2.2311, "mean_token_accuracy": 0.3931034505367279, "step": 29175 }, { "epoch": 0.0293904059159523, "grad_norm": 17.354504545455352, "learning_rate": 2.9389843482464802e-05, "loss": 2.568, "mean_token_accuracy": 0.4, "step": 29180 }, { "epoch": 0.029395441969056475, "grad_norm": 15.895604495469875, "learning_rate": 2.939487943919586e-05, "loss": 2.3817, "mean_token_accuracy": 0.42758620381355283, "step": 29185 }, { "epoch": 0.02940047802216065, "grad_norm": 17.95642008554389, "learning_rate": 2.939991539592692e-05, "loss": 2.4346, "mean_token_accuracy": 0.4241379380226135, "step": 29190 }, { "epoch": 0.02940551407526482, "grad_norm": 14.331357044379631, "learning_rate": 2.9404951352657976e-05, "loss": 2.4156, "mean_token_accuracy": 0.42413793206214906, "step": 29195 }, { "epoch": 0.029410550128368993, "grad_norm": 19.07327428354615, "learning_rate": 2.9409987309389036e-05, "loss": 2.6629, "mean_token_accuracy": 0.3482758551836014, "step": 29200 }, { "epoch": 0.029415586181473167, "grad_norm": 15.908626053084024, "learning_rate": 2.94150232661201e-05, "loss": 2.4519, "mean_token_accuracy": 0.4350272178649902, "step": 29205 }, { "epoch": 0.02942062223457734, "grad_norm": 15.616566926145273, "learning_rate": 2.942005922285116e-05, "loss": 2.7775, "mean_token_accuracy": 0.3620689660310745, "step": 29210 }, { "epoch": 0.02942565828768151, "grad_norm": 16.518380686719727, "learning_rate": 2.9425095179582217e-05, "loss": 2.5235, "mean_token_accuracy": 0.39310344457626345, "step": 29215 }, { "epoch": 0.029430694340785685, "grad_norm": 27.027106638737187, "learning_rate": 2.9430131136313276e-05, "loss": 2.2819, "mean_token_accuracy": 0.42068966031074523, "step": 29220 }, { "epoch": 0.02943573039388986, "grad_norm": 14.52561221937901, "learning_rate": 2.9435167093044335e-05, "loss": 2.1586, "mean_token_accuracy": 0.4724137902259827, "step": 29225 }, { "epoch": 0.02944076644699403, "grad_norm": 15.013909963715054, "learning_rate": 2.94402030497754e-05, "loss": 2.5409, "mean_token_accuracy": 0.41034482717514037, "step": 29230 }, { "epoch": 0.029445802500098203, "grad_norm": 17.17768342793776, "learning_rate": 2.9445239006506457e-05, "loss": 2.3619, "mean_token_accuracy": 0.4517241418361664, "step": 29235 }, { "epoch": 0.029450838553202376, "grad_norm": 20.2352931071833, "learning_rate": 2.9450274963237516e-05, "loss": 2.4683, "mean_token_accuracy": 0.39655172228813174, "step": 29240 }, { "epoch": 0.02945587460630655, "grad_norm": 26.396247906740395, "learning_rate": 2.9455310919968576e-05, "loss": 2.7483, "mean_token_accuracy": 0.36007259488105775, "step": 29245 }, { "epoch": 0.02946091065941072, "grad_norm": 16.257561841829858, "learning_rate": 2.9460346876699635e-05, "loss": 2.7521, "mean_token_accuracy": 0.37241379618644715, "step": 29250 }, { "epoch": 0.029465946712514894, "grad_norm": 16.139308365458582, "learning_rate": 2.9465382833430697e-05, "loss": 2.6898, "mean_token_accuracy": 0.3896551728248596, "step": 29255 }, { "epoch": 0.029470982765619068, "grad_norm": 21.036289619873603, "learning_rate": 2.9470418790161757e-05, "loss": 2.8732, "mean_token_accuracy": 0.37241379022598264, "step": 29260 }, { "epoch": 0.029476018818723238, "grad_norm": 16.502844787291153, "learning_rate": 2.9475454746892816e-05, "loss": 2.2526, "mean_token_accuracy": 0.46551724076271056, "step": 29265 }, { "epoch": 0.029481054871827412, "grad_norm": 18.26457489152445, "learning_rate": 2.9480490703623875e-05, "loss": 2.5965, "mean_token_accuracy": 0.3793103456497192, "step": 29270 }, { "epoch": 0.029486090924931586, "grad_norm": 14.872907791582454, "learning_rate": 2.9485526660354934e-05, "loss": 2.3999, "mean_token_accuracy": 0.45517241954803467, "step": 29275 }, { "epoch": 0.02949112697803576, "grad_norm": 19.141729233509697, "learning_rate": 2.9490562617085994e-05, "loss": 2.258, "mean_token_accuracy": 0.4448275864124298, "step": 29280 }, { "epoch": 0.02949616303113993, "grad_norm": 18.814094296693867, "learning_rate": 2.9495598573817056e-05, "loss": 2.5004, "mean_token_accuracy": 0.47376847863197324, "step": 29285 }, { "epoch": 0.029501199084244104, "grad_norm": 19.141163139076475, "learning_rate": 2.9500634530548116e-05, "loss": 2.4433, "mean_token_accuracy": 0.39310344457626345, "step": 29290 }, { "epoch": 0.029506235137348277, "grad_norm": 13.91851606547807, "learning_rate": 2.9505670487279175e-05, "loss": 2.3206, "mean_token_accuracy": 0.4379310250282288, "step": 29295 }, { "epoch": 0.029511271190452448, "grad_norm": 23.90322714996998, "learning_rate": 2.9510706444010234e-05, "loss": 2.7151, "mean_token_accuracy": 0.37241379022598264, "step": 29300 }, { "epoch": 0.02951630724355662, "grad_norm": 24.02342827222452, "learning_rate": 2.9515742400741293e-05, "loss": 2.6927, "mean_token_accuracy": 0.3827586233615875, "step": 29305 }, { "epoch": 0.029521343296660795, "grad_norm": 24.847120259165067, "learning_rate": 2.9520778357472356e-05, "loss": 2.2897, "mean_token_accuracy": 0.4310344815254211, "step": 29310 }, { "epoch": 0.02952637934976497, "grad_norm": 16.41031167466528, "learning_rate": 2.9525814314203415e-05, "loss": 2.1434, "mean_token_accuracy": 0.46896551847457885, "step": 29315 }, { "epoch": 0.02953141540286914, "grad_norm": 30.884428525054425, "learning_rate": 2.9530850270934475e-05, "loss": 2.5676, "mean_token_accuracy": 0.44137930274009707, "step": 29320 }, { "epoch": 0.029536451455973313, "grad_norm": 18.8049378381986, "learning_rate": 2.9535886227665534e-05, "loss": 2.5201, "mean_token_accuracy": 0.4103448212146759, "step": 29325 }, { "epoch": 0.029541487509077487, "grad_norm": 27.83217431733001, "learning_rate": 2.954092218439659e-05, "loss": 2.907, "mean_token_accuracy": 0.3551724076271057, "step": 29330 }, { "epoch": 0.029546523562181657, "grad_norm": 17.355528083677683, "learning_rate": 2.9545958141127656e-05, "loss": 2.6722, "mean_token_accuracy": 0.3896551728248596, "step": 29335 }, { "epoch": 0.02955155961528583, "grad_norm": 13.243755961804263, "learning_rate": 2.9550994097858715e-05, "loss": 2.3976, "mean_token_accuracy": 0.42758620977401735, "step": 29340 }, { "epoch": 0.029556595668390005, "grad_norm": 17.048561242119124, "learning_rate": 2.9556030054589774e-05, "loss": 2.4364, "mean_token_accuracy": 0.41034482717514037, "step": 29345 }, { "epoch": 0.02956163172149418, "grad_norm": 15.943285690252303, "learning_rate": 2.956106601132083e-05, "loss": 2.437, "mean_token_accuracy": 0.42758620381355283, "step": 29350 }, { "epoch": 0.02956666777459835, "grad_norm": 30.236719435221882, "learning_rate": 2.956610196805189e-05, "loss": 2.5479, "mean_token_accuracy": 0.43103448748588563, "step": 29355 }, { "epoch": 0.029571703827702522, "grad_norm": 26.234716721973356, "learning_rate": 2.957113792478295e-05, "loss": 2.4102, "mean_token_accuracy": 0.41379310488700866, "step": 29360 }, { "epoch": 0.029576739880806696, "grad_norm": 16.875756528583572, "learning_rate": 2.9576173881514015e-05, "loss": 2.778, "mean_token_accuracy": 0.3862068891525269, "step": 29365 }, { "epoch": 0.029581775933910866, "grad_norm": 16.65316127944145, "learning_rate": 2.958120983824507e-05, "loss": 2.8712, "mean_token_accuracy": 0.36206896901130675, "step": 29370 }, { "epoch": 0.02958681198701504, "grad_norm": 15.998510219023641, "learning_rate": 2.958624579497613e-05, "loss": 2.6592, "mean_token_accuracy": 0.41657592058181764, "step": 29375 }, { "epoch": 0.029591848040119214, "grad_norm": 25.264633458656107, "learning_rate": 2.959128175170719e-05, "loss": 2.6608, "mean_token_accuracy": 0.3827586233615875, "step": 29380 }, { "epoch": 0.029596884093223388, "grad_norm": 17.114755030341012, "learning_rate": 2.9596317708438248e-05, "loss": 2.5136, "mean_token_accuracy": 0.4137930989265442, "step": 29385 }, { "epoch": 0.029601920146327558, "grad_norm": 16.08532132579987, "learning_rate": 2.960135366516931e-05, "loss": 2.1647, "mean_token_accuracy": 0.4413793087005615, "step": 29390 }, { "epoch": 0.029606956199431732, "grad_norm": 17.46570987124122, "learning_rate": 2.960638962190037e-05, "loss": 2.8, "mean_token_accuracy": 0.3206896513700485, "step": 29395 }, { "epoch": 0.029611992252535906, "grad_norm": 16.963443391288347, "learning_rate": 2.961142557863143e-05, "loss": 2.499, "mean_token_accuracy": 0.3999999940395355, "step": 29400 }, { "epoch": 0.029617028305640076, "grad_norm": 16.212345392825643, "learning_rate": 2.961646153536249e-05, "loss": 2.3559, "mean_token_accuracy": 0.3551724195480347, "step": 29405 }, { "epoch": 0.02962206435874425, "grad_norm": 27.524889522533908, "learning_rate": 2.9621497492093548e-05, "loss": 2.563, "mean_token_accuracy": 0.4137930989265442, "step": 29410 }, { "epoch": 0.029627100411848423, "grad_norm": 20.045406644533195, "learning_rate": 2.962653344882461e-05, "loss": 2.3554, "mean_token_accuracy": 0.4034482777118683, "step": 29415 }, { "epoch": 0.029632136464952597, "grad_norm": 19.817631140431637, "learning_rate": 2.963156940555567e-05, "loss": 2.4249, "mean_token_accuracy": 0.42068966031074523, "step": 29420 }, { "epoch": 0.029637172518056767, "grad_norm": 18.037053610332176, "learning_rate": 2.963660536228673e-05, "loss": 2.4095, "mean_token_accuracy": 0.43908045887947084, "step": 29425 }, { "epoch": 0.02964220857116094, "grad_norm": 17.383614056859916, "learning_rate": 2.9641641319017788e-05, "loss": 2.8091, "mean_token_accuracy": 0.3793103456497192, "step": 29430 }, { "epoch": 0.029647244624265115, "grad_norm": 15.99798770863236, "learning_rate": 2.9646677275748848e-05, "loss": 2.8413, "mean_token_accuracy": 0.3586206942796707, "step": 29435 }, { "epoch": 0.029652280677369285, "grad_norm": 18.543480893328507, "learning_rate": 2.9651713232479907e-05, "loss": 2.8001, "mean_token_accuracy": 0.3793103456497192, "step": 29440 }, { "epoch": 0.02965731673047346, "grad_norm": 19.222634969412997, "learning_rate": 2.965674918921097e-05, "loss": 2.5166, "mean_token_accuracy": 0.3862069010734558, "step": 29445 }, { "epoch": 0.029662352783577633, "grad_norm": 17.909249140601847, "learning_rate": 2.966178514594203e-05, "loss": 3.5245, "mean_token_accuracy": 0.279310342669487, "step": 29450 }, { "epoch": 0.029667388836681807, "grad_norm": 15.887584492170978, "learning_rate": 2.9666821102673088e-05, "loss": 2.9935, "mean_token_accuracy": 0.37241379022598264, "step": 29455 }, { "epoch": 0.029672424889785977, "grad_norm": 17.092160629461706, "learning_rate": 2.9671857059404147e-05, "loss": 2.7508, "mean_token_accuracy": 0.35517241060733795, "step": 29460 }, { "epoch": 0.02967746094289015, "grad_norm": 20.808594190083024, "learning_rate": 2.9676893016135203e-05, "loss": 2.4414, "mean_token_accuracy": 0.4401088833808899, "step": 29465 }, { "epoch": 0.029682496995994324, "grad_norm": 16.718310803358953, "learning_rate": 2.968192897286627e-05, "loss": 2.5105, "mean_token_accuracy": 0.39310344457626345, "step": 29470 }, { "epoch": 0.029687533049098495, "grad_norm": 15.667714017122984, "learning_rate": 2.968696492959733e-05, "loss": 2.4654, "mean_token_accuracy": 0.4034482717514038, "step": 29475 }, { "epoch": 0.02969256910220267, "grad_norm": 23.58282286190455, "learning_rate": 2.9692000886328388e-05, "loss": 3.0139, "mean_token_accuracy": 0.3310344755649567, "step": 29480 }, { "epoch": 0.029697605155306842, "grad_norm": 16.473412285425365, "learning_rate": 2.9697036843059443e-05, "loss": 2.6174, "mean_token_accuracy": 0.4206896543502808, "step": 29485 }, { "epoch": 0.029702641208411016, "grad_norm": 12.754355032883918, "learning_rate": 2.9702072799790503e-05, "loss": 2.2989, "mean_token_accuracy": 0.4379310250282288, "step": 29490 }, { "epoch": 0.029707677261515186, "grad_norm": 17.311328127135, "learning_rate": 2.970710875652157e-05, "loss": 2.7898, "mean_token_accuracy": 0.3448275774717331, "step": 29495 }, { "epoch": 0.02971271331461936, "grad_norm": 19.907535033061926, "learning_rate": 2.9712144713252628e-05, "loss": 2.8658, "mean_token_accuracy": 0.3517241358757019, "step": 29500 }, { "epoch": 0.029717749367723534, "grad_norm": 21.820613398937944, "learning_rate": 2.9717180669983684e-05, "loss": 2.5412, "mean_token_accuracy": 0.4068965494632721, "step": 29505 }, { "epoch": 0.029722785420827704, "grad_norm": 19.355267630451035, "learning_rate": 2.9722216626714743e-05, "loss": 2.442, "mean_token_accuracy": 0.4206896543502808, "step": 29510 }, { "epoch": 0.029727821473931878, "grad_norm": 16.268539391091032, "learning_rate": 2.9727252583445802e-05, "loss": 2.9293, "mean_token_accuracy": 0.3551724135875702, "step": 29515 }, { "epoch": 0.02973285752703605, "grad_norm": 18.52605119064113, "learning_rate": 2.973228854017687e-05, "loss": 2.7025, "mean_token_accuracy": 0.42487684488296507, "step": 29520 }, { "epoch": 0.029737893580140225, "grad_norm": 20.360358045699712, "learning_rate": 2.9737324496907924e-05, "loss": 2.3381, "mean_token_accuracy": 0.4034482717514038, "step": 29525 }, { "epoch": 0.029742929633244396, "grad_norm": 16.45291028427196, "learning_rate": 2.9742360453638983e-05, "loss": 2.881, "mean_token_accuracy": 0.3172413736581802, "step": 29530 }, { "epoch": 0.02974796568634857, "grad_norm": 18.656983633356006, "learning_rate": 2.9747396410370043e-05, "loss": 2.5963, "mean_token_accuracy": 0.3827586233615875, "step": 29535 }, { "epoch": 0.029753001739452743, "grad_norm": 22.289078598745878, "learning_rate": 2.9752432367101102e-05, "loss": 2.616, "mean_token_accuracy": 0.39655172228813174, "step": 29540 }, { "epoch": 0.029758037792556914, "grad_norm": 14.051042662660073, "learning_rate": 2.975746832383216e-05, "loss": 2.524, "mean_token_accuracy": 0.4068965494632721, "step": 29545 }, { "epoch": 0.029763073845661087, "grad_norm": 15.518998769213356, "learning_rate": 2.9762504280563224e-05, "loss": 2.5539, "mean_token_accuracy": 0.43103448748588563, "step": 29550 }, { "epoch": 0.02976810989876526, "grad_norm": 14.865696794876525, "learning_rate": 2.9767540237294283e-05, "loss": 2.7553, "mean_token_accuracy": 0.38941318094730376, "step": 29555 }, { "epoch": 0.029773145951869435, "grad_norm": 16.67474846412874, "learning_rate": 2.9772576194025342e-05, "loss": 2.6738, "mean_token_accuracy": 0.38620689511299133, "step": 29560 }, { "epoch": 0.029778182004973605, "grad_norm": 16.879283617126898, "learning_rate": 2.97776121507564e-05, "loss": 2.3034, "mean_token_accuracy": 0.4379310429096222, "step": 29565 }, { "epoch": 0.02978321805807778, "grad_norm": 17.18701211359169, "learning_rate": 2.978264810748746e-05, "loss": 2.4396, "mean_token_accuracy": 0.4172413766384125, "step": 29570 }, { "epoch": 0.029788254111181953, "grad_norm": 19.66418818138895, "learning_rate": 2.9787684064218524e-05, "loss": 2.3211, "mean_token_accuracy": 0.42413792610168455, "step": 29575 }, { "epoch": 0.029793290164286123, "grad_norm": 19.071364303775507, "learning_rate": 2.9792720020949583e-05, "loss": 2.3197, "mean_token_accuracy": 0.3931034505367279, "step": 29580 }, { "epoch": 0.029798326217390297, "grad_norm": 20.04572257127486, "learning_rate": 2.9797755977680642e-05, "loss": 2.2869, "mean_token_accuracy": 0.4482758641242981, "step": 29585 }, { "epoch": 0.02980336227049447, "grad_norm": 17.401040157300297, "learning_rate": 2.98027919344117e-05, "loss": 2.8732, "mean_token_accuracy": 0.38965516686439516, "step": 29590 }, { "epoch": 0.029808398323598644, "grad_norm": 19.239256825418053, "learning_rate": 2.980782789114276e-05, "loss": 2.6744, "mean_token_accuracy": 0.3999999940395355, "step": 29595 }, { "epoch": 0.029813434376702815, "grad_norm": 15.614640271657727, "learning_rate": 2.9812863847873823e-05, "loss": 2.7346, "mean_token_accuracy": 0.37241379618644715, "step": 29600 }, { "epoch": 0.02981847042980699, "grad_norm": 19.78024501329053, "learning_rate": 2.9817899804604882e-05, "loss": 2.5891, "mean_token_accuracy": 0.3742286801338196, "step": 29605 }, { "epoch": 0.029823506482911162, "grad_norm": 18.198249063445328, "learning_rate": 2.9822935761335942e-05, "loss": 2.2257, "mean_token_accuracy": 0.4448275864124298, "step": 29610 }, { "epoch": 0.029828542536015332, "grad_norm": 18.022695123390264, "learning_rate": 2.9827971718067e-05, "loss": 2.2505, "mean_token_accuracy": 0.4413793087005615, "step": 29615 }, { "epoch": 0.029833578589119506, "grad_norm": 19.066042279073216, "learning_rate": 2.9833007674798057e-05, "loss": 2.8002, "mean_token_accuracy": 0.31724137663841245, "step": 29620 }, { "epoch": 0.02983861464222368, "grad_norm": 17.61657298487773, "learning_rate": 2.9838043631529116e-05, "loss": 2.6507, "mean_token_accuracy": 0.39310344457626345, "step": 29625 }, { "epoch": 0.029843650695327854, "grad_norm": 19.032397408034644, "learning_rate": 2.9843079588260182e-05, "loss": 1.9455, "mean_token_accuracy": 0.5228675067424774, "step": 29630 }, { "epoch": 0.029848686748432024, "grad_norm": 16.61916737191513, "learning_rate": 2.984811554499124e-05, "loss": 2.3691, "mean_token_accuracy": 0.41379310488700866, "step": 29635 }, { "epoch": 0.029853722801536198, "grad_norm": 14.088103173301802, "learning_rate": 2.9853151501722297e-05, "loss": 2.3612, "mean_token_accuracy": 0.4137930989265442, "step": 29640 }, { "epoch": 0.02985875885464037, "grad_norm": 17.12606198410017, "learning_rate": 2.9858187458453356e-05, "loss": 2.6647, "mean_token_accuracy": 0.37380519807338713, "step": 29645 }, { "epoch": 0.029863794907744542, "grad_norm": 16.250140036134383, "learning_rate": 2.9863223415184416e-05, "loss": 2.1722, "mean_token_accuracy": 0.4674531161785126, "step": 29650 }, { "epoch": 0.029868830960848716, "grad_norm": 21.7110570919871, "learning_rate": 2.9868259371915482e-05, "loss": 3.1896, "mean_token_accuracy": 0.3551724135875702, "step": 29655 }, { "epoch": 0.02987386701395289, "grad_norm": 16.9838557980717, "learning_rate": 2.9873295328646538e-05, "loss": 2.3552, "mean_token_accuracy": 0.41034482717514037, "step": 29660 }, { "epoch": 0.029878903067057063, "grad_norm": 16.700492292832386, "learning_rate": 2.9878331285377597e-05, "loss": 2.8717, "mean_token_accuracy": 0.36551723480224607, "step": 29665 }, { "epoch": 0.029883939120161233, "grad_norm": 17.83220247346024, "learning_rate": 2.9883367242108656e-05, "loss": 2.737, "mean_token_accuracy": 0.4103448212146759, "step": 29670 }, { "epoch": 0.029888975173265407, "grad_norm": 16.586435854450247, "learning_rate": 2.9888403198839715e-05, "loss": 2.5934, "mean_token_accuracy": 0.47586206793785096, "step": 29675 }, { "epoch": 0.02989401122636958, "grad_norm": 19.154691351933703, "learning_rate": 2.9893439155570778e-05, "loss": 2.497, "mean_token_accuracy": 0.4206896543502808, "step": 29680 }, { "epoch": 0.02989904727947375, "grad_norm": 14.061548529639017, "learning_rate": 2.9898475112301837e-05, "loss": 2.7822, "mean_token_accuracy": 0.3430127054452896, "step": 29685 }, { "epoch": 0.029904083332577925, "grad_norm": 18.64674808087456, "learning_rate": 2.9903511069032897e-05, "loss": 2.4677, "mean_token_accuracy": 0.4344827592372894, "step": 29690 }, { "epoch": 0.0299091193856821, "grad_norm": 15.245902590548736, "learning_rate": 2.9908547025763956e-05, "loss": 2.482, "mean_token_accuracy": 0.4517241418361664, "step": 29695 }, { "epoch": 0.029914155438786273, "grad_norm": 16.128631436923406, "learning_rate": 2.9913582982495015e-05, "loss": 2.3459, "mean_token_accuracy": 0.4586206912994385, "step": 29700 }, { "epoch": 0.029919191491890443, "grad_norm": 21.582082133515264, "learning_rate": 2.9918618939226074e-05, "loss": 2.4995, "mean_token_accuracy": 0.43793103098869324, "step": 29705 }, { "epoch": 0.029924227544994617, "grad_norm": 15.086930408636556, "learning_rate": 2.9923654895957137e-05, "loss": 2.3631, "mean_token_accuracy": 0.4517241418361664, "step": 29710 }, { "epoch": 0.02992926359809879, "grad_norm": 16.38705304501124, "learning_rate": 2.9928690852688196e-05, "loss": 2.5819, "mean_token_accuracy": 0.4103448152542114, "step": 29715 }, { "epoch": 0.02993429965120296, "grad_norm": 17.330242075186415, "learning_rate": 2.9933726809419255e-05, "loss": 2.4724, "mean_token_accuracy": 0.42413792610168455, "step": 29720 }, { "epoch": 0.029939335704307134, "grad_norm": 18.19344825965578, "learning_rate": 2.9938762766150315e-05, "loss": 2.3184, "mean_token_accuracy": 0.4551724135875702, "step": 29725 }, { "epoch": 0.029944371757411308, "grad_norm": 14.256496596262643, "learning_rate": 2.994379872288137e-05, "loss": 2.3052, "mean_token_accuracy": 0.46739262342453003, "step": 29730 }, { "epoch": 0.029949407810515482, "grad_norm": 19.86412220122882, "learning_rate": 2.9948834679612437e-05, "loss": 2.2721, "mean_token_accuracy": 0.3965517282485962, "step": 29735 }, { "epoch": 0.029954443863619652, "grad_norm": 15.73026555272646, "learning_rate": 2.9953870636343496e-05, "loss": 2.1618, "mean_token_accuracy": 0.4413793087005615, "step": 29740 }, { "epoch": 0.029959479916723826, "grad_norm": 18.771153913685314, "learning_rate": 2.9958906593074555e-05, "loss": 2.6681, "mean_token_accuracy": 0.4034482717514038, "step": 29745 }, { "epoch": 0.029964515969828, "grad_norm": 14.665095972754285, "learning_rate": 2.996394254980561e-05, "loss": 2.4392, "mean_token_accuracy": 0.4194797396659851, "step": 29750 }, { "epoch": 0.02996955202293217, "grad_norm": 18.62029184840673, "learning_rate": 2.996897850653667e-05, "loss": 2.6521, "mean_token_accuracy": 0.4034482717514038, "step": 29755 }, { "epoch": 0.029974588076036344, "grad_norm": 15.597494328345055, "learning_rate": 2.9974014463267736e-05, "loss": 2.4606, "mean_token_accuracy": 0.3620689630508423, "step": 29760 }, { "epoch": 0.029979624129140518, "grad_norm": 17.466229073595606, "learning_rate": 2.9979050419998795e-05, "loss": 2.5123, "mean_token_accuracy": 0.4206896543502808, "step": 29765 }, { "epoch": 0.02998466018224469, "grad_norm": 15.80912650425274, "learning_rate": 2.998408637672985e-05, "loss": 2.4978, "mean_token_accuracy": 0.42758620977401735, "step": 29770 }, { "epoch": 0.02998969623534886, "grad_norm": 17.016601170452937, "learning_rate": 2.998912233346091e-05, "loss": 2.2564, "mean_token_accuracy": 0.44482759237289426, "step": 29775 }, { "epoch": 0.029994732288453035, "grad_norm": 15.918434279894857, "learning_rate": 2.999415829019197e-05, "loss": 2.5996, "mean_token_accuracy": 0.36896551251411436, "step": 29780 }, { "epoch": 0.02999976834155721, "grad_norm": 17.94975754946306, "learning_rate": 2.999919424692303e-05, "loss": 2.6092, "mean_token_accuracy": 0.37586206793785093, "step": 29785 }, { "epoch": 0.03000480439466138, "grad_norm": 19.641000207729128, "learning_rate": 3.0004230203654092e-05, "loss": 2.5008, "mean_token_accuracy": 0.3915305435657501, "step": 29790 }, { "epoch": 0.030009840447765553, "grad_norm": 14.956762460606019, "learning_rate": 3.000926616038515e-05, "loss": 2.5518, "mean_token_accuracy": 0.40344828069210054, "step": 29795 }, { "epoch": 0.030014876500869727, "grad_norm": 19.319891287401614, "learning_rate": 3.001430211711621e-05, "loss": 2.5014, "mean_token_accuracy": 0.4068965554237366, "step": 29800 }, { "epoch": 0.0300199125539739, "grad_norm": 17.179243454324823, "learning_rate": 3.001933807384727e-05, "loss": 2.3106, "mean_token_accuracy": 0.40852994918823243, "step": 29805 }, { "epoch": 0.03002494860707807, "grad_norm": 18.142029027307785, "learning_rate": 3.002437403057833e-05, "loss": 2.9119, "mean_token_accuracy": 0.3103448271751404, "step": 29810 }, { "epoch": 0.030029984660182245, "grad_norm": 17.313526864754152, "learning_rate": 3.002940998730939e-05, "loss": 2.8225, "mean_token_accuracy": 0.38965516686439516, "step": 29815 }, { "epoch": 0.03003502071328642, "grad_norm": 20.309705975662062, "learning_rate": 3.003444594404045e-05, "loss": 2.5784, "mean_token_accuracy": 0.40000000298023225, "step": 29820 }, { "epoch": 0.03004005676639059, "grad_norm": 17.34009037719211, "learning_rate": 3.003948190077151e-05, "loss": 2.3544, "mean_token_accuracy": 0.4517241358757019, "step": 29825 }, { "epoch": 0.030045092819494763, "grad_norm": 16.174106377404456, "learning_rate": 3.004451785750257e-05, "loss": 2.3002, "mean_token_accuracy": 0.4344827592372894, "step": 29830 }, { "epoch": 0.030050128872598936, "grad_norm": 15.652330903316003, "learning_rate": 3.004955381423363e-05, "loss": 2.7103, "mean_token_accuracy": 0.37586206793785093, "step": 29835 }, { "epoch": 0.03005516492570311, "grad_norm": 16.259342048104294, "learning_rate": 3.005458977096469e-05, "loss": 2.1563, "mean_token_accuracy": 0.4359346628189087, "step": 29840 }, { "epoch": 0.03006020097880728, "grad_norm": 11.828177056235981, "learning_rate": 3.005962572769575e-05, "loss": 1.9828, "mean_token_accuracy": 0.517241370677948, "step": 29845 }, { "epoch": 0.030065237031911454, "grad_norm": 14.157524797239157, "learning_rate": 3.006466168442681e-05, "loss": 2.2234, "mean_token_accuracy": 0.441379314661026, "step": 29850 }, { "epoch": 0.030070273085015628, "grad_norm": 15.347764881080051, "learning_rate": 3.006969764115787e-05, "loss": 2.4233, "mean_token_accuracy": 0.4068965494632721, "step": 29855 }, { "epoch": 0.0300753091381198, "grad_norm": 15.062032454511112, "learning_rate": 3.0074733597888928e-05, "loss": 2.5, "mean_token_accuracy": 0.4284482717514038, "step": 29860 }, { "epoch": 0.030080345191223972, "grad_norm": 17.840264001558253, "learning_rate": 3.007976955461999e-05, "loss": 2.6764, "mean_token_accuracy": 0.3931034505367279, "step": 29865 }, { "epoch": 0.030085381244328146, "grad_norm": 16.351636256178526, "learning_rate": 3.008480551135105e-05, "loss": 2.5083, "mean_token_accuracy": 0.4034482777118683, "step": 29870 }, { "epoch": 0.03009041729743232, "grad_norm": 15.206703634685407, "learning_rate": 3.008984146808211e-05, "loss": 2.0618, "mean_token_accuracy": 0.5034482777118683, "step": 29875 }, { "epoch": 0.03009545335053649, "grad_norm": 18.248234717958244, "learning_rate": 3.009487742481317e-05, "loss": 2.2608, "mean_token_accuracy": 0.41834974884986875, "step": 29880 }, { "epoch": 0.030100489403640664, "grad_norm": 17.786917365373313, "learning_rate": 3.0099913381544224e-05, "loss": 2.4322, "mean_token_accuracy": 0.38620689511299133, "step": 29885 }, { "epoch": 0.030105525456744837, "grad_norm": 18.0308470227239, "learning_rate": 3.0104949338275284e-05, "loss": 2.8619, "mean_token_accuracy": 0.37241379022598264, "step": 29890 }, { "epoch": 0.030110561509849008, "grad_norm": 17.387733731842374, "learning_rate": 3.010998529500635e-05, "loss": 2.5135, "mean_token_accuracy": 0.4159104645252228, "step": 29895 }, { "epoch": 0.03011559756295318, "grad_norm": 16.76268381363948, "learning_rate": 3.011502125173741e-05, "loss": 2.2325, "mean_token_accuracy": 0.4551724076271057, "step": 29900 }, { "epoch": 0.030120633616057355, "grad_norm": 16.49212219374332, "learning_rate": 3.0120057208468465e-05, "loss": 1.9437, "mean_token_accuracy": 0.493103438615799, "step": 29905 }, { "epoch": 0.03012566966916153, "grad_norm": 16.963242910904718, "learning_rate": 3.0125093165199524e-05, "loss": 2.4113, "mean_token_accuracy": 0.43448275327682495, "step": 29910 }, { "epoch": 0.0301307057222657, "grad_norm": 16.3011736103217, "learning_rate": 3.0130129121930583e-05, "loss": 2.4062, "mean_token_accuracy": 0.4620689690113068, "step": 29915 }, { "epoch": 0.030135741775369873, "grad_norm": 15.521354774392584, "learning_rate": 3.013516507866165e-05, "loss": 2.919, "mean_token_accuracy": 0.38275861740112305, "step": 29920 }, { "epoch": 0.030140777828474047, "grad_norm": 14.40645687979916, "learning_rate": 3.0140201035392705e-05, "loss": 2.5124, "mean_token_accuracy": 0.42758620977401735, "step": 29925 }, { "epoch": 0.030145813881578217, "grad_norm": 17.518773214470855, "learning_rate": 3.0145236992123764e-05, "loss": 2.563, "mean_token_accuracy": 0.4344827651977539, "step": 29930 }, { "epoch": 0.03015084993468239, "grad_norm": 20.584196523026073, "learning_rate": 3.0150272948854824e-05, "loss": 2.6491, "mean_token_accuracy": 0.4157289803028107, "step": 29935 }, { "epoch": 0.030155885987786565, "grad_norm": 27.52294978301983, "learning_rate": 3.0155308905585883e-05, "loss": 2.7911, "mean_token_accuracy": 0.37931033968925476, "step": 29940 }, { "epoch": 0.03016092204089074, "grad_norm": 17.442600571078536, "learning_rate": 3.0160344862316946e-05, "loss": 2.8233, "mean_token_accuracy": 0.35862069129943847, "step": 29945 }, { "epoch": 0.03016595809399491, "grad_norm": 14.196915889916598, "learning_rate": 3.0165380819048005e-05, "loss": 2.4175, "mean_token_accuracy": 0.3931034505367279, "step": 29950 }, { "epoch": 0.030170994147099083, "grad_norm": 14.369678741541954, "learning_rate": 3.0170416775779064e-05, "loss": 2.7933, "mean_token_accuracy": 0.4000000059604645, "step": 29955 }, { "epoch": 0.030176030200203256, "grad_norm": 16.642794135234677, "learning_rate": 3.0175452732510123e-05, "loss": 2.5102, "mean_token_accuracy": 0.39655172228813174, "step": 29960 }, { "epoch": 0.030181066253307427, "grad_norm": 14.283556090151132, "learning_rate": 3.0180488689241183e-05, "loss": 2.5282, "mean_token_accuracy": 0.43103447556495667, "step": 29965 }, { "epoch": 0.0301861023064116, "grad_norm": 15.677405826508265, "learning_rate": 3.0185524645972242e-05, "loss": 2.3808, "mean_token_accuracy": 0.42068964838981626, "step": 29970 }, { "epoch": 0.030191138359515774, "grad_norm": 15.490275639668234, "learning_rate": 3.0190560602703304e-05, "loss": 2.5581, "mean_token_accuracy": 0.3827586114406586, "step": 29975 }, { "epoch": 0.030196174412619948, "grad_norm": 16.035419940742443, "learning_rate": 3.0195596559434364e-05, "loss": 2.5398, "mean_token_accuracy": 0.3965517163276672, "step": 29980 }, { "epoch": 0.030201210465724118, "grad_norm": 19.11193119487132, "learning_rate": 3.0200632516165423e-05, "loss": 2.9306, "mean_token_accuracy": 0.3241379201412201, "step": 29985 }, { "epoch": 0.030206246518828292, "grad_norm": 15.232559623867571, "learning_rate": 3.0205668472896482e-05, "loss": 2.4816, "mean_token_accuracy": 0.4000000059604645, "step": 29990 }, { "epoch": 0.030211282571932466, "grad_norm": 16.198606256037586, "learning_rate": 3.021070442962754e-05, "loss": 2.4395, "mean_token_accuracy": 0.4310344815254211, "step": 29995 }, { "epoch": 0.030216318625036636, "grad_norm": 13.775571798437669, "learning_rate": 3.0215740386358604e-05, "loss": 2.6011, "mean_token_accuracy": 0.4034482777118683, "step": 30000 }, { "epoch": 0.03022135467814081, "grad_norm": 15.47556768242462, "learning_rate": 3.0220776343089663e-05, "loss": 2.3931, "mean_token_accuracy": 0.4017543911933899, "step": 30005 }, { "epoch": 0.030226390731244984, "grad_norm": 15.610516771316016, "learning_rate": 3.0225812299820723e-05, "loss": 2.4527, "mean_token_accuracy": 0.4344827592372894, "step": 30010 }, { "epoch": 0.030231426784349157, "grad_norm": 16.534416231849484, "learning_rate": 3.0230848256551782e-05, "loss": 2.6057, "mean_token_accuracy": 0.36896551251411436, "step": 30015 }, { "epoch": 0.030236462837453328, "grad_norm": 17.74059914239265, "learning_rate": 3.0235884213282838e-05, "loss": 2.5507, "mean_token_accuracy": 0.41034482717514037, "step": 30020 }, { "epoch": 0.0302414988905575, "grad_norm": 14.169490247079645, "learning_rate": 3.0240920170013904e-05, "loss": 2.5737, "mean_token_accuracy": 0.42068964838981626, "step": 30025 }, { "epoch": 0.030246534943661675, "grad_norm": 19.59932686888348, "learning_rate": 3.0245956126744963e-05, "loss": 2.8079, "mean_token_accuracy": 0.37931033968925476, "step": 30030 }, { "epoch": 0.030251570996765845, "grad_norm": 18.494276347302904, "learning_rate": 3.0250992083476022e-05, "loss": 2.6121, "mean_token_accuracy": 0.36206896901130675, "step": 30035 }, { "epoch": 0.03025660704987002, "grad_norm": 13.708368034542126, "learning_rate": 3.0256028040207078e-05, "loss": 2.6655, "mean_token_accuracy": 0.41724138259887694, "step": 30040 }, { "epoch": 0.030261643102974193, "grad_norm": 15.560306497404676, "learning_rate": 3.0261063996938137e-05, "loss": 2.393, "mean_token_accuracy": 0.42758620381355283, "step": 30045 }, { "epoch": 0.030266679156078367, "grad_norm": 19.125402155374278, "learning_rate": 3.0266099953669197e-05, "loss": 2.5258, "mean_token_accuracy": 0.42068966031074523, "step": 30050 }, { "epoch": 0.030271715209182537, "grad_norm": 17.04897351989556, "learning_rate": 3.0271135910400263e-05, "loss": 2.3408, "mean_token_accuracy": 0.4586206912994385, "step": 30055 }, { "epoch": 0.03027675126228671, "grad_norm": 16.310581901041303, "learning_rate": 3.027617186713132e-05, "loss": 2.7625, "mean_token_accuracy": 0.34482758939266206, "step": 30060 }, { "epoch": 0.030281787315390885, "grad_norm": 17.67124169396695, "learning_rate": 3.0281207823862378e-05, "loss": 2.8017, "mean_token_accuracy": 0.3517241358757019, "step": 30065 }, { "epoch": 0.030286823368495055, "grad_norm": 14.35494483560042, "learning_rate": 3.0286243780593437e-05, "loss": 1.952, "mean_token_accuracy": 0.5288177132606506, "step": 30070 }, { "epoch": 0.03029185942159923, "grad_norm": 13.329866981371957, "learning_rate": 3.0291279737324496e-05, "loss": 2.0153, "mean_token_accuracy": 0.5803387761116028, "step": 30075 }, { "epoch": 0.030296895474703402, "grad_norm": 16.091686005906237, "learning_rate": 3.029631569405556e-05, "loss": 2.758, "mean_token_accuracy": 0.39310345649719236, "step": 30080 }, { "epoch": 0.030301931527807576, "grad_norm": 16.04852341788052, "learning_rate": 3.0301351650786618e-05, "loss": 2.4701, "mean_token_accuracy": 0.4034482777118683, "step": 30085 }, { "epoch": 0.030306967580911746, "grad_norm": 24.079155564758615, "learning_rate": 3.0306387607517677e-05, "loss": 2.6988, "mean_token_accuracy": 0.35862069129943847, "step": 30090 }, { "epoch": 0.03031200363401592, "grad_norm": 15.189021805844565, "learning_rate": 3.0311423564248737e-05, "loss": 2.9328, "mean_token_accuracy": 0.3517241358757019, "step": 30095 }, { "epoch": 0.030317039687120094, "grad_norm": 17.214922066922952, "learning_rate": 3.0316459520979796e-05, "loss": 2.1698, "mean_token_accuracy": 0.4620689630508423, "step": 30100 }, { "epoch": 0.030322075740224264, "grad_norm": 13.732801799539205, "learning_rate": 3.032149547771086e-05, "loss": 2.3977, "mean_token_accuracy": 0.4068965554237366, "step": 30105 }, { "epoch": 0.030327111793328438, "grad_norm": 26.395799560108106, "learning_rate": 3.0326531434441918e-05, "loss": 2.4416, "mean_token_accuracy": 0.4448275864124298, "step": 30110 }, { "epoch": 0.030332147846432612, "grad_norm": 18.3000158028024, "learning_rate": 3.0331567391172977e-05, "loss": 2.6657, "mean_token_accuracy": 0.358620685338974, "step": 30115 }, { "epoch": 0.030337183899536786, "grad_norm": 19.960077024058872, "learning_rate": 3.0336603347904036e-05, "loss": 2.9329, "mean_token_accuracy": 0.36551724672317504, "step": 30120 }, { "epoch": 0.030342219952640956, "grad_norm": 15.363901624499073, "learning_rate": 3.0341639304635096e-05, "loss": 2.2601, "mean_token_accuracy": 0.43793103098869324, "step": 30125 }, { "epoch": 0.03034725600574513, "grad_norm": 38.208009406439, "learning_rate": 3.034667526136615e-05, "loss": 2.8535, "mean_token_accuracy": 0.38620689511299133, "step": 30130 }, { "epoch": 0.030352292058849303, "grad_norm": 17.565701736988956, "learning_rate": 3.0351711218097217e-05, "loss": 2.7111, "mean_token_accuracy": 0.4068965554237366, "step": 30135 }, { "epoch": 0.030357328111953474, "grad_norm": 15.456861743132727, "learning_rate": 3.0356747174828277e-05, "loss": 2.438, "mean_token_accuracy": 0.4551724135875702, "step": 30140 }, { "epoch": 0.030362364165057647, "grad_norm": 14.078903006187124, "learning_rate": 3.0361783131559336e-05, "loss": 2.3945, "mean_token_accuracy": 0.43448275327682495, "step": 30145 }, { "epoch": 0.03036740021816182, "grad_norm": 15.9738717494348, "learning_rate": 3.0366819088290395e-05, "loss": 2.6611, "mean_token_accuracy": 0.3896551728248596, "step": 30150 }, { "epoch": 0.030372436271265995, "grad_norm": 18.7889276627096, "learning_rate": 3.037185504502145e-05, "loss": 2.6185, "mean_token_accuracy": 0.3931034505367279, "step": 30155 }, { "epoch": 0.030377472324370165, "grad_norm": 16.74827760524287, "learning_rate": 3.0376891001752517e-05, "loss": 2.6533, "mean_token_accuracy": 0.3793103456497192, "step": 30160 }, { "epoch": 0.03038250837747434, "grad_norm": 15.512556359765467, "learning_rate": 3.0381926958483576e-05, "loss": 2.3699, "mean_token_accuracy": 0.41379311084747317, "step": 30165 }, { "epoch": 0.030387544430578513, "grad_norm": 15.730724119177593, "learning_rate": 3.0386962915214636e-05, "loss": 2.4307, "mean_token_accuracy": 0.4344827592372894, "step": 30170 }, { "epoch": 0.030392580483682683, "grad_norm": 16.212270092864646, "learning_rate": 3.039199887194569e-05, "loss": 2.3668, "mean_token_accuracy": 0.4206896543502808, "step": 30175 }, { "epoch": 0.030397616536786857, "grad_norm": 17.519953752298726, "learning_rate": 3.039703482867675e-05, "loss": 2.3741, "mean_token_accuracy": 0.41379311084747317, "step": 30180 }, { "epoch": 0.03040265258989103, "grad_norm": 29.649742037479474, "learning_rate": 3.0402070785407817e-05, "loss": 2.609, "mean_token_accuracy": 0.42068966031074523, "step": 30185 }, { "epoch": 0.030407688642995204, "grad_norm": 19.58096962696459, "learning_rate": 3.0407106742138876e-05, "loss": 2.7538, "mean_token_accuracy": 0.3620689660310745, "step": 30190 }, { "epoch": 0.030412724696099375, "grad_norm": 13.510709216510898, "learning_rate": 3.0412142698869932e-05, "loss": 2.1908, "mean_token_accuracy": 0.4676346004009247, "step": 30195 }, { "epoch": 0.03041776074920355, "grad_norm": 16.04623414307215, "learning_rate": 3.041717865560099e-05, "loss": 2.7714, "mean_token_accuracy": 0.36206896901130675, "step": 30200 }, { "epoch": 0.030422796802307722, "grad_norm": 22.567171505752484, "learning_rate": 3.042221461233205e-05, "loss": 2.3081, "mean_token_accuracy": 0.4401088893413544, "step": 30205 }, { "epoch": 0.030427832855411893, "grad_norm": 15.953017020592727, "learning_rate": 3.042725056906311e-05, "loss": 2.594, "mean_token_accuracy": 0.3724137842655182, "step": 30210 }, { "epoch": 0.030432868908516066, "grad_norm": 14.50691462715041, "learning_rate": 3.0432286525794172e-05, "loss": 2.4375, "mean_token_accuracy": 0.4, "step": 30215 }, { "epoch": 0.03043790496162024, "grad_norm": 22.839055709872014, "learning_rate": 3.043732248252523e-05, "loss": 2.4893, "mean_token_accuracy": 0.4379310369491577, "step": 30220 }, { "epoch": 0.03044294101472441, "grad_norm": 17.269660513922705, "learning_rate": 3.044235843925629e-05, "loss": 2.4767, "mean_token_accuracy": 0.37241379022598264, "step": 30225 }, { "epoch": 0.030447977067828584, "grad_norm": 15.95771989079032, "learning_rate": 3.044739439598735e-05, "loss": 2.5376, "mean_token_accuracy": 0.41034482717514037, "step": 30230 }, { "epoch": 0.030453013120932758, "grad_norm": 15.243007704133149, "learning_rate": 3.045243035271841e-05, "loss": 2.4553, "mean_token_accuracy": 0.4172413766384125, "step": 30235 }, { "epoch": 0.03045804917403693, "grad_norm": 17.981460307537475, "learning_rate": 3.0457466309449472e-05, "loss": 2.5478, "mean_token_accuracy": 0.3689655065536499, "step": 30240 }, { "epoch": 0.030463085227141102, "grad_norm": 14.888134765541642, "learning_rate": 3.046250226618053e-05, "loss": 2.3795, "mean_token_accuracy": 0.42758620977401735, "step": 30245 }, { "epoch": 0.030468121280245276, "grad_norm": 12.79576078181957, "learning_rate": 3.046753822291159e-05, "loss": 2.654, "mean_token_accuracy": 0.38275861740112305, "step": 30250 }, { "epoch": 0.03047315733334945, "grad_norm": 15.769688200831252, "learning_rate": 3.047257417964265e-05, "loss": 2.1658, "mean_token_accuracy": 0.4586206912994385, "step": 30255 }, { "epoch": 0.03047819338645362, "grad_norm": 19.879233265645915, "learning_rate": 3.047761013637371e-05, "loss": 2.4157, "mean_token_accuracy": 0.47761645913124084, "step": 30260 }, { "epoch": 0.030483229439557794, "grad_norm": 19.374864888797102, "learning_rate": 3.048264609310477e-05, "loss": 2.3592, "mean_token_accuracy": 0.4137930989265442, "step": 30265 }, { "epoch": 0.030488265492661967, "grad_norm": 18.35155632261849, "learning_rate": 3.048768204983583e-05, "loss": 2.5171, "mean_token_accuracy": 0.37586206793785093, "step": 30270 }, { "epoch": 0.03049330154576614, "grad_norm": 13.661640151285418, "learning_rate": 3.049271800656689e-05, "loss": 2.59, "mean_token_accuracy": 0.4172413766384125, "step": 30275 }, { "epoch": 0.03049833759887031, "grad_norm": 16.919438216718486, "learning_rate": 3.049775396329795e-05, "loss": 2.6538, "mean_token_accuracy": 0.35172413289546967, "step": 30280 }, { "epoch": 0.030503373651974485, "grad_norm": 17.411347244492763, "learning_rate": 3.0502789920029005e-05, "loss": 2.4037, "mean_token_accuracy": 0.39655172228813174, "step": 30285 }, { "epoch": 0.03050840970507866, "grad_norm": 15.887142689578116, "learning_rate": 3.050782587676007e-05, "loss": 2.8974, "mean_token_accuracy": 0.3551724135875702, "step": 30290 }, { "epoch": 0.03051344575818283, "grad_norm": 14.175207824430862, "learning_rate": 3.051286183349113e-05, "loss": 2.5189, "mean_token_accuracy": 0.43103448748588563, "step": 30295 }, { "epoch": 0.030518481811287003, "grad_norm": 16.714323267250684, "learning_rate": 3.051789779022219e-05, "loss": 2.8999, "mean_token_accuracy": 0.35862069129943847, "step": 30300 }, { "epoch": 0.030523517864391177, "grad_norm": 15.660011179155827, "learning_rate": 3.052293374695325e-05, "loss": 2.509, "mean_token_accuracy": 0.3862068891525269, "step": 30305 }, { "epoch": 0.03052855391749535, "grad_norm": 15.026243148831282, "learning_rate": 3.052796970368431e-05, "loss": 2.6013, "mean_token_accuracy": 0.38965516686439516, "step": 30310 }, { "epoch": 0.03053358997059952, "grad_norm": 15.477787044518426, "learning_rate": 3.053300566041537e-05, "loss": 2.7042, "mean_token_accuracy": 0.3999999940395355, "step": 30315 }, { "epoch": 0.030538626023703695, "grad_norm": 12.81301766343412, "learning_rate": 3.053804161714643e-05, "loss": 2.2586, "mean_token_accuracy": 0.4413793087005615, "step": 30320 }, { "epoch": 0.03054366207680787, "grad_norm": 16.990597685467748, "learning_rate": 3.0543077573877486e-05, "loss": 2.6749, "mean_token_accuracy": 0.38620689511299133, "step": 30325 }, { "epoch": 0.03054869812991204, "grad_norm": 21.61798670004041, "learning_rate": 3.0548113530608545e-05, "loss": 2.4997, "mean_token_accuracy": 0.42758620381355283, "step": 30330 }, { "epoch": 0.030553734183016212, "grad_norm": 14.460003232537366, "learning_rate": 3.0553149487339605e-05, "loss": 2.4109, "mean_token_accuracy": 0.3999999940395355, "step": 30335 }, { "epoch": 0.030558770236120386, "grad_norm": 16.160264494543526, "learning_rate": 3.0558185444070664e-05, "loss": 2.6603, "mean_token_accuracy": 0.4, "step": 30340 }, { "epoch": 0.03056380628922456, "grad_norm": 13.953997206180517, "learning_rate": 3.056322140080173e-05, "loss": 2.8845, "mean_token_accuracy": 0.3379310369491577, "step": 30345 }, { "epoch": 0.03056884234232873, "grad_norm": 16.906129798351298, "learning_rate": 3.056825735753279e-05, "loss": 2.5456, "mean_token_accuracy": 0.4068965554237366, "step": 30350 }, { "epoch": 0.030573878395432904, "grad_norm": 22.367706679008347, "learning_rate": 3.057329331426385e-05, "loss": 3.0359, "mean_token_accuracy": 0.34137930870056155, "step": 30355 }, { "epoch": 0.030578914448537078, "grad_norm": 16.374189070091447, "learning_rate": 3.057832927099491e-05, "loss": 2.2252, "mean_token_accuracy": 0.44482759237289426, "step": 30360 }, { "epoch": 0.030583950501641248, "grad_norm": 16.1528402456324, "learning_rate": 3.058336522772596e-05, "loss": 3.1152, "mean_token_accuracy": 0.38275861740112305, "step": 30365 }, { "epoch": 0.030588986554745422, "grad_norm": 18.890461062556426, "learning_rate": 3.0588401184457026e-05, "loss": 2.5253, "mean_token_accuracy": 0.4034482777118683, "step": 30370 }, { "epoch": 0.030594022607849596, "grad_norm": 14.805205464983276, "learning_rate": 3.0593437141188085e-05, "loss": 2.4996, "mean_token_accuracy": 0.4517241299152374, "step": 30375 }, { "epoch": 0.03059905866095377, "grad_norm": 22.203441800411206, "learning_rate": 3.0598473097919145e-05, "loss": 2.7833, "mean_token_accuracy": 0.3517241388559341, "step": 30380 }, { "epoch": 0.03060409471405794, "grad_norm": 17.692845895326847, "learning_rate": 3.0603509054650204e-05, "loss": 2.5453, "mean_token_accuracy": 0.3551724135875702, "step": 30385 }, { "epoch": 0.030609130767162113, "grad_norm": 14.856504019239393, "learning_rate": 3.060854501138126e-05, "loss": 2.5186, "mean_token_accuracy": 0.41379310488700866, "step": 30390 }, { "epoch": 0.030614166820266287, "grad_norm": 16.471938316044767, "learning_rate": 3.061358096811232e-05, "loss": 2.4007, "mean_token_accuracy": 0.42413792610168455, "step": 30395 }, { "epoch": 0.030619202873370457, "grad_norm": 19.997662772004308, "learning_rate": 3.061861692484339e-05, "loss": 2.6454, "mean_token_accuracy": 0.39655172228813174, "step": 30400 }, { "epoch": 0.03062423892647463, "grad_norm": 15.758670815001214, "learning_rate": 3.062365288157444e-05, "loss": 2.6411, "mean_token_accuracy": 0.36013308763504026, "step": 30405 }, { "epoch": 0.030629274979578805, "grad_norm": 16.511323955616472, "learning_rate": 3.06286888383055e-05, "loss": 2.5415, "mean_token_accuracy": 0.3793103456497192, "step": 30410 }, { "epoch": 0.03063431103268298, "grad_norm": 20.626581644952743, "learning_rate": 3.063372479503656e-05, "loss": 2.6776, "mean_token_accuracy": 0.36896551847457887, "step": 30415 }, { "epoch": 0.03063934708578715, "grad_norm": 12.598339861279566, "learning_rate": 3.063876075176762e-05, "loss": 2.2539, "mean_token_accuracy": 0.4448275864124298, "step": 30420 }, { "epoch": 0.030644383138891323, "grad_norm": 14.918041179807698, "learning_rate": 3.0643796708498685e-05, "loss": 2.2334, "mean_token_accuracy": 0.44325469732284545, "step": 30425 }, { "epoch": 0.030649419191995497, "grad_norm": 17.671264615316996, "learning_rate": 3.0648832665229744e-05, "loss": 2.5784, "mean_token_accuracy": 0.42758620381355283, "step": 30430 }, { "epoch": 0.030654455245099667, "grad_norm": 18.986090409974594, "learning_rate": 3.06538686219608e-05, "loss": 2.6954, "mean_token_accuracy": 0.37586206793785093, "step": 30435 }, { "epoch": 0.03065949129820384, "grad_norm": 17.397741861583288, "learning_rate": 3.065890457869186e-05, "loss": 2.7507, "mean_token_accuracy": 0.34137930274009703, "step": 30440 }, { "epoch": 0.030664527351308014, "grad_norm": 24.779265675093676, "learning_rate": 3.066394053542292e-05, "loss": 2.7733, "mean_token_accuracy": 0.36896551251411436, "step": 30445 }, { "epoch": 0.030669563404412188, "grad_norm": 16.746640332292042, "learning_rate": 3.066897649215398e-05, "loss": 2.6052, "mean_token_accuracy": 0.42758620381355283, "step": 30450 }, { "epoch": 0.03067459945751636, "grad_norm": 15.739215833573613, "learning_rate": 3.067401244888504e-05, "loss": 2.5325, "mean_token_accuracy": 0.38965516686439516, "step": 30455 }, { "epoch": 0.030679635510620532, "grad_norm": 20.508931044024585, "learning_rate": 3.06790484056161e-05, "loss": 2.3893, "mean_token_accuracy": 0.4275861978530884, "step": 30460 }, { "epoch": 0.030684671563724706, "grad_norm": 15.626797405280513, "learning_rate": 3.068408436234716e-05, "loss": 2.35, "mean_token_accuracy": 0.4379310369491577, "step": 30465 }, { "epoch": 0.030689707616828876, "grad_norm": 14.080461642415779, "learning_rate": 3.068912031907822e-05, "loss": 2.3851, "mean_token_accuracy": 0.44016939401626587, "step": 30470 }, { "epoch": 0.03069474366993305, "grad_norm": 16.52376324748906, "learning_rate": 3.069415627580928e-05, "loss": 2.4846, "mean_token_accuracy": 0.46721113920211793, "step": 30475 }, { "epoch": 0.030699779723037224, "grad_norm": 12.847063002731312, "learning_rate": 3.069919223254034e-05, "loss": 2.4385, "mean_token_accuracy": 0.4310344815254211, "step": 30480 }, { "epoch": 0.030704815776141398, "grad_norm": 16.868078186118225, "learning_rate": 3.07042281892714e-05, "loss": 2.5037, "mean_token_accuracy": 0.3896551728248596, "step": 30485 }, { "epoch": 0.030709851829245568, "grad_norm": 19.802893405374796, "learning_rate": 3.070926414600246e-05, "loss": 2.2445, "mean_token_accuracy": 0.43448275327682495, "step": 30490 }, { "epoch": 0.03071488788234974, "grad_norm": 19.087306139130025, "learning_rate": 3.0714300102733514e-05, "loss": 3.2061, "mean_token_accuracy": 0.3551724135875702, "step": 30495 }, { "epoch": 0.030719923935453915, "grad_norm": 18.61903663997251, "learning_rate": 3.0719336059464573e-05, "loss": 2.5648, "mean_token_accuracy": 0.4206896543502808, "step": 30500 }, { "epoch": 0.030724959988558086, "grad_norm": 16.494561291905015, "learning_rate": 3.072437201619564e-05, "loss": 2.445, "mean_token_accuracy": 0.4310344934463501, "step": 30505 }, { "epoch": 0.03072999604166226, "grad_norm": 22.52249822536078, "learning_rate": 3.07294079729267e-05, "loss": 2.8711, "mean_token_accuracy": 0.4034482777118683, "step": 30510 }, { "epoch": 0.030735032094766433, "grad_norm": 26.119276238155425, "learning_rate": 3.073444392965776e-05, "loss": 2.5825, "mean_token_accuracy": 0.4068965494632721, "step": 30515 }, { "epoch": 0.030740068147870607, "grad_norm": 20.99555841081645, "learning_rate": 3.073947988638882e-05, "loss": 2.5414, "mean_token_accuracy": 0.41379310488700866, "step": 30520 }, { "epoch": 0.030745104200974777, "grad_norm": 18.099366968560343, "learning_rate": 3.0744515843119876e-05, "loss": 2.4635, "mean_token_accuracy": 0.42413792610168455, "step": 30525 }, { "epoch": 0.03075014025407895, "grad_norm": 16.669622341003723, "learning_rate": 3.074955179985094e-05, "loss": 2.3943, "mean_token_accuracy": 0.4655172526836395, "step": 30530 }, { "epoch": 0.030755176307183125, "grad_norm": 16.040129049030103, "learning_rate": 3.0754587756582e-05, "loss": 2.4163, "mean_token_accuracy": 0.45862069725990295, "step": 30535 }, { "epoch": 0.030760212360287295, "grad_norm": 19.749330170349932, "learning_rate": 3.0759623713313054e-05, "loss": 2.7484, "mean_token_accuracy": 0.36896551847457887, "step": 30540 }, { "epoch": 0.03076524841339147, "grad_norm": 17.715912903773642, "learning_rate": 3.0764659670044113e-05, "loss": 2.925, "mean_token_accuracy": 0.36551723480224607, "step": 30545 }, { "epoch": 0.030770284466495643, "grad_norm": 22.346242077426773, "learning_rate": 3.076969562677517e-05, "loss": 2.5009, "mean_token_accuracy": 0.40490018129348754, "step": 30550 }, { "epoch": 0.030775320519599816, "grad_norm": 14.61701843403556, "learning_rate": 3.077473158350623e-05, "loss": 2.2661, "mean_token_accuracy": 0.46896551847457885, "step": 30555 }, { "epoch": 0.030780356572703987, "grad_norm": 20.428436460178094, "learning_rate": 3.07797675402373e-05, "loss": 2.7157, "mean_token_accuracy": 0.3999999940395355, "step": 30560 }, { "epoch": 0.03078539262580816, "grad_norm": 13.330522254559284, "learning_rate": 3.078480349696836e-05, "loss": 2.601, "mean_token_accuracy": 0.4034482717514038, "step": 30565 }, { "epoch": 0.030790428678912334, "grad_norm": 15.67972487976431, "learning_rate": 3.0789839453699417e-05, "loss": 2.4561, "mean_token_accuracy": 0.4, "step": 30570 }, { "epoch": 0.030795464732016505, "grad_norm": 17.251556472966243, "learning_rate": 3.0794875410430476e-05, "loss": 2.8083, "mean_token_accuracy": 0.40689654350280763, "step": 30575 }, { "epoch": 0.03080050078512068, "grad_norm": 16.10684482953834, "learning_rate": 3.0799911367161535e-05, "loss": 2.4988, "mean_token_accuracy": 0.38620689511299133, "step": 30580 }, { "epoch": 0.030805536838224852, "grad_norm": 13.562900055103084, "learning_rate": 3.0804947323892594e-05, "loss": 2.3046, "mean_token_accuracy": 0.4517241358757019, "step": 30585 }, { "epoch": 0.030810572891329026, "grad_norm": 20.721104845626176, "learning_rate": 3.0809983280623654e-05, "loss": 2.7369, "mean_token_accuracy": 0.401935875415802, "step": 30590 }, { "epoch": 0.030815608944433196, "grad_norm": 17.996233499089797, "learning_rate": 3.081501923735471e-05, "loss": 2.9257, "mean_token_accuracy": 0.3703569293022156, "step": 30595 }, { "epoch": 0.03082064499753737, "grad_norm": 16.630923579277347, "learning_rate": 3.082005519408577e-05, "loss": 2.5564, "mean_token_accuracy": 0.4344827651977539, "step": 30600 }, { "epoch": 0.030825681050641544, "grad_norm": 19.03321234938062, "learning_rate": 3.082509115081683e-05, "loss": 2.6629, "mean_token_accuracy": 0.36551723480224607, "step": 30605 }, { "epoch": 0.030830717103745714, "grad_norm": 21.08716015102735, "learning_rate": 3.08301271075479e-05, "loss": 2.6313, "mean_token_accuracy": 0.37241379618644715, "step": 30610 }, { "epoch": 0.030835753156849888, "grad_norm": 15.28496689810781, "learning_rate": 3.0835163064278957e-05, "loss": 2.4446, "mean_token_accuracy": 0.42413793206214906, "step": 30615 }, { "epoch": 0.03084078920995406, "grad_norm": 15.761055604278027, "learning_rate": 3.0840199021010016e-05, "loss": 2.5301, "mean_token_accuracy": 0.4068965494632721, "step": 30620 }, { "epoch": 0.030845825263058235, "grad_norm": 15.616708975901105, "learning_rate": 3.0845234977741075e-05, "loss": 2.5107, "mean_token_accuracy": 0.4, "step": 30625 }, { "epoch": 0.030850861316162406, "grad_norm": 16.55868516000282, "learning_rate": 3.085027093447213e-05, "loss": 2.2814, "mean_token_accuracy": 0.4344827651977539, "step": 30630 }, { "epoch": 0.03085589736926658, "grad_norm": 14.52843696094866, "learning_rate": 3.085530689120319e-05, "loss": 2.3017, "mean_token_accuracy": 0.4482758641242981, "step": 30635 }, { "epoch": 0.030860933422370753, "grad_norm": 16.934633652256025, "learning_rate": 3.086034284793425e-05, "loss": 2.4367, "mean_token_accuracy": 0.44482759237289426, "step": 30640 }, { "epoch": 0.030865969475474923, "grad_norm": 22.466024002782913, "learning_rate": 3.086537880466531e-05, "loss": 2.6612, "mean_token_accuracy": 0.38965516686439516, "step": 30645 }, { "epoch": 0.030871005528579097, "grad_norm": 17.8474206771584, "learning_rate": 3.087041476139637e-05, "loss": 2.4694, "mean_token_accuracy": 0.4413793087005615, "step": 30650 }, { "epoch": 0.03087604158168327, "grad_norm": 15.716022966809552, "learning_rate": 3.087545071812743e-05, "loss": 2.6973, "mean_token_accuracy": 0.4068965554237366, "step": 30655 }, { "epoch": 0.030881077634787445, "grad_norm": 14.92452062710475, "learning_rate": 3.088048667485849e-05, "loss": 2.0822, "mean_token_accuracy": 0.506896561384201, "step": 30660 }, { "epoch": 0.030886113687891615, "grad_norm": 15.40357572251711, "learning_rate": 3.0885522631589556e-05, "loss": 2.3737, "mean_token_accuracy": 0.4068965554237366, "step": 30665 }, { "epoch": 0.03089114974099579, "grad_norm": 19.24497440397035, "learning_rate": 3.089055858832061e-05, "loss": 2.6726, "mean_token_accuracy": 0.3655172407627106, "step": 30670 }, { "epoch": 0.030896185794099962, "grad_norm": 12.67523779854411, "learning_rate": 3.089559454505167e-05, "loss": 2.5733, "mean_token_accuracy": 0.3965517282485962, "step": 30675 }, { "epoch": 0.030901221847204133, "grad_norm": 16.52186462888829, "learning_rate": 3.090063050178273e-05, "loss": 2.4245, "mean_token_accuracy": 0.3862069010734558, "step": 30680 }, { "epoch": 0.030906257900308307, "grad_norm": 15.10428144502331, "learning_rate": 3.0905666458513786e-05, "loss": 2.2427, "mean_token_accuracy": 0.42413793206214906, "step": 30685 }, { "epoch": 0.03091129395341248, "grad_norm": 14.810758725723025, "learning_rate": 3.091070241524485e-05, "loss": 2.1321, "mean_token_accuracy": 0.4689655303955078, "step": 30690 }, { "epoch": 0.030916330006516654, "grad_norm": 26.21326831613668, "learning_rate": 3.091573837197591e-05, "loss": 2.4742, "mean_token_accuracy": 0.4551724135875702, "step": 30695 }, { "epoch": 0.030921366059620824, "grad_norm": 15.819044115318434, "learning_rate": 3.092077432870697e-05, "loss": 2.4561, "mean_token_accuracy": 0.4034482717514038, "step": 30700 }, { "epoch": 0.030926402112724998, "grad_norm": 15.909486073072909, "learning_rate": 3.092581028543803e-05, "loss": 2.5721, "mean_token_accuracy": 0.37053840756416323, "step": 30705 }, { "epoch": 0.030931438165829172, "grad_norm": 15.93527555515284, "learning_rate": 3.093084624216909e-05, "loss": 2.9305, "mean_token_accuracy": 0.39122807383537295, "step": 30710 }, { "epoch": 0.030936474218933342, "grad_norm": 15.497636299812536, "learning_rate": 3.093588219890015e-05, "loss": 2.3775, "mean_token_accuracy": 0.4068965494632721, "step": 30715 }, { "epoch": 0.030941510272037516, "grad_norm": 23.425999012698764, "learning_rate": 3.094091815563121e-05, "loss": 2.5795, "mean_token_accuracy": 0.3862069010734558, "step": 30720 }, { "epoch": 0.03094654632514169, "grad_norm": 13.218972220664844, "learning_rate": 3.094595411236227e-05, "loss": 2.4076, "mean_token_accuracy": 0.4137930989265442, "step": 30725 }, { "epoch": 0.030951582378245864, "grad_norm": 15.712632097284391, "learning_rate": 3.0950990069093326e-05, "loss": 2.3937, "mean_token_accuracy": 0.4034482717514038, "step": 30730 }, { "epoch": 0.030956618431350034, "grad_norm": 14.998592414504799, "learning_rate": 3.0956026025824385e-05, "loss": 2.7747, "mean_token_accuracy": 0.4206896543502808, "step": 30735 }, { "epoch": 0.030961654484454208, "grad_norm": 15.694805499787284, "learning_rate": 3.0961061982555445e-05, "loss": 2.2225, "mean_token_accuracy": 0.45728976726531984, "step": 30740 }, { "epoch": 0.03096669053755838, "grad_norm": 21.20947911959607, "learning_rate": 3.096609793928651e-05, "loss": 2.7668, "mean_token_accuracy": 0.3620689630508423, "step": 30745 }, { "epoch": 0.03097172659066255, "grad_norm": 15.294432570524169, "learning_rate": 3.097113389601757e-05, "loss": 2.216, "mean_token_accuracy": 0.4689655125141144, "step": 30750 }, { "epoch": 0.030976762643766725, "grad_norm": 19.416203681667856, "learning_rate": 3.097616985274863e-05, "loss": 2.7584, "mean_token_accuracy": 0.37241379022598264, "step": 30755 }, { "epoch": 0.0309817986968709, "grad_norm": 21.548508300542327, "learning_rate": 3.098120580947969e-05, "loss": 2.6926, "mean_token_accuracy": 0.38275861740112305, "step": 30760 }, { "epoch": 0.030986834749975073, "grad_norm": 19.669080029901618, "learning_rate": 3.098624176621074e-05, "loss": 2.7421, "mean_token_accuracy": 0.358620685338974, "step": 30765 }, { "epoch": 0.030991870803079243, "grad_norm": 20.706613489710804, "learning_rate": 3.099127772294181e-05, "loss": 2.4286, "mean_token_accuracy": 0.44301270246505736, "step": 30770 }, { "epoch": 0.030996906856183417, "grad_norm": 20.85591533429189, "learning_rate": 3.0996313679672866e-05, "loss": 2.8085, "mean_token_accuracy": 0.37931033968925476, "step": 30775 }, { "epoch": 0.03100194290928759, "grad_norm": 21.240582132057945, "learning_rate": 3.1001349636403925e-05, "loss": 2.3658, "mean_token_accuracy": 0.4482758641242981, "step": 30780 }, { "epoch": 0.03100697896239176, "grad_norm": 15.994801369832867, "learning_rate": 3.1006385593134985e-05, "loss": 2.4593, "mean_token_accuracy": 0.4551724076271057, "step": 30785 }, { "epoch": 0.031012015015495935, "grad_norm": 18.667222463450234, "learning_rate": 3.1011421549866044e-05, "loss": 2.2907, "mean_token_accuracy": 0.4294615924358368, "step": 30790 }, { "epoch": 0.03101705106860011, "grad_norm": 16.324995406302243, "learning_rate": 3.101645750659711e-05, "loss": 2.6139, "mean_token_accuracy": 0.43284936547279357, "step": 30795 }, { "epoch": 0.031022087121704282, "grad_norm": 15.471705160922829, "learning_rate": 3.102149346332817e-05, "loss": 2.37, "mean_token_accuracy": 0.4517241299152374, "step": 30800 }, { "epoch": 0.031027123174808453, "grad_norm": 17.685871338778593, "learning_rate": 3.102652942005922e-05, "loss": 2.6891, "mean_token_accuracy": 0.4068965494632721, "step": 30805 }, { "epoch": 0.031032159227912626, "grad_norm": 15.22067551990615, "learning_rate": 3.103156537679028e-05, "loss": 2.3012, "mean_token_accuracy": 0.41724138259887694, "step": 30810 }, { "epoch": 0.0310371952810168, "grad_norm": 19.65213364514607, "learning_rate": 3.103660133352134e-05, "loss": 2.3155, "mean_token_accuracy": 0.4724137902259827, "step": 30815 }, { "epoch": 0.03104223133412097, "grad_norm": 18.252813093205624, "learning_rate": 3.10416372902524e-05, "loss": 2.4949, "mean_token_accuracy": 0.42758620977401735, "step": 30820 }, { "epoch": 0.031047267387225144, "grad_norm": 15.127077616590098, "learning_rate": 3.1046673246983466e-05, "loss": 2.4722, "mean_token_accuracy": 0.4, "step": 30825 }, { "epoch": 0.031052303440329318, "grad_norm": 15.269135106129957, "learning_rate": 3.1051709203714525e-05, "loss": 2.4892, "mean_token_accuracy": 0.3793103516101837, "step": 30830 }, { "epoch": 0.031057339493433492, "grad_norm": 16.326919423890555, "learning_rate": 3.1056745160445584e-05, "loss": 2.883, "mean_token_accuracy": 0.3876587986946106, "step": 30835 }, { "epoch": 0.031062375546537662, "grad_norm": 16.85191112486692, "learning_rate": 3.106178111717664e-05, "loss": 2.8691, "mean_token_accuracy": 0.33793103992938994, "step": 30840 }, { "epoch": 0.031067411599641836, "grad_norm": 14.433164269210906, "learning_rate": 3.10668170739077e-05, "loss": 2.5017, "mean_token_accuracy": 0.4310344815254211, "step": 30845 }, { "epoch": 0.03107244765274601, "grad_norm": 17.38308372483792, "learning_rate": 3.107185303063876e-05, "loss": 2.6591, "mean_token_accuracy": 0.4172413766384125, "step": 30850 }, { "epoch": 0.03107748370585018, "grad_norm": 16.79876793440166, "learning_rate": 3.107688898736982e-05, "loss": 2.664, "mean_token_accuracy": 0.4172413766384125, "step": 30855 }, { "epoch": 0.031082519758954354, "grad_norm": 14.014276177627822, "learning_rate": 3.108192494410088e-05, "loss": 2.388, "mean_token_accuracy": 0.44593595862388613, "step": 30860 }, { "epoch": 0.031087555812058527, "grad_norm": 16.89202230198056, "learning_rate": 3.108696090083194e-05, "loss": 2.6615, "mean_token_accuracy": 0.3862069070339203, "step": 30865 }, { "epoch": 0.0310925918651627, "grad_norm": 19.650701121889007, "learning_rate": 3.1091996857563e-05, "loss": 2.8034, "mean_token_accuracy": 0.36896551847457887, "step": 30870 }, { "epoch": 0.03109762791826687, "grad_norm": 11.19242580243403, "learning_rate": 3.1097032814294065e-05, "loss": 2.3659, "mean_token_accuracy": 0.4344827592372894, "step": 30875 }, { "epoch": 0.031102663971371045, "grad_norm": 14.241357551160785, "learning_rate": 3.1102068771025124e-05, "loss": 2.3956, "mean_token_accuracy": 0.39310344457626345, "step": 30880 }, { "epoch": 0.03110770002447522, "grad_norm": 13.88578435636804, "learning_rate": 3.110710472775618e-05, "loss": 2.3374, "mean_token_accuracy": 0.4448275864124298, "step": 30885 }, { "epoch": 0.03111273607757939, "grad_norm": 17.2863321973129, "learning_rate": 3.111214068448724e-05, "loss": 2.5481, "mean_token_accuracy": 0.42413793206214906, "step": 30890 }, { "epoch": 0.031117772130683563, "grad_norm": 15.754254942066275, "learning_rate": 3.11171766412183e-05, "loss": 2.7277, "mean_token_accuracy": 0.3620689630508423, "step": 30895 }, { "epoch": 0.031122808183787737, "grad_norm": 45.51791045772726, "learning_rate": 3.1122212597949354e-05, "loss": 2.8824, "mean_token_accuracy": 0.3517241388559341, "step": 30900 }, { "epoch": 0.03112784423689191, "grad_norm": 17.994659165999792, "learning_rate": 3.112724855468042e-05, "loss": 2.4755, "mean_token_accuracy": 0.42758620381355283, "step": 30905 }, { "epoch": 0.03113288028999608, "grad_norm": 14.807042651527443, "learning_rate": 3.113228451141148e-05, "loss": 2.6859, "mean_token_accuracy": 0.36551723480224607, "step": 30910 }, { "epoch": 0.031137916343100255, "grad_norm": 22.77945547112795, "learning_rate": 3.113732046814254e-05, "loss": 2.7268, "mean_token_accuracy": 0.4, "step": 30915 }, { "epoch": 0.03114295239620443, "grad_norm": 22.04762658525619, "learning_rate": 3.11423564248736e-05, "loss": 2.5514, "mean_token_accuracy": 0.4103448301553726, "step": 30920 }, { "epoch": 0.0311479884493086, "grad_norm": 16.988566393812373, "learning_rate": 3.114739238160466e-05, "loss": 2.6242, "mean_token_accuracy": 0.4123412013053894, "step": 30925 }, { "epoch": 0.031153024502412772, "grad_norm": 16.71988532837217, "learning_rate": 3.1152428338335723e-05, "loss": 2.2341, "mean_token_accuracy": 0.4517241358757019, "step": 30930 }, { "epoch": 0.031158060555516946, "grad_norm": 12.978140756963043, "learning_rate": 3.115746429506678e-05, "loss": 2.2264, "mean_token_accuracy": 0.42413793206214906, "step": 30935 }, { "epoch": 0.03116309660862112, "grad_norm": 14.899213536918277, "learning_rate": 3.1162500251797835e-05, "loss": 2.2155, "mean_token_accuracy": 0.42758620381355283, "step": 30940 }, { "epoch": 0.03116813266172529, "grad_norm": 19.800373930749814, "learning_rate": 3.1167536208528894e-05, "loss": 2.6368, "mean_token_accuracy": 0.42413793206214906, "step": 30945 }, { "epoch": 0.031173168714829464, "grad_norm": 16.298288805042787, "learning_rate": 3.1172572165259954e-05, "loss": 2.5378, "mean_token_accuracy": 0.38965516686439516, "step": 30950 }, { "epoch": 0.031178204767933638, "grad_norm": 16.993684525467266, "learning_rate": 3.117760812199102e-05, "loss": 2.3674, "mean_token_accuracy": 0.43448275327682495, "step": 30955 }, { "epoch": 0.031183240821037808, "grad_norm": 15.295544513170974, "learning_rate": 3.118264407872208e-05, "loss": 2.7592, "mean_token_accuracy": 0.39655172228813174, "step": 30960 }, { "epoch": 0.031188276874141982, "grad_norm": 15.873606489573843, "learning_rate": 3.118768003545314e-05, "loss": 3.0178, "mean_token_accuracy": 0.4034482717514038, "step": 30965 }, { "epoch": 0.031193312927246156, "grad_norm": 15.663324762626273, "learning_rate": 3.11927159921842e-05, "loss": 2.4642, "mean_token_accuracy": 0.4034482717514038, "step": 30970 }, { "epoch": 0.03119834898035033, "grad_norm": 17.476814496946023, "learning_rate": 3.119775194891526e-05, "loss": 2.5677, "mean_token_accuracy": 0.4034482777118683, "step": 30975 }, { "epoch": 0.0312033850334545, "grad_norm": 18.808492746578334, "learning_rate": 3.1202787905646316e-05, "loss": 2.5759, "mean_token_accuracy": 0.38620689511299133, "step": 30980 }, { "epoch": 0.031208421086558674, "grad_norm": 39.588672677885555, "learning_rate": 3.1207823862377375e-05, "loss": 2.6563, "mean_token_accuracy": 0.4172413796186447, "step": 30985 }, { "epoch": 0.031213457139662847, "grad_norm": 14.129543206075445, "learning_rate": 3.1212859819108434e-05, "loss": 2.5498, "mean_token_accuracy": 0.40344826579093934, "step": 30990 }, { "epoch": 0.031218493192767018, "grad_norm": 15.76946471854939, "learning_rate": 3.1217895775839494e-05, "loss": 2.2457, "mean_token_accuracy": 0.39655172228813174, "step": 30995 }, { "epoch": 0.03122352924587119, "grad_norm": 15.306633393329385, "learning_rate": 3.122293173257055e-05, "loss": 2.3494, "mean_token_accuracy": 0.43103447556495667, "step": 31000 }, { "epoch": 0.031228565298975365, "grad_norm": 16.836498400929468, "learning_rate": 3.122796768930161e-05, "loss": 2.716, "mean_token_accuracy": 0.36896551847457887, "step": 31005 }, { "epoch": 0.03123360135207954, "grad_norm": 20.554276331077507, "learning_rate": 3.123300364603268e-05, "loss": 2.9228, "mean_token_accuracy": 0.317241370677948, "step": 31010 }, { "epoch": 0.03123863740518371, "grad_norm": 18.308975362752747, "learning_rate": 3.123803960276374e-05, "loss": 2.4142, "mean_token_accuracy": 0.43103447556495667, "step": 31015 }, { "epoch": 0.031243673458287883, "grad_norm": 20.393708621662288, "learning_rate": 3.12430755594948e-05, "loss": 2.6508, "mean_token_accuracy": 0.3793103486299515, "step": 31020 }, { "epoch": 0.031248709511392057, "grad_norm": 18.394551536127647, "learning_rate": 3.1248111516225856e-05, "loss": 2.0444, "mean_token_accuracy": 0.5206896603107453, "step": 31025 }, { "epoch": 0.03125374556449623, "grad_norm": 14.2412215479628, "learning_rate": 3.125314747295691e-05, "loss": 2.3363, "mean_token_accuracy": 0.458620685338974, "step": 31030 }, { "epoch": 0.0312587816176004, "grad_norm": 33.722193451709956, "learning_rate": 3.1258183429687974e-05, "loss": 2.4875, "mean_token_accuracy": 0.3620689630508423, "step": 31035 }, { "epoch": 0.031263817670704575, "grad_norm": 19.909251842160195, "learning_rate": 3.1263219386419034e-05, "loss": 2.4466, "mean_token_accuracy": 0.4517241299152374, "step": 31040 }, { "epoch": 0.03126885372380875, "grad_norm": 16.9322599502945, "learning_rate": 3.126825534315009e-05, "loss": 2.7081, "mean_token_accuracy": 0.4448275864124298, "step": 31045 }, { "epoch": 0.03127388977691292, "grad_norm": 16.894011442097224, "learning_rate": 3.127329129988115e-05, "loss": 2.614, "mean_token_accuracy": 0.4068965494632721, "step": 31050 }, { "epoch": 0.031278925830017096, "grad_norm": 18.85198696811844, "learning_rate": 3.127832725661221e-05, "loss": 2.5432, "mean_token_accuracy": 0.43793103098869324, "step": 31055 }, { "epoch": 0.03128396188312126, "grad_norm": 15.34474144552616, "learning_rate": 3.128336321334327e-05, "loss": 2.5084, "mean_token_accuracy": 0.43103447556495667, "step": 31060 }, { "epoch": 0.031288997936225436, "grad_norm": 14.720609481918503, "learning_rate": 3.128839917007434e-05, "loss": 2.5241, "mean_token_accuracy": 0.4034482717514038, "step": 31065 }, { "epoch": 0.03129403398932961, "grad_norm": 15.367324807748583, "learning_rate": 3.1293435126805396e-05, "loss": 2.5049, "mean_token_accuracy": 0.38275861740112305, "step": 31070 }, { "epoch": 0.031299070042433784, "grad_norm": 14.322543384691949, "learning_rate": 3.129847108353645e-05, "loss": 2.7715, "mean_token_accuracy": 0.37241379618644715, "step": 31075 }, { "epoch": 0.03130410609553796, "grad_norm": 15.767998939054454, "learning_rate": 3.130350704026751e-05, "loss": 2.5103, "mean_token_accuracy": 0.42758620381355283, "step": 31080 }, { "epoch": 0.03130914214864213, "grad_norm": 15.160467813036657, "learning_rate": 3.130854299699857e-05, "loss": 2.499, "mean_token_accuracy": 0.4310344934463501, "step": 31085 }, { "epoch": 0.031314178201746305, "grad_norm": 21.80624404172611, "learning_rate": 3.131357895372963e-05, "loss": 2.5723, "mean_token_accuracy": 0.3827586233615875, "step": 31090 }, { "epoch": 0.03131921425485047, "grad_norm": 16.066686064186534, "learning_rate": 3.131861491046069e-05, "loss": 2.3287, "mean_token_accuracy": 0.47586206793785096, "step": 31095 }, { "epoch": 0.031324250307954646, "grad_norm": 19.242333632357337, "learning_rate": 3.132365086719175e-05, "loss": 2.5894, "mean_token_accuracy": 0.38965516686439516, "step": 31100 }, { "epoch": 0.03132928636105882, "grad_norm": 16.70900888829499, "learning_rate": 3.132868682392281e-05, "loss": 2.7564, "mean_token_accuracy": 0.36206896901130675, "step": 31105 }, { "epoch": 0.03133432241416299, "grad_norm": 16.046108305789808, "learning_rate": 3.133372278065387e-05, "loss": 2.3339, "mean_token_accuracy": 0.4068965494632721, "step": 31110 }, { "epoch": 0.03133935846726717, "grad_norm": 14.866702335624908, "learning_rate": 3.133875873738493e-05, "loss": 2.5386, "mean_token_accuracy": 0.4118572294712067, "step": 31115 }, { "epoch": 0.03134439452037134, "grad_norm": 14.580116086686637, "learning_rate": 3.134379469411599e-05, "loss": 2.301, "mean_token_accuracy": 0.4413793087005615, "step": 31120 }, { "epoch": 0.031349430573475515, "grad_norm": 20.774442837483413, "learning_rate": 3.134883065084705e-05, "loss": 3.0282, "mean_token_accuracy": 0.3482758581638336, "step": 31125 }, { "epoch": 0.03135446662657968, "grad_norm": 21.004998362000944, "learning_rate": 3.135386660757811e-05, "loss": 2.4649, "mean_token_accuracy": 0.42413793206214906, "step": 31130 }, { "epoch": 0.031359502679683855, "grad_norm": 17.592016799611127, "learning_rate": 3.1358902564309166e-05, "loss": 2.4704, "mean_token_accuracy": 0.3827586114406586, "step": 31135 }, { "epoch": 0.03136453873278803, "grad_norm": 15.960928630974283, "learning_rate": 3.136393852104023e-05, "loss": 2.3782, "mean_token_accuracy": 0.4570477962493896, "step": 31140 }, { "epoch": 0.0313695747858922, "grad_norm": 29.694737979869522, "learning_rate": 3.136897447777129e-05, "loss": 2.3423, "mean_token_accuracy": 0.44827587008476255, "step": 31145 }, { "epoch": 0.03137461083899638, "grad_norm": 14.352684240678432, "learning_rate": 3.137401043450235e-05, "loss": 2.3205, "mean_token_accuracy": 0.4344827592372894, "step": 31150 }, { "epoch": 0.03137964689210055, "grad_norm": 16.571319270966846, "learning_rate": 3.137904639123341e-05, "loss": 3.2064, "mean_token_accuracy": 0.3448275804519653, "step": 31155 }, { "epoch": 0.03138468294520472, "grad_norm": 15.92645559737672, "learning_rate": 3.138408234796447e-05, "loss": 2.7204, "mean_token_accuracy": 0.39655173420906065, "step": 31160 }, { "epoch": 0.03138971899830889, "grad_norm": 12.657540460225267, "learning_rate": 3.138911830469552e-05, "loss": 2.4906, "mean_token_accuracy": 0.39655172228813174, "step": 31165 }, { "epoch": 0.031394755051413065, "grad_norm": 19.18366152158412, "learning_rate": 3.139415426142659e-05, "loss": 2.4713, "mean_token_accuracy": 0.40496068000793456, "step": 31170 }, { "epoch": 0.03139979110451724, "grad_norm": 15.155839399607364, "learning_rate": 3.139919021815765e-05, "loss": 2.7806, "mean_token_accuracy": 0.3689655065536499, "step": 31175 }, { "epoch": 0.03140482715762141, "grad_norm": 14.776247811640282, "learning_rate": 3.1404226174888706e-05, "loss": 2.6423, "mean_token_accuracy": 0.3965517163276672, "step": 31180 }, { "epoch": 0.031409863210725586, "grad_norm": 16.007125162849835, "learning_rate": 3.1409262131619766e-05, "loss": 2.4992, "mean_token_accuracy": 0.42413793206214906, "step": 31185 }, { "epoch": 0.03141489926382976, "grad_norm": 16.847697647164324, "learning_rate": 3.1414298088350825e-05, "loss": 2.6709, "mean_token_accuracy": 0.3896551728248596, "step": 31190 }, { "epoch": 0.03141993531693393, "grad_norm": 16.133831648128208, "learning_rate": 3.141933404508189e-05, "loss": 2.4347, "mean_token_accuracy": 0.4034482717514038, "step": 31195 }, { "epoch": 0.0314249713700381, "grad_norm": 14.727566765328183, "learning_rate": 3.142437000181295e-05, "loss": 2.9068, "mean_token_accuracy": 0.3551724195480347, "step": 31200 }, { "epoch": 0.031430007423142274, "grad_norm": 19.345363979708925, "learning_rate": 3.1429405958544e-05, "loss": 2.6444, "mean_token_accuracy": 0.4333938241004944, "step": 31205 }, { "epoch": 0.03143504347624645, "grad_norm": 12.921972872845366, "learning_rate": 3.143444191527506e-05, "loss": 2.3661, "mean_token_accuracy": 0.44482759237289426, "step": 31210 }, { "epoch": 0.03144007952935062, "grad_norm": 15.707671440103574, "learning_rate": 3.143947787200612e-05, "loss": 2.9799, "mean_token_accuracy": 0.3674531102180481, "step": 31215 }, { "epoch": 0.031445115582454795, "grad_norm": 14.18825812575258, "learning_rate": 3.144451382873719e-05, "loss": 2.8726, "mean_token_accuracy": 0.417241370677948, "step": 31220 }, { "epoch": 0.03145015163555897, "grad_norm": 15.098022736601756, "learning_rate": 3.1449549785468246e-05, "loss": 2.3315, "mean_token_accuracy": 0.4068965494632721, "step": 31225 }, { "epoch": 0.031455187688663136, "grad_norm": 16.275008678071586, "learning_rate": 3.1454585742199306e-05, "loss": 2.496, "mean_token_accuracy": 0.4137930989265442, "step": 31230 }, { "epoch": 0.03146022374176731, "grad_norm": 16.37831633858391, "learning_rate": 3.1459621698930365e-05, "loss": 2.7835, "mean_token_accuracy": 0.38620689511299133, "step": 31235 }, { "epoch": 0.031465259794871484, "grad_norm": 14.443258137319788, "learning_rate": 3.1464657655661424e-05, "loss": 2.2921, "mean_token_accuracy": 0.4379310369491577, "step": 31240 }, { "epoch": 0.03147029584797566, "grad_norm": 14.057277904370002, "learning_rate": 3.1469693612392483e-05, "loss": 2.4279, "mean_token_accuracy": 0.4034482717514038, "step": 31245 }, { "epoch": 0.03147533190107983, "grad_norm": 18.51838431656866, "learning_rate": 3.147472956912354e-05, "loss": 2.4886, "mean_token_accuracy": 0.38620689511299133, "step": 31250 }, { "epoch": 0.031480367954184005, "grad_norm": 18.216705430004907, "learning_rate": 3.14797655258546e-05, "loss": 2.338, "mean_token_accuracy": 0.4, "step": 31255 }, { "epoch": 0.03148540400728818, "grad_norm": 15.968213553105224, "learning_rate": 3.148480148258566e-05, "loss": 2.5599, "mean_token_accuracy": 0.42413792610168455, "step": 31260 }, { "epoch": 0.031490440060392345, "grad_norm": 17.942252847867046, "learning_rate": 3.148983743931672e-05, "loss": 2.5514, "mean_token_accuracy": 0.41209922432899476, "step": 31265 }, { "epoch": 0.03149547611349652, "grad_norm": 16.879011174205086, "learning_rate": 3.149487339604778e-05, "loss": 2.1798, "mean_token_accuracy": 0.4392014563083649, "step": 31270 }, { "epoch": 0.03150051216660069, "grad_norm": 15.90482147184509, "learning_rate": 3.1499909352778846e-05, "loss": 2.7663, "mean_token_accuracy": 0.3827586233615875, "step": 31275 }, { "epoch": 0.03150554821970487, "grad_norm": 17.7462794780518, "learning_rate": 3.1504945309509905e-05, "loss": 2.3981, "mean_token_accuracy": 0.46569873094558717, "step": 31280 }, { "epoch": 0.03151058427280904, "grad_norm": 14.320367617752172, "learning_rate": 3.1509981266240964e-05, "loss": 2.521, "mean_token_accuracy": 0.4068965494632721, "step": 31285 }, { "epoch": 0.031515620325913214, "grad_norm": 18.396955654277825, "learning_rate": 3.1515017222972023e-05, "loss": 2.8082, "mean_token_accuracy": 0.4, "step": 31290 }, { "epoch": 0.03152065637901739, "grad_norm": 21.158830596439245, "learning_rate": 3.152005317970308e-05, "loss": 2.4941, "mean_token_accuracy": 0.38620689511299133, "step": 31295 }, { "epoch": 0.031525692432121555, "grad_norm": 19.77063815032728, "learning_rate": 3.152508913643414e-05, "loss": 2.6731, "mean_token_accuracy": 0.38620689809322356, "step": 31300 }, { "epoch": 0.03153072848522573, "grad_norm": 12.14615999767482, "learning_rate": 3.15301250931652e-05, "loss": 2.1237, "mean_token_accuracy": 0.4913793087005615, "step": 31305 }, { "epoch": 0.0315357645383299, "grad_norm": 11.554243415446317, "learning_rate": 3.153516104989626e-05, "loss": 2.3632, "mean_token_accuracy": 0.3931034475564957, "step": 31310 }, { "epoch": 0.031540800591434076, "grad_norm": 14.158067402151664, "learning_rate": 3.154019700662732e-05, "loss": 2.7279, "mean_token_accuracy": 0.4103448331356049, "step": 31315 }, { "epoch": 0.03154583664453825, "grad_norm": 14.262203914962535, "learning_rate": 3.154523296335838e-05, "loss": 2.2979, "mean_token_accuracy": 0.39310344457626345, "step": 31320 }, { "epoch": 0.031550872697642424, "grad_norm": 17.321576642526523, "learning_rate": 3.155026892008944e-05, "loss": 2.6142, "mean_token_accuracy": 0.3896551728248596, "step": 31325 }, { "epoch": 0.0315559087507466, "grad_norm": 15.306670965275304, "learning_rate": 3.1555304876820504e-05, "loss": 2.5929, "mean_token_accuracy": 0.387477308511734, "step": 31330 }, { "epoch": 0.031560944803850764, "grad_norm": 15.413337737206035, "learning_rate": 3.1560340833551564e-05, "loss": 2.3474, "mean_token_accuracy": 0.43647913336753846, "step": 31335 }, { "epoch": 0.03156598085695494, "grad_norm": 22.273249633799253, "learning_rate": 3.1565376790282616e-05, "loss": 2.6444, "mean_token_accuracy": 0.3931034475564957, "step": 31340 }, { "epoch": 0.03157101691005911, "grad_norm": 17.494267500914628, "learning_rate": 3.1570412747013675e-05, "loss": 2.5777, "mean_token_accuracy": 0.40344826579093934, "step": 31345 }, { "epoch": 0.031576052963163286, "grad_norm": 14.95898898528262, "learning_rate": 3.1575448703744735e-05, "loss": 2.6368, "mean_token_accuracy": 0.39655172228813174, "step": 31350 }, { "epoch": 0.03158108901626746, "grad_norm": 14.731848845532488, "learning_rate": 3.15804846604758e-05, "loss": 2.6586, "mean_token_accuracy": 0.3931034475564957, "step": 31355 }, { "epoch": 0.03158612506937163, "grad_norm": 14.863021990681805, "learning_rate": 3.158552061720686e-05, "loss": 2.6173, "mean_token_accuracy": 0.35862069129943847, "step": 31360 }, { "epoch": 0.03159116112247581, "grad_norm": 26.053148183950647, "learning_rate": 3.159055657393792e-05, "loss": 2.4453, "mean_token_accuracy": 0.4034482777118683, "step": 31365 }, { "epoch": 0.031596197175579974, "grad_norm": 14.160352987372496, "learning_rate": 3.159559253066898e-05, "loss": 2.9098, "mean_token_accuracy": 0.32758620083332063, "step": 31370 }, { "epoch": 0.03160123322868415, "grad_norm": 17.335591222852372, "learning_rate": 3.160062848740004e-05, "loss": 2.4536, "mean_token_accuracy": 0.4413793087005615, "step": 31375 }, { "epoch": 0.03160626928178832, "grad_norm": 16.9423694416526, "learning_rate": 3.16056644441311e-05, "loss": 2.6466, "mean_token_accuracy": 0.3793103456497192, "step": 31380 }, { "epoch": 0.031611305334892495, "grad_norm": 13.019845002611305, "learning_rate": 3.1610700400862156e-05, "loss": 2.6357, "mean_token_accuracy": 0.3482758581638336, "step": 31385 }, { "epoch": 0.03161634138799667, "grad_norm": 16.793091355606048, "learning_rate": 3.1615736357593215e-05, "loss": 2.2026, "mean_token_accuracy": 0.4172413766384125, "step": 31390 }, { "epoch": 0.03162137744110084, "grad_norm": 13.42637592314996, "learning_rate": 3.1620772314324275e-05, "loss": 2.3526, "mean_token_accuracy": 0.42413793206214906, "step": 31395 }, { "epoch": 0.031626413494205016, "grad_norm": 15.418392819195669, "learning_rate": 3.1625808271055334e-05, "loss": 2.499, "mean_token_accuracy": 0.441379314661026, "step": 31400 }, { "epoch": 0.03163144954730918, "grad_norm": 13.323560131499793, "learning_rate": 3.163084422778639e-05, "loss": 2.2009, "mean_token_accuracy": 0.4379310369491577, "step": 31405 }, { "epoch": 0.03163648560041336, "grad_norm": 17.575433368499517, "learning_rate": 3.163588018451746e-05, "loss": 2.5852, "mean_token_accuracy": 0.38965516090393065, "step": 31410 }, { "epoch": 0.03164152165351753, "grad_norm": 17.747357953292266, "learning_rate": 3.164091614124852e-05, "loss": 2.3726, "mean_token_accuracy": 0.42068964838981626, "step": 31415 }, { "epoch": 0.031646557706621704, "grad_norm": 15.643955787364895, "learning_rate": 3.164595209797958e-05, "loss": 2.5645, "mean_token_accuracy": 0.4172413766384125, "step": 31420 }, { "epoch": 0.03165159375972588, "grad_norm": 11.410841995251184, "learning_rate": 3.165098805471064e-05, "loss": 1.9701, "mean_token_accuracy": 0.5379310250282288, "step": 31425 }, { "epoch": 0.03165662981283005, "grad_norm": 15.15772048623036, "learning_rate": 3.1656024011441696e-05, "loss": 2.3888, "mean_token_accuracy": 0.435632187128067, "step": 31430 }, { "epoch": 0.031661665865934226, "grad_norm": 15.717699656471156, "learning_rate": 3.1661059968172755e-05, "loss": 2.2099, "mean_token_accuracy": 0.44827585816383364, "step": 31435 }, { "epoch": 0.03166670191903839, "grad_norm": 17.289716454953915, "learning_rate": 3.1666095924903815e-05, "loss": 2.6555, "mean_token_accuracy": 0.4049606680870056, "step": 31440 }, { "epoch": 0.031671737972142566, "grad_norm": 18.346335970045768, "learning_rate": 3.1671131881634874e-05, "loss": 2.9139, "mean_token_accuracy": 0.35862069129943847, "step": 31445 }, { "epoch": 0.03167677402524674, "grad_norm": 12.666470571953404, "learning_rate": 3.167616783836593e-05, "loss": 2.4022, "mean_token_accuracy": 0.40689654350280763, "step": 31450 }, { "epoch": 0.031681810078350914, "grad_norm": 14.422252076922584, "learning_rate": 3.168120379509699e-05, "loss": 2.5108, "mean_token_accuracy": 0.44827585816383364, "step": 31455 }, { "epoch": 0.03168684613145509, "grad_norm": 15.004807021625458, "learning_rate": 3.168623975182806e-05, "loss": 2.8335, "mean_token_accuracy": 0.3620689630508423, "step": 31460 }, { "epoch": 0.03169188218455926, "grad_norm": 15.744403779421775, "learning_rate": 3.169127570855912e-05, "loss": 2.3454, "mean_token_accuracy": 0.4517241358757019, "step": 31465 }, { "epoch": 0.031696918237663435, "grad_norm": 15.401118211554786, "learning_rate": 3.169631166529018e-05, "loss": 2.4705, "mean_token_accuracy": 0.3448275804519653, "step": 31470 }, { "epoch": 0.0317019542907676, "grad_norm": 15.825140683763951, "learning_rate": 3.170134762202123e-05, "loss": 2.536, "mean_token_accuracy": 0.3827586233615875, "step": 31475 }, { "epoch": 0.031706990343871776, "grad_norm": 19.175481911711067, "learning_rate": 3.170638357875229e-05, "loss": 2.9579, "mean_token_accuracy": 0.358620685338974, "step": 31480 }, { "epoch": 0.03171202639697595, "grad_norm": 17.952788758661267, "learning_rate": 3.171141953548335e-05, "loss": 2.7074, "mean_token_accuracy": 0.4413793087005615, "step": 31485 }, { "epoch": 0.03171706245008012, "grad_norm": 18.461355586947736, "learning_rate": 3.1716455492214414e-05, "loss": 2.8004, "mean_token_accuracy": 0.37586207687854767, "step": 31490 }, { "epoch": 0.0317220985031843, "grad_norm": 16.67761105066735, "learning_rate": 3.172149144894547e-05, "loss": 2.4502, "mean_token_accuracy": 0.4068965494632721, "step": 31495 }, { "epoch": 0.03172713455628847, "grad_norm": 14.592085228725374, "learning_rate": 3.172652740567653e-05, "loss": 2.1967, "mean_token_accuracy": 0.45172414779663084, "step": 31500 }, { "epoch": 0.031732170609392645, "grad_norm": 15.874502254152878, "learning_rate": 3.173156336240759e-05, "loss": 2.4523, "mean_token_accuracy": 0.4310344845056534, "step": 31505 }, { "epoch": 0.03173720666249681, "grad_norm": 17.221500566629857, "learning_rate": 3.173659931913865e-05, "loss": 2.4348, "mean_token_accuracy": 0.38620689511299133, "step": 31510 }, { "epoch": 0.031742242715600985, "grad_norm": 18.42981261195008, "learning_rate": 3.174163527586971e-05, "loss": 2.4767, "mean_token_accuracy": 0.3999999940395355, "step": 31515 }, { "epoch": 0.03174727876870516, "grad_norm": 18.877477348149434, "learning_rate": 3.174667123260077e-05, "loss": 2.1275, "mean_token_accuracy": 0.47084089517593386, "step": 31520 }, { "epoch": 0.03175231482180933, "grad_norm": 11.620321892250821, "learning_rate": 3.175170718933183e-05, "loss": 2.2201, "mean_token_accuracy": 0.46551724076271056, "step": 31525 }, { "epoch": 0.031757350874913506, "grad_norm": 13.101133111318582, "learning_rate": 3.175674314606289e-05, "loss": 2.5496, "mean_token_accuracy": 0.358620685338974, "step": 31530 }, { "epoch": 0.03176238692801768, "grad_norm": 14.062639592531823, "learning_rate": 3.176177910279395e-05, "loss": 2.4424, "mean_token_accuracy": 0.4068965554237366, "step": 31535 }, { "epoch": 0.031767422981121854, "grad_norm": 15.321546480004097, "learning_rate": 3.176681505952501e-05, "loss": 2.286, "mean_token_accuracy": 0.458620685338974, "step": 31540 }, { "epoch": 0.03177245903422602, "grad_norm": 21.721346330798248, "learning_rate": 3.177185101625607e-05, "loss": 2.5814, "mean_token_accuracy": 0.3655172288417816, "step": 31545 }, { "epoch": 0.031777495087330195, "grad_norm": 18.35941125727484, "learning_rate": 3.177688697298713e-05, "loss": 2.5259, "mean_token_accuracy": 0.3965517282485962, "step": 31550 }, { "epoch": 0.03178253114043437, "grad_norm": 20.086716875417277, "learning_rate": 3.178192292971819e-05, "loss": 2.7044, "mean_token_accuracy": 0.39655172228813174, "step": 31555 }, { "epoch": 0.03178756719353854, "grad_norm": 18.096239198650704, "learning_rate": 3.178695888644925e-05, "loss": 2.515, "mean_token_accuracy": 0.37586206793785093, "step": 31560 }, { "epoch": 0.031792603246642716, "grad_norm": 15.904593621180277, "learning_rate": 3.179199484318031e-05, "loss": 2.5057, "mean_token_accuracy": 0.4103448331356049, "step": 31565 }, { "epoch": 0.03179763929974689, "grad_norm": 13.609211704508473, "learning_rate": 3.179703079991137e-05, "loss": 2.3665, "mean_token_accuracy": 0.4551724135875702, "step": 31570 }, { "epoch": 0.03180267535285106, "grad_norm": 15.502936242992288, "learning_rate": 3.180206675664243e-05, "loss": 2.7459, "mean_token_accuracy": 0.39310344457626345, "step": 31575 }, { "epoch": 0.03180771140595523, "grad_norm": 16.840585964848913, "learning_rate": 3.180710271337349e-05, "loss": 2.4033, "mean_token_accuracy": 0.42413793206214906, "step": 31580 }, { "epoch": 0.031812747459059404, "grad_norm": 15.703083750291317, "learning_rate": 3.1812138670104547e-05, "loss": 2.503, "mean_token_accuracy": 0.42413792610168455, "step": 31585 }, { "epoch": 0.03181778351216358, "grad_norm": 19.00823114562094, "learning_rate": 3.1817174626835606e-05, "loss": 2.6304, "mean_token_accuracy": 0.42758620977401735, "step": 31590 }, { "epoch": 0.03182281956526775, "grad_norm": 14.276788930685148, "learning_rate": 3.182221058356667e-05, "loss": 2.3928, "mean_token_accuracy": 0.36896551251411436, "step": 31595 }, { "epoch": 0.031827855618371925, "grad_norm": 19.897165155676845, "learning_rate": 3.182724654029773e-05, "loss": 2.855, "mean_token_accuracy": 0.39655172228813174, "step": 31600 }, { "epoch": 0.0318328916714761, "grad_norm": 14.947447954346066, "learning_rate": 3.183228249702879e-05, "loss": 2.7942, "mean_token_accuracy": 0.3620689630508423, "step": 31605 }, { "epoch": 0.03183792772458027, "grad_norm": 14.223649027208067, "learning_rate": 3.183731845375984e-05, "loss": 2.4967, "mean_token_accuracy": 0.4310344815254211, "step": 31610 }, { "epoch": 0.03184296377768444, "grad_norm": 15.142297670074997, "learning_rate": 3.18423544104909e-05, "loss": 2.6728, "mean_token_accuracy": 0.4, "step": 31615 }, { "epoch": 0.03184799983078861, "grad_norm": 13.37620587207293, "learning_rate": 3.184739036722197e-05, "loss": 2.2329, "mean_token_accuracy": 0.4172413766384125, "step": 31620 }, { "epoch": 0.03185303588389279, "grad_norm": 14.637774616054092, "learning_rate": 3.185242632395303e-05, "loss": 2.5065, "mean_token_accuracy": 0.39655172228813174, "step": 31625 }, { "epoch": 0.03185807193699696, "grad_norm": 14.375421932224224, "learning_rate": 3.1857462280684087e-05, "loss": 2.5404, "mean_token_accuracy": 0.3758620709180832, "step": 31630 }, { "epoch": 0.031863107990101135, "grad_norm": 14.83339160218976, "learning_rate": 3.1862498237415146e-05, "loss": 2.6449, "mean_token_accuracy": 0.34827585220336915, "step": 31635 }, { "epoch": 0.03186814404320531, "grad_norm": 14.669425163894635, "learning_rate": 3.1867534194146205e-05, "loss": 2.5196, "mean_token_accuracy": 0.3896551728248596, "step": 31640 }, { "epoch": 0.03187318009630948, "grad_norm": 15.785978085563007, "learning_rate": 3.187257015087727e-05, "loss": 2.3561, "mean_token_accuracy": 0.4465517282485962, "step": 31645 }, { "epoch": 0.03187821614941365, "grad_norm": 14.455878305703173, "learning_rate": 3.1877606107608324e-05, "loss": 2.5137, "mean_token_accuracy": 0.4103448331356049, "step": 31650 }, { "epoch": 0.03188325220251782, "grad_norm": 13.736572803826203, "learning_rate": 3.188264206433938e-05, "loss": 2.509, "mean_token_accuracy": 0.37931033968925476, "step": 31655 }, { "epoch": 0.031888288255622, "grad_norm": 22.019080362294957, "learning_rate": 3.188767802107044e-05, "loss": 2.2683, "mean_token_accuracy": 0.4379310369491577, "step": 31660 }, { "epoch": 0.03189332430872617, "grad_norm": 14.246769762409034, "learning_rate": 3.18927139778015e-05, "loss": 2.7988, "mean_token_accuracy": 0.34482758641242983, "step": 31665 }, { "epoch": 0.031898360361830344, "grad_norm": 15.584046437064652, "learning_rate": 3.189774993453256e-05, "loss": 2.5134, "mean_token_accuracy": 0.4034482717514038, "step": 31670 }, { "epoch": 0.03190339641493452, "grad_norm": 14.410254084746354, "learning_rate": 3.190278589126363e-05, "loss": 2.5094, "mean_token_accuracy": 0.41379310488700866, "step": 31675 }, { "epoch": 0.03190843246803869, "grad_norm": 15.974886832874885, "learning_rate": 3.1907821847994686e-05, "loss": 2.5336, "mean_token_accuracy": 0.3896551698446274, "step": 31680 }, { "epoch": 0.03191346852114286, "grad_norm": 14.018783291786033, "learning_rate": 3.1912857804725745e-05, "loss": 2.7433, "mean_token_accuracy": 0.4103448212146759, "step": 31685 }, { "epoch": 0.03191850457424703, "grad_norm": 15.887350090765855, "learning_rate": 3.1917893761456804e-05, "loss": 2.221, "mean_token_accuracy": 0.44827585816383364, "step": 31690 }, { "epoch": 0.031923540627351206, "grad_norm": 19.077985577253013, "learning_rate": 3.1922929718187864e-05, "loss": 2.3239, "mean_token_accuracy": 0.458620685338974, "step": 31695 }, { "epoch": 0.03192857668045538, "grad_norm": 14.531030389339577, "learning_rate": 3.192796567491892e-05, "loss": 2.3882, "mean_token_accuracy": 0.42413792610168455, "step": 31700 }, { "epoch": 0.031933612733559553, "grad_norm": 15.137880868002352, "learning_rate": 3.193300163164998e-05, "loss": 2.3285, "mean_token_accuracy": 0.4517241418361664, "step": 31705 }, { "epoch": 0.03193864878666373, "grad_norm": 19.463340667724417, "learning_rate": 3.193803758838104e-05, "loss": 2.5424, "mean_token_accuracy": 0.44482758045196535, "step": 31710 }, { "epoch": 0.0319436848397679, "grad_norm": 16.49929795130981, "learning_rate": 3.19430735451121e-05, "loss": 2.759, "mean_token_accuracy": 0.38620689511299133, "step": 31715 }, { "epoch": 0.03194872089287207, "grad_norm": 16.736216086243157, "learning_rate": 3.194810950184316e-05, "loss": 2.6337, "mean_token_accuracy": 0.4275861978530884, "step": 31720 }, { "epoch": 0.03195375694597624, "grad_norm": 16.23006486115833, "learning_rate": 3.1953145458574226e-05, "loss": 2.6085, "mean_token_accuracy": 0.3965517282485962, "step": 31725 }, { "epoch": 0.031958792999080415, "grad_norm": 13.194143443043792, "learning_rate": 3.1958181415305285e-05, "loss": 2.3266, "mean_token_accuracy": 0.4413793087005615, "step": 31730 }, { "epoch": 0.03196382905218459, "grad_norm": 15.630480005445824, "learning_rate": 3.1963217372036344e-05, "loss": 2.8838, "mean_token_accuracy": 0.39655172228813174, "step": 31735 }, { "epoch": 0.03196886510528876, "grad_norm": 14.79549729913046, "learning_rate": 3.19682533287674e-05, "loss": 2.6803, "mean_token_accuracy": 0.42758620977401735, "step": 31740 }, { "epoch": 0.03197390115839294, "grad_norm": 22.373040694030642, "learning_rate": 3.1973289285498456e-05, "loss": 2.5735, "mean_token_accuracy": 0.41179673075675965, "step": 31745 }, { "epoch": 0.03197893721149711, "grad_norm": 17.390565570140257, "learning_rate": 3.1978325242229515e-05, "loss": 2.7497, "mean_token_accuracy": 0.4034482717514038, "step": 31750 }, { "epoch": 0.03198397326460128, "grad_norm": 19.896043024010247, "learning_rate": 3.198336119896058e-05, "loss": 3.0829, "mean_token_accuracy": 0.3137931048870087, "step": 31755 }, { "epoch": 0.03198900931770545, "grad_norm": 14.38814132300305, "learning_rate": 3.198839715569164e-05, "loss": 2.4846, "mean_token_accuracy": 0.4172413766384125, "step": 31760 }, { "epoch": 0.031994045370809625, "grad_norm": 15.663242208109411, "learning_rate": 3.19934331124227e-05, "loss": 2.8776, "mean_token_accuracy": 0.384633994102478, "step": 31765 }, { "epoch": 0.0319990814239138, "grad_norm": 19.162530296818833, "learning_rate": 3.199846906915376e-05, "loss": 2.8799, "mean_token_accuracy": 0.358620685338974, "step": 31770 }, { "epoch": 0.03200411747701797, "grad_norm": 11.452596099938221, "learning_rate": 3.200350502588482e-05, "loss": 2.7131, "mean_token_accuracy": 0.4137930929660797, "step": 31775 }, { "epoch": 0.032009153530122146, "grad_norm": 15.490050237737035, "learning_rate": 3.200854098261588e-05, "loss": 2.3456, "mean_token_accuracy": 0.4206896543502808, "step": 31780 }, { "epoch": 0.03201418958322632, "grad_norm": 14.314799150622523, "learning_rate": 3.201357693934694e-05, "loss": 2.4438, "mean_token_accuracy": 0.42758620381355283, "step": 31785 }, { "epoch": 0.03201922563633049, "grad_norm": 16.9258713728606, "learning_rate": 3.2018612896077996e-05, "loss": 2.6915, "mean_token_accuracy": 0.3827586203813553, "step": 31790 }, { "epoch": 0.03202426168943466, "grad_norm": 13.844012758967448, "learning_rate": 3.2023648852809055e-05, "loss": 2.83, "mean_token_accuracy": 0.42413792610168455, "step": 31795 }, { "epoch": 0.032029297742538834, "grad_norm": 13.988678618649534, "learning_rate": 3.2028684809540115e-05, "loss": 2.2807, "mean_token_accuracy": 0.45172414779663084, "step": 31800 }, { "epoch": 0.03203433379564301, "grad_norm": 16.432931553909285, "learning_rate": 3.203372076627118e-05, "loss": 2.4547, "mean_token_accuracy": 0.3896551728248596, "step": 31805 }, { "epoch": 0.03203936984874718, "grad_norm": 17.372332683020776, "learning_rate": 3.203875672300224e-05, "loss": 2.5484, "mean_token_accuracy": 0.3931034505367279, "step": 31810 }, { "epoch": 0.032044405901851356, "grad_norm": 19.722913862720034, "learning_rate": 3.20437926797333e-05, "loss": 2.5644, "mean_token_accuracy": 0.4034482777118683, "step": 31815 }, { "epoch": 0.03204944195495553, "grad_norm": 14.232767844795314, "learning_rate": 3.204882863646436e-05, "loss": 2.5352, "mean_token_accuracy": 0.37586206793785093, "step": 31820 }, { "epoch": 0.032054478008059696, "grad_norm": 14.664770014833527, "learning_rate": 3.205386459319542e-05, "loss": 2.6077, "mean_token_accuracy": 0.41724138557910917, "step": 31825 }, { "epoch": 0.03205951406116387, "grad_norm": 14.629830340814618, "learning_rate": 3.205890054992648e-05, "loss": 2.2064, "mean_token_accuracy": 0.47586206793785096, "step": 31830 }, { "epoch": 0.032064550114268044, "grad_norm": 15.71497217950326, "learning_rate": 3.2063936506657536e-05, "loss": 2.5723, "mean_token_accuracy": 0.4310344815254211, "step": 31835 }, { "epoch": 0.03206958616737222, "grad_norm": 16.89048236517421, "learning_rate": 3.2068972463388596e-05, "loss": 2.6875, "mean_token_accuracy": 0.3827586233615875, "step": 31840 }, { "epoch": 0.03207462222047639, "grad_norm": 13.939032393676971, "learning_rate": 3.2074008420119655e-05, "loss": 2.5423, "mean_token_accuracy": 0.46055657267570493, "step": 31845 }, { "epoch": 0.032079658273580565, "grad_norm": 15.22873089984502, "learning_rate": 3.2079044376850714e-05, "loss": 2.6193, "mean_token_accuracy": 0.42413792610168455, "step": 31850 }, { "epoch": 0.03208469432668474, "grad_norm": 18.6062703212802, "learning_rate": 3.208408033358177e-05, "loss": 2.8232, "mean_token_accuracy": 0.36896551847457887, "step": 31855 }, { "epoch": 0.032089730379788906, "grad_norm": 16.753382946106342, "learning_rate": 3.208911629031284e-05, "loss": 2.4378, "mean_token_accuracy": 0.43448275327682495, "step": 31860 }, { "epoch": 0.03209476643289308, "grad_norm": 22.995077819149248, "learning_rate": 3.20941522470439e-05, "loss": 2.4631, "mean_token_accuracy": 0.4551724135875702, "step": 31865 }, { "epoch": 0.03209980248599725, "grad_norm": 13.258081784776488, "learning_rate": 3.209918820377496e-05, "loss": 2.257, "mean_token_accuracy": 0.4934664249420166, "step": 31870 }, { "epoch": 0.03210483853910143, "grad_norm": 18.049275558511553, "learning_rate": 3.210422416050601e-05, "loss": 2.46, "mean_token_accuracy": 0.4344827592372894, "step": 31875 }, { "epoch": 0.0321098745922056, "grad_norm": 16.098708798849017, "learning_rate": 3.210926011723707e-05, "loss": 2.0541, "mean_token_accuracy": 0.4862068951129913, "step": 31880 }, { "epoch": 0.032114910645309774, "grad_norm": 12.529038684230542, "learning_rate": 3.2114296073968136e-05, "loss": 2.7787, "mean_token_accuracy": 0.39310344457626345, "step": 31885 }, { "epoch": 0.03211994669841395, "grad_norm": 15.89469111712078, "learning_rate": 3.2119332030699195e-05, "loss": 2.5359, "mean_token_accuracy": 0.42758620977401735, "step": 31890 }, { "epoch": 0.032124982751518115, "grad_norm": 14.871866979347494, "learning_rate": 3.2124367987430254e-05, "loss": 2.4535, "mean_token_accuracy": 0.40689656138420105, "step": 31895 }, { "epoch": 0.03213001880462229, "grad_norm": 18.132815438206723, "learning_rate": 3.212940394416131e-05, "loss": 2.4753, "mean_token_accuracy": 0.41034482717514037, "step": 31900 }, { "epoch": 0.03213505485772646, "grad_norm": 13.057225950108466, "learning_rate": 3.213443990089237e-05, "loss": 2.1939, "mean_token_accuracy": 0.4068965494632721, "step": 31905 }, { "epoch": 0.032140090910830636, "grad_norm": 13.792825233556757, "learning_rate": 3.213947585762344e-05, "loss": 2.6195, "mean_token_accuracy": 0.4034482777118683, "step": 31910 }, { "epoch": 0.03214512696393481, "grad_norm": 14.798467684527143, "learning_rate": 3.214451181435449e-05, "loss": 2.3901, "mean_token_accuracy": 0.4572292804718018, "step": 31915 }, { "epoch": 0.032150163017038984, "grad_norm": 13.284380487643643, "learning_rate": 3.214954777108555e-05, "loss": 2.439, "mean_token_accuracy": 0.4034482777118683, "step": 31920 }, { "epoch": 0.03215519907014316, "grad_norm": 14.765989799890471, "learning_rate": 3.215458372781661e-05, "loss": 1.994, "mean_token_accuracy": 0.49879008531570435, "step": 31925 }, { "epoch": 0.032160235123247324, "grad_norm": 18.4559098476638, "learning_rate": 3.215961968454767e-05, "loss": 2.5886, "mean_token_accuracy": 0.4330913484096527, "step": 31930 }, { "epoch": 0.0321652711763515, "grad_norm": 17.271322567993348, "learning_rate": 3.216465564127873e-05, "loss": 2.6926, "mean_token_accuracy": 0.38275861740112305, "step": 31935 }, { "epoch": 0.03217030722945567, "grad_norm": 15.531095598974309, "learning_rate": 3.2169691598009794e-05, "loss": 2.8378, "mean_token_accuracy": 0.3655172407627106, "step": 31940 }, { "epoch": 0.032175343282559846, "grad_norm": 21.826110288110947, "learning_rate": 3.217472755474085e-05, "loss": 2.794, "mean_token_accuracy": 0.39310344457626345, "step": 31945 }, { "epoch": 0.03218037933566402, "grad_norm": 13.483256730885234, "learning_rate": 3.217976351147191e-05, "loss": 2.5576, "mean_token_accuracy": 0.4068965554237366, "step": 31950 }, { "epoch": 0.03218541538876819, "grad_norm": 25.399759107685377, "learning_rate": 3.218479946820297e-05, "loss": 2.7941, "mean_token_accuracy": 0.3655172407627106, "step": 31955 }, { "epoch": 0.03219045144187237, "grad_norm": 13.795769592448956, "learning_rate": 3.218983542493403e-05, "loss": 2.2029, "mean_token_accuracy": 0.46896551847457885, "step": 31960 }, { "epoch": 0.032195487494976534, "grad_norm": 19.687534207059002, "learning_rate": 3.219487138166509e-05, "loss": 2.7932, "mean_token_accuracy": 0.38124621510505674, "step": 31965 }, { "epoch": 0.03220052354808071, "grad_norm": 15.66597812115187, "learning_rate": 3.219990733839615e-05, "loss": 2.1681, "mean_token_accuracy": 0.41379310488700866, "step": 31970 }, { "epoch": 0.03220555960118488, "grad_norm": 18.582798690754903, "learning_rate": 3.220494329512721e-05, "loss": 2.505, "mean_token_accuracy": 0.40344828367233276, "step": 31975 }, { "epoch": 0.032210595654289055, "grad_norm": 20.892540358918726, "learning_rate": 3.220997925185827e-05, "loss": 2.7942, "mean_token_accuracy": 0.358620685338974, "step": 31980 }, { "epoch": 0.03221563170739323, "grad_norm": 21.311820183596392, "learning_rate": 3.221501520858933e-05, "loss": 2.8461, "mean_token_accuracy": 0.3482758641242981, "step": 31985 }, { "epoch": 0.0322206677604974, "grad_norm": 13.959394583223167, "learning_rate": 3.2220051165320393e-05, "loss": 2.5304, "mean_token_accuracy": 0.37931033968925476, "step": 31990 }, { "epoch": 0.032225703813601576, "grad_norm": 17.68524256841339, "learning_rate": 3.222508712205145e-05, "loss": 2.4262, "mean_token_accuracy": 0.45172413885593415, "step": 31995 }, { "epoch": 0.03223073986670574, "grad_norm": 11.963026965281514, "learning_rate": 3.223012307878251e-05, "loss": 2.522, "mean_token_accuracy": 0.4229885071516037, "step": 32000 }, { "epoch": 0.03223577591980992, "grad_norm": 20.74772290321624, "learning_rate": 3.223515903551357e-05, "loss": 2.2171, "mean_token_accuracy": 0.45172412395477296, "step": 32005 }, { "epoch": 0.03224081197291409, "grad_norm": 25.81333277563238, "learning_rate": 3.2240194992244624e-05, "loss": 2.4971, "mean_token_accuracy": 0.43448275327682495, "step": 32010 }, { "epoch": 0.032245848026018264, "grad_norm": 16.399910027816013, "learning_rate": 3.224523094897568e-05, "loss": 2.6021, "mean_token_accuracy": 0.41034482717514037, "step": 32015 }, { "epoch": 0.03225088407912244, "grad_norm": 15.444067457505872, "learning_rate": 3.225026690570675e-05, "loss": 2.328, "mean_token_accuracy": 0.4344827592372894, "step": 32020 }, { "epoch": 0.03225592013222661, "grad_norm": 18.380717556984646, "learning_rate": 3.225530286243781e-05, "loss": 2.6579, "mean_token_accuracy": 0.4068965494632721, "step": 32025 }, { "epoch": 0.032260956185330786, "grad_norm": 21.335418193543553, "learning_rate": 3.226033881916887e-05, "loss": 2.582, "mean_token_accuracy": 0.4034482777118683, "step": 32030 }, { "epoch": 0.03226599223843495, "grad_norm": 16.14789670198836, "learning_rate": 3.226537477589993e-05, "loss": 2.5123, "mean_token_accuracy": 0.39310344457626345, "step": 32035 }, { "epoch": 0.032271028291539126, "grad_norm": 37.10484275750181, "learning_rate": 3.2270410732630986e-05, "loss": 2.4986, "mean_token_accuracy": 0.37586206793785093, "step": 32040 }, { "epoch": 0.0322760643446433, "grad_norm": 16.336733506172358, "learning_rate": 3.227544668936205e-05, "loss": 2.2392, "mean_token_accuracy": 0.4551724135875702, "step": 32045 }, { "epoch": 0.032281100397747474, "grad_norm": 13.222422764888664, "learning_rate": 3.2280482646093104e-05, "loss": 2.1154, "mean_token_accuracy": 0.4896551728248596, "step": 32050 }, { "epoch": 0.03228613645085165, "grad_norm": 16.184272726538154, "learning_rate": 3.2285518602824164e-05, "loss": 2.6389, "mean_token_accuracy": 0.4120992124080658, "step": 32055 }, { "epoch": 0.03229117250395582, "grad_norm": 19.032141198336728, "learning_rate": 3.229055455955522e-05, "loss": 2.1948, "mean_token_accuracy": 0.4620689690113068, "step": 32060 }, { "epoch": 0.032296208557059995, "grad_norm": 17.6952403882455, "learning_rate": 3.229559051628628e-05, "loss": 2.4429, "mean_token_accuracy": 0.4150030255317688, "step": 32065 }, { "epoch": 0.03230124461016416, "grad_norm": 14.975568016200587, "learning_rate": 3.230062647301735e-05, "loss": 3.0166, "mean_token_accuracy": 0.35862069129943847, "step": 32070 }, { "epoch": 0.032306280663268336, "grad_norm": 16.535068595972923, "learning_rate": 3.230566242974841e-05, "loss": 2.4399, "mean_token_accuracy": 0.38620689511299133, "step": 32075 }, { "epoch": 0.03231131671637251, "grad_norm": 18.23646602773212, "learning_rate": 3.231069838647947e-05, "loss": 2.3771, "mean_token_accuracy": 0.4172413766384125, "step": 32080 }, { "epoch": 0.03231635276947668, "grad_norm": 17.42718513526834, "learning_rate": 3.2315734343210526e-05, "loss": 2.4916, "mean_token_accuracy": 0.4360556542873383, "step": 32085 }, { "epoch": 0.03232138882258086, "grad_norm": 19.450275304985905, "learning_rate": 3.2320770299941585e-05, "loss": 2.9534, "mean_token_accuracy": 0.36896551847457887, "step": 32090 }, { "epoch": 0.03232642487568503, "grad_norm": 16.82510128371573, "learning_rate": 3.2325806256672645e-05, "loss": 2.487, "mean_token_accuracy": 0.44996975660324096, "step": 32095 }, { "epoch": 0.032331460928789205, "grad_norm": 13.330910089092427, "learning_rate": 3.2330842213403704e-05, "loss": 2.5524, "mean_token_accuracy": 0.39745916724205016, "step": 32100 }, { "epoch": 0.03233649698189337, "grad_norm": 12.860601540779621, "learning_rate": 3.233587817013476e-05, "loss": 2.3268, "mean_token_accuracy": 0.4344827651977539, "step": 32105 }, { "epoch": 0.032341533034997545, "grad_norm": 18.282960698250932, "learning_rate": 3.234091412686582e-05, "loss": 2.3494, "mean_token_accuracy": 0.458620685338974, "step": 32110 }, { "epoch": 0.03234656908810172, "grad_norm": 14.623584881450494, "learning_rate": 3.234595008359688e-05, "loss": 2.6351, "mean_token_accuracy": 0.38275861740112305, "step": 32115 }, { "epoch": 0.03235160514120589, "grad_norm": 15.024159822342462, "learning_rate": 3.235098604032794e-05, "loss": 2.5269, "mean_token_accuracy": 0.4225045382976532, "step": 32120 }, { "epoch": 0.032356641194310067, "grad_norm": 19.099184236849922, "learning_rate": 3.235602199705901e-05, "loss": 2.4594, "mean_token_accuracy": 0.46551724076271056, "step": 32125 }, { "epoch": 0.03236167724741424, "grad_norm": 16.44982788636979, "learning_rate": 3.2361057953790066e-05, "loss": 2.4191, "mean_token_accuracy": 0.4448275864124298, "step": 32130 }, { "epoch": 0.032366713300518414, "grad_norm": 13.983052278791792, "learning_rate": 3.2366093910521125e-05, "loss": 2.3441, "mean_token_accuracy": 0.43103447556495667, "step": 32135 }, { "epoch": 0.03237174935362258, "grad_norm": 18.17881352541018, "learning_rate": 3.2371129867252185e-05, "loss": 2.346, "mean_token_accuracy": 0.4310344815254211, "step": 32140 }, { "epoch": 0.032376785406726755, "grad_norm": 13.80726677187621, "learning_rate": 3.237616582398324e-05, "loss": 2.2867, "mean_token_accuracy": 0.42413792610168455, "step": 32145 }, { "epoch": 0.03238182145983093, "grad_norm": 16.28163112531554, "learning_rate": 3.23812017807143e-05, "loss": 2.2719, "mean_token_accuracy": 0.4137930989265442, "step": 32150 }, { "epoch": 0.0323868575129351, "grad_norm": 15.098626086410823, "learning_rate": 3.238623773744536e-05, "loss": 2.95, "mean_token_accuracy": 0.35517241060733795, "step": 32155 }, { "epoch": 0.032391893566039276, "grad_norm": 15.224541699012704, "learning_rate": 3.239127369417642e-05, "loss": 2.5146, "mean_token_accuracy": 0.3862068891525269, "step": 32160 }, { "epoch": 0.03239692961914345, "grad_norm": 24.51257894992692, "learning_rate": 3.239630965090748e-05, "loss": 2.6108, "mean_token_accuracy": 0.3586206793785095, "step": 32165 }, { "epoch": 0.032401965672247623, "grad_norm": 16.041702719256328, "learning_rate": 3.240134560763854e-05, "loss": 2.1935, "mean_token_accuracy": 0.4379310369491577, "step": 32170 }, { "epoch": 0.03240700172535179, "grad_norm": 20.03427273189437, "learning_rate": 3.24063815643696e-05, "loss": 2.8543, "mean_token_accuracy": 0.3999999940395355, "step": 32175 }, { "epoch": 0.032412037778455964, "grad_norm": 15.025860946444702, "learning_rate": 3.2411417521100665e-05, "loss": 2.6747, "mean_token_accuracy": 0.3896551728248596, "step": 32180 }, { "epoch": 0.03241707383156014, "grad_norm": 15.18273402469302, "learning_rate": 3.241645347783172e-05, "loss": 2.3062, "mean_token_accuracy": 0.4517241358757019, "step": 32185 }, { "epoch": 0.03242210988466431, "grad_norm": 18.556809033548994, "learning_rate": 3.242148943456278e-05, "loss": 2.4114, "mean_token_accuracy": 0.42068966031074523, "step": 32190 }, { "epoch": 0.032427145937768485, "grad_norm": 16.060983362481547, "learning_rate": 3.2426525391293836e-05, "loss": 2.814, "mean_token_accuracy": 0.35862069129943847, "step": 32195 }, { "epoch": 0.03243218199087266, "grad_norm": 14.05237509166311, "learning_rate": 3.2431561348024896e-05, "loss": 2.6087, "mean_token_accuracy": 0.38275861740112305, "step": 32200 }, { "epoch": 0.03243721804397683, "grad_norm": 17.0173683494379, "learning_rate": 3.243659730475596e-05, "loss": 2.4193, "mean_token_accuracy": 0.4218995749950409, "step": 32205 }, { "epoch": 0.032442254097081, "grad_norm": 14.105437716019017, "learning_rate": 3.244163326148702e-05, "loss": 2.6755, "mean_token_accuracy": 0.3965517163276672, "step": 32210 }, { "epoch": 0.032447290150185173, "grad_norm": 18.915968337623788, "learning_rate": 3.244666921821808e-05, "loss": 2.5324, "mean_token_accuracy": 0.3965517282485962, "step": 32215 }, { "epoch": 0.03245232620328935, "grad_norm": 18.336440009248875, "learning_rate": 3.245170517494914e-05, "loss": 2.4277, "mean_token_accuracy": 0.39655172228813174, "step": 32220 }, { "epoch": 0.03245736225639352, "grad_norm": 18.859294052830872, "learning_rate": 3.24567411316802e-05, "loss": 2.4793, "mean_token_accuracy": 0.41209921836853025, "step": 32225 }, { "epoch": 0.032462398309497695, "grad_norm": 13.70911000016588, "learning_rate": 3.246177708841126e-05, "loss": 2.7154, "mean_token_accuracy": 0.38620689809322356, "step": 32230 }, { "epoch": 0.03246743436260187, "grad_norm": 14.393306776236178, "learning_rate": 3.246681304514232e-05, "loss": 2.7389, "mean_token_accuracy": 0.37586207389831544, "step": 32235 }, { "epoch": 0.03247247041570604, "grad_norm": 16.65650115500263, "learning_rate": 3.2471849001873376e-05, "loss": 1.9689, "mean_token_accuracy": 0.44827585816383364, "step": 32240 }, { "epoch": 0.03247750646881021, "grad_norm": 13.093253149439937, "learning_rate": 3.2476884958604436e-05, "loss": 2.0934, "mean_token_accuracy": 0.43448275327682495, "step": 32245 }, { "epoch": 0.03248254252191438, "grad_norm": 13.270009278754165, "learning_rate": 3.2481920915335495e-05, "loss": 2.5177, "mean_token_accuracy": 0.4310344815254211, "step": 32250 }, { "epoch": 0.03248757857501856, "grad_norm": 14.8297682942415, "learning_rate": 3.2486956872066554e-05, "loss": 2.6759, "mean_token_accuracy": 0.3413793116807938, "step": 32255 }, { "epoch": 0.03249261462812273, "grad_norm": 15.850042286632341, "learning_rate": 3.249199282879762e-05, "loss": 2.5311, "mean_token_accuracy": 0.34137930870056155, "step": 32260 }, { "epoch": 0.032497650681226904, "grad_norm": 17.355580273969014, "learning_rate": 3.249702878552868e-05, "loss": 2.5011, "mean_token_accuracy": 0.3793103456497192, "step": 32265 }, { "epoch": 0.03250268673433108, "grad_norm": 18.597345245564842, "learning_rate": 3.250206474225974e-05, "loss": 2.606, "mean_token_accuracy": 0.38965516686439516, "step": 32270 }, { "epoch": 0.03250772278743525, "grad_norm": 17.691182532611357, "learning_rate": 3.250710069899079e-05, "loss": 2.3904, "mean_token_accuracy": 0.42238354682922363, "step": 32275 }, { "epoch": 0.03251275884053942, "grad_norm": 12.973132273489238, "learning_rate": 3.251213665572185e-05, "loss": 2.4805, "mean_token_accuracy": 0.41379311084747317, "step": 32280 }, { "epoch": 0.03251779489364359, "grad_norm": 15.411380593076348, "learning_rate": 3.2517172612452916e-05, "loss": 2.7634, "mean_token_accuracy": 0.3811857283115387, "step": 32285 }, { "epoch": 0.032522830946747766, "grad_norm": 15.80851553983375, "learning_rate": 3.2522208569183976e-05, "loss": 2.5959, "mean_token_accuracy": 0.36896551251411436, "step": 32290 }, { "epoch": 0.03252786699985194, "grad_norm": 13.904617417854315, "learning_rate": 3.2527244525915035e-05, "loss": 2.5049, "mean_token_accuracy": 0.4310344934463501, "step": 32295 }, { "epoch": 0.032532903052956114, "grad_norm": 18.111453857054155, "learning_rate": 3.2532280482646094e-05, "loss": 2.7707, "mean_token_accuracy": 0.3793103516101837, "step": 32300 }, { "epoch": 0.03253793910606029, "grad_norm": 15.259650204224474, "learning_rate": 3.2537316439377153e-05, "loss": 2.1062, "mean_token_accuracy": 0.42758620381355283, "step": 32305 }, { "epoch": 0.03254297515916446, "grad_norm": 17.14264131869594, "learning_rate": 3.254235239610822e-05, "loss": 2.4472, "mean_token_accuracy": 0.4137930989265442, "step": 32310 }, { "epoch": 0.03254801121226863, "grad_norm": 18.216093845166327, "learning_rate": 3.254738835283927e-05, "loss": 3.3084, "mean_token_accuracy": 0.3034482687711716, "step": 32315 }, { "epoch": 0.0325530472653728, "grad_norm": 16.833267191726566, "learning_rate": 3.255242430957033e-05, "loss": 2.7356, "mean_token_accuracy": 0.3827586233615875, "step": 32320 }, { "epoch": 0.032558083318476976, "grad_norm": 14.263316660709878, "learning_rate": 3.255746026630139e-05, "loss": 2.5684, "mean_token_accuracy": 0.40689654350280763, "step": 32325 }, { "epoch": 0.03256311937158115, "grad_norm": 21.129096249725375, "learning_rate": 3.256249622303245e-05, "loss": 2.6917, "mean_token_accuracy": 0.3793103456497192, "step": 32330 }, { "epoch": 0.03256815542468532, "grad_norm": 15.109700293836818, "learning_rate": 3.2567532179763516e-05, "loss": 2.7367, "mean_token_accuracy": 0.38275861740112305, "step": 32335 }, { "epoch": 0.0325731914777895, "grad_norm": 14.672514085772532, "learning_rate": 3.2572568136494575e-05, "loss": 2.501, "mean_token_accuracy": 0.42068966031074523, "step": 32340 }, { "epoch": 0.03257822753089367, "grad_norm": 21.21441372986871, "learning_rate": 3.2577604093225634e-05, "loss": 2.4776, "mean_token_accuracy": 0.42068964838981626, "step": 32345 }, { "epoch": 0.03258326358399784, "grad_norm": 19.521571024778307, "learning_rate": 3.2582640049956694e-05, "loss": 2.8759, "mean_token_accuracy": 0.38620689511299133, "step": 32350 }, { "epoch": 0.03258829963710201, "grad_norm": 12.866670673767898, "learning_rate": 3.258767600668775e-05, "loss": 2.4569, "mean_token_accuracy": 0.4034482777118683, "step": 32355 }, { "epoch": 0.032593335690206185, "grad_norm": 15.294640161388223, "learning_rate": 3.259271196341881e-05, "loss": 2.4256, "mean_token_accuracy": 0.4275861978530884, "step": 32360 }, { "epoch": 0.03259837174331036, "grad_norm": 22.458370346777123, "learning_rate": 3.259774792014987e-05, "loss": 2.4107, "mean_token_accuracy": 0.47032020688056947, "step": 32365 }, { "epoch": 0.03260340779641453, "grad_norm": 26.28077278790031, "learning_rate": 3.260278387688093e-05, "loss": 2.7524, "mean_token_accuracy": 0.37241379022598264, "step": 32370 }, { "epoch": 0.032608443849518706, "grad_norm": 14.988661724543077, "learning_rate": 3.260781983361199e-05, "loss": 2.6765, "mean_token_accuracy": 0.37241379022598264, "step": 32375 }, { "epoch": 0.03261347990262288, "grad_norm": 16.100163483926046, "learning_rate": 3.261285579034305e-05, "loss": 2.4524, "mean_token_accuracy": 0.44482759237289426, "step": 32380 }, { "epoch": 0.03261851595572705, "grad_norm": 16.267974310613923, "learning_rate": 3.261789174707411e-05, "loss": 2.3633, "mean_token_accuracy": 0.4344827592372894, "step": 32385 }, { "epoch": 0.03262355200883122, "grad_norm": 13.636739659224656, "learning_rate": 3.2622927703805174e-05, "loss": 2.4255, "mean_token_accuracy": 0.43103448748588563, "step": 32390 }, { "epoch": 0.032628588061935394, "grad_norm": 19.241362527388315, "learning_rate": 3.2627963660536234e-05, "loss": 2.713, "mean_token_accuracy": 0.3655172407627106, "step": 32395 }, { "epoch": 0.03263362411503957, "grad_norm": 17.101459549411683, "learning_rate": 3.263299961726729e-05, "loss": 2.5745, "mean_token_accuracy": 0.4413793087005615, "step": 32400 }, { "epoch": 0.03263866016814374, "grad_norm": 14.632414485445233, "learning_rate": 3.263803557399835e-05, "loss": 2.6531, "mean_token_accuracy": 0.441379314661026, "step": 32405 }, { "epoch": 0.032643696221247916, "grad_norm": 18.3375729386485, "learning_rate": 3.2643071530729405e-05, "loss": 2.975, "mean_token_accuracy": 0.3655172407627106, "step": 32410 }, { "epoch": 0.03264873227435209, "grad_norm": 25.145133926330377, "learning_rate": 3.264810748746047e-05, "loss": 2.6496, "mean_token_accuracy": 0.37241379618644715, "step": 32415 }, { "epoch": 0.032653768327456256, "grad_norm": 17.51596270172547, "learning_rate": 3.265314344419153e-05, "loss": 2.6977, "mean_token_accuracy": 0.36896551847457887, "step": 32420 }, { "epoch": 0.03265880438056043, "grad_norm": 17.64171167935097, "learning_rate": 3.265817940092259e-05, "loss": 2.874, "mean_token_accuracy": 0.3655172407627106, "step": 32425 }, { "epoch": 0.032663840433664604, "grad_norm": 13.934183998688624, "learning_rate": 3.266321535765365e-05, "loss": 2.2642, "mean_token_accuracy": 0.4586206912994385, "step": 32430 }, { "epoch": 0.03266887648676878, "grad_norm": 15.839976746346874, "learning_rate": 3.266825131438471e-05, "loss": 2.8586, "mean_token_accuracy": 0.36702964305877683, "step": 32435 }, { "epoch": 0.03267391253987295, "grad_norm": 16.75131639610263, "learning_rate": 3.267328727111577e-05, "loss": 3.1976, "mean_token_accuracy": 0.3827586233615875, "step": 32440 }, { "epoch": 0.032678948592977125, "grad_norm": 18.332316803193702, "learning_rate": 3.267832322784683e-05, "loss": 2.754, "mean_token_accuracy": 0.38275861740112305, "step": 32445 }, { "epoch": 0.0326839846460813, "grad_norm": 15.568692537268564, "learning_rate": 3.2683359184577885e-05, "loss": 2.6681, "mean_token_accuracy": 0.3931034505367279, "step": 32450 }, { "epoch": 0.032689020699185466, "grad_norm": 12.886240656636897, "learning_rate": 3.2688395141308945e-05, "loss": 2.4385, "mean_token_accuracy": 0.4103448331356049, "step": 32455 }, { "epoch": 0.03269405675228964, "grad_norm": 17.45917550119998, "learning_rate": 3.2693431098040004e-05, "loss": 2.4444, "mean_token_accuracy": 0.3551724016666412, "step": 32460 }, { "epoch": 0.03269909280539381, "grad_norm": 18.194960230112926, "learning_rate": 3.269846705477106e-05, "loss": 2.8905, "mean_token_accuracy": 0.3413793116807938, "step": 32465 }, { "epoch": 0.03270412885849799, "grad_norm": 20.256597680089506, "learning_rate": 3.270350301150213e-05, "loss": 2.5095, "mean_token_accuracy": 0.4172413766384125, "step": 32470 }, { "epoch": 0.03270916491160216, "grad_norm": 13.679478173965265, "learning_rate": 3.270853896823319e-05, "loss": 2.1934, "mean_token_accuracy": 0.4551724076271057, "step": 32475 }, { "epoch": 0.032714200964706334, "grad_norm": 15.38817868053556, "learning_rate": 3.271357492496425e-05, "loss": 2.4598, "mean_token_accuracy": 0.45359952449798585, "step": 32480 }, { "epoch": 0.03271923701781051, "grad_norm": 13.33233748028832, "learning_rate": 3.271861088169531e-05, "loss": 2.0339, "mean_token_accuracy": 0.48275862336158754, "step": 32485 }, { "epoch": 0.032724273070914675, "grad_norm": 17.634742522008263, "learning_rate": 3.2723646838426366e-05, "loss": 2.0843, "mean_token_accuracy": 0.4624384164810181, "step": 32490 }, { "epoch": 0.03272930912401885, "grad_norm": 16.972196529768752, "learning_rate": 3.2728682795157425e-05, "loss": 2.0786, "mean_token_accuracy": 0.4745311617851257, "step": 32495 }, { "epoch": 0.03273434517712302, "grad_norm": 15.63336540800433, "learning_rate": 3.2733718751888485e-05, "loss": 2.5229, "mean_token_accuracy": 0.36206896901130675, "step": 32500 }, { "epoch": 0.032739381230227196, "grad_norm": 15.744294802959585, "learning_rate": 3.2738754708619544e-05, "loss": 2.48, "mean_token_accuracy": 0.37586207687854767, "step": 32505 }, { "epoch": 0.03274441728333137, "grad_norm": 15.706699492731067, "learning_rate": 3.27437906653506e-05, "loss": 2.6452, "mean_token_accuracy": 0.3827586114406586, "step": 32510 }, { "epoch": 0.032749453336435544, "grad_norm": 14.62583286960497, "learning_rate": 3.274882662208166e-05, "loss": 2.5581, "mean_token_accuracy": 0.42413792610168455, "step": 32515 }, { "epoch": 0.03275448938953972, "grad_norm": 16.327236137582013, "learning_rate": 3.275386257881272e-05, "loss": 2.7675, "mean_token_accuracy": 0.4034482777118683, "step": 32520 }, { "epoch": 0.032759525442643884, "grad_norm": 14.418837489538838, "learning_rate": 3.275889853554379e-05, "loss": 3.0492, "mean_token_accuracy": 0.335632187128067, "step": 32525 }, { "epoch": 0.03276456149574806, "grad_norm": 19.571838839152466, "learning_rate": 3.276393449227485e-05, "loss": 2.5628, "mean_token_accuracy": 0.44482759237289426, "step": 32530 }, { "epoch": 0.03276959754885223, "grad_norm": 16.185364961414447, "learning_rate": 3.2768970449005906e-05, "loss": 2.7562, "mean_token_accuracy": 0.4103448152542114, "step": 32535 }, { "epoch": 0.032774633601956406, "grad_norm": 19.374238875990887, "learning_rate": 3.2774006405736965e-05, "loss": 2.6599, "mean_token_accuracy": 0.4103448331356049, "step": 32540 }, { "epoch": 0.03277966965506058, "grad_norm": 23.46081446532975, "learning_rate": 3.277904236246802e-05, "loss": 2.7965, "mean_token_accuracy": 0.3620689630508423, "step": 32545 }, { "epoch": 0.03278470570816475, "grad_norm": 43.8760298723066, "learning_rate": 3.2784078319199084e-05, "loss": 2.8466, "mean_token_accuracy": 0.3517241358757019, "step": 32550 }, { "epoch": 0.03278974176126893, "grad_norm": 14.556031174394118, "learning_rate": 3.278911427593014e-05, "loss": 2.4029, "mean_token_accuracy": 0.4, "step": 32555 }, { "epoch": 0.032794777814373094, "grad_norm": 14.165278450746793, "learning_rate": 3.27941502326612e-05, "loss": 2.7859, "mean_token_accuracy": 0.35517241060733795, "step": 32560 }, { "epoch": 0.03279981386747727, "grad_norm": 14.130001904812096, "learning_rate": 3.279918618939226e-05, "loss": 2.7425, "mean_token_accuracy": 0.358620685338974, "step": 32565 }, { "epoch": 0.03280484992058144, "grad_norm": 13.785734789087467, "learning_rate": 3.280422214612332e-05, "loss": 2.2672, "mean_token_accuracy": 0.44664247035980226, "step": 32570 }, { "epoch": 0.032809885973685615, "grad_norm": 17.009895786402716, "learning_rate": 3.280925810285439e-05, "loss": 2.5961, "mean_token_accuracy": 0.4137930989265442, "step": 32575 }, { "epoch": 0.03281492202678979, "grad_norm": 15.864609471595381, "learning_rate": 3.2814294059585446e-05, "loss": 2.5674, "mean_token_accuracy": 0.4172413766384125, "step": 32580 }, { "epoch": 0.03281995807989396, "grad_norm": 15.276121474227274, "learning_rate": 3.28193300163165e-05, "loss": 3.0003, "mean_token_accuracy": 0.33793103992938994, "step": 32585 }, { "epoch": 0.032824994132998137, "grad_norm": 17.341411846904155, "learning_rate": 3.282436597304756e-05, "loss": 2.6605, "mean_token_accuracy": 0.4084694445133209, "step": 32590 }, { "epoch": 0.0328300301861023, "grad_norm": 13.761140576288689, "learning_rate": 3.282940192977862e-05, "loss": 2.5202, "mean_token_accuracy": 0.4517241418361664, "step": 32595 }, { "epoch": 0.03283506623920648, "grad_norm": 14.170902206409414, "learning_rate": 3.2834437886509676e-05, "loss": 2.1625, "mean_token_accuracy": 0.4896551787853241, "step": 32600 }, { "epoch": 0.03284010229231065, "grad_norm": 22.920358752005935, "learning_rate": 3.283947384324074e-05, "loss": 2.3717, "mean_token_accuracy": 0.42758620381355283, "step": 32605 }, { "epoch": 0.032845138345414825, "grad_norm": 13.484745472748521, "learning_rate": 3.28445097999718e-05, "loss": 2.3805, "mean_token_accuracy": 0.4448275864124298, "step": 32610 }, { "epoch": 0.032850174398519, "grad_norm": 33.812520425550005, "learning_rate": 3.284954575670286e-05, "loss": 2.5333, "mean_token_accuracy": 0.42068964838981626, "step": 32615 }, { "epoch": 0.03285521045162317, "grad_norm": 13.50382108218205, "learning_rate": 3.285458171343392e-05, "loss": 2.3242, "mean_token_accuracy": 0.4241379201412201, "step": 32620 }, { "epoch": 0.032860246504727346, "grad_norm": 19.01702079389191, "learning_rate": 3.285961767016498e-05, "loss": 2.4528, "mean_token_accuracy": 0.42758620381355283, "step": 32625 }, { "epoch": 0.03286528255783151, "grad_norm": 17.360495059037017, "learning_rate": 3.286465362689604e-05, "loss": 2.444, "mean_token_accuracy": 0.39655172526836396, "step": 32630 }, { "epoch": 0.032870318610935687, "grad_norm": 13.637231170933239, "learning_rate": 3.28696895836271e-05, "loss": 2.6759, "mean_token_accuracy": 0.44283121824264526, "step": 32635 }, { "epoch": 0.03287535466403986, "grad_norm": 24.739942452078214, "learning_rate": 3.287472554035816e-05, "loss": 2.5579, "mean_token_accuracy": 0.39655172228813174, "step": 32640 }, { "epoch": 0.032880390717144034, "grad_norm": 16.647408438471913, "learning_rate": 3.2879761497089217e-05, "loss": 2.5326, "mean_token_accuracy": 0.44827585220336913, "step": 32645 }, { "epoch": 0.03288542677024821, "grad_norm": 16.424397996555225, "learning_rate": 3.2884797453820276e-05, "loss": 2.0253, "mean_token_accuracy": 0.4793103516101837, "step": 32650 }, { "epoch": 0.03289046282335238, "grad_norm": 15.33131228653515, "learning_rate": 3.288983341055134e-05, "loss": 2.4195, "mean_token_accuracy": 0.4034482717514038, "step": 32655 }, { "epoch": 0.032895498876456555, "grad_norm": 13.874489198003475, "learning_rate": 3.28948693672824e-05, "loss": 2.3916, "mean_token_accuracy": 0.4172413766384125, "step": 32660 }, { "epoch": 0.03290053492956072, "grad_norm": 13.700994907168901, "learning_rate": 3.289990532401346e-05, "loss": 2.872, "mean_token_accuracy": 0.3620689630508423, "step": 32665 }, { "epoch": 0.032905570982664896, "grad_norm": 15.233080226532602, "learning_rate": 3.290494128074452e-05, "loss": 2.5351, "mean_token_accuracy": 0.46061705946922304, "step": 32670 }, { "epoch": 0.03291060703576907, "grad_norm": 15.122612866384182, "learning_rate": 3.290997723747558e-05, "loss": 2.4321, "mean_token_accuracy": 0.4103448212146759, "step": 32675 }, { "epoch": 0.032915643088873243, "grad_norm": 15.5546120122505, "learning_rate": 3.291501319420663e-05, "loss": 2.6522, "mean_token_accuracy": 0.3758620649576187, "step": 32680 }, { "epoch": 0.03292067914197742, "grad_norm": 18.58651549224703, "learning_rate": 3.29200491509377e-05, "loss": 2.5107, "mean_token_accuracy": 0.3758620619773865, "step": 32685 }, { "epoch": 0.03292571519508159, "grad_norm": 16.77188971342295, "learning_rate": 3.2925085107668757e-05, "loss": 2.4989, "mean_token_accuracy": 0.3655172407627106, "step": 32690 }, { "epoch": 0.032930751248185765, "grad_norm": 15.064119263848445, "learning_rate": 3.2930121064399816e-05, "loss": 2.6803, "mean_token_accuracy": 0.32758620381355286, "step": 32695 }, { "epoch": 0.03293578730128993, "grad_norm": 15.629164831492382, "learning_rate": 3.2935157021130875e-05, "loss": 2.4515, "mean_token_accuracy": 0.441379314661026, "step": 32700 }, { "epoch": 0.032940823354394105, "grad_norm": 19.312839807927848, "learning_rate": 3.2940192977861934e-05, "loss": 2.7672, "mean_token_accuracy": 0.3827586233615875, "step": 32705 }, { "epoch": 0.03294585940749828, "grad_norm": 13.74508109066384, "learning_rate": 3.2945228934593e-05, "loss": 2.4376, "mean_token_accuracy": 0.39655172228813174, "step": 32710 }, { "epoch": 0.03295089546060245, "grad_norm": 14.264688306605068, "learning_rate": 3.295026489132406e-05, "loss": 2.7354, "mean_token_accuracy": 0.3931034505367279, "step": 32715 }, { "epoch": 0.03295593151370663, "grad_norm": 14.819957772864308, "learning_rate": 3.295530084805511e-05, "loss": 2.4244, "mean_token_accuracy": 0.4137930989265442, "step": 32720 }, { "epoch": 0.0329609675668108, "grad_norm": 13.415476694124704, "learning_rate": 3.296033680478617e-05, "loss": 2.5399, "mean_token_accuracy": 0.45862067937850953, "step": 32725 }, { "epoch": 0.032966003619914974, "grad_norm": 17.35436693927641, "learning_rate": 3.296537276151723e-05, "loss": 2.5272, "mean_token_accuracy": 0.3999999940395355, "step": 32730 }, { "epoch": 0.03297103967301914, "grad_norm": 14.651602809937497, "learning_rate": 3.29704087182483e-05, "loss": 2.2693, "mean_token_accuracy": 0.41379310488700866, "step": 32735 }, { "epoch": 0.032976075726123315, "grad_norm": 14.714131839229292, "learning_rate": 3.2975444674979356e-05, "loss": 2.9447, "mean_token_accuracy": 0.36896551847457887, "step": 32740 }, { "epoch": 0.03298111177922749, "grad_norm": 17.26957853112257, "learning_rate": 3.2980480631710415e-05, "loss": 2.5958, "mean_token_accuracy": 0.38275861740112305, "step": 32745 }, { "epoch": 0.03298614783233166, "grad_norm": 13.979668246781813, "learning_rate": 3.2985516588441474e-05, "loss": 2.4161, "mean_token_accuracy": 0.44482758045196535, "step": 32750 }, { "epoch": 0.032991183885435836, "grad_norm": 13.237033524918994, "learning_rate": 3.2990552545172534e-05, "loss": 2.0654, "mean_token_accuracy": 0.4551724076271057, "step": 32755 }, { "epoch": 0.03299621993854001, "grad_norm": 31.87565633030914, "learning_rate": 3.299558850190359e-05, "loss": 3.1275, "mean_token_accuracy": 0.37241379022598264, "step": 32760 }, { "epoch": 0.033001255991644184, "grad_norm": 19.30916172166212, "learning_rate": 3.300062445863465e-05, "loss": 2.9449, "mean_token_accuracy": 0.37586206793785093, "step": 32765 }, { "epoch": 0.03300629204474835, "grad_norm": 13.601206403633212, "learning_rate": 3.300566041536571e-05, "loss": 2.3829, "mean_token_accuracy": 0.4206896543502808, "step": 32770 }, { "epoch": 0.033011328097852524, "grad_norm": 16.141391459853192, "learning_rate": 3.301069637209677e-05, "loss": 2.4434, "mean_token_accuracy": 0.41379311084747317, "step": 32775 }, { "epoch": 0.0330163641509567, "grad_norm": 16.526577673594275, "learning_rate": 3.301573232882783e-05, "loss": 2.3123, "mean_token_accuracy": 0.4604355752468109, "step": 32780 }, { "epoch": 0.03302140020406087, "grad_norm": 24.643942491537437, "learning_rate": 3.302076828555889e-05, "loss": 2.6579, "mean_token_accuracy": 0.38275861740112305, "step": 32785 }, { "epoch": 0.033026436257165045, "grad_norm": 16.47801802227408, "learning_rate": 3.3025804242289955e-05, "loss": 2.5916, "mean_token_accuracy": 0.41875377893447874, "step": 32790 }, { "epoch": 0.03303147231026922, "grad_norm": 19.186217286286556, "learning_rate": 3.3030840199021014e-05, "loss": 2.322, "mean_token_accuracy": 0.44482758045196535, "step": 32795 }, { "epoch": 0.03303650836337339, "grad_norm": 14.606280679160797, "learning_rate": 3.3035876155752074e-05, "loss": 2.8743, "mean_token_accuracy": 0.39310343861579894, "step": 32800 }, { "epoch": 0.03304154441647756, "grad_norm": 26.2061474423102, "learning_rate": 3.304091211248313e-05, "loss": 2.9024, "mean_token_accuracy": 0.4275862067937851, "step": 32805 }, { "epoch": 0.033046580469581734, "grad_norm": 14.26947536340395, "learning_rate": 3.3045948069214185e-05, "loss": 2.7068, "mean_token_accuracy": 0.3931034505367279, "step": 32810 }, { "epoch": 0.03305161652268591, "grad_norm": 21.934174959490285, "learning_rate": 3.305098402594525e-05, "loss": 2.6867, "mean_token_accuracy": 0.40344828367233276, "step": 32815 }, { "epoch": 0.03305665257579008, "grad_norm": 19.71199654781282, "learning_rate": 3.305601998267631e-05, "loss": 2.8772, "mean_token_accuracy": 0.35862069129943847, "step": 32820 }, { "epoch": 0.033061688628894255, "grad_norm": 14.184218606520233, "learning_rate": 3.306105593940737e-05, "loss": 2.599, "mean_token_accuracy": 0.43103448748588563, "step": 32825 }, { "epoch": 0.03306672468199843, "grad_norm": 21.144547968067535, "learning_rate": 3.306609189613843e-05, "loss": 2.922, "mean_token_accuracy": 0.39310344457626345, "step": 32830 }, { "epoch": 0.0330717607351026, "grad_norm": 14.796514973272407, "learning_rate": 3.307112785286949e-05, "loss": 2.0287, "mean_token_accuracy": 0.5034482777118683, "step": 32835 }, { "epoch": 0.03307679678820677, "grad_norm": 15.233998379859829, "learning_rate": 3.3076163809600555e-05, "loss": 2.4836, "mean_token_accuracy": 0.4413793087005615, "step": 32840 }, { "epoch": 0.03308183284131094, "grad_norm": 16.242673312143374, "learning_rate": 3.3081199766331614e-05, "loss": 2.4771, "mean_token_accuracy": 0.43793103098869324, "step": 32845 }, { "epoch": 0.03308686889441512, "grad_norm": 13.5680368270543, "learning_rate": 3.3086235723062666e-05, "loss": 1.9275, "mean_token_accuracy": 0.4344827592372894, "step": 32850 }, { "epoch": 0.03309190494751929, "grad_norm": 14.857133629414871, "learning_rate": 3.3091271679793725e-05, "loss": 2.6523, "mean_token_accuracy": 0.38620689511299133, "step": 32855 }, { "epoch": 0.033096941000623464, "grad_norm": 19.470643124619098, "learning_rate": 3.3096307636524785e-05, "loss": 2.8632, "mean_token_accuracy": 0.3896551787853241, "step": 32860 }, { "epoch": 0.03310197705372764, "grad_norm": 15.572803051558685, "learning_rate": 3.3101343593255844e-05, "loss": 2.5144, "mean_token_accuracy": 0.43414571285247805, "step": 32865 }, { "epoch": 0.03310701310683181, "grad_norm": 29.17644828434295, "learning_rate": 3.310637954998691e-05, "loss": 2.7512, "mean_token_accuracy": 0.3999999940395355, "step": 32870 }, { "epoch": 0.03311204915993598, "grad_norm": 27.802633706794726, "learning_rate": 3.311141550671797e-05, "loss": 2.6099, "mean_token_accuracy": 0.37586206793785093, "step": 32875 }, { "epoch": 0.03311708521304015, "grad_norm": 21.33415304577971, "learning_rate": 3.311645146344903e-05, "loss": 2.5907, "mean_token_accuracy": 0.41034482717514037, "step": 32880 }, { "epoch": 0.033122121266144326, "grad_norm": 13.503505508098185, "learning_rate": 3.312148742018009e-05, "loss": 2.3657, "mean_token_accuracy": 0.4878402888774872, "step": 32885 }, { "epoch": 0.0331271573192485, "grad_norm": 15.443038331686399, "learning_rate": 3.312652337691115e-05, "loss": 3.0062, "mean_token_accuracy": 0.3448275923728943, "step": 32890 }, { "epoch": 0.033132193372352674, "grad_norm": 15.225491335064921, "learning_rate": 3.3131559333642206e-05, "loss": 2.9249, "mean_token_accuracy": 0.36551724672317504, "step": 32895 }, { "epoch": 0.03313722942545685, "grad_norm": 19.099019693838706, "learning_rate": 3.3136595290373266e-05, "loss": 2.7016, "mean_token_accuracy": 0.34827585220336915, "step": 32900 }, { "epoch": 0.03314226547856102, "grad_norm": 15.563578133207391, "learning_rate": 3.3141631247104325e-05, "loss": 2.5884, "mean_token_accuracy": 0.44137930274009707, "step": 32905 }, { "epoch": 0.03314730153166519, "grad_norm": 21.3845044468222, "learning_rate": 3.3146667203835384e-05, "loss": 2.7174, "mean_token_accuracy": 0.38965516686439516, "step": 32910 }, { "epoch": 0.03315233758476936, "grad_norm": 13.745378551792657, "learning_rate": 3.315170316056644e-05, "loss": 3.0389, "mean_token_accuracy": 0.33260738253593447, "step": 32915 }, { "epoch": 0.033157373637873536, "grad_norm": 13.521368173600575, "learning_rate": 3.315673911729751e-05, "loss": 2.2271, "mean_token_accuracy": 0.44482758045196535, "step": 32920 }, { "epoch": 0.03316240969097771, "grad_norm": 12.712620518427432, "learning_rate": 3.316177507402857e-05, "loss": 2.5467, "mean_token_accuracy": 0.4034482777118683, "step": 32925 }, { "epoch": 0.03316744574408188, "grad_norm": 17.42330842763052, "learning_rate": 3.316681103075963e-05, "loss": 2.4931, "mean_token_accuracy": 0.358620685338974, "step": 32930 }, { "epoch": 0.03317248179718606, "grad_norm": 14.075211790528742, "learning_rate": 3.317184698749069e-05, "loss": 2.5371, "mean_token_accuracy": 0.3793103456497192, "step": 32935 }, { "epoch": 0.03317751785029023, "grad_norm": 14.758830134843063, "learning_rate": 3.3176882944221746e-05, "loss": 2.3068, "mean_token_accuracy": 0.4344827592372894, "step": 32940 }, { "epoch": 0.0331825539033944, "grad_norm": 17.388416649584013, "learning_rate": 3.31819189009528e-05, "loss": 2.1321, "mean_token_accuracy": 0.4713248610496521, "step": 32945 }, { "epoch": 0.03318758995649857, "grad_norm": 16.25904088259404, "learning_rate": 3.3186954857683865e-05, "loss": 2.5435, "mean_token_accuracy": 0.4172413766384125, "step": 32950 }, { "epoch": 0.033192626009602745, "grad_norm": 14.010933277384302, "learning_rate": 3.3191990814414924e-05, "loss": 2.8915, "mean_token_accuracy": 0.3758620709180832, "step": 32955 }, { "epoch": 0.03319766206270692, "grad_norm": 18.915387018368037, "learning_rate": 3.319702677114598e-05, "loss": 2.4008, "mean_token_accuracy": 0.41034482717514037, "step": 32960 }, { "epoch": 0.03320269811581109, "grad_norm": 15.52182085524452, "learning_rate": 3.320206272787704e-05, "loss": 2.1514, "mean_token_accuracy": 0.458620685338974, "step": 32965 }, { "epoch": 0.033207734168915266, "grad_norm": 11.995706018739137, "learning_rate": 3.32070986846081e-05, "loss": 2.2944, "mean_token_accuracy": 0.458620685338974, "step": 32970 }, { "epoch": 0.03321277022201944, "grad_norm": 18.089039180390436, "learning_rate": 3.321213464133917e-05, "loss": 2.791, "mean_token_accuracy": 0.35172413289546967, "step": 32975 }, { "epoch": 0.03321780627512361, "grad_norm": 16.80464875533471, "learning_rate": 3.321717059807023e-05, "loss": 2.5939, "mean_token_accuracy": 0.4432546854019165, "step": 32980 }, { "epoch": 0.03322284232822778, "grad_norm": 17.76193985514041, "learning_rate": 3.322220655480128e-05, "loss": 2.4068, "mean_token_accuracy": 0.41034482717514037, "step": 32985 }, { "epoch": 0.033227878381331954, "grad_norm": 18.892986445589255, "learning_rate": 3.322724251153234e-05, "loss": 2.5575, "mean_token_accuracy": 0.4068965494632721, "step": 32990 }, { "epoch": 0.03323291443443613, "grad_norm": 15.808080648279875, "learning_rate": 3.32322784682634e-05, "loss": 2.5703, "mean_token_accuracy": 0.3931034505367279, "step": 32995 }, { "epoch": 0.0332379504875403, "grad_norm": 13.54889599129294, "learning_rate": 3.3237314424994464e-05, "loss": 2.9481, "mean_token_accuracy": 0.36551723480224607, "step": 33000 }, { "epoch": 0.033242986540644476, "grad_norm": 12.552144263318109, "learning_rate": 3.3242350381725523e-05, "loss": 2.3748, "mean_token_accuracy": 0.4448275864124298, "step": 33005 }, { "epoch": 0.03324802259374865, "grad_norm": 16.066591431698455, "learning_rate": 3.324738633845658e-05, "loss": 3.0322, "mean_token_accuracy": 0.35983061194419863, "step": 33010 }, { "epoch": 0.033253058646852816, "grad_norm": 15.490126717108064, "learning_rate": 3.325242229518764e-05, "loss": 2.3441, "mean_token_accuracy": 0.41379310488700866, "step": 33015 }, { "epoch": 0.03325809469995699, "grad_norm": 16.990774496164942, "learning_rate": 3.32574582519187e-05, "loss": 2.793, "mean_token_accuracy": 0.42262552976608275, "step": 33020 }, { "epoch": 0.033263130753061164, "grad_norm": 15.766862528613073, "learning_rate": 3.326249420864976e-05, "loss": 2.7703, "mean_token_accuracy": 0.39655172228813174, "step": 33025 }, { "epoch": 0.03326816680616534, "grad_norm": 16.56155977292928, "learning_rate": 3.326753016538082e-05, "loss": 2.7486, "mean_token_accuracy": 0.4068965494632721, "step": 33030 }, { "epoch": 0.03327320285926951, "grad_norm": 14.240409954299082, "learning_rate": 3.327256612211188e-05, "loss": 2.2333, "mean_token_accuracy": 0.4068965494632721, "step": 33035 }, { "epoch": 0.033278238912373685, "grad_norm": 16.806186328435505, "learning_rate": 3.327760207884294e-05, "loss": 3.1286, "mean_token_accuracy": 0.3620689630508423, "step": 33040 }, { "epoch": 0.03328327496547786, "grad_norm": 15.277516432419226, "learning_rate": 3.3282638035574e-05, "loss": 2.5292, "mean_token_accuracy": 0.40859044194221494, "step": 33045 }, { "epoch": 0.033288311018582026, "grad_norm": 14.104097226528332, "learning_rate": 3.328767399230506e-05, "loss": 2.529, "mean_token_accuracy": 0.4206896543502808, "step": 33050 }, { "epoch": 0.0332933470716862, "grad_norm": 18.28627326729721, "learning_rate": 3.329270994903612e-05, "loss": 2.5753, "mean_token_accuracy": 0.43793103098869324, "step": 33055 }, { "epoch": 0.03329838312479037, "grad_norm": 15.083040234095684, "learning_rate": 3.329774590576718e-05, "loss": 2.2659, "mean_token_accuracy": 0.45172414779663084, "step": 33060 }, { "epoch": 0.03330341917789455, "grad_norm": 13.677435807551833, "learning_rate": 3.330278186249824e-05, "loss": 2.3495, "mean_token_accuracy": 0.4206896543502808, "step": 33065 }, { "epoch": 0.03330845523099872, "grad_norm": 14.239537944036051, "learning_rate": 3.33078178192293e-05, "loss": 2.1205, "mean_token_accuracy": 0.49522079825401305, "step": 33070 }, { "epoch": 0.033313491284102895, "grad_norm": 15.061175451136954, "learning_rate": 3.331285377596036e-05, "loss": 2.5303, "mean_token_accuracy": 0.42413792610168455, "step": 33075 }, { "epoch": 0.03331852733720707, "grad_norm": 21.753106809082745, "learning_rate": 3.331788973269142e-05, "loss": 2.2662, "mean_token_accuracy": 0.4655172348022461, "step": 33080 }, { "epoch": 0.033323563390311235, "grad_norm": 12.278847152702175, "learning_rate": 3.332292568942248e-05, "loss": 2.4451, "mean_token_accuracy": 0.41379311084747317, "step": 33085 }, { "epoch": 0.03332859944341541, "grad_norm": 15.400948490820884, "learning_rate": 3.332796164615354e-05, "loss": 2.1137, "mean_token_accuracy": 0.4642468154430389, "step": 33090 }, { "epoch": 0.03333363549651958, "grad_norm": 15.37636375666732, "learning_rate": 3.33329976028846e-05, "loss": 2.5247, "mean_token_accuracy": 0.3827586233615875, "step": 33095 }, { "epoch": 0.033338671549623757, "grad_norm": 15.926229810228714, "learning_rate": 3.3338033559615656e-05, "loss": 2.7152, "mean_token_accuracy": 0.3862069010734558, "step": 33100 }, { "epoch": 0.03334370760272793, "grad_norm": 15.528552504465917, "learning_rate": 3.3343069516346715e-05, "loss": 2.6528, "mean_token_accuracy": 0.39310344457626345, "step": 33105 }, { "epoch": 0.033348743655832104, "grad_norm": 27.259088801723504, "learning_rate": 3.334810547307778e-05, "loss": 2.8978, "mean_token_accuracy": 0.3482758641242981, "step": 33110 }, { "epoch": 0.03335377970893628, "grad_norm": 14.96597782987991, "learning_rate": 3.335314142980884e-05, "loss": 2.3211, "mean_token_accuracy": 0.4310344815254211, "step": 33115 }, { "epoch": 0.033358815762040445, "grad_norm": 12.354795096235605, "learning_rate": 3.335817738653989e-05, "loss": 2.1967, "mean_token_accuracy": 0.48054186105728147, "step": 33120 }, { "epoch": 0.03336385181514462, "grad_norm": 15.789989264706746, "learning_rate": 3.336321334327095e-05, "loss": 2.7695, "mean_token_accuracy": 0.3793103516101837, "step": 33125 }, { "epoch": 0.03336888786824879, "grad_norm": 16.110453248553732, "learning_rate": 3.336824930000201e-05, "loss": 2.214, "mean_token_accuracy": 0.4068965494632721, "step": 33130 }, { "epoch": 0.033373923921352966, "grad_norm": 18.17640855229856, "learning_rate": 3.337328525673308e-05, "loss": 2.6773, "mean_token_accuracy": 0.36896551251411436, "step": 33135 }, { "epoch": 0.03337895997445714, "grad_norm": 14.402292179960261, "learning_rate": 3.337832121346414e-05, "loss": 2.5751, "mean_token_accuracy": 0.3862069010734558, "step": 33140 }, { "epoch": 0.03338399602756131, "grad_norm": 18.257369049968627, "learning_rate": 3.3383357170195196e-05, "loss": 2.3465, "mean_token_accuracy": 0.4103448331356049, "step": 33145 }, { "epoch": 0.03338903208066549, "grad_norm": 17.06039855843876, "learning_rate": 3.3388393126926255e-05, "loss": 2.8489, "mean_token_accuracy": 0.37241379022598264, "step": 33150 }, { "epoch": 0.033394068133769654, "grad_norm": 15.628252019953884, "learning_rate": 3.3393429083657315e-05, "loss": 2.5832, "mean_token_accuracy": 0.40689654350280763, "step": 33155 }, { "epoch": 0.03339910418687383, "grad_norm": 12.962348523629823, "learning_rate": 3.3398465040388374e-05, "loss": 2.8061, "mean_token_accuracy": 0.4206896424293518, "step": 33160 }, { "epoch": 0.033404140239978, "grad_norm": 17.002289808605582, "learning_rate": 3.340350099711943e-05, "loss": 2.9136, "mean_token_accuracy": 0.3241379290819168, "step": 33165 }, { "epoch": 0.033409176293082175, "grad_norm": 16.62418367605977, "learning_rate": 3.340853695385049e-05, "loss": 2.6146, "mean_token_accuracy": 0.4034482777118683, "step": 33170 }, { "epoch": 0.03341421234618635, "grad_norm": 12.517377629188488, "learning_rate": 3.341357291058155e-05, "loss": 2.2509, "mean_token_accuracy": 0.44482758045196535, "step": 33175 }, { "epoch": 0.03341924839929052, "grad_norm": 23.45826937993233, "learning_rate": 3.341860886731261e-05, "loss": 2.8067, "mean_token_accuracy": 0.3620689630508423, "step": 33180 }, { "epoch": 0.0334242844523947, "grad_norm": 14.746604972155437, "learning_rate": 3.342364482404368e-05, "loss": 2.5117, "mean_token_accuracy": 0.3880217790603638, "step": 33185 }, { "epoch": 0.033429320505498863, "grad_norm": 17.976318886170816, "learning_rate": 3.3428680780774736e-05, "loss": 2.7867, "mean_token_accuracy": 0.3620689630508423, "step": 33190 }, { "epoch": 0.03343435655860304, "grad_norm": 15.689278863022123, "learning_rate": 3.3433716737505795e-05, "loss": 3.0675, "mean_token_accuracy": 0.3551724135875702, "step": 33195 }, { "epoch": 0.03343939261170721, "grad_norm": 17.345155392919605, "learning_rate": 3.3438752694236855e-05, "loss": 2.6537, "mean_token_accuracy": 0.3793103456497192, "step": 33200 }, { "epoch": 0.033444428664811385, "grad_norm": 15.97772378951692, "learning_rate": 3.3443788650967914e-05, "loss": 2.2864, "mean_token_accuracy": 0.42758620381355283, "step": 33205 }, { "epoch": 0.03344946471791556, "grad_norm": 15.058755767480124, "learning_rate": 3.344882460769897e-05, "loss": 2.5346, "mean_token_accuracy": 0.3999999940395355, "step": 33210 }, { "epoch": 0.03345450077101973, "grad_norm": 16.07652448115746, "learning_rate": 3.345386056443003e-05, "loss": 2.5148, "mean_token_accuracy": 0.4379310369491577, "step": 33215 }, { "epoch": 0.033459536824123906, "grad_norm": 17.095413394026565, "learning_rate": 3.345889652116109e-05, "loss": 2.6228, "mean_token_accuracy": 0.38965516686439516, "step": 33220 }, { "epoch": 0.03346457287722807, "grad_norm": 22.11982129443817, "learning_rate": 3.346393247789215e-05, "loss": 3.0668, "mean_token_accuracy": 0.3724137842655182, "step": 33225 }, { "epoch": 0.03346960893033225, "grad_norm": 15.029078757370373, "learning_rate": 3.346896843462321e-05, "loss": 2.9051, "mean_token_accuracy": 0.41379311084747317, "step": 33230 }, { "epoch": 0.03347464498343642, "grad_norm": 16.191446970451178, "learning_rate": 3.347400439135427e-05, "loss": 2.3787, "mean_token_accuracy": 0.4225045382976532, "step": 33235 }, { "epoch": 0.033479681036540594, "grad_norm": 15.824360435998495, "learning_rate": 3.3479040348085335e-05, "loss": 2.7137, "mean_token_accuracy": 0.40344826579093934, "step": 33240 }, { "epoch": 0.03348471708964477, "grad_norm": 15.169593277686035, "learning_rate": 3.3484076304816395e-05, "loss": 2.7675, "mean_token_accuracy": 0.38965516686439516, "step": 33245 }, { "epoch": 0.03348975314274894, "grad_norm": 15.369812641168073, "learning_rate": 3.3489112261547454e-05, "loss": 2.5108, "mean_token_accuracy": 0.3896551728248596, "step": 33250 }, { "epoch": 0.033494789195853115, "grad_norm": 14.809585461815166, "learning_rate": 3.3494148218278506e-05, "loss": 2.7273, "mean_token_accuracy": 0.38620689511299133, "step": 33255 }, { "epoch": 0.03349982524895728, "grad_norm": 14.808301768584974, "learning_rate": 3.3499184175009566e-05, "loss": 2.7721, "mean_token_accuracy": 0.3379310339689255, "step": 33260 }, { "epoch": 0.033504861302061456, "grad_norm": 13.342922547886442, "learning_rate": 3.350422013174063e-05, "loss": 2.3938, "mean_token_accuracy": 0.43448275327682495, "step": 33265 }, { "epoch": 0.03350989735516563, "grad_norm": 16.561012974349115, "learning_rate": 3.350925608847169e-05, "loss": 2.7407, "mean_token_accuracy": 0.36551723480224607, "step": 33270 }, { "epoch": 0.033514933408269804, "grad_norm": 13.60932517338548, "learning_rate": 3.351429204520275e-05, "loss": 2.5665, "mean_token_accuracy": 0.3793103456497192, "step": 33275 }, { "epoch": 0.03351996946137398, "grad_norm": 16.107395200085517, "learning_rate": 3.351932800193381e-05, "loss": 2.4879, "mean_token_accuracy": 0.42068966031074523, "step": 33280 }, { "epoch": 0.03352500551447815, "grad_norm": 16.258837189437735, "learning_rate": 3.352436395866487e-05, "loss": 2.4455, "mean_token_accuracy": 0.44482758045196535, "step": 33285 }, { "epoch": 0.033530041567582325, "grad_norm": 12.612442365852178, "learning_rate": 3.352939991539593e-05, "loss": 2.5474, "mean_token_accuracy": 0.4241379380226135, "step": 33290 }, { "epoch": 0.03353507762068649, "grad_norm": 15.285059120712274, "learning_rate": 3.353443587212699e-05, "loss": 2.6941, "mean_token_accuracy": 0.4034482777118683, "step": 33295 }, { "epoch": 0.033540113673790665, "grad_norm": 17.74235595323413, "learning_rate": 3.3539471828858046e-05, "loss": 2.1123, "mean_token_accuracy": 0.44827585816383364, "step": 33300 }, { "epoch": 0.03354514972689484, "grad_norm": 15.476502851511148, "learning_rate": 3.3544507785589106e-05, "loss": 2.7773, "mean_token_accuracy": 0.34827585220336915, "step": 33305 }, { "epoch": 0.03355018577999901, "grad_norm": 14.008121096277222, "learning_rate": 3.3549543742320165e-05, "loss": 2.7597, "mean_token_accuracy": 0.3896551728248596, "step": 33310 }, { "epoch": 0.03355522183310319, "grad_norm": 16.605581340323674, "learning_rate": 3.3554579699051224e-05, "loss": 2.4237, "mean_token_accuracy": 0.46896551847457885, "step": 33315 }, { "epoch": 0.03356025788620736, "grad_norm": 18.331301797876606, "learning_rate": 3.355961565578229e-05, "loss": 2.5487, "mean_token_accuracy": 0.4068965494632721, "step": 33320 }, { "epoch": 0.033565293939311534, "grad_norm": 13.460174678949524, "learning_rate": 3.356465161251335e-05, "loss": 2.7869, "mean_token_accuracy": 0.3793103456497192, "step": 33325 }, { "epoch": 0.0335703299924157, "grad_norm": 14.242020904590808, "learning_rate": 3.356968756924441e-05, "loss": 2.6531, "mean_token_accuracy": 0.3482758641242981, "step": 33330 }, { "epoch": 0.033575366045519875, "grad_norm": 27.131183669236453, "learning_rate": 3.357472352597547e-05, "loss": 2.6675, "mean_token_accuracy": 0.4137930989265442, "step": 33335 }, { "epoch": 0.03358040209862405, "grad_norm": 14.409381538665848, "learning_rate": 3.357975948270653e-05, "loss": 2.3833, "mean_token_accuracy": 0.42413793206214906, "step": 33340 }, { "epoch": 0.03358543815172822, "grad_norm": 15.404906589490066, "learning_rate": 3.3584795439437586e-05, "loss": 2.3128, "mean_token_accuracy": 0.4344827592372894, "step": 33345 }, { "epoch": 0.033590474204832396, "grad_norm": 14.492186166910189, "learning_rate": 3.3589831396168646e-05, "loss": 2.5889, "mean_token_accuracy": 0.42068966031074523, "step": 33350 }, { "epoch": 0.03359551025793657, "grad_norm": 16.30733284922257, "learning_rate": 3.3594867352899705e-05, "loss": 2.7639, "mean_token_accuracy": 0.37241379022598264, "step": 33355 }, { "epoch": 0.033600546311040744, "grad_norm": 21.649081301761097, "learning_rate": 3.3599903309630764e-05, "loss": 2.5192, "mean_token_accuracy": 0.43103447556495667, "step": 33360 }, { "epoch": 0.03360558236414491, "grad_norm": 20.0866325306752, "learning_rate": 3.3604939266361824e-05, "loss": 2.4209, "mean_token_accuracy": 0.3808832406997681, "step": 33365 }, { "epoch": 0.033610618417249084, "grad_norm": 13.063299039731445, "learning_rate": 3.360997522309288e-05, "loss": 2.3627, "mean_token_accuracy": 0.4275861978530884, "step": 33370 }, { "epoch": 0.03361565447035326, "grad_norm": 20.861546449478343, "learning_rate": 3.361501117982395e-05, "loss": 2.1489, "mean_token_accuracy": 0.44313369393348695, "step": 33375 }, { "epoch": 0.03362069052345743, "grad_norm": 20.11652574373875, "learning_rate": 3.362004713655501e-05, "loss": 2.5937, "mean_token_accuracy": 0.36896551847457887, "step": 33380 }, { "epoch": 0.033625726576561606, "grad_norm": 13.84720456585736, "learning_rate": 3.362508309328606e-05, "loss": 2.4155, "mean_token_accuracy": 0.43944342732429503, "step": 33385 }, { "epoch": 0.03363076262966578, "grad_norm": 22.944019659650202, "learning_rate": 3.363011905001712e-05, "loss": 2.5964, "mean_token_accuracy": 0.43284936547279357, "step": 33390 }, { "epoch": 0.03363579868276995, "grad_norm": 15.677187462539987, "learning_rate": 3.363515500674818e-05, "loss": 2.2788, "mean_token_accuracy": 0.44137930274009707, "step": 33395 }, { "epoch": 0.03364083473587412, "grad_norm": 26.760753721145292, "learning_rate": 3.3640190963479245e-05, "loss": 2.6794, "mean_token_accuracy": 0.3827586233615875, "step": 33400 }, { "epoch": 0.033645870788978294, "grad_norm": 16.055333245280966, "learning_rate": 3.3645226920210304e-05, "loss": 2.5818, "mean_token_accuracy": 0.42909860610961914, "step": 33405 }, { "epoch": 0.03365090684208247, "grad_norm": 15.189732950487997, "learning_rate": 3.3650262876941364e-05, "loss": 2.651, "mean_token_accuracy": 0.39655172228813174, "step": 33410 }, { "epoch": 0.03365594289518664, "grad_norm": 17.463146441261472, "learning_rate": 3.365529883367242e-05, "loss": 2.725, "mean_token_accuracy": 0.3241379201412201, "step": 33415 }, { "epoch": 0.033660978948290815, "grad_norm": 13.955130438432594, "learning_rate": 3.366033479040348e-05, "loss": 2.6173, "mean_token_accuracy": 0.358620685338974, "step": 33420 }, { "epoch": 0.03366601500139499, "grad_norm": 17.667702723989937, "learning_rate": 3.366537074713455e-05, "loss": 2.648, "mean_token_accuracy": 0.37241379618644715, "step": 33425 }, { "epoch": 0.03367105105449916, "grad_norm": 18.655066953109756, "learning_rate": 3.36704067038656e-05, "loss": 2.3834, "mean_token_accuracy": 0.46551724076271056, "step": 33430 }, { "epoch": 0.03367608710760333, "grad_norm": 11.717593924337502, "learning_rate": 3.367544266059666e-05, "loss": 2.4696, "mean_token_accuracy": 0.37586206793785093, "step": 33435 }, { "epoch": 0.0336811231607075, "grad_norm": 16.887628886810692, "learning_rate": 3.368047861732772e-05, "loss": 2.3334, "mean_token_accuracy": 0.3724137842655182, "step": 33440 }, { "epoch": 0.03368615921381168, "grad_norm": 21.30764226035458, "learning_rate": 3.368551457405878e-05, "loss": 2.3051, "mean_token_accuracy": 0.4448275864124298, "step": 33445 }, { "epoch": 0.03369119526691585, "grad_norm": 14.986759248956599, "learning_rate": 3.369055053078984e-05, "loss": 2.8938, "mean_token_accuracy": 0.4034482777118683, "step": 33450 }, { "epoch": 0.033696231320020024, "grad_norm": 14.806973340812089, "learning_rate": 3.3695586487520904e-05, "loss": 2.4146, "mean_token_accuracy": 0.4344827473163605, "step": 33455 }, { "epoch": 0.0337012673731242, "grad_norm": 14.143464002615191, "learning_rate": 3.370062244425196e-05, "loss": 2.4405, "mean_token_accuracy": 0.42928009629249575, "step": 33460 }, { "epoch": 0.03370630342622837, "grad_norm": 18.534554305573916, "learning_rate": 3.370565840098302e-05, "loss": 2.556, "mean_token_accuracy": 0.4068965554237366, "step": 33465 }, { "epoch": 0.03371133947933254, "grad_norm": 17.106716346576636, "learning_rate": 3.371069435771408e-05, "loss": 2.3672, "mean_token_accuracy": 0.4344827651977539, "step": 33470 }, { "epoch": 0.03371637553243671, "grad_norm": 26.831340390434864, "learning_rate": 3.371573031444514e-05, "loss": 2.6057, "mean_token_accuracy": 0.460591134428978, "step": 33475 }, { "epoch": 0.033721411585540886, "grad_norm": 15.727864745970095, "learning_rate": 3.37207662711762e-05, "loss": 2.5969, "mean_token_accuracy": 0.45172414779663084, "step": 33480 }, { "epoch": 0.03372644763864506, "grad_norm": 17.164861915220417, "learning_rate": 3.372580222790726e-05, "loss": 2.28, "mean_token_accuracy": 0.4793103516101837, "step": 33485 }, { "epoch": 0.033731483691749234, "grad_norm": 20.176578972725714, "learning_rate": 3.373083818463832e-05, "loss": 2.8377, "mean_token_accuracy": 0.41034482717514037, "step": 33490 }, { "epoch": 0.03373651974485341, "grad_norm": 15.936296248299778, "learning_rate": 3.373587414136938e-05, "loss": 2.3494, "mean_token_accuracy": 0.41724138259887694, "step": 33495 }, { "epoch": 0.03374155579795758, "grad_norm": 15.900457088743305, "learning_rate": 3.374091009810044e-05, "loss": 2.4265, "mean_token_accuracy": 0.3896551787853241, "step": 33500 }, { "epoch": 0.03374659185106175, "grad_norm": 33.268559112893676, "learning_rate": 3.37459460548315e-05, "loss": 2.3502, "mean_token_accuracy": 0.4379310369491577, "step": 33505 }, { "epoch": 0.03375162790416592, "grad_norm": 13.252180669358543, "learning_rate": 3.375098201156256e-05, "loss": 2.4637, "mean_token_accuracy": 0.41724138855934145, "step": 33510 }, { "epoch": 0.033756663957270096, "grad_norm": 15.363289733802445, "learning_rate": 3.375601796829362e-05, "loss": 2.5266, "mean_token_accuracy": 0.41034482717514037, "step": 33515 }, { "epoch": 0.03376170001037427, "grad_norm": 16.928419872285037, "learning_rate": 3.3761053925024674e-05, "loss": 2.523, "mean_token_accuracy": 0.37586206793785093, "step": 33520 }, { "epoch": 0.03376673606347844, "grad_norm": 15.919952134533274, "learning_rate": 3.376608988175573e-05, "loss": 2.4868, "mean_token_accuracy": 0.4103448212146759, "step": 33525 }, { "epoch": 0.03377177211658262, "grad_norm": 12.777495684930544, "learning_rate": 3.377112583848679e-05, "loss": 2.2396, "mean_token_accuracy": 0.4413793087005615, "step": 33530 }, { "epoch": 0.03377680816968679, "grad_norm": 17.476087042341497, "learning_rate": 3.377616179521786e-05, "loss": 2.4973, "mean_token_accuracy": 0.3965517282485962, "step": 33535 }, { "epoch": 0.03378184422279096, "grad_norm": 14.93544548891769, "learning_rate": 3.378119775194892e-05, "loss": 2.7043, "mean_token_accuracy": 0.40344828367233276, "step": 33540 }, { "epoch": 0.03378688027589513, "grad_norm": 15.358212390514966, "learning_rate": 3.378623370867998e-05, "loss": 2.5301, "mean_token_accuracy": 0.4103448212146759, "step": 33545 }, { "epoch": 0.033791916328999305, "grad_norm": 16.73696533977745, "learning_rate": 3.3791269665411036e-05, "loss": 2.6264, "mean_token_accuracy": 0.42413793206214906, "step": 33550 }, { "epoch": 0.03379695238210348, "grad_norm": 21.232846244977896, "learning_rate": 3.3796305622142095e-05, "loss": 2.3784, "mean_token_accuracy": 0.4137930989265442, "step": 33555 }, { "epoch": 0.03380198843520765, "grad_norm": 11.819675051600473, "learning_rate": 3.3801341578873155e-05, "loss": 2.1918, "mean_token_accuracy": 0.4689655125141144, "step": 33560 }, { "epoch": 0.033807024488311826, "grad_norm": 15.348407649157389, "learning_rate": 3.3806377535604214e-05, "loss": 2.5444, "mean_token_accuracy": 0.4172413766384125, "step": 33565 }, { "epoch": 0.033812060541416, "grad_norm": 17.649377102581006, "learning_rate": 3.381141349233527e-05, "loss": 2.6573, "mean_token_accuracy": 0.3827586203813553, "step": 33570 }, { "epoch": 0.03381709659452017, "grad_norm": 21.930410616442188, "learning_rate": 3.381644944906633e-05, "loss": 2.789, "mean_token_accuracy": 0.3879612863063812, "step": 33575 }, { "epoch": 0.03382213264762434, "grad_norm": 16.631705264386476, "learning_rate": 3.382148540579739e-05, "loss": 2.4717, "mean_token_accuracy": 0.45578818321228026, "step": 33580 }, { "epoch": 0.033827168700728515, "grad_norm": 13.766222915396035, "learning_rate": 3.382652136252846e-05, "loss": 2.4466, "mean_token_accuracy": 0.4034482717514038, "step": 33585 }, { "epoch": 0.03383220475383269, "grad_norm": 17.02146465759159, "learning_rate": 3.383155731925952e-05, "loss": 2.1857, "mean_token_accuracy": 0.46721112728118896, "step": 33590 }, { "epoch": 0.03383724080693686, "grad_norm": 13.118351357620305, "learning_rate": 3.3836593275990576e-05, "loss": 2.409, "mean_token_accuracy": 0.48342408537864684, "step": 33595 }, { "epoch": 0.033842276860041036, "grad_norm": 13.427074149318164, "learning_rate": 3.3841629232721635e-05, "loss": 2.4875, "mean_token_accuracy": 0.44482759237289426, "step": 33600 }, { "epoch": 0.03384731291314521, "grad_norm": 14.607017075836513, "learning_rate": 3.3846665189452695e-05, "loss": 2.431, "mean_token_accuracy": 0.43448275327682495, "step": 33605 }, { "epoch": 0.033852348966249377, "grad_norm": 17.751284243482814, "learning_rate": 3.3851701146183754e-05, "loss": 2.7856, "mean_token_accuracy": 0.3620689630508423, "step": 33610 }, { "epoch": 0.03385738501935355, "grad_norm": 15.42033638768905, "learning_rate": 3.385673710291481e-05, "loss": 2.44, "mean_token_accuracy": 0.4034482777118683, "step": 33615 }, { "epoch": 0.033862421072457724, "grad_norm": 12.611103891349403, "learning_rate": 3.386177305964587e-05, "loss": 2.193, "mean_token_accuracy": 0.42758620977401735, "step": 33620 }, { "epoch": 0.0338674571255619, "grad_norm": 13.668291883708857, "learning_rate": 3.386680901637693e-05, "loss": 2.6532, "mean_token_accuracy": 0.36206896901130675, "step": 33625 }, { "epoch": 0.03387249317866607, "grad_norm": 19.470713150273824, "learning_rate": 3.387184497310799e-05, "loss": 2.5028, "mean_token_accuracy": 0.39310344159603117, "step": 33630 }, { "epoch": 0.033877529231770245, "grad_norm": 15.18968727435688, "learning_rate": 3.387688092983905e-05, "loss": 2.0236, "mean_token_accuracy": 0.46600985527038574, "step": 33635 }, { "epoch": 0.03388256528487442, "grad_norm": 20.707502905794545, "learning_rate": 3.3881916886570116e-05, "loss": 3.0023, "mean_token_accuracy": 0.36551724672317504, "step": 33640 }, { "epoch": 0.033887601337978586, "grad_norm": 13.991047991894307, "learning_rate": 3.3886952843301176e-05, "loss": 2.5632, "mean_token_accuracy": 0.3793103456497192, "step": 33645 }, { "epoch": 0.03389263739108276, "grad_norm": 12.805463351154948, "learning_rate": 3.3891988800032235e-05, "loss": 2.2792, "mean_token_accuracy": 0.4, "step": 33650 }, { "epoch": 0.03389767344418693, "grad_norm": 13.114297774297848, "learning_rate": 3.389702475676329e-05, "loss": 2.0424, "mean_token_accuracy": 0.49655172824859617, "step": 33655 }, { "epoch": 0.03390270949729111, "grad_norm": 16.012034323220714, "learning_rate": 3.3902060713494347e-05, "loss": 2.6002, "mean_token_accuracy": 0.42413793206214906, "step": 33660 }, { "epoch": 0.03390774555039528, "grad_norm": 11.986109823559781, "learning_rate": 3.390709667022541e-05, "loss": 2.301, "mean_token_accuracy": 0.5103448271751404, "step": 33665 }, { "epoch": 0.033912781603499455, "grad_norm": 15.387117777741706, "learning_rate": 3.391213262695647e-05, "loss": 2.297, "mean_token_accuracy": 0.4241379201412201, "step": 33670 }, { "epoch": 0.03391781765660363, "grad_norm": 23.704366234664672, "learning_rate": 3.391716858368753e-05, "loss": 2.253, "mean_token_accuracy": 0.4241379380226135, "step": 33675 }, { "epoch": 0.033922853709707795, "grad_norm": 17.171684954468404, "learning_rate": 3.392220454041859e-05, "loss": 2.2524, "mean_token_accuracy": 0.39655172228813174, "step": 33680 }, { "epoch": 0.03392788976281197, "grad_norm": 15.154667727304059, "learning_rate": 3.392724049714965e-05, "loss": 2.4502, "mean_token_accuracy": 0.41034482717514037, "step": 33685 }, { "epoch": 0.03393292581591614, "grad_norm": 16.388028734471675, "learning_rate": 3.3932276453880716e-05, "loss": 2.3603, "mean_token_accuracy": 0.4344827592372894, "step": 33690 }, { "epoch": 0.03393796186902032, "grad_norm": 13.796465123146703, "learning_rate": 3.393731241061177e-05, "loss": 2.5177, "mean_token_accuracy": 0.417241370677948, "step": 33695 }, { "epoch": 0.03394299792212449, "grad_norm": 15.78286358304764, "learning_rate": 3.394234836734283e-05, "loss": 2.1044, "mean_token_accuracy": 0.4896551728248596, "step": 33700 }, { "epoch": 0.033948033975228664, "grad_norm": 12.053792809791872, "learning_rate": 3.3947384324073887e-05, "loss": 2.3663, "mean_token_accuracy": 0.4504537105560303, "step": 33705 }, { "epoch": 0.03395307002833284, "grad_norm": 14.741870524935232, "learning_rate": 3.3952420280804946e-05, "loss": 2.3069, "mean_token_accuracy": 0.4482758641242981, "step": 33710 }, { "epoch": 0.033958106081437005, "grad_norm": 15.119479877099353, "learning_rate": 3.3957456237536005e-05, "loss": 2.6241, "mean_token_accuracy": 0.3758620619773865, "step": 33715 }, { "epoch": 0.03396314213454118, "grad_norm": 15.363678421117212, "learning_rate": 3.396249219426707e-05, "loss": 2.5051, "mean_token_accuracy": 0.42413793206214906, "step": 33720 }, { "epoch": 0.03396817818764535, "grad_norm": 15.251076252583264, "learning_rate": 3.396752815099813e-05, "loss": 2.2943, "mean_token_accuracy": 0.4413793087005615, "step": 33725 }, { "epoch": 0.033973214240749526, "grad_norm": 15.115383566892831, "learning_rate": 3.397256410772919e-05, "loss": 2.6776, "mean_token_accuracy": 0.4137930989265442, "step": 33730 }, { "epoch": 0.0339782502938537, "grad_norm": 16.26241219602374, "learning_rate": 3.397760006446025e-05, "loss": 2.8019, "mean_token_accuracy": 0.37931033968925476, "step": 33735 }, { "epoch": 0.033983286346957874, "grad_norm": 15.49627672981393, "learning_rate": 3.398263602119131e-05, "loss": 2.7764, "mean_token_accuracy": 0.3275862097740173, "step": 33740 }, { "epoch": 0.03398832240006205, "grad_norm": 16.5952548377308, "learning_rate": 3.398767197792237e-05, "loss": 2.891, "mean_token_accuracy": 0.3793103486299515, "step": 33745 }, { "epoch": 0.033993358453166214, "grad_norm": 16.740261800217052, "learning_rate": 3.399270793465343e-05, "loss": 2.476, "mean_token_accuracy": 0.3965517282485962, "step": 33750 }, { "epoch": 0.03399839450627039, "grad_norm": 16.411786038470886, "learning_rate": 3.3997743891384486e-05, "loss": 2.5387, "mean_token_accuracy": 0.4275861978530884, "step": 33755 }, { "epoch": 0.03400343055937456, "grad_norm": 16.7520349544638, "learning_rate": 3.4002779848115545e-05, "loss": 2.6123, "mean_token_accuracy": 0.43793101906776427, "step": 33760 }, { "epoch": 0.034008466612478735, "grad_norm": 16.161476629062996, "learning_rate": 3.4007815804846604e-05, "loss": 2.5211, "mean_token_accuracy": 0.41465517282485964, "step": 33765 }, { "epoch": 0.03401350266558291, "grad_norm": 14.674103235717382, "learning_rate": 3.401285176157767e-05, "loss": 2.5252, "mean_token_accuracy": 0.3517241418361664, "step": 33770 }, { "epoch": 0.03401853871868708, "grad_norm": 19.462784442511474, "learning_rate": 3.401788771830873e-05, "loss": 2.5809, "mean_token_accuracy": 0.39655172228813174, "step": 33775 }, { "epoch": 0.03402357477179126, "grad_norm": 18.508202179021975, "learning_rate": 3.402292367503979e-05, "loss": 2.5125, "mean_token_accuracy": 0.4034482717514038, "step": 33780 }, { "epoch": 0.034028610824895424, "grad_norm": 15.539433873030923, "learning_rate": 3.402795963177085e-05, "loss": 2.3488, "mean_token_accuracy": 0.4551724135875702, "step": 33785 }, { "epoch": 0.0340336468779996, "grad_norm": 13.968372594787937, "learning_rate": 3.40329955885019e-05, "loss": 2.4434, "mean_token_accuracy": 0.4310344815254211, "step": 33790 }, { "epoch": 0.03403868293110377, "grad_norm": 14.07881312574245, "learning_rate": 3.403803154523296e-05, "loss": 2.4394, "mean_token_accuracy": 0.4448275864124298, "step": 33795 }, { "epoch": 0.034043718984207945, "grad_norm": 14.110041114670986, "learning_rate": 3.4043067501964026e-05, "loss": 2.8194, "mean_token_accuracy": 0.33103448152542114, "step": 33800 }, { "epoch": 0.03404875503731212, "grad_norm": 16.70874688718829, "learning_rate": 3.4048103458695085e-05, "loss": 2.8836, "mean_token_accuracy": 0.3620689630508423, "step": 33805 }, { "epoch": 0.03405379109041629, "grad_norm": 18.13144746709522, "learning_rate": 3.4053139415426144e-05, "loss": 2.7949, "mean_token_accuracy": 0.4068965494632721, "step": 33810 }, { "epoch": 0.034058827143520466, "grad_norm": 17.5810539581166, "learning_rate": 3.4058175372157204e-05, "loss": 2.3936, "mean_token_accuracy": 0.38620689511299133, "step": 33815 }, { "epoch": 0.03406386319662463, "grad_norm": 16.481199209900517, "learning_rate": 3.406321132888826e-05, "loss": 2.5263, "mean_token_accuracy": 0.39310344457626345, "step": 33820 }, { "epoch": 0.03406889924972881, "grad_norm": 14.92482879472569, "learning_rate": 3.406824728561933e-05, "loss": 2.472, "mean_token_accuracy": 0.41034482717514037, "step": 33825 }, { "epoch": 0.03407393530283298, "grad_norm": 19.171447277158492, "learning_rate": 3.407328324235038e-05, "loss": 2.8402, "mean_token_accuracy": 0.35172412991523744, "step": 33830 }, { "epoch": 0.034078971355937154, "grad_norm": 17.398853947876074, "learning_rate": 3.407831919908144e-05, "loss": 2.7976, "mean_token_accuracy": 0.3551724135875702, "step": 33835 }, { "epoch": 0.03408400740904133, "grad_norm": 15.27709314653025, "learning_rate": 3.40833551558125e-05, "loss": 2.1321, "mean_token_accuracy": 0.4827586054801941, "step": 33840 }, { "epoch": 0.0340890434621455, "grad_norm": 17.055838417665477, "learning_rate": 3.408839111254356e-05, "loss": 2.4088, "mean_token_accuracy": 0.4016333997249603, "step": 33845 }, { "epoch": 0.034094079515249676, "grad_norm": 18.134560739578674, "learning_rate": 3.4093427069274625e-05, "loss": 2.3745, "mean_token_accuracy": 0.4379310369491577, "step": 33850 }, { "epoch": 0.03409911556835384, "grad_norm": 13.060604675826816, "learning_rate": 3.4098463026005685e-05, "loss": 2.3797, "mean_token_accuracy": 0.4310344815254211, "step": 33855 }, { "epoch": 0.034104151621458016, "grad_norm": 16.85645883129703, "learning_rate": 3.4103498982736744e-05, "loss": 2.6108, "mean_token_accuracy": 0.32758620381355286, "step": 33860 }, { "epoch": 0.03410918767456219, "grad_norm": 14.268218037654616, "learning_rate": 3.41085349394678e-05, "loss": 2.618, "mean_token_accuracy": 0.40175438225269317, "step": 33865 }, { "epoch": 0.034114223727666364, "grad_norm": 15.902701959772452, "learning_rate": 3.411357089619886e-05, "loss": 2.6419, "mean_token_accuracy": 0.3724137932062149, "step": 33870 }, { "epoch": 0.03411925978077054, "grad_norm": 19.571085798288763, "learning_rate": 3.411860685292992e-05, "loss": 2.6711, "mean_token_accuracy": 0.4172413766384125, "step": 33875 }, { "epoch": 0.03412429583387471, "grad_norm": 15.741795653911232, "learning_rate": 3.412364280966098e-05, "loss": 2.756, "mean_token_accuracy": 0.39655172228813174, "step": 33880 }, { "epoch": 0.034129331886978885, "grad_norm": 15.847061074334219, "learning_rate": 3.412867876639204e-05, "loss": 2.3558, "mean_token_accuracy": 0.3758620619773865, "step": 33885 }, { "epoch": 0.03413436794008305, "grad_norm": 14.239911078513291, "learning_rate": 3.41337147231231e-05, "loss": 2.8119, "mean_token_accuracy": 0.38275861740112305, "step": 33890 }, { "epoch": 0.034139403993187226, "grad_norm": 16.770907420070984, "learning_rate": 3.413875067985416e-05, "loss": 2.1466, "mean_token_accuracy": 0.48275861144065857, "step": 33895 }, { "epoch": 0.0341444400462914, "grad_norm": 19.560814373759353, "learning_rate": 3.414378663658522e-05, "loss": 2.7282, "mean_token_accuracy": 0.4000000059604645, "step": 33900 }, { "epoch": 0.03414947609939557, "grad_norm": 16.67254395560786, "learning_rate": 3.4148822593316284e-05, "loss": 2.4749, "mean_token_accuracy": 0.40344828367233276, "step": 33905 }, { "epoch": 0.03415451215249975, "grad_norm": 16.066624157074884, "learning_rate": 3.415385855004734e-05, "loss": 2.4009, "mean_token_accuracy": 0.37586206793785093, "step": 33910 }, { "epoch": 0.03415954820560392, "grad_norm": 16.74701612385621, "learning_rate": 3.41588945067784e-05, "loss": 2.5473, "mean_token_accuracy": 0.3655172407627106, "step": 33915 }, { "epoch": 0.034164584258708094, "grad_norm": 14.490524628766174, "learning_rate": 3.4163930463509455e-05, "loss": 2.3936, "mean_token_accuracy": 0.36551723480224607, "step": 33920 }, { "epoch": 0.03416962031181226, "grad_norm": 15.803940977840652, "learning_rate": 3.4168966420240514e-05, "loss": 2.4214, "mean_token_accuracy": 0.42220206260681153, "step": 33925 }, { "epoch": 0.034174656364916435, "grad_norm": 12.956760557394777, "learning_rate": 3.417400237697158e-05, "loss": 2.5856, "mean_token_accuracy": 0.42607380747795104, "step": 33930 }, { "epoch": 0.03417969241802061, "grad_norm": 20.295608864424725, "learning_rate": 3.417903833370264e-05, "loss": 2.4486, "mean_token_accuracy": 0.4034482777118683, "step": 33935 }, { "epoch": 0.03418472847112478, "grad_norm": 17.694047148451265, "learning_rate": 3.41840742904337e-05, "loss": 2.216, "mean_token_accuracy": 0.44331517815589905, "step": 33940 }, { "epoch": 0.034189764524228956, "grad_norm": 18.025711657512737, "learning_rate": 3.418911024716476e-05, "loss": 2.4034, "mean_token_accuracy": 0.3793103516101837, "step": 33945 }, { "epoch": 0.03419480057733313, "grad_norm": 14.12918994842883, "learning_rate": 3.419414620389582e-05, "loss": 2.2498, "mean_token_accuracy": 0.44827585816383364, "step": 33950 }, { "epoch": 0.034199836630437304, "grad_norm": 21.605890947503223, "learning_rate": 3.419918216062688e-05, "loss": 3.018, "mean_token_accuracy": 0.358620685338974, "step": 33955 }, { "epoch": 0.03420487268354147, "grad_norm": 15.48640859595419, "learning_rate": 3.420421811735794e-05, "loss": 2.3261, "mean_token_accuracy": 0.417241370677948, "step": 33960 }, { "epoch": 0.034209908736645644, "grad_norm": 17.66462995738986, "learning_rate": 3.4209254074088995e-05, "loss": 2.6326, "mean_token_accuracy": 0.4, "step": 33965 }, { "epoch": 0.03421494478974982, "grad_norm": 15.671109317636722, "learning_rate": 3.4214290030820054e-05, "loss": 2.7434, "mean_token_accuracy": 0.42758620381355283, "step": 33970 }, { "epoch": 0.03421998084285399, "grad_norm": 15.748290969434409, "learning_rate": 3.421932598755111e-05, "loss": 2.4462, "mean_token_accuracy": 0.4172413766384125, "step": 33975 }, { "epoch": 0.034225016895958166, "grad_norm": 17.913842390778633, "learning_rate": 3.422436194428217e-05, "loss": 2.6313, "mean_token_accuracy": 0.34827586114406583, "step": 33980 }, { "epoch": 0.03423005294906234, "grad_norm": 12.879765159251058, "learning_rate": 3.422939790101324e-05, "loss": 2.2138, "mean_token_accuracy": 0.4310344815254211, "step": 33985 }, { "epoch": 0.03423508900216651, "grad_norm": 13.883531032037038, "learning_rate": 3.42344338577443e-05, "loss": 2.3723, "mean_token_accuracy": 0.41034482717514037, "step": 33990 }, { "epoch": 0.03424012505527068, "grad_norm": 17.25046061325281, "learning_rate": 3.423946981447536e-05, "loss": 2.6528, "mean_token_accuracy": 0.3517241418361664, "step": 33995 }, { "epoch": 0.034245161108374854, "grad_norm": 15.484828877820906, "learning_rate": 3.4244505771206416e-05, "loss": 2.3892, "mean_token_accuracy": 0.49848759174346924, "step": 34000 }, { "epoch": 0.03425019716147903, "grad_norm": 17.74652430956716, "learning_rate": 3.4249541727937476e-05, "loss": 2.978, "mean_token_accuracy": 0.4068965494632721, "step": 34005 }, { "epoch": 0.0342552332145832, "grad_norm": 14.218214831625037, "learning_rate": 3.4254577684668535e-05, "loss": 2.1288, "mean_token_accuracy": 0.47241379618644713, "step": 34010 }, { "epoch": 0.034260269267687375, "grad_norm": 15.711550929757276, "learning_rate": 3.4259613641399594e-05, "loss": 2.5334, "mean_token_accuracy": 0.41034482717514037, "step": 34015 }, { "epoch": 0.03426530532079155, "grad_norm": 19.41064982787321, "learning_rate": 3.4264649598130653e-05, "loss": 2.5724, "mean_token_accuracy": 0.4068965494632721, "step": 34020 }, { "epoch": 0.03427034137389572, "grad_norm": 15.481279276030175, "learning_rate": 3.426968555486171e-05, "loss": 2.6408, "mean_token_accuracy": 0.4034482717514038, "step": 34025 }, { "epoch": 0.03427537742699989, "grad_norm": 15.25332052565089, "learning_rate": 3.427472151159277e-05, "loss": 2.4959, "mean_token_accuracy": 0.4241379201412201, "step": 34030 }, { "epoch": 0.03428041348010406, "grad_norm": 15.780460568069019, "learning_rate": 3.427975746832384e-05, "loss": 2.701, "mean_token_accuracy": 0.37586207389831544, "step": 34035 }, { "epoch": 0.03428544953320824, "grad_norm": 13.895803619002006, "learning_rate": 3.42847934250549e-05, "loss": 2.5724, "mean_token_accuracy": 0.3982456147670746, "step": 34040 }, { "epoch": 0.03429048558631241, "grad_norm": 17.866298224704064, "learning_rate": 3.4289829381785956e-05, "loss": 2.5913, "mean_token_accuracy": 0.3946158528327942, "step": 34045 }, { "epoch": 0.034295521639416585, "grad_norm": 13.520626196344217, "learning_rate": 3.4294865338517016e-05, "loss": 2.6836, "mean_token_accuracy": 0.41034482717514037, "step": 34050 }, { "epoch": 0.03430055769252076, "grad_norm": 15.010275448022409, "learning_rate": 3.429990129524807e-05, "loss": 2.4978, "mean_token_accuracy": 0.4607380449771881, "step": 34055 }, { "epoch": 0.03430559374562493, "grad_norm": 13.074079396081535, "learning_rate": 3.430493725197913e-05, "loss": 2.7411, "mean_token_accuracy": 0.41034482717514037, "step": 34060 }, { "epoch": 0.0343106297987291, "grad_norm": 15.350148405145282, "learning_rate": 3.4309973208710193e-05, "loss": 2.6671, "mean_token_accuracy": 0.3482758581638336, "step": 34065 }, { "epoch": 0.03431566585183327, "grad_norm": 16.88966018226705, "learning_rate": 3.431500916544125e-05, "loss": 2.4739, "mean_token_accuracy": 0.3931034505367279, "step": 34070 }, { "epoch": 0.034320701904937446, "grad_norm": 16.44123458618899, "learning_rate": 3.432004512217231e-05, "loss": 2.3103, "mean_token_accuracy": 0.47428917288780215, "step": 34075 }, { "epoch": 0.03432573795804162, "grad_norm": 16.164549064725446, "learning_rate": 3.432508107890337e-05, "loss": 2.634, "mean_token_accuracy": 0.41724138259887694, "step": 34080 }, { "epoch": 0.034330774011145794, "grad_norm": 13.849358439860044, "learning_rate": 3.433011703563443e-05, "loss": 2.2913, "mean_token_accuracy": 0.4034482717514038, "step": 34085 }, { "epoch": 0.03433581006424997, "grad_norm": 17.608906695227233, "learning_rate": 3.4335152992365496e-05, "loss": 2.4927, "mean_token_accuracy": 0.4502117395401001, "step": 34090 }, { "epoch": 0.03434084611735414, "grad_norm": 16.19896311227932, "learning_rate": 3.434018894909655e-05, "loss": 2.7337, "mean_token_accuracy": 0.3793103456497192, "step": 34095 }, { "epoch": 0.03434588217045831, "grad_norm": 13.979191227233933, "learning_rate": 3.434522490582761e-05, "loss": 2.8432, "mean_token_accuracy": 0.37241379618644715, "step": 34100 }, { "epoch": 0.03435091822356248, "grad_norm": 16.60670509611578, "learning_rate": 3.435026086255867e-05, "loss": 2.7363, "mean_token_accuracy": 0.38275861740112305, "step": 34105 }, { "epoch": 0.034355954276666656, "grad_norm": 19.15045486615811, "learning_rate": 3.435529681928973e-05, "loss": 2.2882, "mean_token_accuracy": 0.4255898416042328, "step": 34110 }, { "epoch": 0.03436099032977083, "grad_norm": 17.265214814749402, "learning_rate": 3.436033277602079e-05, "loss": 2.9429, "mean_token_accuracy": 0.35172413289546967, "step": 34115 }, { "epoch": 0.034366026382875, "grad_norm": 22.96428250450586, "learning_rate": 3.436536873275185e-05, "loss": 2.5968, "mean_token_accuracy": 0.36896551847457887, "step": 34120 }, { "epoch": 0.03437106243597918, "grad_norm": 19.78464647723333, "learning_rate": 3.437040468948291e-05, "loss": 2.4, "mean_token_accuracy": 0.43103448748588563, "step": 34125 }, { "epoch": 0.03437609848908335, "grad_norm": 16.326230907138537, "learning_rate": 3.437544064621397e-05, "loss": 2.7009, "mean_token_accuracy": 0.3965517163276672, "step": 34130 }, { "epoch": 0.03438113454218752, "grad_norm": 17.20404656147665, "learning_rate": 3.438047660294503e-05, "loss": 2.5321, "mean_token_accuracy": 0.37241379618644715, "step": 34135 }, { "epoch": 0.03438617059529169, "grad_norm": 13.716251025793976, "learning_rate": 3.438551255967609e-05, "loss": 2.4028, "mean_token_accuracy": 0.44482759237289426, "step": 34140 }, { "epoch": 0.034391206648395865, "grad_norm": 14.239828525533264, "learning_rate": 3.439054851640715e-05, "loss": 2.2131, "mean_token_accuracy": 0.4620689690113068, "step": 34145 }, { "epoch": 0.03439624270150004, "grad_norm": 15.613954636993007, "learning_rate": 3.439558447313821e-05, "loss": 2.3042, "mean_token_accuracy": 0.4517241418361664, "step": 34150 }, { "epoch": 0.03440127875460421, "grad_norm": 13.50958402722549, "learning_rate": 3.440062042986927e-05, "loss": 2.9191, "mean_token_accuracy": 0.3551724135875702, "step": 34155 }, { "epoch": 0.03440631480770839, "grad_norm": 12.328542434053968, "learning_rate": 3.4405656386600326e-05, "loss": 2.2921, "mean_token_accuracy": 0.4068965494632721, "step": 34160 }, { "epoch": 0.03441135086081256, "grad_norm": 12.962218718339766, "learning_rate": 3.4410692343331385e-05, "loss": 2.5559, "mean_token_accuracy": 0.3965517163276672, "step": 34165 }, { "epoch": 0.03441638691391673, "grad_norm": 17.469871074217824, "learning_rate": 3.441572830006245e-05, "loss": 2.6591, "mean_token_accuracy": 0.38965516686439516, "step": 34170 }, { "epoch": 0.0344214229670209, "grad_norm": 16.45827663379989, "learning_rate": 3.442076425679351e-05, "loss": 2.6024, "mean_token_accuracy": 0.36896550953388213, "step": 34175 }, { "epoch": 0.034426459020125075, "grad_norm": 13.565074547159714, "learning_rate": 3.442580021352457e-05, "loss": 2.4708, "mean_token_accuracy": 0.453841495513916, "step": 34180 }, { "epoch": 0.03443149507322925, "grad_norm": 16.661736936377068, "learning_rate": 3.443083617025563e-05, "loss": 2.6495, "mean_token_accuracy": 0.41379310488700866, "step": 34185 }, { "epoch": 0.03443653112633342, "grad_norm": 18.075275490504943, "learning_rate": 3.443587212698668e-05, "loss": 2.7532, "mean_token_accuracy": 0.358620697259903, "step": 34190 }, { "epoch": 0.034441567179437596, "grad_norm": 10.993543614001391, "learning_rate": 3.444090808371775e-05, "loss": 2.3698, "mean_token_accuracy": 0.417241370677948, "step": 34195 }, { "epoch": 0.03444660323254177, "grad_norm": 19.57029793317187, "learning_rate": 3.444594404044881e-05, "loss": 2.5461, "mean_token_accuracy": 0.4310344815254211, "step": 34200 }, { "epoch": 0.03445163928564594, "grad_norm": 16.210564188101415, "learning_rate": 3.4450979997179866e-05, "loss": 2.9766, "mean_token_accuracy": 0.37586206793785093, "step": 34205 }, { "epoch": 0.03445667533875011, "grad_norm": 16.876624524013554, "learning_rate": 3.4456015953910925e-05, "loss": 2.9907, "mean_token_accuracy": 0.34482758343219755, "step": 34210 }, { "epoch": 0.034461711391854284, "grad_norm": 17.280675149070028, "learning_rate": 3.4461051910641985e-05, "loss": 2.5087, "mean_token_accuracy": 0.4172413766384125, "step": 34215 }, { "epoch": 0.03446674744495846, "grad_norm": 13.820146239724414, "learning_rate": 3.4466087867373044e-05, "loss": 2.9072, "mean_token_accuracy": 0.36896551251411436, "step": 34220 }, { "epoch": 0.03447178349806263, "grad_norm": 14.630898746427063, "learning_rate": 3.447112382410411e-05, "loss": 2.1276, "mean_token_accuracy": 0.46896551847457885, "step": 34225 }, { "epoch": 0.034476819551166805, "grad_norm": 19.257986269732434, "learning_rate": 3.447615978083516e-05, "loss": 2.868, "mean_token_accuracy": 0.3774954676628113, "step": 34230 }, { "epoch": 0.03448185560427098, "grad_norm": 16.86330253904032, "learning_rate": 3.448119573756622e-05, "loss": 2.3603, "mean_token_accuracy": 0.4434966742992401, "step": 34235 }, { "epoch": 0.034486891657375146, "grad_norm": 16.775332881608698, "learning_rate": 3.448623169429728e-05, "loss": 2.7784, "mean_token_accuracy": 0.36551723480224607, "step": 34240 }, { "epoch": 0.03449192771047932, "grad_norm": 19.983191333105317, "learning_rate": 3.449126765102834e-05, "loss": 2.3983, "mean_token_accuracy": 0.4294010937213898, "step": 34245 }, { "epoch": 0.034496963763583494, "grad_norm": 18.120551284710473, "learning_rate": 3.4496303607759406e-05, "loss": 2.7196, "mean_token_accuracy": 0.3827586233615875, "step": 34250 }, { "epoch": 0.03450199981668767, "grad_norm": 13.013714454753044, "learning_rate": 3.4501339564490465e-05, "loss": 2.734, "mean_token_accuracy": 0.36896551847457887, "step": 34255 }, { "epoch": 0.03450703586979184, "grad_norm": 14.127661099280106, "learning_rate": 3.4506375521221525e-05, "loss": 3.0503, "mean_token_accuracy": 0.3448275804519653, "step": 34260 }, { "epoch": 0.034512071922896015, "grad_norm": 15.841282738405992, "learning_rate": 3.4511411477952584e-05, "loss": 2.4476, "mean_token_accuracy": 0.37586206793785093, "step": 34265 }, { "epoch": 0.03451710797600019, "grad_norm": 15.76792098143515, "learning_rate": 3.451644743468364e-05, "loss": 2.2326, "mean_token_accuracy": 0.4068965494632721, "step": 34270 }, { "epoch": 0.034522144029104355, "grad_norm": 20.504027869138753, "learning_rate": 3.45214833914147e-05, "loss": 2.5074, "mean_token_accuracy": 0.4103448212146759, "step": 34275 }, { "epoch": 0.03452718008220853, "grad_norm": 17.15812435440335, "learning_rate": 3.452651934814576e-05, "loss": 2.5533, "mean_token_accuracy": 0.38657635152339936, "step": 34280 }, { "epoch": 0.0345322161353127, "grad_norm": 14.954269071815194, "learning_rate": 3.453155530487682e-05, "loss": 2.3977, "mean_token_accuracy": 0.4, "step": 34285 }, { "epoch": 0.03453725218841688, "grad_norm": 14.705818735346607, "learning_rate": 3.453659126160788e-05, "loss": 2.2945, "mean_token_accuracy": 0.4730295598506927, "step": 34290 }, { "epoch": 0.03454228824152105, "grad_norm": 15.915321480315606, "learning_rate": 3.454162721833894e-05, "loss": 2.4169, "mean_token_accuracy": 0.42758620381355283, "step": 34295 }, { "epoch": 0.034547324294625224, "grad_norm": 15.907780587149457, "learning_rate": 3.454666317507e-05, "loss": 2.6603, "mean_token_accuracy": 0.40689654350280763, "step": 34300 }, { "epoch": 0.0345523603477294, "grad_norm": 19.0911661738502, "learning_rate": 3.4551699131801065e-05, "loss": 2.3442, "mean_token_accuracy": 0.4332123339176178, "step": 34305 }, { "epoch": 0.034557396400833565, "grad_norm": 15.393271498408437, "learning_rate": 3.4556735088532124e-05, "loss": 2.8674, "mean_token_accuracy": 0.3482758581638336, "step": 34310 }, { "epoch": 0.03456243245393774, "grad_norm": 17.61540307012319, "learning_rate": 3.456177104526318e-05, "loss": 2.7625, "mean_token_accuracy": 0.3793103456497192, "step": 34315 }, { "epoch": 0.03456746850704191, "grad_norm": 16.679858098993158, "learning_rate": 3.456680700199424e-05, "loss": 3.1326, "mean_token_accuracy": 0.36896551251411436, "step": 34320 }, { "epoch": 0.034572504560146086, "grad_norm": 14.909692828759107, "learning_rate": 3.4571842958725295e-05, "loss": 2.7697, "mean_token_accuracy": 0.35862069129943847, "step": 34325 }, { "epoch": 0.03457754061325026, "grad_norm": 15.445383712202576, "learning_rate": 3.457687891545636e-05, "loss": 2.3825, "mean_token_accuracy": 0.4068965554237366, "step": 34330 }, { "epoch": 0.034582576666354434, "grad_norm": 13.930526148131197, "learning_rate": 3.458191487218742e-05, "loss": 2.4642, "mean_token_accuracy": 0.3999999940395355, "step": 34335 }, { "epoch": 0.03458761271945861, "grad_norm": 13.47338949714128, "learning_rate": 3.458695082891848e-05, "loss": 2.2247, "mean_token_accuracy": 0.4482758641242981, "step": 34340 }, { "epoch": 0.034592648772562774, "grad_norm": 14.514223383734237, "learning_rate": 3.459198678564954e-05, "loss": 2.5925, "mean_token_accuracy": 0.36551723480224607, "step": 34345 }, { "epoch": 0.03459768482566695, "grad_norm": 14.954291704610474, "learning_rate": 3.45970227423806e-05, "loss": 2.1633, "mean_token_accuracy": 0.4571687877178192, "step": 34350 }, { "epoch": 0.03460272087877112, "grad_norm": 19.142415862922576, "learning_rate": 3.4602058699111664e-05, "loss": 2.6894, "mean_token_accuracy": 0.4068965494632721, "step": 34355 }, { "epoch": 0.034607756931875296, "grad_norm": 16.734867101987728, "learning_rate": 3.460709465584272e-05, "loss": 2.5994, "mean_token_accuracy": 0.34827586114406583, "step": 34360 }, { "epoch": 0.03461279298497947, "grad_norm": 19.547124104281572, "learning_rate": 3.4612130612573776e-05, "loss": 2.5978, "mean_token_accuracy": 0.4, "step": 34365 }, { "epoch": 0.03461782903808364, "grad_norm": 16.724276952531444, "learning_rate": 3.4617166569304835e-05, "loss": 2.4564, "mean_token_accuracy": 0.42413793206214906, "step": 34370 }, { "epoch": 0.03462286509118782, "grad_norm": 15.59306249631678, "learning_rate": 3.4622202526035894e-05, "loss": 2.5072, "mean_token_accuracy": 0.358620685338974, "step": 34375 }, { "epoch": 0.034627901144291984, "grad_norm": 17.215472544268252, "learning_rate": 3.462723848276696e-05, "loss": 2.7569, "mean_token_accuracy": 0.3655172437429428, "step": 34380 }, { "epoch": 0.03463293719739616, "grad_norm": 17.4131492149336, "learning_rate": 3.463227443949802e-05, "loss": 2.6924, "mean_token_accuracy": 0.36551723778247835, "step": 34385 }, { "epoch": 0.03463797325050033, "grad_norm": 16.497925299239114, "learning_rate": 3.463731039622908e-05, "loss": 2.4546, "mean_token_accuracy": 0.43103448748588563, "step": 34390 }, { "epoch": 0.034643009303604505, "grad_norm": 13.008799173253962, "learning_rate": 3.464234635296014e-05, "loss": 2.4753, "mean_token_accuracy": 0.3862068921327591, "step": 34395 }, { "epoch": 0.03464804535670868, "grad_norm": 13.883261062337679, "learning_rate": 3.46473823096912e-05, "loss": 2.2983, "mean_token_accuracy": 0.4348457396030426, "step": 34400 }, { "epoch": 0.03465308140981285, "grad_norm": 14.406086326141299, "learning_rate": 3.4652418266422257e-05, "loss": 2.3777, "mean_token_accuracy": 0.4344827592372894, "step": 34405 }, { "epoch": 0.034658117462917026, "grad_norm": 15.925165431045617, "learning_rate": 3.4657454223153316e-05, "loss": 2.4159, "mean_token_accuracy": 0.4034482777118683, "step": 34410 }, { "epoch": 0.03466315351602119, "grad_norm": 14.472032126744317, "learning_rate": 3.4662490179884375e-05, "loss": 2.736, "mean_token_accuracy": 0.3793103456497192, "step": 34415 }, { "epoch": 0.03466818956912537, "grad_norm": 13.570949773151083, "learning_rate": 3.4667526136615434e-05, "loss": 2.4162, "mean_token_accuracy": 0.42758620977401735, "step": 34420 }, { "epoch": 0.03467322562222954, "grad_norm": 13.788320520304786, "learning_rate": 3.4672562093346494e-05, "loss": 2.8515, "mean_token_accuracy": 0.3896551728248596, "step": 34425 }, { "epoch": 0.034678261675333714, "grad_norm": 10.106099830762087, "learning_rate": 3.467759805007755e-05, "loss": 2.1511, "mean_token_accuracy": 0.42906404137611387, "step": 34430 }, { "epoch": 0.03468329772843789, "grad_norm": 16.513953567921412, "learning_rate": 3.468263400680862e-05, "loss": 2.4459, "mean_token_accuracy": 0.42413792610168455, "step": 34435 }, { "epoch": 0.03468833378154206, "grad_norm": 24.889540564844093, "learning_rate": 3.468766996353968e-05, "loss": 2.6426, "mean_token_accuracy": 0.3862069010734558, "step": 34440 }, { "epoch": 0.034693369834646236, "grad_norm": 20.250625369503194, "learning_rate": 3.469270592027074e-05, "loss": 2.6332, "mean_token_accuracy": 0.41034482717514037, "step": 34445 }, { "epoch": 0.0346984058877504, "grad_norm": 14.376829350731116, "learning_rate": 3.4697741877001797e-05, "loss": 2.7246, "mean_token_accuracy": 0.36896551251411436, "step": 34450 }, { "epoch": 0.034703441940854576, "grad_norm": 15.168909115388761, "learning_rate": 3.470277783373285e-05, "loss": 2.7558, "mean_token_accuracy": 0.35862069129943847, "step": 34455 }, { "epoch": 0.03470847799395875, "grad_norm": 13.919353414976857, "learning_rate": 3.4707813790463915e-05, "loss": 2.5143, "mean_token_accuracy": 0.42068964838981626, "step": 34460 }, { "epoch": 0.034713514047062924, "grad_norm": 16.886317898872004, "learning_rate": 3.4712849747194974e-05, "loss": 2.215, "mean_token_accuracy": 0.42758620977401735, "step": 34465 }, { "epoch": 0.0347185501001671, "grad_norm": 13.6069989621035, "learning_rate": 3.4717885703926034e-05, "loss": 2.5228, "mean_token_accuracy": 0.4206896543502808, "step": 34470 }, { "epoch": 0.03472358615327127, "grad_norm": 13.256595197787686, "learning_rate": 3.472292166065709e-05, "loss": 2.5866, "mean_token_accuracy": 0.417241370677948, "step": 34475 }, { "epoch": 0.034728622206375445, "grad_norm": 16.6983576774615, "learning_rate": 3.472795761738815e-05, "loss": 2.7045, "mean_token_accuracy": 0.37931033968925476, "step": 34480 }, { "epoch": 0.03473365825947961, "grad_norm": 14.284368255583972, "learning_rate": 3.473299357411921e-05, "loss": 2.3891, "mean_token_accuracy": 0.39310344457626345, "step": 34485 }, { "epoch": 0.034738694312583786, "grad_norm": 18.71597346867723, "learning_rate": 3.473802953085028e-05, "loss": 2.8608, "mean_token_accuracy": 0.3724137872457504, "step": 34490 }, { "epoch": 0.03474373036568796, "grad_norm": 18.778149507658444, "learning_rate": 3.474306548758134e-05, "loss": 2.6009, "mean_token_accuracy": 0.3482758581638336, "step": 34495 }, { "epoch": 0.03474876641879213, "grad_norm": 18.738622273976702, "learning_rate": 3.474810144431239e-05, "loss": 2.4465, "mean_token_accuracy": 0.37586206793785093, "step": 34500 }, { "epoch": 0.03475380247189631, "grad_norm": 15.907321198418725, "learning_rate": 3.475313740104345e-05, "loss": 2.4409, "mean_token_accuracy": 0.4172413766384125, "step": 34505 }, { "epoch": 0.03475883852500048, "grad_norm": 12.991090430191488, "learning_rate": 3.475817335777451e-05, "loss": 2.5997, "mean_token_accuracy": 0.41034482717514037, "step": 34510 }, { "epoch": 0.034763874578104655, "grad_norm": 16.413449966488265, "learning_rate": 3.4763209314505574e-05, "loss": 2.6908, "mean_token_accuracy": 0.3931034505367279, "step": 34515 }, { "epoch": 0.03476891063120882, "grad_norm": 17.890136832056744, "learning_rate": 3.476824527123663e-05, "loss": 2.7192, "mean_token_accuracy": 0.37586206793785093, "step": 34520 }, { "epoch": 0.034773946684312995, "grad_norm": 14.8285509932761, "learning_rate": 3.477328122796769e-05, "loss": 2.5435, "mean_token_accuracy": 0.4156079888343811, "step": 34525 }, { "epoch": 0.03477898273741717, "grad_norm": 13.982453703419386, "learning_rate": 3.477831718469875e-05, "loss": 2.1683, "mean_token_accuracy": 0.46551724076271056, "step": 34530 }, { "epoch": 0.03478401879052134, "grad_norm": 13.888591547304285, "learning_rate": 3.478335314142981e-05, "loss": 2.3196, "mean_token_accuracy": 0.45172414779663084, "step": 34535 }, { "epoch": 0.034789054843625516, "grad_norm": 15.662586782728326, "learning_rate": 3.478838909816087e-05, "loss": 2.5761, "mean_token_accuracy": 0.4379310369491577, "step": 34540 }, { "epoch": 0.03479409089672969, "grad_norm": 21.941888073634633, "learning_rate": 3.479342505489193e-05, "loss": 2.6451, "mean_token_accuracy": 0.42758620977401735, "step": 34545 }, { "epoch": 0.034799126949833864, "grad_norm": 14.658407588655695, "learning_rate": 3.479846101162299e-05, "loss": 2.5867, "mean_token_accuracy": 0.46551724076271056, "step": 34550 }, { "epoch": 0.03480416300293803, "grad_norm": 14.58469748594041, "learning_rate": 3.480349696835405e-05, "loss": 2.6283, "mean_token_accuracy": 0.41724138259887694, "step": 34555 }, { "epoch": 0.034809199056042205, "grad_norm": 14.613418263710889, "learning_rate": 3.480853292508511e-05, "loss": 2.705, "mean_token_accuracy": 0.3689655214548111, "step": 34560 }, { "epoch": 0.03481423510914638, "grad_norm": 16.590421665815768, "learning_rate": 3.4813568881816166e-05, "loss": 2.3128, "mean_token_accuracy": 0.43103447556495667, "step": 34565 }, { "epoch": 0.03481927116225055, "grad_norm": 17.28619264389333, "learning_rate": 3.481860483854723e-05, "loss": 2.5566, "mean_token_accuracy": 0.39310343861579894, "step": 34570 }, { "epoch": 0.034824307215354726, "grad_norm": 14.897544557703494, "learning_rate": 3.482364079527829e-05, "loss": 3.0206, "mean_token_accuracy": 0.3849364757537842, "step": 34575 }, { "epoch": 0.0348293432684589, "grad_norm": 29.75429863225802, "learning_rate": 3.482867675200935e-05, "loss": 2.957, "mean_token_accuracy": 0.3482758581638336, "step": 34580 }, { "epoch": 0.03483437932156307, "grad_norm": 12.777747861410624, "learning_rate": 3.483371270874041e-05, "loss": 2.4241, "mean_token_accuracy": 0.47586206793785096, "step": 34585 }, { "epoch": 0.03483941537466724, "grad_norm": 19.06062655928057, "learning_rate": 3.483874866547146e-05, "loss": 2.4463, "mean_token_accuracy": 0.4068965494632721, "step": 34590 }, { "epoch": 0.034844451427771414, "grad_norm": 14.880322484069044, "learning_rate": 3.484378462220253e-05, "loss": 2.5543, "mean_token_accuracy": 0.3655172407627106, "step": 34595 }, { "epoch": 0.03484948748087559, "grad_norm": 18.2088328779293, "learning_rate": 3.484882057893359e-05, "loss": 2.5652, "mean_token_accuracy": 0.4172413766384125, "step": 34600 }, { "epoch": 0.03485452353397976, "grad_norm": 17.248621778399723, "learning_rate": 3.485385653566465e-05, "loss": 2.6376, "mean_token_accuracy": 0.4206896543502808, "step": 34605 }, { "epoch": 0.034859559587083935, "grad_norm": 14.471448228827747, "learning_rate": 3.4858892492395706e-05, "loss": 2.1248, "mean_token_accuracy": 0.40344828367233276, "step": 34610 }, { "epoch": 0.03486459564018811, "grad_norm": 20.867260112306372, "learning_rate": 3.4863928449126765e-05, "loss": 2.8706, "mean_token_accuracy": 0.37241379022598264, "step": 34615 }, { "epoch": 0.03486963169329228, "grad_norm": 15.25798863459342, "learning_rate": 3.486896440585783e-05, "loss": 2.6889, "mean_token_accuracy": 0.4, "step": 34620 }, { "epoch": 0.03487466774639645, "grad_norm": 14.659442558739642, "learning_rate": 3.487400036258889e-05, "loss": 2.6817, "mean_token_accuracy": 0.38965516686439516, "step": 34625 }, { "epoch": 0.03487970379950062, "grad_norm": 15.560786355028446, "learning_rate": 3.487903631931994e-05, "loss": 2.3978, "mean_token_accuracy": 0.4448275864124298, "step": 34630 }, { "epoch": 0.0348847398526048, "grad_norm": 13.752442286182815, "learning_rate": 3.4884072276051e-05, "loss": 2.6445, "mean_token_accuracy": 0.4448275864124298, "step": 34635 }, { "epoch": 0.03488977590570897, "grad_norm": 15.973012628786337, "learning_rate": 3.488910823278206e-05, "loss": 2.7024, "mean_token_accuracy": 0.38275861740112305, "step": 34640 }, { "epoch": 0.034894811958813145, "grad_norm": 13.909753541908835, "learning_rate": 3.489414418951312e-05, "loss": 2.222, "mean_token_accuracy": 0.4413793087005615, "step": 34645 }, { "epoch": 0.03489984801191732, "grad_norm": 13.897338376267177, "learning_rate": 3.489918014624419e-05, "loss": 2.4116, "mean_token_accuracy": 0.38275861740112305, "step": 34650 }, { "epoch": 0.03490488406502149, "grad_norm": 19.41903406560839, "learning_rate": 3.4904216102975246e-05, "loss": 2.4505, "mean_token_accuracy": 0.39655172228813174, "step": 34655 }, { "epoch": 0.03490992011812566, "grad_norm": 20.658938205485974, "learning_rate": 3.4909252059706306e-05, "loss": 2.5236, "mean_token_accuracy": 0.42413793206214906, "step": 34660 }, { "epoch": 0.03491495617122983, "grad_norm": 17.096764502387536, "learning_rate": 3.4914288016437365e-05, "loss": 2.7408, "mean_token_accuracy": 0.41724138259887694, "step": 34665 }, { "epoch": 0.03491999222433401, "grad_norm": 15.998200593539226, "learning_rate": 3.4919323973168424e-05, "loss": 2.5951, "mean_token_accuracy": 0.3689655214548111, "step": 34670 }, { "epoch": 0.03492502827743818, "grad_norm": 13.182363512643608, "learning_rate": 3.492435992989948e-05, "loss": 2.6719, "mean_token_accuracy": 0.3931034505367279, "step": 34675 }, { "epoch": 0.034930064330542354, "grad_norm": 16.873519779320084, "learning_rate": 3.492939588663054e-05, "loss": 2.606, "mean_token_accuracy": 0.4256503343582153, "step": 34680 }, { "epoch": 0.03493510038364653, "grad_norm": 22.750745524513793, "learning_rate": 3.49344318433616e-05, "loss": 2.6748, "mean_token_accuracy": 0.3827586233615875, "step": 34685 }, { "epoch": 0.0349401364367507, "grad_norm": 24.074574644803008, "learning_rate": 3.493946780009266e-05, "loss": 2.6533, "mean_token_accuracy": 0.3793103456497192, "step": 34690 }, { "epoch": 0.03494517248985487, "grad_norm": 16.49382448473932, "learning_rate": 3.494450375682372e-05, "loss": 2.4331, "mean_token_accuracy": 0.4206896543502808, "step": 34695 }, { "epoch": 0.03495020854295904, "grad_norm": 15.617841678912903, "learning_rate": 3.4949539713554786e-05, "loss": 2.652, "mean_token_accuracy": 0.39310344457626345, "step": 34700 }, { "epoch": 0.034955244596063216, "grad_norm": 12.513930646665889, "learning_rate": 3.4954575670285846e-05, "loss": 2.5435, "mean_token_accuracy": 0.39655172228813174, "step": 34705 }, { "epoch": 0.03496028064916739, "grad_norm": 20.413935791725716, "learning_rate": 3.4959611627016905e-05, "loss": 3.162, "mean_token_accuracy": 0.3310344874858856, "step": 34710 }, { "epoch": 0.034965316702271564, "grad_norm": 17.720985862913732, "learning_rate": 3.4964647583747964e-05, "loss": 2.3776, "mean_token_accuracy": 0.4517241358757019, "step": 34715 }, { "epoch": 0.03497035275537574, "grad_norm": 14.364985700435367, "learning_rate": 3.496968354047902e-05, "loss": 2.6911, "mean_token_accuracy": 0.36896551251411436, "step": 34720 }, { "epoch": 0.03497538880847991, "grad_norm": 16.598430228124727, "learning_rate": 3.4974719497210076e-05, "loss": 2.4671, "mean_token_accuracy": 0.4137930989265442, "step": 34725 }, { "epoch": 0.03498042486158408, "grad_norm": 14.3754381441058, "learning_rate": 3.497975545394114e-05, "loss": 2.1248, "mean_token_accuracy": 0.4689655125141144, "step": 34730 }, { "epoch": 0.03498546091468825, "grad_norm": 13.646445526115247, "learning_rate": 3.49847914106722e-05, "loss": 2.5737, "mean_token_accuracy": 0.358620697259903, "step": 34735 }, { "epoch": 0.034990496967792425, "grad_norm": 17.308369773458114, "learning_rate": 3.498982736740326e-05, "loss": 2.7151, "mean_token_accuracy": 0.4255898416042328, "step": 34740 }, { "epoch": 0.0349955330208966, "grad_norm": 15.08407907683832, "learning_rate": 3.499486332413432e-05, "loss": 2.221, "mean_token_accuracy": 0.44827587008476255, "step": 34745 }, { "epoch": 0.03500056907400077, "grad_norm": 15.91755307247523, "learning_rate": 3.499989928086538e-05, "loss": 2.5029, "mean_token_accuracy": 0.441379314661026, "step": 34750 }, { "epoch": 0.03500560512710495, "grad_norm": 18.520511646762156, "learning_rate": 3.5004935237596445e-05, "loss": 2.3419, "mean_token_accuracy": 0.4362371563911438, "step": 34755 }, { "epoch": 0.03501064118020912, "grad_norm": 18.711625749401353, "learning_rate": 3.5009971194327504e-05, "loss": 2.8273, "mean_token_accuracy": 0.4, "step": 34760 }, { "epoch": 0.03501567723331329, "grad_norm": 19.699896135275633, "learning_rate": 3.501500715105856e-05, "loss": 2.7136, "mean_token_accuracy": 0.4068965554237366, "step": 34765 }, { "epoch": 0.03502071328641746, "grad_norm": 15.6250548239187, "learning_rate": 3.5020043107789616e-05, "loss": 3.0684, "mean_token_accuracy": 0.3655172407627106, "step": 34770 }, { "epoch": 0.035025749339521635, "grad_norm": 14.204615655023026, "learning_rate": 3.5025079064520675e-05, "loss": 2.4801, "mean_token_accuracy": 0.39655173420906065, "step": 34775 }, { "epoch": 0.03503078539262581, "grad_norm": 12.056004899335262, "learning_rate": 3.503011502125174e-05, "loss": 2.7502, "mean_token_accuracy": 0.3758620619773865, "step": 34780 }, { "epoch": 0.03503582144572998, "grad_norm": 12.752678351786745, "learning_rate": 3.50351509779828e-05, "loss": 2.6846, "mean_token_accuracy": 0.37241379022598264, "step": 34785 }, { "epoch": 0.035040857498834156, "grad_norm": 13.187990599671174, "learning_rate": 3.504018693471386e-05, "loss": 2.3376, "mean_token_accuracy": 0.4379310429096222, "step": 34790 }, { "epoch": 0.03504589355193833, "grad_norm": 15.3408474812755, "learning_rate": 3.504522289144492e-05, "loss": 2.5433, "mean_token_accuracy": 0.4034482777118683, "step": 34795 }, { "epoch": 0.0350509296050425, "grad_norm": 13.461076495520665, "learning_rate": 3.505025884817598e-05, "loss": 2.5766, "mean_token_accuracy": 0.3862068891525269, "step": 34800 }, { "epoch": 0.03505596565814667, "grad_norm": 13.60327195472584, "learning_rate": 3.505529480490704e-05, "loss": 2.196, "mean_token_accuracy": 0.4517241358757019, "step": 34805 }, { "epoch": 0.035061001711250844, "grad_norm": 14.874605979394588, "learning_rate": 3.50603307616381e-05, "loss": 2.4762, "mean_token_accuracy": 0.4448275864124298, "step": 34810 }, { "epoch": 0.03506603776435502, "grad_norm": 18.440378456217555, "learning_rate": 3.5065366718369156e-05, "loss": 2.3704, "mean_token_accuracy": 0.43448275327682495, "step": 34815 }, { "epoch": 0.03507107381745919, "grad_norm": 13.8060401283756, "learning_rate": 3.5070402675100215e-05, "loss": 2.5296, "mean_token_accuracy": 0.3827586203813553, "step": 34820 }, { "epoch": 0.035076109870563366, "grad_norm": 13.902660422625276, "learning_rate": 3.5075438631831274e-05, "loss": 2.4656, "mean_token_accuracy": 0.4, "step": 34825 }, { "epoch": 0.03508114592366753, "grad_norm": 14.433850535863852, "learning_rate": 3.5080474588562334e-05, "loss": 2.6364, "mean_token_accuracy": 0.4241379380226135, "step": 34830 }, { "epoch": 0.035086181976771706, "grad_norm": 15.626413363696457, "learning_rate": 3.50855105452934e-05, "loss": 2.7109, "mean_token_accuracy": 0.39655172228813174, "step": 34835 }, { "epoch": 0.03509121802987588, "grad_norm": 14.906736564406229, "learning_rate": 3.509054650202446e-05, "loss": 2.289, "mean_token_accuracy": 0.38620689511299133, "step": 34840 }, { "epoch": 0.035096254082980054, "grad_norm": 16.290965766880536, "learning_rate": 3.509558245875552e-05, "loss": 2.5371, "mean_token_accuracy": 0.39655172228813174, "step": 34845 }, { "epoch": 0.03510129013608423, "grad_norm": 14.443031955984175, "learning_rate": 3.510061841548658e-05, "loss": 2.3989, "mean_token_accuracy": 0.3896551728248596, "step": 34850 }, { "epoch": 0.0351063261891884, "grad_norm": 16.647428570156077, "learning_rate": 3.510565437221764e-05, "loss": 2.1607, "mean_token_accuracy": 0.4620689570903778, "step": 34855 }, { "epoch": 0.035111362242292575, "grad_norm": 20.02522224677574, "learning_rate": 3.5110690328948696e-05, "loss": 2.7904, "mean_token_accuracy": 0.34137930572032926, "step": 34860 }, { "epoch": 0.03511639829539674, "grad_norm": 13.799344289018075, "learning_rate": 3.5115726285679755e-05, "loss": 2.7876, "mean_token_accuracy": 0.4172413766384125, "step": 34865 }, { "epoch": 0.035121434348500916, "grad_norm": 21.597128027383445, "learning_rate": 3.5120762242410814e-05, "loss": 2.3981, "mean_token_accuracy": 0.4103448331356049, "step": 34870 }, { "epoch": 0.03512647040160509, "grad_norm": 14.612752673523506, "learning_rate": 3.5125798199141874e-05, "loss": 2.2843, "mean_token_accuracy": 0.41034482717514037, "step": 34875 }, { "epoch": 0.03513150645470926, "grad_norm": 17.293872710631845, "learning_rate": 3.513083415587293e-05, "loss": 2.7855, "mean_token_accuracy": 0.3655172407627106, "step": 34880 }, { "epoch": 0.03513654250781344, "grad_norm": 13.49845232578023, "learning_rate": 3.5135870112604e-05, "loss": 2.2603, "mean_token_accuracy": 0.46551724076271056, "step": 34885 }, { "epoch": 0.03514157856091761, "grad_norm": 20.155548607584144, "learning_rate": 3.514090606933506e-05, "loss": 2.6396, "mean_token_accuracy": 0.39310344457626345, "step": 34890 }, { "epoch": 0.035146614614021784, "grad_norm": 19.714118561070414, "learning_rate": 3.514594202606612e-05, "loss": 2.3158, "mean_token_accuracy": 0.4396249234676361, "step": 34895 }, { "epoch": 0.03515165066712595, "grad_norm": 16.650278951914416, "learning_rate": 3.515097798279717e-05, "loss": 2.9605, "mean_token_accuracy": 0.3448275804519653, "step": 34900 }, { "epoch": 0.035156686720230125, "grad_norm": 13.648467445858389, "learning_rate": 3.515601393952823e-05, "loss": 2.4078, "mean_token_accuracy": 0.4103448301553726, "step": 34905 }, { "epoch": 0.0351617227733343, "grad_norm": 14.467172873578784, "learning_rate": 3.516104989625929e-05, "loss": 2.3961, "mean_token_accuracy": 0.4448275864124298, "step": 34910 }, { "epoch": 0.03516675882643847, "grad_norm": 17.040386607396222, "learning_rate": 3.5166085852990355e-05, "loss": 2.6022, "mean_token_accuracy": 0.39310344457626345, "step": 34915 }, { "epoch": 0.035171794879542646, "grad_norm": 15.238184139793118, "learning_rate": 3.5171121809721414e-05, "loss": 2.3827, "mean_token_accuracy": 0.4068965554237366, "step": 34920 }, { "epoch": 0.03517683093264682, "grad_norm": 16.129892640271386, "learning_rate": 3.517615776645247e-05, "loss": 2.2233, "mean_token_accuracy": 0.46896551847457885, "step": 34925 }, { "epoch": 0.035181866985750994, "grad_norm": 11.842092803308393, "learning_rate": 3.518119372318353e-05, "loss": 3.1096, "mean_token_accuracy": 0.39655172228813174, "step": 34930 }, { "epoch": 0.03518690303885516, "grad_norm": 15.73663039863801, "learning_rate": 3.518622967991459e-05, "loss": 2.4456, "mean_token_accuracy": 0.4310344815254211, "step": 34935 }, { "epoch": 0.035191939091959334, "grad_norm": 14.288817774901093, "learning_rate": 3.519126563664565e-05, "loss": 2.2833, "mean_token_accuracy": 0.42413793206214906, "step": 34940 }, { "epoch": 0.03519697514506351, "grad_norm": 14.469159702042143, "learning_rate": 3.519630159337671e-05, "loss": 2.655, "mean_token_accuracy": 0.38965516686439516, "step": 34945 }, { "epoch": 0.03520201119816768, "grad_norm": 18.09243880448009, "learning_rate": 3.520133755010777e-05, "loss": 2.4444, "mean_token_accuracy": 0.424137943983078, "step": 34950 }, { "epoch": 0.035207047251271856, "grad_norm": 16.350324481796303, "learning_rate": 3.520637350683883e-05, "loss": 2.7826, "mean_token_accuracy": 0.4034482777118683, "step": 34955 }, { "epoch": 0.03521208330437603, "grad_norm": 17.700875394178883, "learning_rate": 3.521140946356989e-05, "loss": 3.0237, "mean_token_accuracy": 0.39655171930789945, "step": 34960 }, { "epoch": 0.0352171193574802, "grad_norm": 12.57047174349908, "learning_rate": 3.5216445420300954e-05, "loss": 2.6225, "mean_token_accuracy": 0.41379310488700866, "step": 34965 }, { "epoch": 0.03522215541058437, "grad_norm": 13.7806588895724, "learning_rate": 3.522148137703201e-05, "loss": 2.5445, "mean_token_accuracy": 0.42546883821487425, "step": 34970 }, { "epoch": 0.035227191463688544, "grad_norm": 15.1779189230023, "learning_rate": 3.522651733376307e-05, "loss": 2.4448, "mean_token_accuracy": 0.42068966031074523, "step": 34975 }, { "epoch": 0.03523222751679272, "grad_norm": 16.945905737412804, "learning_rate": 3.523155329049413e-05, "loss": 2.5292, "mean_token_accuracy": 0.41724138259887694, "step": 34980 }, { "epoch": 0.03523726356989689, "grad_norm": 15.156172836972884, "learning_rate": 3.523658924722519e-05, "loss": 2.3893, "mean_token_accuracy": 0.44482759237289426, "step": 34985 }, { "epoch": 0.035242299623001065, "grad_norm": 15.150424613832756, "learning_rate": 3.524162520395624e-05, "loss": 2.5776, "mean_token_accuracy": 0.4, "step": 34990 }, { "epoch": 0.03524733567610524, "grad_norm": 13.181538344579817, "learning_rate": 3.524666116068731e-05, "loss": 2.5919, "mean_token_accuracy": 0.38620689511299133, "step": 34995 }, { "epoch": 0.03525237172920941, "grad_norm": 14.545185599738529, "learning_rate": 3.525169711741837e-05, "loss": 2.4735, "mean_token_accuracy": 0.4292801022529602, "step": 35000 }, { "epoch": 0.03525740778231358, "grad_norm": 15.321789890941083, "learning_rate": 3.525673307414943e-05, "loss": 2.3938, "mean_token_accuracy": 0.4103448212146759, "step": 35005 }, { "epoch": 0.03526244383541775, "grad_norm": 11.889020495085513, "learning_rate": 3.526176903088049e-05, "loss": 2.1852, "mean_token_accuracy": 0.43793103098869324, "step": 35010 }, { "epoch": 0.03526747988852193, "grad_norm": 12.509413225799198, "learning_rate": 3.5266804987611546e-05, "loss": 2.5663, "mean_token_accuracy": 0.41379310488700866, "step": 35015 }, { "epoch": 0.0352725159416261, "grad_norm": 17.708691800389598, "learning_rate": 3.527184094434261e-05, "loss": 2.9172, "mean_token_accuracy": 0.3252268582582474, "step": 35020 }, { "epoch": 0.035277551994730275, "grad_norm": 15.933069921103863, "learning_rate": 3.527687690107367e-05, "loss": 2.6766, "mean_token_accuracy": 0.39806412160396576, "step": 35025 }, { "epoch": 0.03528258804783445, "grad_norm": 13.240292165154473, "learning_rate": 3.528191285780473e-05, "loss": 2.354, "mean_token_accuracy": 0.41724137365818026, "step": 35030 }, { "epoch": 0.03528762410093862, "grad_norm": 14.053145028200777, "learning_rate": 3.528694881453578e-05, "loss": 2.2076, "mean_token_accuracy": 0.4482758641242981, "step": 35035 }, { "epoch": 0.03529266015404279, "grad_norm": 23.347071559969965, "learning_rate": 3.529198477126684e-05, "loss": 3.0553, "mean_token_accuracy": 0.35172412991523744, "step": 35040 }, { "epoch": 0.03529769620714696, "grad_norm": 15.750660845861152, "learning_rate": 3.529702072799791e-05, "loss": 2.5436, "mean_token_accuracy": 0.33103448152542114, "step": 35045 }, { "epoch": 0.035302732260251136, "grad_norm": 17.313315106769004, "learning_rate": 3.530205668472897e-05, "loss": 2.7395, "mean_token_accuracy": 0.3344827562570572, "step": 35050 }, { "epoch": 0.03530776831335531, "grad_norm": 14.856269865060684, "learning_rate": 3.530709264146003e-05, "loss": 2.405, "mean_token_accuracy": 0.43793103098869324, "step": 35055 }, { "epoch": 0.035312804366459484, "grad_norm": 19.8915450732745, "learning_rate": 3.5312128598191086e-05, "loss": 2.6414, "mean_token_accuracy": 0.42068964838981626, "step": 35060 }, { "epoch": 0.03531784041956366, "grad_norm": 18.253268800770304, "learning_rate": 3.5317164554922146e-05, "loss": 2.8963, "mean_token_accuracy": 0.33793103098869326, "step": 35065 }, { "epoch": 0.03532287647266783, "grad_norm": 15.799863142352251, "learning_rate": 3.5322200511653205e-05, "loss": 2.3502, "mean_token_accuracy": 0.42413792610168455, "step": 35070 }, { "epoch": 0.035327912525772, "grad_norm": 12.160868357161332, "learning_rate": 3.5327236468384264e-05, "loss": 2.3084, "mean_token_accuracy": 0.46551724076271056, "step": 35075 }, { "epoch": 0.03533294857887617, "grad_norm": 16.148838752236983, "learning_rate": 3.5332272425115323e-05, "loss": 2.1896, "mean_token_accuracy": 0.4206896543502808, "step": 35080 }, { "epoch": 0.035337984631980346, "grad_norm": 15.60361082735607, "learning_rate": 3.533730838184638e-05, "loss": 2.775, "mean_token_accuracy": 0.358620685338974, "step": 35085 }, { "epoch": 0.03534302068508452, "grad_norm": 15.185444271243693, "learning_rate": 3.534234433857744e-05, "loss": 2.2067, "mean_token_accuracy": 0.5, "step": 35090 }, { "epoch": 0.03534805673818869, "grad_norm": 17.283525375310372, "learning_rate": 3.53473802953085e-05, "loss": 2.8294, "mean_token_accuracy": 0.39655172228813174, "step": 35095 }, { "epoch": 0.03535309279129287, "grad_norm": 13.296519780993291, "learning_rate": 3.535241625203957e-05, "loss": 2.9436, "mean_token_accuracy": 0.39655172228813174, "step": 35100 }, { "epoch": 0.03535812884439704, "grad_norm": 14.186059246960328, "learning_rate": 3.5357452208770626e-05, "loss": 2.7963, "mean_token_accuracy": 0.3793103456497192, "step": 35105 }, { "epoch": 0.03536316489750121, "grad_norm": 14.568811557967948, "learning_rate": 3.5362488165501686e-05, "loss": 2.6306, "mean_token_accuracy": 0.41379310488700866, "step": 35110 }, { "epoch": 0.03536820095060538, "grad_norm": 14.723056368932362, "learning_rate": 3.5367524122232745e-05, "loss": 2.6108, "mean_token_accuracy": 0.41034482717514037, "step": 35115 }, { "epoch": 0.035373237003709555, "grad_norm": 12.836738126795844, "learning_rate": 3.5372560078963804e-05, "loss": 2.3457, "mean_token_accuracy": 0.44827585816383364, "step": 35120 }, { "epoch": 0.03537827305681373, "grad_norm": 15.460265128156244, "learning_rate": 3.5377596035694863e-05, "loss": 2.3418, "mean_token_accuracy": 0.43103447556495667, "step": 35125 }, { "epoch": 0.0353833091099179, "grad_norm": 17.166241826087635, "learning_rate": 3.538263199242592e-05, "loss": 2.6302, "mean_token_accuracy": 0.39764065146446226, "step": 35130 }, { "epoch": 0.03538834516302208, "grad_norm": 13.816056938682053, "learning_rate": 3.538766794915698e-05, "loss": 2.3703, "mean_token_accuracy": 0.4413793087005615, "step": 35135 }, { "epoch": 0.03539338121612625, "grad_norm": 18.217601662834483, "learning_rate": 3.539270390588804e-05, "loss": 2.4424, "mean_token_accuracy": 0.42413793206214906, "step": 35140 }, { "epoch": 0.03539841726923042, "grad_norm": 19.779283108787794, "learning_rate": 3.53977398626191e-05, "loss": 2.4743, "mean_token_accuracy": 0.42068966031074523, "step": 35145 }, { "epoch": 0.03540345332233459, "grad_norm": 17.37813870869696, "learning_rate": 3.540277581935016e-05, "loss": 2.3088, "mean_token_accuracy": 0.46206897497177124, "step": 35150 }, { "epoch": 0.035408489375438765, "grad_norm": 23.35273640600897, "learning_rate": 3.5407811776081226e-05, "loss": 3.076, "mean_token_accuracy": 0.3379310369491577, "step": 35155 }, { "epoch": 0.03541352542854294, "grad_norm": 15.224470085218696, "learning_rate": 3.5412847732812285e-05, "loss": 2.4235, "mean_token_accuracy": 0.4068965554237366, "step": 35160 }, { "epoch": 0.03541856148164711, "grad_norm": 14.824299459795785, "learning_rate": 3.541788368954334e-05, "loss": 2.4214, "mean_token_accuracy": 0.47991530895233153, "step": 35165 }, { "epoch": 0.035423597534751286, "grad_norm": 13.590798711527745, "learning_rate": 3.54229196462744e-05, "loss": 2.5867, "mean_token_accuracy": 0.3896551728248596, "step": 35170 }, { "epoch": 0.03542863358785546, "grad_norm": 16.92664178823344, "learning_rate": 3.5427955603005456e-05, "loss": 2.844, "mean_token_accuracy": 0.36896551847457887, "step": 35175 }, { "epoch": 0.03543366964095963, "grad_norm": 15.888231070112926, "learning_rate": 3.543299155973652e-05, "loss": 2.5928, "mean_token_accuracy": 0.34827585220336915, "step": 35180 }, { "epoch": 0.0354387056940638, "grad_norm": 16.887654805323134, "learning_rate": 3.543802751646758e-05, "loss": 2.5761, "mean_token_accuracy": 0.37241379618644715, "step": 35185 }, { "epoch": 0.035443741747167974, "grad_norm": 11.686358097261753, "learning_rate": 3.544306347319864e-05, "loss": 2.3297, "mean_token_accuracy": 0.4206896543502808, "step": 35190 }, { "epoch": 0.03544877780027215, "grad_norm": 15.46425221067465, "learning_rate": 3.54480994299297e-05, "loss": 2.5449, "mean_token_accuracy": 0.3758620619773865, "step": 35195 }, { "epoch": 0.03545381385337632, "grad_norm": 13.720126020308937, "learning_rate": 3.545313538666076e-05, "loss": 2.5084, "mean_token_accuracy": 0.3931034505367279, "step": 35200 }, { "epoch": 0.035458849906480495, "grad_norm": 14.070012196115618, "learning_rate": 3.545817134339182e-05, "loss": 2.3758, "mean_token_accuracy": 0.41929824352264405, "step": 35205 }, { "epoch": 0.03546388595958467, "grad_norm": 16.72302871606665, "learning_rate": 3.546320730012288e-05, "loss": 2.6234, "mean_token_accuracy": 0.3965517282485962, "step": 35210 }, { "epoch": 0.035468922012688836, "grad_norm": 14.700955514728081, "learning_rate": 3.546824325685394e-05, "loss": 2.4101, "mean_token_accuracy": 0.46394434571266174, "step": 35215 }, { "epoch": 0.03547395806579301, "grad_norm": 13.019223114973107, "learning_rate": 3.5473279213584996e-05, "loss": 2.2539, "mean_token_accuracy": 0.4896551728248596, "step": 35220 }, { "epoch": 0.035478994118897184, "grad_norm": 17.608092260275658, "learning_rate": 3.5478315170316055e-05, "loss": 3.2916, "mean_token_accuracy": 0.29999999403953553, "step": 35225 }, { "epoch": 0.03548403017200136, "grad_norm": 15.38901232639779, "learning_rate": 3.548335112704712e-05, "loss": 2.2428, "mean_token_accuracy": 0.47931033968925474, "step": 35230 }, { "epoch": 0.03548906622510553, "grad_norm": 13.538477744787103, "learning_rate": 3.548838708377818e-05, "loss": 2.1607, "mean_token_accuracy": 0.482758629322052, "step": 35235 }, { "epoch": 0.035494102278209705, "grad_norm": 11.764179887355546, "learning_rate": 3.549342304050924e-05, "loss": 2.2296, "mean_token_accuracy": 0.44482759237289426, "step": 35240 }, { "epoch": 0.03549913833131388, "grad_norm": 16.424727649302117, "learning_rate": 3.54984589972403e-05, "loss": 2.6642, "mean_token_accuracy": 0.3758620709180832, "step": 35245 }, { "epoch": 0.035504174384418045, "grad_norm": 13.469770967869868, "learning_rate": 3.550349495397136e-05, "loss": 2.0929, "mean_token_accuracy": 0.5344827592372894, "step": 35250 }, { "epoch": 0.03550921043752222, "grad_norm": 24.100702334841046, "learning_rate": 3.550853091070242e-05, "loss": 2.7332, "mean_token_accuracy": 0.39449485540390017, "step": 35255 }, { "epoch": 0.03551424649062639, "grad_norm": 14.76522486670013, "learning_rate": 3.551356686743348e-05, "loss": 2.6202, "mean_token_accuracy": 0.358620685338974, "step": 35260 }, { "epoch": 0.03551928254373057, "grad_norm": 13.964797598028799, "learning_rate": 3.5518602824164536e-05, "loss": 2.4625, "mean_token_accuracy": 0.42928009629249575, "step": 35265 }, { "epoch": 0.03552431859683474, "grad_norm": 15.861037821874467, "learning_rate": 3.5523638780895595e-05, "loss": 2.4677, "mean_token_accuracy": 0.45517241954803467, "step": 35270 }, { "epoch": 0.035529354649938914, "grad_norm": 12.198624988777901, "learning_rate": 3.5528674737626655e-05, "loss": 2.4007, "mean_token_accuracy": 0.4672111332416534, "step": 35275 }, { "epoch": 0.03553439070304309, "grad_norm": 14.407577776710445, "learning_rate": 3.5533710694357714e-05, "loss": 2.4147, "mean_token_accuracy": 0.4172413766384125, "step": 35280 }, { "epoch": 0.035539426756147255, "grad_norm": 11.017067355657874, "learning_rate": 3.553874665108878e-05, "loss": 2.4584, "mean_token_accuracy": 0.43793103098869324, "step": 35285 }, { "epoch": 0.03554446280925143, "grad_norm": 21.129298779075278, "learning_rate": 3.554378260781984e-05, "loss": 2.404, "mean_token_accuracy": 0.4034482717514038, "step": 35290 }, { "epoch": 0.0355494988623556, "grad_norm": 13.060785112936866, "learning_rate": 3.55488185645509e-05, "loss": 2.7439, "mean_token_accuracy": 0.37380520105361936, "step": 35295 }, { "epoch": 0.035554534915459776, "grad_norm": 13.979399790958748, "learning_rate": 3.555385452128195e-05, "loss": 2.4058, "mean_token_accuracy": 0.4206896543502808, "step": 35300 }, { "epoch": 0.03555957096856395, "grad_norm": 15.099301494686506, "learning_rate": 3.555889047801301e-05, "loss": 2.5701, "mean_token_accuracy": 0.3896551787853241, "step": 35305 }, { "epoch": 0.035564607021668124, "grad_norm": 15.470220849342617, "learning_rate": 3.5563926434744076e-05, "loss": 2.511, "mean_token_accuracy": 0.3965517282485962, "step": 35310 }, { "epoch": 0.0355696430747723, "grad_norm": 17.419314818900872, "learning_rate": 3.5568962391475135e-05, "loss": 2.3176, "mean_token_accuracy": 0.43103447556495667, "step": 35315 }, { "epoch": 0.035574679127876464, "grad_norm": 16.95778330627382, "learning_rate": 3.5573998348206195e-05, "loss": 2.7032, "mean_token_accuracy": 0.4034482717514038, "step": 35320 }, { "epoch": 0.03557971518098064, "grad_norm": 16.69115441368345, "learning_rate": 3.5579034304937254e-05, "loss": 2.5185, "mean_token_accuracy": 0.44984874725341795, "step": 35325 }, { "epoch": 0.03558475123408481, "grad_norm": 11.871965701534956, "learning_rate": 3.558407026166831e-05, "loss": 2.5165, "mean_token_accuracy": 0.4137930989265442, "step": 35330 }, { "epoch": 0.035589787287188986, "grad_norm": 12.262484628889235, "learning_rate": 3.558910621839937e-05, "loss": 2.2712, "mean_token_accuracy": 0.45517241954803467, "step": 35335 }, { "epoch": 0.03559482334029316, "grad_norm": 14.97926085963146, "learning_rate": 3.559414217513043e-05, "loss": 2.7502, "mean_token_accuracy": 0.3965517282485962, "step": 35340 }, { "epoch": 0.03559985939339733, "grad_norm": 18.535705531532795, "learning_rate": 3.559917813186149e-05, "loss": 2.4407, "mean_token_accuracy": 0.41034482717514037, "step": 35345 }, { "epoch": 0.03560489544650151, "grad_norm": 15.634884967722432, "learning_rate": 3.560421408859255e-05, "loss": 2.3135, "mean_token_accuracy": 0.41724138259887694, "step": 35350 }, { "epoch": 0.035609931499605674, "grad_norm": 13.031608692625467, "learning_rate": 3.560925004532361e-05, "loss": 2.5981, "mean_token_accuracy": 0.42068966031074523, "step": 35355 }, { "epoch": 0.03561496755270985, "grad_norm": 18.14733925359142, "learning_rate": 3.561428600205467e-05, "loss": 2.8859, "mean_token_accuracy": 0.3793103456497192, "step": 35360 }, { "epoch": 0.03562000360581402, "grad_norm": 16.374472706408568, "learning_rate": 3.5619321958785735e-05, "loss": 2.473, "mean_token_accuracy": 0.4310344815254211, "step": 35365 }, { "epoch": 0.035625039658918195, "grad_norm": 23.510849869616205, "learning_rate": 3.5624357915516794e-05, "loss": 2.9885, "mean_token_accuracy": 0.34482758641242983, "step": 35370 }, { "epoch": 0.03563007571202237, "grad_norm": 14.792527617906579, "learning_rate": 3.562939387224785e-05, "loss": 2.6941, "mean_token_accuracy": 0.4172413766384125, "step": 35375 }, { "epoch": 0.03563511176512654, "grad_norm": 15.468006708389817, "learning_rate": 3.563442982897891e-05, "loss": 2.4908, "mean_token_accuracy": 0.4103448212146759, "step": 35380 }, { "epoch": 0.035640147818230716, "grad_norm": 14.71705712569485, "learning_rate": 3.563946578570997e-05, "loss": 2.5292, "mean_token_accuracy": 0.39310344457626345, "step": 35385 }, { "epoch": 0.03564518387133488, "grad_norm": 14.688470590413813, "learning_rate": 3.564450174244103e-05, "loss": 2.7157, "mean_token_accuracy": 0.4103448212146759, "step": 35390 }, { "epoch": 0.03565021992443906, "grad_norm": 11.901411495723053, "learning_rate": 3.564953769917209e-05, "loss": 2.8455, "mean_token_accuracy": 0.41923774480819703, "step": 35395 }, { "epoch": 0.03565525597754323, "grad_norm": 19.39408417714081, "learning_rate": 3.565457365590315e-05, "loss": 2.8126, "mean_token_accuracy": 0.33103448152542114, "step": 35400 }, { "epoch": 0.035660292030647404, "grad_norm": 16.166699125198168, "learning_rate": 3.565960961263421e-05, "loss": 2.3968, "mean_token_accuracy": 0.39310344457626345, "step": 35405 }, { "epoch": 0.03566532808375158, "grad_norm": 11.622290459096908, "learning_rate": 3.566464556936527e-05, "loss": 2.1377, "mean_token_accuracy": 0.4103448212146759, "step": 35410 }, { "epoch": 0.03567036413685575, "grad_norm": 15.372122823406604, "learning_rate": 3.566968152609633e-05, "loss": 2.3865, "mean_token_accuracy": 0.4323653936386108, "step": 35415 }, { "epoch": 0.035675400189959926, "grad_norm": 17.5870721417988, "learning_rate": 3.567471748282739e-05, "loss": 2.515, "mean_token_accuracy": 0.42068964838981626, "step": 35420 }, { "epoch": 0.03568043624306409, "grad_norm": 14.796916261664762, "learning_rate": 3.567975343955845e-05, "loss": 2.3273, "mean_token_accuracy": 0.46551724076271056, "step": 35425 }, { "epoch": 0.035685472296168266, "grad_norm": 16.627819932407505, "learning_rate": 3.568478939628951e-05, "loss": 2.7484, "mean_token_accuracy": 0.42758620381355283, "step": 35430 }, { "epoch": 0.03569050834927244, "grad_norm": 15.242096179375515, "learning_rate": 3.5689825353020564e-05, "loss": 2.2638, "mean_token_accuracy": 0.4448275864124298, "step": 35435 }, { "epoch": 0.035695544402376614, "grad_norm": 13.796157116777385, "learning_rate": 3.5694861309751624e-05, "loss": 3.0278, "mean_token_accuracy": 0.3482758581638336, "step": 35440 }, { "epoch": 0.03570058045548079, "grad_norm": 16.072140421760455, "learning_rate": 3.569989726648269e-05, "loss": 2.6115, "mean_token_accuracy": 0.38275861740112305, "step": 35445 }, { "epoch": 0.03570561650858496, "grad_norm": 16.774082359754804, "learning_rate": 3.570493322321375e-05, "loss": 2.2743, "mean_token_accuracy": 0.42068966031074523, "step": 35450 }, { "epoch": 0.035710652561689135, "grad_norm": 13.176571392850427, "learning_rate": 3.570996917994481e-05, "loss": 2.5807, "mean_token_accuracy": 0.4034482777118683, "step": 35455 }, { "epoch": 0.0357156886147933, "grad_norm": 19.210897493015615, "learning_rate": 3.571500513667587e-05, "loss": 2.565, "mean_token_accuracy": 0.41379311084747317, "step": 35460 }, { "epoch": 0.035720724667897476, "grad_norm": 15.797181810095896, "learning_rate": 3.5720041093406927e-05, "loss": 2.7047, "mean_token_accuracy": 0.40689654350280763, "step": 35465 }, { "epoch": 0.03572576072100165, "grad_norm": 14.169786013140444, "learning_rate": 3.572507705013799e-05, "loss": 2.4288, "mean_token_accuracy": 0.46896551847457885, "step": 35470 }, { "epoch": 0.03573079677410582, "grad_norm": 14.403208626634898, "learning_rate": 3.5730113006869045e-05, "loss": 2.9215, "mean_token_accuracy": 0.3551724076271057, "step": 35475 }, { "epoch": 0.03573583282721, "grad_norm": 17.32313323761732, "learning_rate": 3.5735148963600104e-05, "loss": 2.5442, "mean_token_accuracy": 0.4379310369491577, "step": 35480 }, { "epoch": 0.03574086888031417, "grad_norm": 15.908719857189077, "learning_rate": 3.5740184920331164e-05, "loss": 2.4375, "mean_token_accuracy": 0.4206896543502808, "step": 35485 }, { "epoch": 0.035745904933418345, "grad_norm": 14.23529070873796, "learning_rate": 3.574522087706222e-05, "loss": 2.5678, "mean_token_accuracy": 0.41379310488700866, "step": 35490 }, { "epoch": 0.03575094098652251, "grad_norm": 14.05360682853102, "learning_rate": 3.575025683379328e-05, "loss": 2.1873, "mean_token_accuracy": 0.4172413766384125, "step": 35495 }, { "epoch": 0.035755977039626685, "grad_norm": 20.427118954821168, "learning_rate": 3.575529279052435e-05, "loss": 2.8419, "mean_token_accuracy": 0.38965516686439516, "step": 35500 }, { "epoch": 0.03576101309273086, "grad_norm": 18.399676560837694, "learning_rate": 3.576032874725541e-05, "loss": 2.4122, "mean_token_accuracy": 0.43290985822677613, "step": 35505 }, { "epoch": 0.03576604914583503, "grad_norm": 13.685914706727317, "learning_rate": 3.576536470398647e-05, "loss": 2.3069, "mean_token_accuracy": 0.43448275327682495, "step": 35510 }, { "epoch": 0.035771085198939206, "grad_norm": 12.876838638126735, "learning_rate": 3.5770400660717526e-05, "loss": 2.4114, "mean_token_accuracy": 0.45862069725990295, "step": 35515 }, { "epoch": 0.03577612125204338, "grad_norm": 17.918828232646362, "learning_rate": 3.5775436617448585e-05, "loss": 2.713, "mean_token_accuracy": 0.3965517163276672, "step": 35520 }, { "epoch": 0.035781157305147554, "grad_norm": 15.185139507452968, "learning_rate": 3.5780472574179644e-05, "loss": 2.5344, "mean_token_accuracy": 0.40344828367233276, "step": 35525 }, { "epoch": 0.03578619335825172, "grad_norm": 16.28109091498346, "learning_rate": 3.5785508530910704e-05, "loss": 2.3429, "mean_token_accuracy": 0.4551724076271057, "step": 35530 }, { "epoch": 0.035791229411355895, "grad_norm": 18.947028597711554, "learning_rate": 3.579054448764176e-05, "loss": 2.3144, "mean_token_accuracy": 0.4379310369491577, "step": 35535 }, { "epoch": 0.03579626546446007, "grad_norm": 15.371822247827776, "learning_rate": 3.579558044437282e-05, "loss": 2.2236, "mean_token_accuracy": 0.4551724076271057, "step": 35540 }, { "epoch": 0.03580130151756424, "grad_norm": 14.49687517519227, "learning_rate": 3.580061640110388e-05, "loss": 2.351, "mean_token_accuracy": 0.4206896543502808, "step": 35545 }, { "epoch": 0.035806337570668416, "grad_norm": 25.636853136937354, "learning_rate": 3.580565235783495e-05, "loss": 2.6554, "mean_token_accuracy": 0.34137930572032926, "step": 35550 }, { "epoch": 0.03581137362377259, "grad_norm": 17.311992290896285, "learning_rate": 3.581068831456601e-05, "loss": 2.8273, "mean_token_accuracy": 0.4154264986515045, "step": 35555 }, { "epoch": 0.03581640967687676, "grad_norm": 13.910138230425096, "learning_rate": 3.5815724271297066e-05, "loss": 2.566, "mean_token_accuracy": 0.4011494338512421, "step": 35560 }, { "epoch": 0.03582144572998093, "grad_norm": 13.69434489961823, "learning_rate": 3.5820760228028125e-05, "loss": 2.4349, "mean_token_accuracy": 0.4482758462429047, "step": 35565 }, { "epoch": 0.035826481783085104, "grad_norm": 18.490324924281, "learning_rate": 3.582579618475918e-05, "loss": 2.9352, "mean_token_accuracy": 0.37241379618644715, "step": 35570 }, { "epoch": 0.03583151783618928, "grad_norm": 12.793083924396608, "learning_rate": 3.583083214149024e-05, "loss": 2.5553, "mean_token_accuracy": 0.4068965494632721, "step": 35575 }, { "epoch": 0.03583655388929345, "grad_norm": 15.210389993425762, "learning_rate": 3.58358680982213e-05, "loss": 2.5775, "mean_token_accuracy": 0.37707199454307555, "step": 35580 }, { "epoch": 0.035841589942397625, "grad_norm": 15.20890508494618, "learning_rate": 3.584090405495236e-05, "loss": 2.5868, "mean_token_accuracy": 0.41034482717514037, "step": 35585 }, { "epoch": 0.0358466259955018, "grad_norm": 13.381917511052428, "learning_rate": 3.584594001168342e-05, "loss": 2.9119, "mean_token_accuracy": 0.38275861740112305, "step": 35590 }, { "epoch": 0.03585166204860597, "grad_norm": 12.493109191625829, "learning_rate": 3.585097596841448e-05, "loss": 2.4386, "mean_token_accuracy": 0.38275861740112305, "step": 35595 }, { "epoch": 0.03585669810171014, "grad_norm": 17.824904916444197, "learning_rate": 3.585601192514554e-05, "loss": 2.5532, "mean_token_accuracy": 0.41554749608039854, "step": 35600 }, { "epoch": 0.03586173415481431, "grad_norm": 15.570239825143327, "learning_rate": 3.5861047881876606e-05, "loss": 2.5145, "mean_token_accuracy": 0.41034482717514037, "step": 35605 }, { "epoch": 0.03586677020791849, "grad_norm": 13.622352660830135, "learning_rate": 3.586608383860766e-05, "loss": 2.5926, "mean_token_accuracy": 0.40689654350280763, "step": 35610 }, { "epoch": 0.03587180626102266, "grad_norm": 12.536057487095968, "learning_rate": 3.587111979533872e-05, "loss": 2.7633, "mean_token_accuracy": 0.3896551698446274, "step": 35615 }, { "epoch": 0.035876842314126835, "grad_norm": 12.958877525157792, "learning_rate": 3.587615575206978e-05, "loss": 2.6776, "mean_token_accuracy": 0.45517241954803467, "step": 35620 }, { "epoch": 0.03588187836723101, "grad_norm": 16.389540740193812, "learning_rate": 3.5881191708800836e-05, "loss": 2.6803, "mean_token_accuracy": 0.3827586233615875, "step": 35625 }, { "epoch": 0.03588691442033518, "grad_norm": 17.51925880213451, "learning_rate": 3.58862276655319e-05, "loss": 2.5911, "mean_token_accuracy": 0.4482758641242981, "step": 35630 }, { "epoch": 0.03589195047343935, "grad_norm": 12.673485753549853, "learning_rate": 3.589126362226296e-05, "loss": 2.6774, "mean_token_accuracy": 0.40344828367233276, "step": 35635 }, { "epoch": 0.03589698652654352, "grad_norm": 14.917715377498274, "learning_rate": 3.589629957899402e-05, "loss": 2.6512, "mean_token_accuracy": 0.3344827562570572, "step": 35640 }, { "epoch": 0.0359020225796477, "grad_norm": 13.720853268146719, "learning_rate": 3.590133553572508e-05, "loss": 2.4033, "mean_token_accuracy": 0.42758620977401735, "step": 35645 }, { "epoch": 0.03590705863275187, "grad_norm": 12.609524471901185, "learning_rate": 3.590637149245614e-05, "loss": 2.4937, "mean_token_accuracy": 0.4620689630508423, "step": 35650 }, { "epoch": 0.035912094685856044, "grad_norm": 15.501070616356776, "learning_rate": 3.59114074491872e-05, "loss": 2.3365, "mean_token_accuracy": 0.41724138259887694, "step": 35655 }, { "epoch": 0.03591713073896022, "grad_norm": 14.93177713409494, "learning_rate": 3.591644340591826e-05, "loss": 2.5277, "mean_token_accuracy": 0.3999999940395355, "step": 35660 }, { "epoch": 0.03592216679206439, "grad_norm": 19.790333414116663, "learning_rate": 3.592147936264932e-05, "loss": 2.6167, "mean_token_accuracy": 0.41379310488700866, "step": 35665 }, { "epoch": 0.03592720284516856, "grad_norm": 15.731223545668247, "learning_rate": 3.5926515319380376e-05, "loss": 2.6356, "mean_token_accuracy": 0.3896551728248596, "step": 35670 }, { "epoch": 0.03593223889827273, "grad_norm": 17.809563326580847, "learning_rate": 3.5931551276111436e-05, "loss": 2.8249, "mean_token_accuracy": 0.341379314661026, "step": 35675 }, { "epoch": 0.035937274951376906, "grad_norm": 18.292534540978913, "learning_rate": 3.5936587232842495e-05, "loss": 2.7546, "mean_token_accuracy": 0.4206896543502808, "step": 35680 }, { "epoch": 0.03594231100448108, "grad_norm": 17.084287701479095, "learning_rate": 3.594162318957356e-05, "loss": 2.5702, "mean_token_accuracy": 0.41379310488700866, "step": 35685 }, { "epoch": 0.035947347057585254, "grad_norm": 15.509917716608, "learning_rate": 3.594665914630462e-05, "loss": 2.3557, "mean_token_accuracy": 0.44482758045196535, "step": 35690 }, { "epoch": 0.03595238311068943, "grad_norm": 13.82853505337028, "learning_rate": 3.595169510303568e-05, "loss": 2.3411, "mean_token_accuracy": 0.4103448331356049, "step": 35695 }, { "epoch": 0.0359574191637936, "grad_norm": 16.51209833487878, "learning_rate": 3.595673105976673e-05, "loss": 2.3065, "mean_token_accuracy": 0.41724138259887694, "step": 35700 }, { "epoch": 0.03596245521689777, "grad_norm": 15.547442680429047, "learning_rate": 3.596176701649779e-05, "loss": 2.7701, "mean_token_accuracy": 0.3862069010734558, "step": 35705 }, { "epoch": 0.03596749127000194, "grad_norm": 47.135882463260295, "learning_rate": 3.596680297322886e-05, "loss": 2.9303, "mean_token_accuracy": 0.37241379022598264, "step": 35710 }, { "epoch": 0.035972527323106115, "grad_norm": 19.794887651527695, "learning_rate": 3.5971838929959916e-05, "loss": 2.8959, "mean_token_accuracy": 0.34137930870056155, "step": 35715 }, { "epoch": 0.03597756337621029, "grad_norm": 14.356133745325254, "learning_rate": 3.5976874886690976e-05, "loss": 2.811, "mean_token_accuracy": 0.3551724195480347, "step": 35720 }, { "epoch": 0.03598259942931446, "grad_norm": 13.27706680193558, "learning_rate": 3.5981910843422035e-05, "loss": 2.3341, "mean_token_accuracy": 0.44827585220336913, "step": 35725 }, { "epoch": 0.03598763548241864, "grad_norm": 11.228835141586131, "learning_rate": 3.5986946800153094e-05, "loss": 2.3359, "mean_token_accuracy": 0.43793103098869324, "step": 35730 }, { "epoch": 0.03599267153552281, "grad_norm": 12.189650152744777, "learning_rate": 3.599198275688416e-05, "loss": 2.5131, "mean_token_accuracy": 0.4, "step": 35735 }, { "epoch": 0.03599770758862698, "grad_norm": 19.267046518760324, "learning_rate": 3.599701871361521e-05, "loss": 2.7015, "mean_token_accuracy": 0.34137930870056155, "step": 35740 }, { "epoch": 0.03600274364173115, "grad_norm": 14.870453562178106, "learning_rate": 3.600205467034627e-05, "loss": 2.5435, "mean_token_accuracy": 0.3620689630508423, "step": 35745 }, { "epoch": 0.036007779694835325, "grad_norm": 14.618246072353006, "learning_rate": 3.600709062707733e-05, "loss": 2.4896, "mean_token_accuracy": 0.38106473684310915, "step": 35750 }, { "epoch": 0.0360128157479395, "grad_norm": 15.901296860817029, "learning_rate": 3.601212658380839e-05, "loss": 2.5075, "mean_token_accuracy": 0.39655172228813174, "step": 35755 }, { "epoch": 0.03601785180104367, "grad_norm": 15.575862476953128, "learning_rate": 3.601716254053945e-05, "loss": 2.8814, "mean_token_accuracy": 0.334482753276825, "step": 35760 }, { "epoch": 0.036022887854147846, "grad_norm": 14.39027230414355, "learning_rate": 3.6022198497270516e-05, "loss": 2.3425, "mean_token_accuracy": 0.43448275327682495, "step": 35765 }, { "epoch": 0.03602792390725202, "grad_norm": 18.875038085220268, "learning_rate": 3.6027234454001575e-05, "loss": 2.7167, "mean_token_accuracy": 0.3724137872457504, "step": 35770 }, { "epoch": 0.03603295996035619, "grad_norm": 14.022362398138073, "learning_rate": 3.6032270410732634e-05, "loss": 2.3131, "mean_token_accuracy": 0.43793103098869324, "step": 35775 }, { "epoch": 0.03603799601346036, "grad_norm": 13.198568676296196, "learning_rate": 3.603730636746369e-05, "loss": 2.5847, "mean_token_accuracy": 0.4344827592372894, "step": 35780 }, { "epoch": 0.036043032066564534, "grad_norm": 16.359619141831356, "learning_rate": 3.604234232419475e-05, "loss": 2.2021, "mean_token_accuracy": 0.4482758641242981, "step": 35785 }, { "epoch": 0.03604806811966871, "grad_norm": 15.17237985572359, "learning_rate": 3.604737828092581e-05, "loss": 2.6876, "mean_token_accuracy": 0.32758620381355286, "step": 35790 }, { "epoch": 0.03605310417277288, "grad_norm": 17.46830585619335, "learning_rate": 3.605241423765687e-05, "loss": 2.7893, "mean_token_accuracy": 0.3517241358757019, "step": 35795 }, { "epoch": 0.036058140225877056, "grad_norm": 11.081676417264928, "learning_rate": 3.605745019438793e-05, "loss": 2.5076, "mean_token_accuracy": 0.37241379022598264, "step": 35800 }, { "epoch": 0.03606317627898123, "grad_norm": 17.246101171400483, "learning_rate": 3.606248615111899e-05, "loss": 2.3044, "mean_token_accuracy": 0.4586206912994385, "step": 35805 }, { "epoch": 0.036068212332085396, "grad_norm": 17.511476155750902, "learning_rate": 3.606752210785005e-05, "loss": 2.6091, "mean_token_accuracy": 0.4186932861804962, "step": 35810 }, { "epoch": 0.03607324838518957, "grad_norm": 32.674096194830184, "learning_rate": 3.6072558064581115e-05, "loss": 3.1546, "mean_token_accuracy": 0.34137930870056155, "step": 35815 }, { "epoch": 0.036078284438293744, "grad_norm": 19.707316532075957, "learning_rate": 3.6077594021312174e-05, "loss": 2.9366, "mean_token_accuracy": 0.3793103456497192, "step": 35820 }, { "epoch": 0.03608332049139792, "grad_norm": 13.387358123766912, "learning_rate": 3.6082629978043233e-05, "loss": 2.3367, "mean_token_accuracy": 0.44482759237289426, "step": 35825 }, { "epoch": 0.03608835654450209, "grad_norm": 17.429435847706067, "learning_rate": 3.608766593477429e-05, "loss": 2.4156, "mean_token_accuracy": 0.441379314661026, "step": 35830 }, { "epoch": 0.036093392597606265, "grad_norm": 15.6860832924542, "learning_rate": 3.6092701891505345e-05, "loss": 2.2817, "mean_token_accuracy": 0.42413793206214906, "step": 35835 }, { "epoch": 0.03609842865071044, "grad_norm": 29.688036388874316, "learning_rate": 3.6097737848236404e-05, "loss": 2.7454, "mean_token_accuracy": 0.36551723778247835, "step": 35840 }, { "epoch": 0.036103464703814606, "grad_norm": 15.220022821820365, "learning_rate": 3.610277380496747e-05, "loss": 2.4929, "mean_token_accuracy": 0.34482758343219755, "step": 35845 }, { "epoch": 0.03610850075691878, "grad_norm": 15.242905750411836, "learning_rate": 3.610780976169853e-05, "loss": 2.035, "mean_token_accuracy": 0.44827585816383364, "step": 35850 }, { "epoch": 0.03611353681002295, "grad_norm": 12.622525167184936, "learning_rate": 3.611284571842959e-05, "loss": 2.5185, "mean_token_accuracy": 0.43103448748588563, "step": 35855 }, { "epoch": 0.03611857286312713, "grad_norm": 15.561342733826251, "learning_rate": 3.611788167516065e-05, "loss": 2.3952, "mean_token_accuracy": 0.4172413766384125, "step": 35860 }, { "epoch": 0.0361236089162313, "grad_norm": 14.808678748066189, "learning_rate": 3.612291763189171e-05, "loss": 2.8236, "mean_token_accuracy": 0.3999999940395355, "step": 35865 }, { "epoch": 0.036128644969335474, "grad_norm": 12.96414199813315, "learning_rate": 3.6127953588622773e-05, "loss": 2.3862, "mean_token_accuracy": 0.4137930929660797, "step": 35870 }, { "epoch": 0.03613368102243965, "grad_norm": 14.804641055529913, "learning_rate": 3.6132989545353826e-05, "loss": 2.8642, "mean_token_accuracy": 0.3896551728248596, "step": 35875 }, { "epoch": 0.036138717075543815, "grad_norm": 15.46684701304198, "learning_rate": 3.6138025502084885e-05, "loss": 2.5977, "mean_token_accuracy": 0.37931033968925476, "step": 35880 }, { "epoch": 0.03614375312864799, "grad_norm": 16.907079577522516, "learning_rate": 3.6143061458815944e-05, "loss": 2.6537, "mean_token_accuracy": 0.4206896543502808, "step": 35885 }, { "epoch": 0.03614878918175216, "grad_norm": 17.363303991893545, "learning_rate": 3.6148097415547004e-05, "loss": 2.2237, "mean_token_accuracy": 0.5146400570869446, "step": 35890 }, { "epoch": 0.036153825234856336, "grad_norm": 17.480863184396082, "learning_rate": 3.615313337227807e-05, "loss": 2.7495, "mean_token_accuracy": 0.37241379618644715, "step": 35895 }, { "epoch": 0.03615886128796051, "grad_norm": 31.941424221703727, "learning_rate": 3.615816932900913e-05, "loss": 2.4973, "mean_token_accuracy": 0.4517241299152374, "step": 35900 }, { "epoch": 0.036163897341064684, "grad_norm": 14.23777291827755, "learning_rate": 3.616320528574019e-05, "loss": 2.6203, "mean_token_accuracy": 0.42758620977401735, "step": 35905 }, { "epoch": 0.03616893339416886, "grad_norm": 15.67736916485311, "learning_rate": 3.616824124247125e-05, "loss": 2.823, "mean_token_accuracy": 0.3620689630508423, "step": 35910 }, { "epoch": 0.036173969447273024, "grad_norm": 18.786597366271845, "learning_rate": 3.617327719920231e-05, "loss": 2.9431, "mean_token_accuracy": 0.40344828367233276, "step": 35915 }, { "epoch": 0.0361790055003772, "grad_norm": 13.235391011107163, "learning_rate": 3.6178313155933366e-05, "loss": 2.5783, "mean_token_accuracy": 0.3965517282485962, "step": 35920 }, { "epoch": 0.03618404155348137, "grad_norm": 16.089551166087016, "learning_rate": 3.6183349112664425e-05, "loss": 2.4544, "mean_token_accuracy": 0.42758620381355283, "step": 35925 }, { "epoch": 0.036189077606585546, "grad_norm": 11.968365440762746, "learning_rate": 3.6188385069395485e-05, "loss": 2.5632, "mean_token_accuracy": 0.37241379022598264, "step": 35930 }, { "epoch": 0.03619411365968972, "grad_norm": 15.917514254278657, "learning_rate": 3.6193421026126544e-05, "loss": 2.5223, "mean_token_accuracy": 0.4448275864124298, "step": 35935 }, { "epoch": 0.03619914971279389, "grad_norm": 15.30218224352409, "learning_rate": 3.61984569828576e-05, "loss": 2.5706, "mean_token_accuracy": 0.42165759205818176, "step": 35940 }, { "epoch": 0.03620418576589807, "grad_norm": 13.605617111862836, "learning_rate": 3.620349293958866e-05, "loss": 2.619, "mean_token_accuracy": 0.3848759800195694, "step": 35945 }, { "epoch": 0.036209221819002234, "grad_norm": 13.452115947324405, "learning_rate": 3.620852889631973e-05, "loss": 2.619, "mean_token_accuracy": 0.3793103456497192, "step": 35950 }, { "epoch": 0.03621425787210641, "grad_norm": 14.650236371455364, "learning_rate": 3.621356485305079e-05, "loss": 2.5377, "mean_token_accuracy": 0.4000000059604645, "step": 35955 }, { "epoch": 0.03621929392521058, "grad_norm": 13.849238041885808, "learning_rate": 3.621860080978185e-05, "loss": 2.3568, "mean_token_accuracy": 0.4154869973659515, "step": 35960 }, { "epoch": 0.036224329978314755, "grad_norm": 18.751799276184475, "learning_rate": 3.6223636766512906e-05, "loss": 2.3606, "mean_token_accuracy": 0.41034482419490814, "step": 35965 }, { "epoch": 0.03622936603141893, "grad_norm": 13.095282157229866, "learning_rate": 3.622867272324396e-05, "loss": 2.234, "mean_token_accuracy": 0.4034482717514038, "step": 35970 }, { "epoch": 0.0362344020845231, "grad_norm": 12.67767159801466, "learning_rate": 3.6233708679975025e-05, "loss": 2.3487, "mean_token_accuracy": 0.4448275864124298, "step": 35975 }, { "epoch": 0.036239438137627276, "grad_norm": 15.397382634031622, "learning_rate": 3.6238744636706084e-05, "loss": 2.5323, "mean_token_accuracy": 0.4034482777118683, "step": 35980 }, { "epoch": 0.03624447419073144, "grad_norm": 18.535575416031605, "learning_rate": 3.624378059343714e-05, "loss": 2.4384, "mean_token_accuracy": 0.5041871905326843, "step": 35985 }, { "epoch": 0.03624951024383562, "grad_norm": 17.667832184653868, "learning_rate": 3.62488165501682e-05, "loss": 2.6409, "mean_token_accuracy": 0.4137930989265442, "step": 35990 }, { "epoch": 0.03625454629693979, "grad_norm": 15.9274282836938, "learning_rate": 3.625385250689926e-05, "loss": 2.6306, "mean_token_accuracy": 0.3896551728248596, "step": 35995 }, { "epoch": 0.036259582350043965, "grad_norm": 16.615355871870868, "learning_rate": 3.625888846363033e-05, "loss": 2.3913, "mean_token_accuracy": 0.39310344457626345, "step": 36000 }, { "epoch": 0.03626461840314814, "grad_norm": 14.44093224721498, "learning_rate": 3.626392442036139e-05, "loss": 2.3716, "mean_token_accuracy": 0.4, "step": 36005 }, { "epoch": 0.03626965445625231, "grad_norm": 14.68171824857839, "learning_rate": 3.626896037709244e-05, "loss": 2.6463, "mean_token_accuracy": 0.3896551698446274, "step": 36010 }, { "epoch": 0.036274690509356486, "grad_norm": 18.59762143557379, "learning_rate": 3.62739963338235e-05, "loss": 2.3726, "mean_token_accuracy": 0.4551724135875702, "step": 36015 }, { "epoch": 0.03627972656246065, "grad_norm": 18.59015039302462, "learning_rate": 3.627903229055456e-05, "loss": 2.6263, "mean_token_accuracy": 0.4068965494632721, "step": 36020 }, { "epoch": 0.036284762615564826, "grad_norm": 14.565903595795094, "learning_rate": 3.628406824728562e-05, "loss": 2.478, "mean_token_accuracy": 0.41379310190677643, "step": 36025 }, { "epoch": 0.036289798668669, "grad_norm": 14.991762898592958, "learning_rate": 3.628910420401668e-05, "loss": 2.4585, "mean_token_accuracy": 0.4224440395832062, "step": 36030 }, { "epoch": 0.036294834721773174, "grad_norm": 14.522640122472096, "learning_rate": 3.629414016074774e-05, "loss": 2.355, "mean_token_accuracy": 0.43103447556495667, "step": 36035 }, { "epoch": 0.03629987077487735, "grad_norm": 15.520254692439453, "learning_rate": 3.62991761174788e-05, "loss": 2.9153, "mean_token_accuracy": 0.37586207389831544, "step": 36040 }, { "epoch": 0.03630490682798152, "grad_norm": 14.91854509133642, "learning_rate": 3.630421207420986e-05, "loss": 2.6491, "mean_token_accuracy": 0.4, "step": 36045 }, { "epoch": 0.036309942881085695, "grad_norm": 14.574495053276056, "learning_rate": 3.630924803094092e-05, "loss": 2.32, "mean_token_accuracy": 0.44137930274009707, "step": 36050 }, { "epoch": 0.03631497893418986, "grad_norm": 14.746171790010926, "learning_rate": 3.631428398767198e-05, "loss": 2.5833, "mean_token_accuracy": 0.37241379618644715, "step": 36055 }, { "epoch": 0.036320014987294036, "grad_norm": 13.121010824546477, "learning_rate": 3.631931994440304e-05, "loss": 2.6651, "mean_token_accuracy": 0.38620689809322356, "step": 36060 }, { "epoch": 0.03632505104039821, "grad_norm": 14.402146624586079, "learning_rate": 3.63243559011341e-05, "loss": 2.8832, "mean_token_accuracy": 0.35862069129943847, "step": 36065 }, { "epoch": 0.03633008709350238, "grad_norm": 17.299694859136945, "learning_rate": 3.632939185786516e-05, "loss": 2.5805, "mean_token_accuracy": 0.4, "step": 36070 }, { "epoch": 0.03633512314660656, "grad_norm": 15.853195381210904, "learning_rate": 3.6334427814596216e-05, "loss": 2.6807, "mean_token_accuracy": 0.4034482717514038, "step": 36075 }, { "epoch": 0.03634015919971073, "grad_norm": 8.66524039649621, "learning_rate": 3.633946377132728e-05, "loss": 2.3307, "mean_token_accuracy": 0.4631773352622986, "step": 36080 }, { "epoch": 0.036345195252814905, "grad_norm": 16.2393771521022, "learning_rate": 3.634449972805834e-05, "loss": 2.7093, "mean_token_accuracy": 0.3793103456497192, "step": 36085 }, { "epoch": 0.03635023130591907, "grad_norm": 14.447030893583047, "learning_rate": 3.63495356847894e-05, "loss": 2.6829, "mean_token_accuracy": 0.3862069010734558, "step": 36090 }, { "epoch": 0.036355267359023245, "grad_norm": 19.181049367800277, "learning_rate": 3.635457164152046e-05, "loss": 2.8938, "mean_token_accuracy": 0.37241379022598264, "step": 36095 }, { "epoch": 0.03636030341212742, "grad_norm": 13.578190266651461, "learning_rate": 3.635960759825152e-05, "loss": 2.6843, "mean_token_accuracy": 0.36206896901130675, "step": 36100 }, { "epoch": 0.03636533946523159, "grad_norm": 14.615518215360693, "learning_rate": 3.636464355498257e-05, "loss": 2.8081, "mean_token_accuracy": 0.3827586233615875, "step": 36105 }, { "epoch": 0.03637037551833577, "grad_norm": 15.719146325221544, "learning_rate": 3.636967951171364e-05, "loss": 2.4677, "mean_token_accuracy": 0.4068965494632721, "step": 36110 }, { "epoch": 0.03637541157143994, "grad_norm": 16.92111035466899, "learning_rate": 3.63747154684447e-05, "loss": 2.6666, "mean_token_accuracy": 0.37241379022598264, "step": 36115 }, { "epoch": 0.036380447624544114, "grad_norm": 28.581161192996884, "learning_rate": 3.6379751425175756e-05, "loss": 2.6894, "mean_token_accuracy": 0.3655172407627106, "step": 36120 }, { "epoch": 0.03638548367764828, "grad_norm": 13.837011062664848, "learning_rate": 3.6384787381906816e-05, "loss": 2.3826, "mean_token_accuracy": 0.43448275327682495, "step": 36125 }, { "epoch": 0.036390519730752455, "grad_norm": 16.073770594099287, "learning_rate": 3.6389823338637875e-05, "loss": 2.2991, "mean_token_accuracy": 0.4448275864124298, "step": 36130 }, { "epoch": 0.03639555578385663, "grad_norm": 17.171784335485533, "learning_rate": 3.639485929536894e-05, "loss": 2.6711, "mean_token_accuracy": 0.36896551847457887, "step": 36135 }, { "epoch": 0.0364005918369608, "grad_norm": 20.97024408643152, "learning_rate": 3.63998952521e-05, "loss": 2.4862, "mean_token_accuracy": 0.36896551847457887, "step": 36140 }, { "epoch": 0.036405627890064976, "grad_norm": 15.290655460565825, "learning_rate": 3.640493120883105e-05, "loss": 2.9559, "mean_token_accuracy": 0.3655172407627106, "step": 36145 }, { "epoch": 0.03641066394316915, "grad_norm": 12.990436795360878, "learning_rate": 3.640996716556211e-05, "loss": 2.5069, "mean_token_accuracy": 0.37241379618644715, "step": 36150 }, { "epoch": 0.036415699996273324, "grad_norm": 16.559496268266315, "learning_rate": 3.641500312229317e-05, "loss": 2.6801, "mean_token_accuracy": 0.37241379022598264, "step": 36155 }, { "epoch": 0.03642073604937749, "grad_norm": 16.427517298038882, "learning_rate": 3.642003907902424e-05, "loss": 2.1478, "mean_token_accuracy": 0.4034482717514038, "step": 36160 }, { "epoch": 0.036425772102481664, "grad_norm": 12.08346055587758, "learning_rate": 3.6425075035755297e-05, "loss": 2.5865, "mean_token_accuracy": 0.41379310488700866, "step": 36165 }, { "epoch": 0.03643080815558584, "grad_norm": 17.181908043626212, "learning_rate": 3.6430110992486356e-05, "loss": 2.9156, "mean_token_accuracy": 0.3551724016666412, "step": 36170 }, { "epoch": 0.03643584420869001, "grad_norm": 16.86227056359431, "learning_rate": 3.6435146949217415e-05, "loss": 2.5197, "mean_token_accuracy": 0.4103448212146759, "step": 36175 }, { "epoch": 0.036440880261794185, "grad_norm": 14.590993247296248, "learning_rate": 3.6440182905948474e-05, "loss": 2.5491, "mean_token_accuracy": 0.3620689630508423, "step": 36180 }, { "epoch": 0.03644591631489836, "grad_norm": 34.86349183475435, "learning_rate": 3.6445218862679534e-05, "loss": 2.4711, "mean_token_accuracy": 0.3724137991666794, "step": 36185 }, { "epoch": 0.03645095236800253, "grad_norm": 13.361965798027391, "learning_rate": 3.645025481941059e-05, "loss": 2.3447, "mean_token_accuracy": 0.4241379201412201, "step": 36190 }, { "epoch": 0.0364559884211067, "grad_norm": 14.625573992905954, "learning_rate": 3.645529077614165e-05, "loss": 2.4696, "mean_token_accuracy": 0.3947973370552063, "step": 36195 }, { "epoch": 0.036461024474210874, "grad_norm": 16.121511202696592, "learning_rate": 3.646032673287271e-05, "loss": 2.4182, "mean_token_accuracy": 0.4344827592372894, "step": 36200 }, { "epoch": 0.03646606052731505, "grad_norm": 14.41206580746921, "learning_rate": 3.646536268960377e-05, "loss": 2.2879, "mean_token_accuracy": 0.4655172288417816, "step": 36205 }, { "epoch": 0.03647109658041922, "grad_norm": 16.489156414997574, "learning_rate": 3.647039864633483e-05, "loss": 3.0207, "mean_token_accuracy": 0.32758620381355286, "step": 36210 }, { "epoch": 0.036476132633523395, "grad_norm": 17.25028780882638, "learning_rate": 3.6475434603065896e-05, "loss": 2.8948, "mean_token_accuracy": 0.3137931048870087, "step": 36215 }, { "epoch": 0.03648116868662757, "grad_norm": 17.330294823616452, "learning_rate": 3.6480470559796955e-05, "loss": 2.3841, "mean_token_accuracy": 0.4379310250282288, "step": 36220 }, { "epoch": 0.03648620473973174, "grad_norm": 12.436920029521131, "learning_rate": 3.6485506516528014e-05, "loss": 2.4116, "mean_token_accuracy": 0.4034482777118683, "step": 36225 }, { "epoch": 0.03649124079283591, "grad_norm": 15.573834788462632, "learning_rate": 3.6490542473259074e-05, "loss": 2.6657, "mean_token_accuracy": 0.37241379022598264, "step": 36230 }, { "epoch": 0.03649627684594008, "grad_norm": 16.24419360018926, "learning_rate": 3.6495578429990126e-05, "loss": 2.5524, "mean_token_accuracy": 0.36896551847457887, "step": 36235 }, { "epoch": 0.03650131289904426, "grad_norm": 14.933289594571585, "learning_rate": 3.650061438672119e-05, "loss": 2.367, "mean_token_accuracy": 0.4482758641242981, "step": 36240 }, { "epoch": 0.03650634895214843, "grad_norm": 12.666380885376883, "learning_rate": 3.650565034345225e-05, "loss": 2.4119, "mean_token_accuracy": 0.3982456088066101, "step": 36245 }, { "epoch": 0.036511385005252604, "grad_norm": 13.100292262310145, "learning_rate": 3.651068630018331e-05, "loss": 2.6084, "mean_token_accuracy": 0.38784028887748717, "step": 36250 }, { "epoch": 0.03651642105835678, "grad_norm": 14.347002985750676, "learning_rate": 3.651572225691437e-05, "loss": 2.6537, "mean_token_accuracy": 0.3551724135875702, "step": 36255 }, { "epoch": 0.03652145711146095, "grad_norm": 15.716971561083701, "learning_rate": 3.652075821364543e-05, "loss": 2.6467, "mean_token_accuracy": 0.3551724076271057, "step": 36260 }, { "epoch": 0.03652649316456512, "grad_norm": 15.209175574867688, "learning_rate": 3.652579417037649e-05, "loss": 2.5953, "mean_token_accuracy": 0.4517241358757019, "step": 36265 }, { "epoch": 0.03653152921766929, "grad_norm": 13.446245082686808, "learning_rate": 3.6530830127107554e-05, "loss": 2.2906, "mean_token_accuracy": 0.4103448212146759, "step": 36270 }, { "epoch": 0.036536565270773466, "grad_norm": 15.435067096842143, "learning_rate": 3.653586608383861e-05, "loss": 2.5666, "mean_token_accuracy": 0.3896551728248596, "step": 36275 }, { "epoch": 0.03654160132387764, "grad_norm": 15.966678975802402, "learning_rate": 3.6540902040569666e-05, "loss": 2.6833, "mean_token_accuracy": 0.38620689511299133, "step": 36280 }, { "epoch": 0.036546637376981814, "grad_norm": 16.165294318984767, "learning_rate": 3.6545937997300725e-05, "loss": 2.5684, "mean_token_accuracy": 0.4000000059604645, "step": 36285 }, { "epoch": 0.03655167343008599, "grad_norm": 15.952097192636561, "learning_rate": 3.6550973954031785e-05, "loss": 2.4569, "mean_token_accuracy": 0.4053841412067413, "step": 36290 }, { "epoch": 0.03655670948319016, "grad_norm": 14.402081919933101, "learning_rate": 3.655600991076285e-05, "loss": 2.5428, "mean_token_accuracy": 0.42413793206214906, "step": 36295 }, { "epoch": 0.03656174553629433, "grad_norm": 12.426868957198204, "learning_rate": 3.656104586749391e-05, "loss": 2.8947, "mean_token_accuracy": 0.3896551787853241, "step": 36300 }, { "epoch": 0.0365667815893985, "grad_norm": 15.976688919586836, "learning_rate": 3.656608182422497e-05, "loss": 2.6303, "mean_token_accuracy": 0.3758620619773865, "step": 36305 }, { "epoch": 0.036571817642502676, "grad_norm": 16.111670452577425, "learning_rate": 3.657111778095603e-05, "loss": 2.9224, "mean_token_accuracy": 0.36551723480224607, "step": 36310 }, { "epoch": 0.03657685369560685, "grad_norm": 14.849957117665214, "learning_rate": 3.657615373768709e-05, "loss": 2.7598, "mean_token_accuracy": 0.36896551251411436, "step": 36315 }, { "epoch": 0.03658188974871102, "grad_norm": 14.2569943032712, "learning_rate": 3.658118969441815e-05, "loss": 2.3419, "mean_token_accuracy": 0.4172413766384125, "step": 36320 }, { "epoch": 0.0365869258018152, "grad_norm": 15.727984292075476, "learning_rate": 3.6586225651149206e-05, "loss": 2.8127, "mean_token_accuracy": 0.3517241418361664, "step": 36325 }, { "epoch": 0.03659196185491937, "grad_norm": 14.01803022610442, "learning_rate": 3.6591261607880265e-05, "loss": 2.3206, "mean_token_accuracy": 0.42583181858062746, "step": 36330 }, { "epoch": 0.03659699790802354, "grad_norm": 13.4828261759767, "learning_rate": 3.6596297564611325e-05, "loss": 2.3721, "mean_token_accuracy": 0.4620689630508423, "step": 36335 }, { "epoch": 0.03660203396112771, "grad_norm": 15.834492333253197, "learning_rate": 3.6601333521342384e-05, "loss": 2.4677, "mean_token_accuracy": 0.3896551728248596, "step": 36340 }, { "epoch": 0.036607070014231885, "grad_norm": 15.466322439234206, "learning_rate": 3.660636947807344e-05, "loss": 2.7789, "mean_token_accuracy": 0.38620689511299133, "step": 36345 }, { "epoch": 0.03661210606733606, "grad_norm": 14.374797629620218, "learning_rate": 3.661140543480451e-05, "loss": 3.0346, "mean_token_accuracy": 0.36896551847457887, "step": 36350 }, { "epoch": 0.03661714212044023, "grad_norm": 19.16983048800411, "learning_rate": 3.661644139153557e-05, "loss": 2.4667, "mean_token_accuracy": 0.41034482717514037, "step": 36355 }, { "epoch": 0.036622178173544406, "grad_norm": 12.010314798852168, "learning_rate": 3.662147734826663e-05, "loss": 2.4491, "mean_token_accuracy": 0.4034482717514038, "step": 36360 }, { "epoch": 0.03662721422664858, "grad_norm": 17.007335344439117, "learning_rate": 3.662651330499769e-05, "loss": 2.4829, "mean_token_accuracy": 0.4172413766384125, "step": 36365 }, { "epoch": 0.03663225027975275, "grad_norm": 15.392404324690036, "learning_rate": 3.663154926172874e-05, "loss": 2.7903, "mean_token_accuracy": 0.4103448331356049, "step": 36370 }, { "epoch": 0.03663728633285692, "grad_norm": 16.736351199627077, "learning_rate": 3.6636585218459805e-05, "loss": 2.4089, "mean_token_accuracy": 0.44137930274009707, "step": 36375 }, { "epoch": 0.036642322385961094, "grad_norm": 14.165160510607796, "learning_rate": 3.6641621175190865e-05, "loss": 2.134, "mean_token_accuracy": 0.42413793206214906, "step": 36380 }, { "epoch": 0.03664735843906527, "grad_norm": 10.178440378193777, "learning_rate": 3.6646657131921924e-05, "loss": 2.0906, "mean_token_accuracy": 0.48694581389427183, "step": 36385 }, { "epoch": 0.03665239449216944, "grad_norm": 13.532420242627989, "learning_rate": 3.665169308865298e-05, "loss": 2.5315, "mean_token_accuracy": 0.3620689570903778, "step": 36390 }, { "epoch": 0.036657430545273616, "grad_norm": 12.112287261908916, "learning_rate": 3.665672904538404e-05, "loss": 2.4276, "mean_token_accuracy": 0.4497882604598999, "step": 36395 }, { "epoch": 0.03666246659837779, "grad_norm": 13.824284013189242, "learning_rate": 3.666176500211511e-05, "loss": 2.3524, "mean_token_accuracy": 0.44827585816383364, "step": 36400 }, { "epoch": 0.036667502651481956, "grad_norm": 10.314050921608588, "learning_rate": 3.666680095884617e-05, "loss": 2.4139, "mean_token_accuracy": 0.41034482717514037, "step": 36405 }, { "epoch": 0.03667253870458613, "grad_norm": 14.157524875135051, "learning_rate": 3.667183691557722e-05, "loss": 2.6844, "mean_token_accuracy": 0.3827586233615875, "step": 36410 }, { "epoch": 0.036677574757690304, "grad_norm": 18.127502866511396, "learning_rate": 3.667687287230828e-05, "loss": 3.0694, "mean_token_accuracy": 0.37586207389831544, "step": 36415 }, { "epoch": 0.03668261081079448, "grad_norm": 20.93984412025965, "learning_rate": 3.668190882903934e-05, "loss": 2.2199, "mean_token_accuracy": 0.4586206912994385, "step": 36420 }, { "epoch": 0.03668764686389865, "grad_norm": 14.076807432408701, "learning_rate": 3.6686944785770405e-05, "loss": 2.483, "mean_token_accuracy": 0.37241379022598264, "step": 36425 }, { "epoch": 0.036692682917002825, "grad_norm": 14.034939192178593, "learning_rate": 3.6691980742501464e-05, "loss": 2.6646, "mean_token_accuracy": 0.37586206793785093, "step": 36430 }, { "epoch": 0.036697718970107, "grad_norm": 21.785662007451943, "learning_rate": 3.669701669923252e-05, "loss": 2.3723, "mean_token_accuracy": 0.39999998807907106, "step": 36435 }, { "epoch": 0.036702755023211166, "grad_norm": 16.98498210836726, "learning_rate": 3.670205265596358e-05, "loss": 2.5823, "mean_token_accuracy": 0.4206896543502808, "step": 36440 }, { "epoch": 0.03670779107631534, "grad_norm": 20.36475786215909, "learning_rate": 3.670708861269464e-05, "loss": 2.5037, "mean_token_accuracy": 0.42758620977401735, "step": 36445 }, { "epoch": 0.03671282712941951, "grad_norm": 15.70298399202541, "learning_rate": 3.67121245694257e-05, "loss": 2.7215, "mean_token_accuracy": 0.324137932062149, "step": 36450 }, { "epoch": 0.03671786318252369, "grad_norm": 15.341581918664403, "learning_rate": 3.671716052615676e-05, "loss": 2.9717, "mean_token_accuracy": 0.34482758343219755, "step": 36455 }, { "epoch": 0.03672289923562786, "grad_norm": 12.913492486317647, "learning_rate": 3.672219648288782e-05, "loss": 2.0034, "mean_token_accuracy": 0.493103438615799, "step": 36460 }, { "epoch": 0.036727935288732035, "grad_norm": 14.35918644896692, "learning_rate": 3.672723243961888e-05, "loss": 2.4398, "mean_token_accuracy": 0.41379310488700866, "step": 36465 }, { "epoch": 0.03673297134183621, "grad_norm": 25.070769274074035, "learning_rate": 3.673226839634994e-05, "loss": 2.9326, "mean_token_accuracy": 0.3655172437429428, "step": 36470 }, { "epoch": 0.036738007394940375, "grad_norm": 17.67738931458292, "learning_rate": 3.6737304353081e-05, "loss": 3.031, "mean_token_accuracy": 0.34137930870056155, "step": 36475 }, { "epoch": 0.03674304344804455, "grad_norm": 17.003119450609997, "learning_rate": 3.674234030981206e-05, "loss": 2.887, "mean_token_accuracy": 0.3103448301553726, "step": 36480 }, { "epoch": 0.03674807950114872, "grad_norm": 18.814124237527544, "learning_rate": 3.674737626654312e-05, "loss": 2.6444, "mean_token_accuracy": 0.37586206793785093, "step": 36485 }, { "epoch": 0.036753115554252896, "grad_norm": 12.884678802483899, "learning_rate": 3.675241222327418e-05, "loss": 2.3917, "mean_token_accuracy": 0.4, "step": 36490 }, { "epoch": 0.03675815160735707, "grad_norm": 17.245891982000895, "learning_rate": 3.675744818000524e-05, "loss": 2.3929, "mean_token_accuracy": 0.4206896543502808, "step": 36495 }, { "epoch": 0.036763187660461244, "grad_norm": 13.152215944301998, "learning_rate": 3.67624841367363e-05, "loss": 2.7032, "mean_token_accuracy": 0.3879612863063812, "step": 36500 }, { "epoch": 0.03676822371356542, "grad_norm": 16.350336362039627, "learning_rate": 3.676752009346736e-05, "loss": 2.3987, "mean_token_accuracy": 0.4241379380226135, "step": 36505 }, { "epoch": 0.036773259766669585, "grad_norm": 14.037244549377553, "learning_rate": 3.677255605019842e-05, "loss": 2.3596, "mean_token_accuracy": 0.43448275327682495, "step": 36510 }, { "epoch": 0.03677829581977376, "grad_norm": 13.857625168668877, "learning_rate": 3.677759200692948e-05, "loss": 2.2814, "mean_token_accuracy": 0.4172413766384125, "step": 36515 }, { "epoch": 0.03678333187287793, "grad_norm": 21.318733556134344, "learning_rate": 3.678262796366054e-05, "loss": 2.7173, "mean_token_accuracy": 0.4257713288068771, "step": 36520 }, { "epoch": 0.036788367925982106, "grad_norm": 17.55898067525928, "learning_rate": 3.6787663920391597e-05, "loss": 2.4871, "mean_token_accuracy": 0.39655172228813174, "step": 36525 }, { "epoch": 0.03679340397908628, "grad_norm": 14.315485446441079, "learning_rate": 3.6792699877122656e-05, "loss": 2.7248, "mean_token_accuracy": 0.38620689511299133, "step": 36530 }, { "epoch": 0.03679844003219045, "grad_norm": 14.217193766507231, "learning_rate": 3.679773583385372e-05, "loss": 3.0267, "mean_token_accuracy": 0.32758620381355286, "step": 36535 }, { "epoch": 0.03680347608529463, "grad_norm": 22.39215163177794, "learning_rate": 3.680277179058478e-05, "loss": 2.5361, "mean_token_accuracy": 0.43103448748588563, "step": 36540 }, { "epoch": 0.036808512138398794, "grad_norm": 11.005363590176186, "learning_rate": 3.6807807747315834e-05, "loss": 2.4369, "mean_token_accuracy": 0.47586206197738645, "step": 36545 }, { "epoch": 0.03681354819150297, "grad_norm": 12.471559578698317, "learning_rate": 3.681284370404689e-05, "loss": 2.6909, "mean_token_accuracy": 0.4172413766384125, "step": 36550 }, { "epoch": 0.03681858424460714, "grad_norm": 12.293882083586196, "learning_rate": 3.681787966077795e-05, "loss": 2.2911, "mean_token_accuracy": 0.43448275327682495, "step": 36555 }, { "epoch": 0.036823620297711315, "grad_norm": 22.296073766017283, "learning_rate": 3.682291561750902e-05, "loss": 2.8727, "mean_token_accuracy": 0.42068964838981626, "step": 36560 }, { "epoch": 0.03682865635081549, "grad_norm": 21.091985036715986, "learning_rate": 3.682795157424008e-05, "loss": 2.6694, "mean_token_accuracy": 0.38100423812866213, "step": 36565 }, { "epoch": 0.03683369240391966, "grad_norm": 12.106428830467667, "learning_rate": 3.683298753097114e-05, "loss": 2.7159, "mean_token_accuracy": 0.32758620083332063, "step": 36570 }, { "epoch": 0.03683872845702384, "grad_norm": 13.394345368898799, "learning_rate": 3.6838023487702196e-05, "loss": 2.4586, "mean_token_accuracy": 0.42758620977401735, "step": 36575 }, { "epoch": 0.036843764510128, "grad_norm": 13.24967801914048, "learning_rate": 3.6843059444433255e-05, "loss": 2.3655, "mean_token_accuracy": 0.44301270246505736, "step": 36580 }, { "epoch": 0.03684880056323218, "grad_norm": 40.44384102988305, "learning_rate": 3.6848095401164314e-05, "loss": 2.5865, "mean_token_accuracy": 0.41034482717514037, "step": 36585 }, { "epoch": 0.03685383661633635, "grad_norm": 13.768364865999422, "learning_rate": 3.6853131357895374e-05, "loss": 2.4702, "mean_token_accuracy": 0.44827585220336913, "step": 36590 }, { "epoch": 0.036858872669440525, "grad_norm": 14.168180245258524, "learning_rate": 3.685816731462643e-05, "loss": 2.7905, "mean_token_accuracy": 0.37241379022598264, "step": 36595 }, { "epoch": 0.0368639087225447, "grad_norm": 16.62926980347543, "learning_rate": 3.686320327135749e-05, "loss": 2.4051, "mean_token_accuracy": 0.45862067937850953, "step": 36600 }, { "epoch": 0.03686894477564887, "grad_norm": 13.404983881374992, "learning_rate": 3.686823922808855e-05, "loss": 2.3596, "mean_token_accuracy": 0.4413793087005615, "step": 36605 }, { "epoch": 0.036873980828753046, "grad_norm": 19.508836861056597, "learning_rate": 3.687327518481961e-05, "loss": 2.1978, "mean_token_accuracy": 0.46551724076271056, "step": 36610 }, { "epoch": 0.03687901688185721, "grad_norm": 19.71569079372293, "learning_rate": 3.687831114155068e-05, "loss": 2.613, "mean_token_accuracy": 0.4448275864124298, "step": 36615 }, { "epoch": 0.03688405293496139, "grad_norm": 14.73028217038572, "learning_rate": 3.6883347098281736e-05, "loss": 2.7805, "mean_token_accuracy": 0.4052631616592407, "step": 36620 }, { "epoch": 0.03688908898806556, "grad_norm": 17.770338187386397, "learning_rate": 3.6888383055012795e-05, "loss": 2.7489, "mean_token_accuracy": 0.41034482717514037, "step": 36625 }, { "epoch": 0.036894125041169734, "grad_norm": 20.77891851891104, "learning_rate": 3.6893419011743854e-05, "loss": 3.2355, "mean_token_accuracy": 0.37241379618644715, "step": 36630 }, { "epoch": 0.03689916109427391, "grad_norm": 13.3385813629474, "learning_rate": 3.6898454968474914e-05, "loss": 2.4978, "mean_token_accuracy": 0.3965517282485962, "step": 36635 }, { "epoch": 0.03690419714737808, "grad_norm": 13.825000542230375, "learning_rate": 3.690349092520597e-05, "loss": 2.5561, "mean_token_accuracy": 0.41379311084747317, "step": 36640 }, { "epoch": 0.036909233200482255, "grad_norm": 15.368526569471111, "learning_rate": 3.690852688193703e-05, "loss": 2.8062, "mean_token_accuracy": 0.3551724076271057, "step": 36645 }, { "epoch": 0.03691426925358642, "grad_norm": 16.50336998233481, "learning_rate": 3.691356283866809e-05, "loss": 2.5526, "mean_token_accuracy": 0.38620689511299133, "step": 36650 }, { "epoch": 0.036919305306690596, "grad_norm": 15.469497237057583, "learning_rate": 3.691859879539915e-05, "loss": 2.6811, "mean_token_accuracy": 0.36896551847457887, "step": 36655 }, { "epoch": 0.03692434135979477, "grad_norm": 14.544419116863702, "learning_rate": 3.692363475213021e-05, "loss": 2.5209, "mean_token_accuracy": 0.4103448152542114, "step": 36660 }, { "epoch": 0.036929377412898944, "grad_norm": 12.83716851573263, "learning_rate": 3.6928670708861276e-05, "loss": 2.4483, "mean_token_accuracy": 0.4310344815254211, "step": 36665 }, { "epoch": 0.03693441346600312, "grad_norm": 13.17123543482321, "learning_rate": 3.6933706665592335e-05, "loss": 2.5752, "mean_token_accuracy": 0.42928009033203124, "step": 36670 }, { "epoch": 0.03693944951910729, "grad_norm": 13.88805913012588, "learning_rate": 3.6938742622323395e-05, "loss": 2.6828, "mean_token_accuracy": 0.40344826579093934, "step": 36675 }, { "epoch": 0.036944485572211465, "grad_norm": 17.179908879362976, "learning_rate": 3.694377857905445e-05, "loss": 2.8689, "mean_token_accuracy": 0.37241379022598264, "step": 36680 }, { "epoch": 0.03694952162531563, "grad_norm": 14.287607241169338, "learning_rate": 3.6948814535785506e-05, "loss": 2.5652, "mean_token_accuracy": 0.37586206793785093, "step": 36685 }, { "epoch": 0.036954557678419805, "grad_norm": 17.197553630448738, "learning_rate": 3.6953850492516566e-05, "loss": 2.0752, "mean_token_accuracy": 0.517241370677948, "step": 36690 }, { "epoch": 0.03695959373152398, "grad_norm": 14.713473174646044, "learning_rate": 3.695888644924763e-05, "loss": 2.5222, "mean_token_accuracy": 0.4034482717514038, "step": 36695 }, { "epoch": 0.03696462978462815, "grad_norm": 16.4823026652872, "learning_rate": 3.696392240597869e-05, "loss": 2.5325, "mean_token_accuracy": 0.4344827592372894, "step": 36700 }, { "epoch": 0.03696966583773233, "grad_norm": 17.061207594308204, "learning_rate": 3.696895836270975e-05, "loss": 2.5047, "mean_token_accuracy": 0.4172413766384125, "step": 36705 }, { "epoch": 0.0369747018908365, "grad_norm": 15.09426318919535, "learning_rate": 3.697399431944081e-05, "loss": 2.9041, "mean_token_accuracy": 0.3793103456497192, "step": 36710 }, { "epoch": 0.036979737943940674, "grad_norm": 12.310223699470448, "learning_rate": 3.697903027617187e-05, "loss": 2.5271, "mean_token_accuracy": 0.38620689511299133, "step": 36715 }, { "epoch": 0.03698477399704484, "grad_norm": 20.21067023310719, "learning_rate": 3.698406623290293e-05, "loss": 2.7052, "mean_token_accuracy": 0.3551724076271057, "step": 36720 }, { "epoch": 0.036989810050149015, "grad_norm": 13.128369039081202, "learning_rate": 3.698910218963399e-05, "loss": 2.5684, "mean_token_accuracy": 0.4034482777118683, "step": 36725 }, { "epoch": 0.03699484610325319, "grad_norm": 13.924755315227733, "learning_rate": 3.6994138146365046e-05, "loss": 2.6576, "mean_token_accuracy": 0.4034482777118683, "step": 36730 }, { "epoch": 0.03699988215635736, "grad_norm": 15.301539652949188, "learning_rate": 3.6999174103096106e-05, "loss": 2.7914, "mean_token_accuracy": 0.3620689570903778, "step": 36735 }, { "epoch": 0.037004918209461536, "grad_norm": 14.693466406679303, "learning_rate": 3.7004210059827165e-05, "loss": 2.6974, "mean_token_accuracy": 0.35862069129943847, "step": 36740 }, { "epoch": 0.03700995426256571, "grad_norm": 14.119999064467663, "learning_rate": 3.700924601655823e-05, "loss": 2.5673, "mean_token_accuracy": 0.400123143196106, "step": 36745 }, { "epoch": 0.037014990315669884, "grad_norm": 26.143715144810443, "learning_rate": 3.701428197328929e-05, "loss": 2.5585, "mean_token_accuracy": 0.4068965554237366, "step": 36750 }, { "epoch": 0.03702002636877405, "grad_norm": 15.057146841669313, "learning_rate": 3.701931793002035e-05, "loss": 2.9046, "mean_token_accuracy": 0.3620689630508423, "step": 36755 }, { "epoch": 0.037025062421878224, "grad_norm": 15.665040739350905, "learning_rate": 3.702435388675141e-05, "loss": 2.5491, "mean_token_accuracy": 0.3724137842655182, "step": 36760 }, { "epoch": 0.0370300984749824, "grad_norm": 13.046002718561619, "learning_rate": 3.702938984348247e-05, "loss": 2.2847, "mean_token_accuracy": 0.4103448331356049, "step": 36765 }, { "epoch": 0.03703513452808657, "grad_norm": 15.259623645284506, "learning_rate": 3.703442580021352e-05, "loss": 2.7348, "mean_token_accuracy": 0.39655172228813174, "step": 36770 }, { "epoch": 0.037040170581190746, "grad_norm": 15.786352190158714, "learning_rate": 3.7039461756944586e-05, "loss": 2.6891, "mean_token_accuracy": 0.3655172407627106, "step": 36775 }, { "epoch": 0.03704520663429492, "grad_norm": 10.804181689832195, "learning_rate": 3.7044497713675646e-05, "loss": 2.2104, "mean_token_accuracy": 0.4724137902259827, "step": 36780 }, { "epoch": 0.03705024268739909, "grad_norm": 12.783219528094989, "learning_rate": 3.7049533670406705e-05, "loss": 2.3824, "mean_token_accuracy": 0.37241379618644715, "step": 36785 }, { "epoch": 0.03705527874050326, "grad_norm": 15.897804868073115, "learning_rate": 3.7054569627137764e-05, "loss": 2.6225, "mean_token_accuracy": 0.38669951558113097, "step": 36790 }, { "epoch": 0.037060314793607434, "grad_norm": 13.347424377097235, "learning_rate": 3.705960558386882e-05, "loss": 2.574, "mean_token_accuracy": 0.34137930274009703, "step": 36795 }, { "epoch": 0.03706535084671161, "grad_norm": 13.7405612957663, "learning_rate": 3.706464154059989e-05, "loss": 2.5837, "mean_token_accuracy": 0.4068965494632721, "step": 36800 }, { "epoch": 0.03707038689981578, "grad_norm": 17.1206256919078, "learning_rate": 3.706967749733095e-05, "loss": 2.8932, "mean_token_accuracy": 0.3551724076271057, "step": 36805 }, { "epoch": 0.037075422952919955, "grad_norm": 13.876509502446755, "learning_rate": 3.7074713454062e-05, "loss": 2.3711, "mean_token_accuracy": 0.4517241418361664, "step": 36810 }, { "epoch": 0.03708045900602413, "grad_norm": 16.550653718997555, "learning_rate": 3.707974941079306e-05, "loss": 2.8437, "mean_token_accuracy": 0.38275861740112305, "step": 36815 }, { "epoch": 0.0370854950591283, "grad_norm": 14.707804878946575, "learning_rate": 3.708478536752412e-05, "loss": 2.1756, "mean_token_accuracy": 0.45366001725196836, "step": 36820 }, { "epoch": 0.03709053111223247, "grad_norm": 14.860491872812407, "learning_rate": 3.7089821324255186e-05, "loss": 3.0737, "mean_token_accuracy": 0.3517241388559341, "step": 36825 }, { "epoch": 0.03709556716533664, "grad_norm": 14.935375948459374, "learning_rate": 3.7094857280986245e-05, "loss": 2.4475, "mean_token_accuracy": 0.4275861978530884, "step": 36830 }, { "epoch": 0.03710060321844082, "grad_norm": 18.9861745522136, "learning_rate": 3.7099893237717304e-05, "loss": 2.8778, "mean_token_accuracy": 0.35862069129943847, "step": 36835 }, { "epoch": 0.03710563927154499, "grad_norm": 16.36811136693674, "learning_rate": 3.7104929194448363e-05, "loss": 2.5483, "mean_token_accuracy": 0.4034482777118683, "step": 36840 }, { "epoch": 0.037110675324649164, "grad_norm": 13.987039601173036, "learning_rate": 3.710996515117942e-05, "loss": 2.4742, "mean_token_accuracy": 0.382758629322052, "step": 36845 }, { "epoch": 0.03711571137775334, "grad_norm": 12.528021198028817, "learning_rate": 3.711500110791049e-05, "loss": 2.1765, "mean_token_accuracy": 0.4517241299152374, "step": 36850 }, { "epoch": 0.03712074743085751, "grad_norm": 19.737135447565944, "learning_rate": 3.712003706464154e-05, "loss": 3.0308, "mean_token_accuracy": 0.37586206793785093, "step": 36855 }, { "epoch": 0.03712578348396168, "grad_norm": 13.328153713146431, "learning_rate": 3.71250730213726e-05, "loss": 3.016, "mean_token_accuracy": 0.3827586114406586, "step": 36860 }, { "epoch": 0.03713081953706585, "grad_norm": 13.034933300677164, "learning_rate": 3.713010897810366e-05, "loss": 2.4614, "mean_token_accuracy": 0.441379314661026, "step": 36865 }, { "epoch": 0.037135855590170026, "grad_norm": 13.418181249917087, "learning_rate": 3.713514493483472e-05, "loss": 2.4326, "mean_token_accuracy": 0.42758620977401735, "step": 36870 }, { "epoch": 0.0371408916432742, "grad_norm": 12.20188906408476, "learning_rate": 3.714018089156578e-05, "loss": 2.3056, "mean_token_accuracy": 0.4172413766384125, "step": 36875 }, { "epoch": 0.037145927696378374, "grad_norm": 21.345372961236794, "learning_rate": 3.7145216848296844e-05, "loss": 2.9974, "mean_token_accuracy": 0.3569872975349426, "step": 36880 }, { "epoch": 0.03715096374948255, "grad_norm": 15.597128550286877, "learning_rate": 3.7150252805027903e-05, "loss": 2.5023, "mean_token_accuracy": 0.4310344815254211, "step": 36885 }, { "epoch": 0.03715599980258672, "grad_norm": 15.9805739594926, "learning_rate": 3.715528876175896e-05, "loss": 2.5682, "mean_token_accuracy": 0.4517241299152374, "step": 36890 }, { "epoch": 0.03716103585569089, "grad_norm": 14.344108333865575, "learning_rate": 3.716032471849002e-05, "loss": 2.6906, "mean_token_accuracy": 0.3896551638841629, "step": 36895 }, { "epoch": 0.03716607190879506, "grad_norm": 14.01777929035498, "learning_rate": 3.716536067522108e-05, "loss": 2.7899, "mean_token_accuracy": 0.3275862067937851, "step": 36900 }, { "epoch": 0.037171107961899236, "grad_norm": 16.351207492175742, "learning_rate": 3.717039663195214e-05, "loss": 2.4145, "mean_token_accuracy": 0.4206896543502808, "step": 36905 }, { "epoch": 0.03717614401500341, "grad_norm": 15.645225942122792, "learning_rate": 3.71754325886832e-05, "loss": 2.4171, "mean_token_accuracy": 0.42758620977401735, "step": 36910 }, { "epoch": 0.03718118006810758, "grad_norm": 17.610599082166246, "learning_rate": 3.718046854541426e-05, "loss": 2.3899, "mean_token_accuracy": 0.3620689570903778, "step": 36915 }, { "epoch": 0.03718621612121176, "grad_norm": 19.235781277663015, "learning_rate": 3.718550450214532e-05, "loss": 2.2603, "mean_token_accuracy": 0.4862068951129913, "step": 36920 }, { "epoch": 0.03719125217431593, "grad_norm": 15.002460756484108, "learning_rate": 3.719054045887638e-05, "loss": 2.4973, "mean_token_accuracy": 0.41379310488700866, "step": 36925 }, { "epoch": 0.0371962882274201, "grad_norm": 16.61585405385401, "learning_rate": 3.7195576415607444e-05, "loss": 2.6413, "mean_token_accuracy": 0.41724138855934145, "step": 36930 }, { "epoch": 0.03720132428052427, "grad_norm": 15.083693627198638, "learning_rate": 3.72006123723385e-05, "loss": 2.588, "mean_token_accuracy": 0.39655172228813174, "step": 36935 }, { "epoch": 0.037206360333628445, "grad_norm": 15.073336156813328, "learning_rate": 3.720564832906956e-05, "loss": 2.4992, "mean_token_accuracy": 0.4430732011795044, "step": 36940 }, { "epoch": 0.03721139638673262, "grad_norm": 14.187064410263124, "learning_rate": 3.7210684285800615e-05, "loss": 2.2632, "mean_token_accuracy": 0.42758620381355283, "step": 36945 }, { "epoch": 0.03721643243983679, "grad_norm": 17.947019564046133, "learning_rate": 3.7215720242531674e-05, "loss": 2.5576, "mean_token_accuracy": 0.40344828367233276, "step": 36950 }, { "epoch": 0.037221468492940966, "grad_norm": 14.59329980851731, "learning_rate": 3.722075619926273e-05, "loss": 2.59, "mean_token_accuracy": 0.4154264986515045, "step": 36955 }, { "epoch": 0.03722650454604514, "grad_norm": 12.088184753972115, "learning_rate": 3.72257921559938e-05, "loss": 2.2358, "mean_token_accuracy": 0.4137930989265442, "step": 36960 }, { "epoch": 0.03723154059914931, "grad_norm": 19.499888568930547, "learning_rate": 3.723082811272486e-05, "loss": 2.61, "mean_token_accuracy": 0.3689655214548111, "step": 36965 }, { "epoch": 0.03723657665225348, "grad_norm": 15.421823701432386, "learning_rate": 3.723586406945592e-05, "loss": 2.2798, "mean_token_accuracy": 0.43623715043067934, "step": 36970 }, { "epoch": 0.037241612705357655, "grad_norm": 14.833428354424186, "learning_rate": 3.724090002618698e-05, "loss": 2.5679, "mean_token_accuracy": 0.3862069010734558, "step": 36975 }, { "epoch": 0.03724664875846183, "grad_norm": 18.577130206443186, "learning_rate": 3.7245935982918036e-05, "loss": 2.7312, "mean_token_accuracy": 0.3774349629878998, "step": 36980 }, { "epoch": 0.037251684811566, "grad_norm": 14.873891519078386, "learning_rate": 3.7250971939649095e-05, "loss": 2.6457, "mean_token_accuracy": 0.3724137842655182, "step": 36985 }, { "epoch": 0.037256720864670176, "grad_norm": 14.555570046125922, "learning_rate": 3.7256007896380155e-05, "loss": 2.9705, "mean_token_accuracy": 0.3551724195480347, "step": 36990 }, { "epoch": 0.03726175691777435, "grad_norm": 13.116953898879931, "learning_rate": 3.7261043853111214e-05, "loss": 2.5321, "mean_token_accuracy": 0.39999999701976774, "step": 36995 }, { "epoch": 0.037266792970878516, "grad_norm": 14.05964570090305, "learning_rate": 3.726607980984227e-05, "loss": 2.5995, "mean_token_accuracy": 0.358620685338974, "step": 37000 }, { "epoch": 0.03727182902398269, "grad_norm": 15.272355727642719, "learning_rate": 3.727111576657333e-05, "loss": 2.6403, "mean_token_accuracy": 0.42413793206214906, "step": 37005 }, { "epoch": 0.037276865077086864, "grad_norm": 17.128886054715, "learning_rate": 3.72761517233044e-05, "loss": 2.5104, "mean_token_accuracy": 0.42758620977401735, "step": 37010 }, { "epoch": 0.03728190113019104, "grad_norm": 15.086621158668816, "learning_rate": 3.728118768003546e-05, "loss": 2.5052, "mean_token_accuracy": 0.38275861740112305, "step": 37015 }, { "epoch": 0.03728693718329521, "grad_norm": 15.9754395426235, "learning_rate": 3.728622363676652e-05, "loss": 3.0067, "mean_token_accuracy": 0.31724137961864474, "step": 37020 }, { "epoch": 0.037291973236399385, "grad_norm": 15.26129420871469, "learning_rate": 3.7291259593497576e-05, "loss": 2.3938, "mean_token_accuracy": 0.45172412395477296, "step": 37025 }, { "epoch": 0.03729700928950356, "grad_norm": 16.882262991485778, "learning_rate": 3.7296295550228635e-05, "loss": 3.0585, "mean_token_accuracy": 0.3793103456497192, "step": 37030 }, { "epoch": 0.037302045342607726, "grad_norm": 15.417573989533633, "learning_rate": 3.7301331506959695e-05, "loss": 2.543, "mean_token_accuracy": 0.3862068891525269, "step": 37035 }, { "epoch": 0.0373070813957119, "grad_norm": 12.395973961518987, "learning_rate": 3.7306367463690754e-05, "loss": 2.3744, "mean_token_accuracy": 0.4344827592372894, "step": 37040 }, { "epoch": 0.03731211744881607, "grad_norm": 18.63392842726373, "learning_rate": 3.731140342042181e-05, "loss": 2.7318, "mean_token_accuracy": 0.3517241358757019, "step": 37045 }, { "epoch": 0.03731715350192025, "grad_norm": 25.509881745602566, "learning_rate": 3.731643937715287e-05, "loss": 2.405, "mean_token_accuracy": 0.43254687786102297, "step": 37050 }, { "epoch": 0.03732218955502442, "grad_norm": 15.158199915438326, "learning_rate": 3.732147533388393e-05, "loss": 2.6761, "mean_token_accuracy": 0.4079854846000671, "step": 37055 }, { "epoch": 0.037327225608128595, "grad_norm": 17.821497858560402, "learning_rate": 3.732651129061499e-05, "loss": 2.3383, "mean_token_accuracy": 0.45517240166664125, "step": 37060 }, { "epoch": 0.03733226166123277, "grad_norm": 14.962328222669854, "learning_rate": 3.733154724734606e-05, "loss": 2.4325, "mean_token_accuracy": 0.3896551728248596, "step": 37065 }, { "epoch": 0.037337297714336935, "grad_norm": 13.236523969216346, "learning_rate": 3.7336583204077116e-05, "loss": 2.4022, "mean_token_accuracy": 0.4431336998939514, "step": 37070 }, { "epoch": 0.03734233376744111, "grad_norm": 17.173124417103086, "learning_rate": 3.7341619160808175e-05, "loss": 2.9942, "mean_token_accuracy": 0.37241379022598264, "step": 37075 }, { "epoch": 0.03734736982054528, "grad_norm": 17.751673952909627, "learning_rate": 3.734665511753923e-05, "loss": 3.0673, "mean_token_accuracy": 0.33103448152542114, "step": 37080 }, { "epoch": 0.03735240587364946, "grad_norm": 15.133261018386692, "learning_rate": 3.735169107427029e-05, "loss": 2.8089, "mean_token_accuracy": 0.4000000059604645, "step": 37085 }, { "epoch": 0.03735744192675363, "grad_norm": 15.351146374087152, "learning_rate": 3.735672703100135e-05, "loss": 2.5436, "mean_token_accuracy": 0.3827586233615875, "step": 37090 }, { "epoch": 0.037362477979857804, "grad_norm": 15.842471067688862, "learning_rate": 3.736176298773241e-05, "loss": 2.8207, "mean_token_accuracy": 0.39655172228813174, "step": 37095 }, { "epoch": 0.03736751403296198, "grad_norm": 13.255546320989447, "learning_rate": 3.736679894446347e-05, "loss": 2.256, "mean_token_accuracy": 0.4482758641242981, "step": 37100 }, { "epoch": 0.037372550086066145, "grad_norm": 15.001262189703693, "learning_rate": 3.737183490119453e-05, "loss": 3.0586, "mean_token_accuracy": 0.3551724076271057, "step": 37105 }, { "epoch": 0.03737758613917032, "grad_norm": 15.08775160260167, "learning_rate": 3.737687085792559e-05, "loss": 2.3661, "mean_token_accuracy": 0.42413793206214906, "step": 37110 }, { "epoch": 0.03738262219227449, "grad_norm": 13.632981879661061, "learning_rate": 3.738190681465665e-05, "loss": 2.4577, "mean_token_accuracy": 0.4482758641242981, "step": 37115 }, { "epoch": 0.037387658245378666, "grad_norm": 12.969719664638676, "learning_rate": 3.738694277138771e-05, "loss": 2.6251, "mean_token_accuracy": 0.39086509346961973, "step": 37120 }, { "epoch": 0.03739269429848284, "grad_norm": 12.905571565015103, "learning_rate": 3.739197872811877e-05, "loss": 2.6843, "mean_token_accuracy": 0.37241379022598264, "step": 37125 }, { "epoch": 0.037397730351587014, "grad_norm": 18.638340379462747, "learning_rate": 3.739701468484983e-05, "loss": 2.7719, "mean_token_accuracy": 0.3827586233615875, "step": 37130 }, { "epoch": 0.03740276640469119, "grad_norm": 15.253056089411233, "learning_rate": 3.7402050641580886e-05, "loss": 2.5805, "mean_token_accuracy": 0.441379314661026, "step": 37135 }, { "epoch": 0.037407802457795354, "grad_norm": 14.242773377886216, "learning_rate": 3.7407086598311946e-05, "loss": 2.6559, "mean_token_accuracy": 0.3620689660310745, "step": 37140 }, { "epoch": 0.03741283851089953, "grad_norm": 15.260825564855491, "learning_rate": 3.741212255504301e-05, "loss": 2.7139, "mean_token_accuracy": 0.38965516686439516, "step": 37145 }, { "epoch": 0.0374178745640037, "grad_norm": 13.076380620682157, "learning_rate": 3.741715851177407e-05, "loss": 2.2292, "mean_token_accuracy": 0.41034482717514037, "step": 37150 }, { "epoch": 0.037422910617107875, "grad_norm": 19.260522588702823, "learning_rate": 3.742219446850513e-05, "loss": 2.4814, "mean_token_accuracy": 0.4379310369491577, "step": 37155 }, { "epoch": 0.03742794667021205, "grad_norm": 15.382215252876598, "learning_rate": 3.742723042523619e-05, "loss": 2.911, "mean_token_accuracy": 0.3551724076271057, "step": 37160 }, { "epoch": 0.03743298272331622, "grad_norm": 14.915892891451094, "learning_rate": 3.743226638196725e-05, "loss": 2.9596, "mean_token_accuracy": 0.3586206793785095, "step": 37165 }, { "epoch": 0.0374380187764204, "grad_norm": 21.018577224892457, "learning_rate": 3.743730233869831e-05, "loss": 2.8792, "mean_token_accuracy": 0.4103448331356049, "step": 37170 }, { "epoch": 0.037443054829524564, "grad_norm": 14.148458836166203, "learning_rate": 3.744233829542937e-05, "loss": 2.7112, "mean_token_accuracy": 0.3793103456497192, "step": 37175 }, { "epoch": 0.03744809088262874, "grad_norm": 10.999292119636277, "learning_rate": 3.7447374252160427e-05, "loss": 2.6739, "mean_token_accuracy": 0.41379310488700866, "step": 37180 }, { "epoch": 0.03745312693573291, "grad_norm": 15.551240900808741, "learning_rate": 3.7452410208891486e-05, "loss": 2.9199, "mean_token_accuracy": 0.35517241060733795, "step": 37185 }, { "epoch": 0.037458162988837085, "grad_norm": 16.05545767455831, "learning_rate": 3.7457446165622545e-05, "loss": 2.7567, "mean_token_accuracy": 0.3827586233615875, "step": 37190 }, { "epoch": 0.03746319904194126, "grad_norm": 12.091876989964625, "learning_rate": 3.7462482122353604e-05, "loss": 2.2669, "mean_token_accuracy": 0.4275862157344818, "step": 37195 }, { "epoch": 0.03746823509504543, "grad_norm": 15.017214059884669, "learning_rate": 3.746751807908467e-05, "loss": 2.915, "mean_token_accuracy": 0.35517241060733795, "step": 37200 }, { "epoch": 0.037473271148149606, "grad_norm": 17.15534291637143, "learning_rate": 3.747255403581573e-05, "loss": 3.0101, "mean_token_accuracy": 0.3206896513700485, "step": 37205 }, { "epoch": 0.03747830720125377, "grad_norm": 12.500291941285042, "learning_rate": 3.747758999254679e-05, "loss": 2.4347, "mean_token_accuracy": 0.41034482717514037, "step": 37210 }, { "epoch": 0.03748334325435795, "grad_norm": 16.522644581295456, "learning_rate": 3.748262594927784e-05, "loss": 2.9567, "mean_token_accuracy": 0.37586206793785093, "step": 37215 }, { "epoch": 0.03748837930746212, "grad_norm": 13.482915454090747, "learning_rate": 3.74876619060089e-05, "loss": 2.2989, "mean_token_accuracy": 0.42758620381355283, "step": 37220 }, { "epoch": 0.037493415360566294, "grad_norm": 13.095617954138069, "learning_rate": 3.7492697862739967e-05, "loss": 2.6189, "mean_token_accuracy": 0.3965517312288284, "step": 37225 }, { "epoch": 0.03749845141367047, "grad_norm": 15.214622762015352, "learning_rate": 3.7497733819471026e-05, "loss": 2.6386, "mean_token_accuracy": 0.4116757452487946, "step": 37230 }, { "epoch": 0.03750348746677464, "grad_norm": 13.51527068099464, "learning_rate": 3.7502769776202085e-05, "loss": 2.4337, "mean_token_accuracy": 0.4379310369491577, "step": 37235 }, { "epoch": 0.037508523519878816, "grad_norm": 18.819319508182396, "learning_rate": 3.7507805732933144e-05, "loss": 2.528, "mean_token_accuracy": 0.4017543852329254, "step": 37240 }, { "epoch": 0.03751355957298298, "grad_norm": 18.02407642550917, "learning_rate": 3.7512841689664204e-05, "loss": 2.6443, "mean_token_accuracy": 0.36896551847457887, "step": 37245 }, { "epoch": 0.037518595626087156, "grad_norm": 22.40748886666479, "learning_rate": 3.751787764639527e-05, "loss": 2.5944, "mean_token_accuracy": 0.4137930989265442, "step": 37250 }, { "epoch": 0.03752363167919133, "grad_norm": 15.073655982234948, "learning_rate": 3.752291360312632e-05, "loss": 2.2824, "mean_token_accuracy": 0.41379310488700866, "step": 37255 }, { "epoch": 0.037528667732295504, "grad_norm": 15.205477085343581, "learning_rate": 3.752794955985738e-05, "loss": 2.7656, "mean_token_accuracy": 0.42758620977401735, "step": 37260 }, { "epoch": 0.03753370378539968, "grad_norm": 14.833307725558798, "learning_rate": 3.753298551658844e-05, "loss": 2.428, "mean_token_accuracy": 0.4, "step": 37265 }, { "epoch": 0.03753873983850385, "grad_norm": 16.49894398318288, "learning_rate": 3.75380214733195e-05, "loss": 2.5599, "mean_token_accuracy": 0.37586206793785093, "step": 37270 }, { "epoch": 0.037543775891608025, "grad_norm": 15.12630260029372, "learning_rate": 3.7543057430050566e-05, "loss": 2.209, "mean_token_accuracy": 0.4068965494632721, "step": 37275 }, { "epoch": 0.03754881194471219, "grad_norm": 17.13705218859721, "learning_rate": 3.7548093386781625e-05, "loss": 2.6415, "mean_token_accuracy": 0.4034482717514038, "step": 37280 }, { "epoch": 0.037553847997816366, "grad_norm": 14.967548616468518, "learning_rate": 3.7553129343512684e-05, "loss": 2.7424, "mean_token_accuracy": 0.37931033968925476, "step": 37285 }, { "epoch": 0.03755888405092054, "grad_norm": 12.0525257702542, "learning_rate": 3.7558165300243744e-05, "loss": 2.4353, "mean_token_accuracy": 0.39310344457626345, "step": 37290 }, { "epoch": 0.03756392010402471, "grad_norm": 11.980874813087183, "learning_rate": 3.75632012569748e-05, "loss": 2.4278, "mean_token_accuracy": 0.36551723480224607, "step": 37295 }, { "epoch": 0.03756895615712889, "grad_norm": 14.102922157654511, "learning_rate": 3.756823721370586e-05, "loss": 2.4648, "mean_token_accuracy": 0.45517241954803467, "step": 37300 }, { "epoch": 0.03757399221023306, "grad_norm": 15.27001406095254, "learning_rate": 3.757327317043692e-05, "loss": 2.5155, "mean_token_accuracy": 0.4172413766384125, "step": 37305 }, { "epoch": 0.037579028263337234, "grad_norm": 17.021464613701173, "learning_rate": 3.757830912716798e-05, "loss": 3.1649, "mean_token_accuracy": 0.3103448271751404, "step": 37310 }, { "epoch": 0.0375840643164414, "grad_norm": 11.76766535096749, "learning_rate": 3.758334508389904e-05, "loss": 2.3518, "mean_token_accuracy": 0.4034482777118683, "step": 37315 }, { "epoch": 0.037589100369545575, "grad_norm": 18.2817168173513, "learning_rate": 3.75883810406301e-05, "loss": 2.7215, "mean_token_accuracy": 0.3620689630508423, "step": 37320 }, { "epoch": 0.03759413642264975, "grad_norm": 16.139894169946512, "learning_rate": 3.759341699736116e-05, "loss": 2.4469, "mean_token_accuracy": 0.41379310488700866, "step": 37325 }, { "epoch": 0.03759917247575392, "grad_norm": 12.959993246528642, "learning_rate": 3.7598452954092224e-05, "loss": 2.5899, "mean_token_accuracy": 0.4223835408687592, "step": 37330 }, { "epoch": 0.037604208528858096, "grad_norm": 12.562160644566669, "learning_rate": 3.7603488910823284e-05, "loss": 2.5406, "mean_token_accuracy": 0.42758620381355283, "step": 37335 }, { "epoch": 0.03760924458196227, "grad_norm": 11.388757829919916, "learning_rate": 3.760852486755434e-05, "loss": 2.604, "mean_token_accuracy": 0.44482759237289426, "step": 37340 }, { "epoch": 0.037614280635066444, "grad_norm": 17.859357054318874, "learning_rate": 3.7613560824285395e-05, "loss": 2.4748, "mean_token_accuracy": 0.4482758641242981, "step": 37345 }, { "epoch": 0.03761931668817061, "grad_norm": 12.795253385490184, "learning_rate": 3.7618596781016455e-05, "loss": 2.137, "mean_token_accuracy": 0.4896551609039307, "step": 37350 }, { "epoch": 0.037624352741274784, "grad_norm": 11.007730505909652, "learning_rate": 3.762363273774752e-05, "loss": 2.7909, "mean_token_accuracy": 0.39999999701976774, "step": 37355 }, { "epoch": 0.03762938879437896, "grad_norm": 17.49928757104641, "learning_rate": 3.762866869447858e-05, "loss": 2.909, "mean_token_accuracy": 0.3655172407627106, "step": 37360 }, { "epoch": 0.03763442484748313, "grad_norm": 16.18310434471742, "learning_rate": 3.763370465120964e-05, "loss": 2.7706, "mean_token_accuracy": 0.3551724195480347, "step": 37365 }, { "epoch": 0.037639460900587306, "grad_norm": 18.431244996763173, "learning_rate": 3.76387406079407e-05, "loss": 2.7118, "mean_token_accuracy": 0.4068965494632721, "step": 37370 }, { "epoch": 0.03764449695369148, "grad_norm": 19.73557123611002, "learning_rate": 3.764377656467176e-05, "loss": 2.7471, "mean_token_accuracy": 0.37241379022598264, "step": 37375 }, { "epoch": 0.03764953300679565, "grad_norm": 17.254990550462626, "learning_rate": 3.764881252140282e-05, "loss": 2.6761, "mean_token_accuracy": 0.36896551251411436, "step": 37380 }, { "epoch": 0.03765456905989982, "grad_norm": 15.233477798941589, "learning_rate": 3.765384847813388e-05, "loss": 2.4229, "mean_token_accuracy": 0.4, "step": 37385 }, { "epoch": 0.037659605113003994, "grad_norm": 14.567631397117367, "learning_rate": 3.7658884434864935e-05, "loss": 2.5189, "mean_token_accuracy": 0.39310344457626345, "step": 37390 }, { "epoch": 0.03766464116610817, "grad_norm": 13.663812497416789, "learning_rate": 3.7663920391595995e-05, "loss": 2.5284, "mean_token_accuracy": 0.42068964838981626, "step": 37395 }, { "epoch": 0.03766967721921234, "grad_norm": 12.843168890320923, "learning_rate": 3.7668956348327054e-05, "loss": 2.33, "mean_token_accuracy": 0.4310344815254211, "step": 37400 }, { "epoch": 0.037674713272316515, "grad_norm": 11.180394036435631, "learning_rate": 3.767399230505811e-05, "loss": 2.222, "mean_token_accuracy": 0.4532365322113037, "step": 37405 }, { "epoch": 0.03767974932542069, "grad_norm": 15.16200139613471, "learning_rate": 3.767902826178918e-05, "loss": 2.4349, "mean_token_accuracy": 0.43793103098869324, "step": 37410 }, { "epoch": 0.03768478537852486, "grad_norm": 15.00254684676308, "learning_rate": 3.768406421852024e-05, "loss": 2.4594, "mean_token_accuracy": 0.42413792610168455, "step": 37415 }, { "epoch": 0.03768982143162903, "grad_norm": 12.064764300251081, "learning_rate": 3.76891001752513e-05, "loss": 2.5965, "mean_token_accuracy": 0.4034482777118683, "step": 37420 }, { "epoch": 0.0376948574847332, "grad_norm": 14.782644649373715, "learning_rate": 3.769413613198236e-05, "loss": 2.2438, "mean_token_accuracy": 0.484634006023407, "step": 37425 }, { "epoch": 0.03769989353783738, "grad_norm": 16.088616604242418, "learning_rate": 3.7699172088713416e-05, "loss": 2.7424, "mean_token_accuracy": 0.4068965494632721, "step": 37430 }, { "epoch": 0.03770492959094155, "grad_norm": 14.861030761559944, "learning_rate": 3.7704208045444476e-05, "loss": 2.5604, "mean_token_accuracy": 0.3896551728248596, "step": 37435 }, { "epoch": 0.037709965644045725, "grad_norm": 14.942877241509203, "learning_rate": 3.7709244002175535e-05, "loss": 2.4307, "mean_token_accuracy": 0.41034482717514037, "step": 37440 }, { "epoch": 0.0377150016971499, "grad_norm": 11.554954658508198, "learning_rate": 3.7714279958906594e-05, "loss": 2.3134, "mean_token_accuracy": 0.4344827592372894, "step": 37445 }, { "epoch": 0.03772003775025407, "grad_norm": 13.234206052930565, "learning_rate": 3.771931591563765e-05, "loss": 2.4729, "mean_token_accuracy": 0.43629764914512636, "step": 37450 }, { "epoch": 0.03772507380335824, "grad_norm": 14.550100839923084, "learning_rate": 3.772435187236871e-05, "loss": 2.3391, "mean_token_accuracy": 0.46067755818367007, "step": 37455 }, { "epoch": 0.03773010985646241, "grad_norm": 13.860644732971673, "learning_rate": 3.772938782909977e-05, "loss": 2.4702, "mean_token_accuracy": 0.4172413766384125, "step": 37460 }, { "epoch": 0.037735145909566586, "grad_norm": 14.26105951773389, "learning_rate": 3.773442378583084e-05, "loss": 2.4257, "mean_token_accuracy": 0.4103448331356049, "step": 37465 }, { "epoch": 0.03774018196267076, "grad_norm": 15.029553924801379, "learning_rate": 3.77394597425619e-05, "loss": 2.6831, "mean_token_accuracy": 0.3827586114406586, "step": 37470 }, { "epoch": 0.037745218015774934, "grad_norm": 15.884297512686892, "learning_rate": 3.7744495699292956e-05, "loss": 2.8042, "mean_token_accuracy": 0.38965516686439516, "step": 37475 }, { "epoch": 0.03775025406887911, "grad_norm": 17.54698322303089, "learning_rate": 3.774953165602401e-05, "loss": 2.6781, "mean_token_accuracy": 0.38620689511299133, "step": 37480 }, { "epoch": 0.03775529012198328, "grad_norm": 15.242176840196654, "learning_rate": 3.775456761275507e-05, "loss": 2.1139, "mean_token_accuracy": 0.44670296311378477, "step": 37485 }, { "epoch": 0.03776032617508745, "grad_norm": 22.901440042691785, "learning_rate": 3.7759603569486134e-05, "loss": 2.7446, "mean_token_accuracy": 0.39310344457626345, "step": 37490 }, { "epoch": 0.03776536222819162, "grad_norm": 14.824067473452105, "learning_rate": 3.776463952621719e-05, "loss": 2.6717, "mean_token_accuracy": 0.3586206823587418, "step": 37495 }, { "epoch": 0.037770398281295796, "grad_norm": 15.874462897133325, "learning_rate": 3.776967548294825e-05, "loss": 2.5705, "mean_token_accuracy": 0.41724138259887694, "step": 37500 }, { "epoch": 0.03777543433439997, "grad_norm": 16.136880666348233, "learning_rate": 3.777471143967931e-05, "loss": 2.3457, "mean_token_accuracy": 0.4137930989265442, "step": 37505 }, { "epoch": 0.03778047038750414, "grad_norm": 23.498264828852093, "learning_rate": 3.777974739641037e-05, "loss": 2.8405, "mean_token_accuracy": 0.3482758581638336, "step": 37510 }, { "epoch": 0.03778550644060832, "grad_norm": 16.352136124326414, "learning_rate": 3.778478335314144e-05, "loss": 2.6462, "mean_token_accuracy": 0.4103448331356049, "step": 37515 }, { "epoch": 0.03779054249371249, "grad_norm": 13.824106704885109, "learning_rate": 3.778981930987249e-05, "loss": 2.6651, "mean_token_accuracy": 0.37241379022598264, "step": 37520 }, { "epoch": 0.03779557854681666, "grad_norm": 18.44190869040901, "learning_rate": 3.779485526660355e-05, "loss": 2.6627, "mean_token_accuracy": 0.36551723480224607, "step": 37525 }, { "epoch": 0.03780061459992083, "grad_norm": 12.182179802825718, "learning_rate": 3.779989122333461e-05, "loss": 2.3572, "mean_token_accuracy": 0.47586206793785096, "step": 37530 }, { "epoch": 0.037805650653025005, "grad_norm": 30.57724416519205, "learning_rate": 3.780492718006567e-05, "loss": 2.6024, "mean_token_accuracy": 0.44827585518360136, "step": 37535 }, { "epoch": 0.03781068670612918, "grad_norm": 15.762529631656243, "learning_rate": 3.7809963136796727e-05, "loss": 2.3011, "mean_token_accuracy": 0.42758620977401735, "step": 37540 }, { "epoch": 0.03781572275923335, "grad_norm": 13.101366316160087, "learning_rate": 3.781499909352779e-05, "loss": 1.9965, "mean_token_accuracy": 0.4551724076271057, "step": 37545 }, { "epoch": 0.03782075881233753, "grad_norm": 17.902825756778125, "learning_rate": 3.782003505025885e-05, "loss": 2.8482, "mean_token_accuracy": 0.3896551728248596, "step": 37550 }, { "epoch": 0.0378257948654417, "grad_norm": 17.788013136817227, "learning_rate": 3.782507100698991e-05, "loss": 2.5551, "mean_token_accuracy": 0.38620689511299133, "step": 37555 }, { "epoch": 0.03783083091854587, "grad_norm": 18.931728739740624, "learning_rate": 3.783010696372097e-05, "loss": 2.7915, "mean_token_accuracy": 0.40344828367233276, "step": 37560 }, { "epoch": 0.03783586697165004, "grad_norm": 16.888980661762105, "learning_rate": 3.783514292045203e-05, "loss": 2.5336, "mean_token_accuracy": 0.3931034505367279, "step": 37565 }, { "epoch": 0.037840903024754215, "grad_norm": 12.659661308799716, "learning_rate": 3.784017887718309e-05, "loss": 2.298, "mean_token_accuracy": 0.4517241358757019, "step": 37570 }, { "epoch": 0.03784593907785839, "grad_norm": 14.696549667213176, "learning_rate": 3.784521483391415e-05, "loss": 2.6208, "mean_token_accuracy": 0.37241379022598264, "step": 37575 }, { "epoch": 0.03785097513096256, "grad_norm": 13.475020994118884, "learning_rate": 3.785025079064521e-05, "loss": 2.4669, "mean_token_accuracy": 0.4344827592372894, "step": 37580 }, { "epoch": 0.037856011184066736, "grad_norm": 15.540453173611905, "learning_rate": 3.785528674737627e-05, "loss": 2.5261, "mean_token_accuracy": 0.4448275864124298, "step": 37585 }, { "epoch": 0.03786104723717091, "grad_norm": 13.720858918634693, "learning_rate": 3.7860322704107326e-05, "loss": 3.4132, "mean_token_accuracy": 0.3896551787853241, "step": 37590 }, { "epoch": 0.03786608329027508, "grad_norm": 18.110051051886447, "learning_rate": 3.786535866083839e-05, "loss": 2.5888, "mean_token_accuracy": 0.36896551251411436, "step": 37595 }, { "epoch": 0.03787111934337925, "grad_norm": 15.025557668487194, "learning_rate": 3.787039461756945e-05, "loss": 2.7147, "mean_token_accuracy": 0.34482758641242983, "step": 37600 }, { "epoch": 0.037876155396483424, "grad_norm": 15.881645737485643, "learning_rate": 3.787543057430051e-05, "loss": 2.4904, "mean_token_accuracy": 0.43448275327682495, "step": 37605 }, { "epoch": 0.0378811914495876, "grad_norm": 16.244838333260162, "learning_rate": 3.788046653103157e-05, "loss": 2.2996, "mean_token_accuracy": 0.4188142716884613, "step": 37610 }, { "epoch": 0.03788622750269177, "grad_norm": 14.142268602458111, "learning_rate": 3.788550248776262e-05, "loss": 2.2497, "mean_token_accuracy": 0.4448275864124298, "step": 37615 }, { "epoch": 0.037891263555795945, "grad_norm": 17.428215889340503, "learning_rate": 3.789053844449368e-05, "loss": 2.5894, "mean_token_accuracy": 0.40562613010406495, "step": 37620 }, { "epoch": 0.03789629960890012, "grad_norm": 15.416147142399966, "learning_rate": 3.789557440122475e-05, "loss": 2.8734, "mean_token_accuracy": 0.38275861740112305, "step": 37625 }, { "epoch": 0.037901335662004286, "grad_norm": 13.53705135546813, "learning_rate": 3.790061035795581e-05, "loss": 2.6827, "mean_token_accuracy": 0.37241379618644715, "step": 37630 }, { "epoch": 0.03790637171510846, "grad_norm": 14.122650647766182, "learning_rate": 3.7905646314686866e-05, "loss": 2.8602, "mean_token_accuracy": 0.41034482717514037, "step": 37635 }, { "epoch": 0.037911407768212634, "grad_norm": 14.0121383662044, "learning_rate": 3.7910682271417925e-05, "loss": 2.3133, "mean_token_accuracy": 0.44827585220336913, "step": 37640 }, { "epoch": 0.03791644382131681, "grad_norm": 12.906907868383122, "learning_rate": 3.7915718228148984e-05, "loss": 2.5719, "mean_token_accuracy": 0.4, "step": 37645 }, { "epoch": 0.03792147987442098, "grad_norm": 14.71865617752947, "learning_rate": 3.792075418488005e-05, "loss": 2.3419, "mean_token_accuracy": 0.44337567687034607, "step": 37650 }, { "epoch": 0.037926515927525155, "grad_norm": 14.53414532772543, "learning_rate": 3.79257901416111e-05, "loss": 2.8254, "mean_token_accuracy": 0.3793103516101837, "step": 37655 }, { "epoch": 0.03793155198062933, "grad_norm": 17.960179634101433, "learning_rate": 3.793082609834216e-05, "loss": 2.6533, "mean_token_accuracy": 0.4137930929660797, "step": 37660 }, { "epoch": 0.037936588033733495, "grad_norm": 14.880145906706671, "learning_rate": 3.793586205507322e-05, "loss": 2.3652, "mean_token_accuracy": 0.40000001192092893, "step": 37665 }, { "epoch": 0.03794162408683767, "grad_norm": 12.882325953347065, "learning_rate": 3.794089801180428e-05, "loss": 2.6547, "mean_token_accuracy": 0.4206896543502808, "step": 37670 }, { "epoch": 0.03794666013994184, "grad_norm": 16.75878623042873, "learning_rate": 3.794593396853535e-05, "loss": 2.9485, "mean_token_accuracy": 0.34482758641242983, "step": 37675 }, { "epoch": 0.03795169619304602, "grad_norm": 14.950645580552777, "learning_rate": 3.7950969925266406e-05, "loss": 2.7763, "mean_token_accuracy": 0.38124622106552125, "step": 37680 }, { "epoch": 0.03795673224615019, "grad_norm": 12.873083036364172, "learning_rate": 3.7956005881997465e-05, "loss": 2.7165, "mean_token_accuracy": 0.4172413766384125, "step": 37685 }, { "epoch": 0.037961768299254364, "grad_norm": 13.341979325639846, "learning_rate": 3.7961041838728525e-05, "loss": 2.449, "mean_token_accuracy": 0.38275861740112305, "step": 37690 }, { "epoch": 0.03796680435235854, "grad_norm": 13.637899203896987, "learning_rate": 3.7966077795459584e-05, "loss": 2.7396, "mean_token_accuracy": 0.3655172407627106, "step": 37695 }, { "epoch": 0.037971840405462705, "grad_norm": 14.474543480835859, "learning_rate": 3.797111375219064e-05, "loss": 2.9682, "mean_token_accuracy": 0.31724137663841245, "step": 37700 }, { "epoch": 0.03797687645856688, "grad_norm": 13.604619763214215, "learning_rate": 3.79761497089217e-05, "loss": 2.6191, "mean_token_accuracy": 0.37241379618644715, "step": 37705 }, { "epoch": 0.03798191251167105, "grad_norm": 12.890961451079653, "learning_rate": 3.798118566565276e-05, "loss": 2.4378, "mean_token_accuracy": 0.441379314661026, "step": 37710 }, { "epoch": 0.037986948564775226, "grad_norm": 16.86646797943491, "learning_rate": 3.798622162238382e-05, "loss": 2.5003, "mean_token_accuracy": 0.4184728980064392, "step": 37715 }, { "epoch": 0.0379919846178794, "grad_norm": 15.261238527802787, "learning_rate": 3.799125757911488e-05, "loss": 3.0867, "mean_token_accuracy": 0.3172413736581802, "step": 37720 }, { "epoch": 0.037997020670983574, "grad_norm": 12.858486166253133, "learning_rate": 3.799629353584594e-05, "loss": 2.5633, "mean_token_accuracy": 0.4018148839473724, "step": 37725 }, { "epoch": 0.03800205672408775, "grad_norm": 16.522694020114727, "learning_rate": 3.8001329492577005e-05, "loss": 2.7418, "mean_token_accuracy": 0.38620689511299133, "step": 37730 }, { "epoch": 0.038007092777191914, "grad_norm": 19.660380668550253, "learning_rate": 3.8006365449308065e-05, "loss": 2.4952, "mean_token_accuracy": 0.44482758045196535, "step": 37735 }, { "epoch": 0.03801212883029609, "grad_norm": 14.345364093379661, "learning_rate": 3.8011401406039124e-05, "loss": 2.4168, "mean_token_accuracy": 0.4, "step": 37740 }, { "epoch": 0.03801716488340026, "grad_norm": 15.248943692346172, "learning_rate": 3.801643736277018e-05, "loss": 2.9315, "mean_token_accuracy": 0.3448275804519653, "step": 37745 }, { "epoch": 0.038022200936504436, "grad_norm": 13.586911914243494, "learning_rate": 3.8021473319501236e-05, "loss": 2.4199, "mean_token_accuracy": 0.43448275327682495, "step": 37750 }, { "epoch": 0.03802723698960861, "grad_norm": 14.05078082131833, "learning_rate": 3.80265092762323e-05, "loss": 3.0982, "mean_token_accuracy": 0.33103448152542114, "step": 37755 }, { "epoch": 0.03803227304271278, "grad_norm": 21.276259145202413, "learning_rate": 3.803154523296336e-05, "loss": 2.7434, "mean_token_accuracy": 0.3793103456497192, "step": 37760 }, { "epoch": 0.03803730909581696, "grad_norm": 12.703429595678553, "learning_rate": 3.803658118969442e-05, "loss": 2.4986, "mean_token_accuracy": 0.403448274731636, "step": 37765 }, { "epoch": 0.038042345148921124, "grad_norm": 14.28092880209114, "learning_rate": 3.804161714642548e-05, "loss": 2.5141, "mean_token_accuracy": 0.3551724135875702, "step": 37770 }, { "epoch": 0.0380473812020253, "grad_norm": 13.229322818394166, "learning_rate": 3.804665310315654e-05, "loss": 2.656, "mean_token_accuracy": 0.417241370677948, "step": 37775 }, { "epoch": 0.03805241725512947, "grad_norm": 15.277840811207552, "learning_rate": 3.8051689059887605e-05, "loss": 2.1946, "mean_token_accuracy": 0.4744101583957672, "step": 37780 }, { "epoch": 0.038057453308233645, "grad_norm": 13.83828537259868, "learning_rate": 3.8056725016618664e-05, "loss": 2.3753, "mean_token_accuracy": 0.4413793087005615, "step": 37785 }, { "epoch": 0.03806248936133782, "grad_norm": 16.748514017231923, "learning_rate": 3.8061760973349716e-05, "loss": 2.648, "mean_token_accuracy": 0.4413793087005615, "step": 37790 }, { "epoch": 0.03806752541444199, "grad_norm": 12.965549435518312, "learning_rate": 3.8066796930080776e-05, "loss": 2.2892, "mean_token_accuracy": 0.4137930989265442, "step": 37795 }, { "epoch": 0.038072561467546166, "grad_norm": 13.488241851475362, "learning_rate": 3.8071832886811835e-05, "loss": 2.2491, "mean_token_accuracy": 0.42068964838981626, "step": 37800 }, { "epoch": 0.03807759752065033, "grad_norm": 15.05901351740915, "learning_rate": 3.8076868843542894e-05, "loss": 3.1551, "mean_token_accuracy": 0.34137930870056155, "step": 37805 }, { "epoch": 0.03808263357375451, "grad_norm": 14.649521441943392, "learning_rate": 3.808190480027396e-05, "loss": 2.3197, "mean_token_accuracy": 0.4241379380226135, "step": 37810 }, { "epoch": 0.03808766962685868, "grad_norm": 16.791017205358198, "learning_rate": 3.808694075700502e-05, "loss": 3.095, "mean_token_accuracy": 0.35862069129943847, "step": 37815 }, { "epoch": 0.038092705679962854, "grad_norm": 11.666733752996919, "learning_rate": 3.809197671373608e-05, "loss": 2.2199, "mean_token_accuracy": 0.4465214848518372, "step": 37820 }, { "epoch": 0.03809774173306703, "grad_norm": 16.63830817275079, "learning_rate": 3.809701267046714e-05, "loss": 3.0882, "mean_token_accuracy": 0.28620689213275907, "step": 37825 }, { "epoch": 0.0381027777861712, "grad_norm": 17.471730260980532, "learning_rate": 3.81020486271982e-05, "loss": 3.1173, "mean_token_accuracy": 0.35172413289546967, "step": 37830 }, { "epoch": 0.038107813839275376, "grad_norm": 12.163262364675393, "learning_rate": 3.8107084583929256e-05, "loss": 2.3014, "mean_token_accuracy": 0.44482758045196535, "step": 37835 }, { "epoch": 0.03811284989237954, "grad_norm": 16.145911724976326, "learning_rate": 3.8112120540660316e-05, "loss": 2.7963, "mean_token_accuracy": 0.3896551728248596, "step": 37840 }, { "epoch": 0.038117885945483716, "grad_norm": 19.010040289595896, "learning_rate": 3.8117156497391375e-05, "loss": 2.9046, "mean_token_accuracy": 0.32928009927272794, "step": 37845 }, { "epoch": 0.03812292199858789, "grad_norm": 14.727536539950835, "learning_rate": 3.8122192454122434e-05, "loss": 2.3857, "mean_token_accuracy": 0.43793103098869324, "step": 37850 }, { "epoch": 0.038127958051692064, "grad_norm": 13.411638013748485, "learning_rate": 3.8127228410853493e-05, "loss": 2.3252, "mean_token_accuracy": 0.42413792610168455, "step": 37855 }, { "epoch": 0.03813299410479624, "grad_norm": 16.2483222159373, "learning_rate": 3.813226436758456e-05, "loss": 2.7421, "mean_token_accuracy": 0.37241379618644715, "step": 37860 }, { "epoch": 0.03813803015790041, "grad_norm": 22.356008089113793, "learning_rate": 3.813730032431562e-05, "loss": 2.5209, "mean_token_accuracy": 0.3482758581638336, "step": 37865 }, { "epoch": 0.038143066211004585, "grad_norm": 23.040959417855625, "learning_rate": 3.814233628104668e-05, "loss": 2.5172, "mean_token_accuracy": 0.3999999940395355, "step": 37870 }, { "epoch": 0.03814810226410875, "grad_norm": 14.231902972127429, "learning_rate": 3.814737223777774e-05, "loss": 2.3011, "mean_token_accuracy": 0.3965517282485962, "step": 37875 }, { "epoch": 0.038153138317212926, "grad_norm": 12.251146197870462, "learning_rate": 3.815240819450879e-05, "loss": 2.5045, "mean_token_accuracy": 0.38275861740112305, "step": 37880 }, { "epoch": 0.0381581743703171, "grad_norm": 16.343568896723845, "learning_rate": 3.815744415123985e-05, "loss": 2.6235, "mean_token_accuracy": 0.3862069010734558, "step": 37885 }, { "epoch": 0.03816321042342127, "grad_norm": 12.614292507245805, "learning_rate": 3.8162480107970915e-05, "loss": 2.4736, "mean_token_accuracy": 0.4034482777118683, "step": 37890 }, { "epoch": 0.03816824647652545, "grad_norm": 17.410284974319527, "learning_rate": 3.8167516064701974e-05, "loss": 2.4558, "mean_token_accuracy": 0.42232305407524107, "step": 37895 }, { "epoch": 0.03817328252962962, "grad_norm": 15.264322979936674, "learning_rate": 3.8172552021433033e-05, "loss": 2.4544, "mean_token_accuracy": 0.4413793087005615, "step": 37900 }, { "epoch": 0.038178318582733795, "grad_norm": 11.33407289792448, "learning_rate": 3.817758797816409e-05, "loss": 2.398, "mean_token_accuracy": 0.47047791481018064, "step": 37905 }, { "epoch": 0.03818335463583796, "grad_norm": 18.236389684464193, "learning_rate": 3.818262393489515e-05, "loss": 2.8812, "mean_token_accuracy": 0.36896551549434664, "step": 37910 }, { "epoch": 0.038188390688942135, "grad_norm": 13.055690610454949, "learning_rate": 3.818765989162622e-05, "loss": 2.4693, "mean_token_accuracy": 0.4310344815254211, "step": 37915 }, { "epoch": 0.03819342674204631, "grad_norm": 14.602818689444494, "learning_rate": 3.819269584835728e-05, "loss": 2.3434, "mean_token_accuracy": 0.4592364549636841, "step": 37920 }, { "epoch": 0.03819846279515048, "grad_norm": 16.860791310310088, "learning_rate": 3.819773180508833e-05, "loss": 2.5999, "mean_token_accuracy": 0.36206896901130675, "step": 37925 }, { "epoch": 0.038203498848254656, "grad_norm": 17.676588156578468, "learning_rate": 3.820276776181939e-05, "loss": 2.662, "mean_token_accuracy": 0.3965517282485962, "step": 37930 }, { "epoch": 0.03820853490135883, "grad_norm": 14.424592910128728, "learning_rate": 3.820780371855045e-05, "loss": 2.5793, "mean_token_accuracy": 0.38965516686439516, "step": 37935 }, { "epoch": 0.038213570954463004, "grad_norm": 14.465523532596922, "learning_rate": 3.8212839675281514e-05, "loss": 2.4059, "mean_token_accuracy": 0.39999999701976774, "step": 37940 }, { "epoch": 0.03821860700756717, "grad_norm": 17.526842622394312, "learning_rate": 3.8217875632012574e-05, "loss": 2.5765, "mean_token_accuracy": 0.4206896543502808, "step": 37945 }, { "epoch": 0.038223643060671345, "grad_norm": 13.429102939720687, "learning_rate": 3.822291158874363e-05, "loss": 2.4583, "mean_token_accuracy": 0.4310344815254211, "step": 37950 }, { "epoch": 0.03822867911377552, "grad_norm": 14.515700993707537, "learning_rate": 3.822794754547469e-05, "loss": 2.4842, "mean_token_accuracy": 0.39310343861579894, "step": 37955 }, { "epoch": 0.03823371516687969, "grad_norm": 13.258251571760692, "learning_rate": 3.823298350220575e-05, "loss": 2.8482, "mean_token_accuracy": 0.3344827651977539, "step": 37960 }, { "epoch": 0.038238751219983866, "grad_norm": 12.469689986103063, "learning_rate": 3.823801945893681e-05, "loss": 2.5696, "mean_token_accuracy": 0.4103448212146759, "step": 37965 }, { "epoch": 0.03824378727308804, "grad_norm": 16.833768074662025, "learning_rate": 3.824305541566787e-05, "loss": 3.2654, "mean_token_accuracy": 0.36896551251411436, "step": 37970 }, { "epoch": 0.03824882332619221, "grad_norm": 29.877819259812252, "learning_rate": 3.824809137239893e-05, "loss": 2.4797, "mean_token_accuracy": 0.4088324248790741, "step": 37975 }, { "epoch": 0.03825385937929638, "grad_norm": 19.569854120700658, "learning_rate": 3.825312732912999e-05, "loss": 2.5189, "mean_token_accuracy": 0.42413793206214906, "step": 37980 }, { "epoch": 0.038258895432400554, "grad_norm": 14.168327821474506, "learning_rate": 3.825816328586105e-05, "loss": 2.3881, "mean_token_accuracy": 0.3689655244350433, "step": 37985 }, { "epoch": 0.03826393148550473, "grad_norm": 15.395164860575624, "learning_rate": 3.826319924259211e-05, "loss": 2.2982, "mean_token_accuracy": 0.43448275327682495, "step": 37990 }, { "epoch": 0.0382689675386089, "grad_norm": 16.82363297383371, "learning_rate": 3.826823519932317e-05, "loss": 2.5644, "mean_token_accuracy": 0.36896551251411436, "step": 37995 }, { "epoch": 0.038274003591713075, "grad_norm": 15.208807175783207, "learning_rate": 3.827327115605423e-05, "loss": 2.8182, "mean_token_accuracy": 0.35862068831920624, "step": 38000 }, { "epoch": 0.03827903964481725, "grad_norm": 10.16155184098454, "learning_rate": 3.827830711278529e-05, "loss": 2.3621, "mean_token_accuracy": 0.41834974884986875, "step": 38005 }, { "epoch": 0.03828407569792142, "grad_norm": 14.532251019456645, "learning_rate": 3.828334306951635e-05, "loss": 2.7053, "mean_token_accuracy": 0.37241379618644715, "step": 38010 }, { "epoch": 0.03828911175102559, "grad_norm": 20.829843655670896, "learning_rate": 3.82883790262474e-05, "loss": 2.8277, "mean_token_accuracy": 0.3827586114406586, "step": 38015 }, { "epoch": 0.03829414780412976, "grad_norm": 15.385980298208624, "learning_rate": 3.829341498297847e-05, "loss": 2.3331, "mean_token_accuracy": 0.4566848039627075, "step": 38020 }, { "epoch": 0.03829918385723394, "grad_norm": 13.378946118091237, "learning_rate": 3.829845093970953e-05, "loss": 2.7543, "mean_token_accuracy": 0.3793103456497192, "step": 38025 }, { "epoch": 0.03830421991033811, "grad_norm": 14.957470681282642, "learning_rate": 3.830348689644059e-05, "loss": 2.5885, "mean_token_accuracy": 0.3413793116807938, "step": 38030 }, { "epoch": 0.038309255963442285, "grad_norm": 13.276386945986111, "learning_rate": 3.830852285317165e-05, "loss": 2.532, "mean_token_accuracy": 0.42758620381355283, "step": 38035 }, { "epoch": 0.03831429201654646, "grad_norm": 18.05517090181714, "learning_rate": 3.8313558809902706e-05, "loss": 2.5982, "mean_token_accuracy": 0.38100423812866213, "step": 38040 }, { "epoch": 0.03831932806965063, "grad_norm": 13.84326636495037, "learning_rate": 3.831859476663377e-05, "loss": 2.5894, "mean_token_accuracy": 0.41034482717514037, "step": 38045 }, { "epoch": 0.0383243641227548, "grad_norm": 15.37424037919201, "learning_rate": 3.832363072336483e-05, "loss": 2.5127, "mean_token_accuracy": 0.41034482717514037, "step": 38050 }, { "epoch": 0.03832940017585897, "grad_norm": 19.149411362041537, "learning_rate": 3.8328666680095884e-05, "loss": 2.8354, "mean_token_accuracy": 0.36551723778247835, "step": 38055 }, { "epoch": 0.03833443622896315, "grad_norm": 22.01478028517961, "learning_rate": 3.833370263682694e-05, "loss": 2.7823, "mean_token_accuracy": 0.37755596041679385, "step": 38060 }, { "epoch": 0.03833947228206732, "grad_norm": 12.696778283793972, "learning_rate": 3.8338738593558e-05, "loss": 2.6861, "mean_token_accuracy": 0.36551724672317504, "step": 38065 }, { "epoch": 0.038344508335171494, "grad_norm": 14.099033853751777, "learning_rate": 3.834377455028906e-05, "loss": 3.0078, "mean_token_accuracy": 0.34482758641242983, "step": 38070 }, { "epoch": 0.03834954438827567, "grad_norm": 16.4955314916322, "learning_rate": 3.834881050702013e-05, "loss": 2.5724, "mean_token_accuracy": 0.36896551847457887, "step": 38075 }, { "epoch": 0.03835458044137984, "grad_norm": 14.968538978818405, "learning_rate": 3.835384646375119e-05, "loss": 2.6623, "mean_token_accuracy": 0.4068965554237366, "step": 38080 }, { "epoch": 0.03835961649448401, "grad_norm": 13.870731479022009, "learning_rate": 3.8358882420482246e-05, "loss": 2.4284, "mean_token_accuracy": 0.40689654350280763, "step": 38085 }, { "epoch": 0.03836465254758818, "grad_norm": 14.66610542187267, "learning_rate": 3.8363918377213305e-05, "loss": 2.3999, "mean_token_accuracy": 0.42068966031074523, "step": 38090 }, { "epoch": 0.038369688600692356, "grad_norm": 14.566716728432645, "learning_rate": 3.8368954333944365e-05, "loss": 2.4935, "mean_token_accuracy": 0.4310344815254211, "step": 38095 }, { "epoch": 0.03837472465379653, "grad_norm": 26.088833328390997, "learning_rate": 3.8373990290675424e-05, "loss": 2.5664, "mean_token_accuracy": 0.4448275864124298, "step": 38100 }, { "epoch": 0.038379760706900704, "grad_norm": 15.749096942871624, "learning_rate": 3.837902624740648e-05, "loss": 2.2186, "mean_token_accuracy": 0.44482758045196535, "step": 38105 }, { "epoch": 0.03838479676000488, "grad_norm": 14.358632283239768, "learning_rate": 3.838406220413754e-05, "loss": 2.339, "mean_token_accuracy": 0.4310344815254211, "step": 38110 }, { "epoch": 0.03838983281310905, "grad_norm": 14.259182207545305, "learning_rate": 3.83890981608686e-05, "loss": 2.1361, "mean_token_accuracy": 0.4310344815254211, "step": 38115 }, { "epoch": 0.03839486886621322, "grad_norm": 12.627322962958067, "learning_rate": 3.839413411759966e-05, "loss": 2.8337, "mean_token_accuracy": 0.35862068831920624, "step": 38120 }, { "epoch": 0.03839990491931739, "grad_norm": 14.694656489401094, "learning_rate": 3.839917007433073e-05, "loss": 2.2121, "mean_token_accuracy": 0.482758617401123, "step": 38125 }, { "epoch": 0.038404940972421565, "grad_norm": 13.25566767324547, "learning_rate": 3.8404206031061786e-05, "loss": 2.5515, "mean_token_accuracy": 0.39310344457626345, "step": 38130 }, { "epoch": 0.03840997702552574, "grad_norm": 13.66502754784551, "learning_rate": 3.8409241987792845e-05, "loss": 2.2646, "mean_token_accuracy": 0.42413793206214906, "step": 38135 }, { "epoch": 0.03841501307862991, "grad_norm": 14.138770750457414, "learning_rate": 3.8414277944523905e-05, "loss": 2.4893, "mean_token_accuracy": 0.4517241418361664, "step": 38140 }, { "epoch": 0.03842004913173409, "grad_norm": 13.672508425981114, "learning_rate": 3.8419313901254964e-05, "loss": 2.6837, "mean_token_accuracy": 0.324137932062149, "step": 38145 }, { "epoch": 0.03842508518483826, "grad_norm": 14.361431082391956, "learning_rate": 3.8424349857986016e-05, "loss": 2.3155, "mean_token_accuracy": 0.42068966031074523, "step": 38150 }, { "epoch": 0.03843012123794243, "grad_norm": 13.759563307405394, "learning_rate": 3.842938581471708e-05, "loss": 2.6316, "mean_token_accuracy": 0.4185722887516022, "step": 38155 }, { "epoch": 0.0384351572910466, "grad_norm": 13.97845361877045, "learning_rate": 3.843442177144814e-05, "loss": 2.7524, "mean_token_accuracy": 0.33793103098869326, "step": 38160 }, { "epoch": 0.038440193344150775, "grad_norm": 14.263126126280985, "learning_rate": 3.84394577281792e-05, "loss": 2.117, "mean_token_accuracy": 0.47586206197738645, "step": 38165 }, { "epoch": 0.03844522939725495, "grad_norm": 15.117477851425155, "learning_rate": 3.844449368491026e-05, "loss": 2.524, "mean_token_accuracy": 0.4, "step": 38170 }, { "epoch": 0.03845026545035912, "grad_norm": 24.503855781523896, "learning_rate": 3.844952964164132e-05, "loss": 2.8655, "mean_token_accuracy": 0.37586206793785093, "step": 38175 }, { "epoch": 0.038455301503463296, "grad_norm": 18.85976483675346, "learning_rate": 3.8454565598372386e-05, "loss": 2.4495, "mean_token_accuracy": 0.39310344457626345, "step": 38180 }, { "epoch": 0.03846033755656747, "grad_norm": 15.754784630369839, "learning_rate": 3.8459601555103445e-05, "loss": 2.5747, "mean_token_accuracy": 0.39310344159603117, "step": 38185 }, { "epoch": 0.03846537360967164, "grad_norm": 14.74041618737717, "learning_rate": 3.84646375118345e-05, "loss": 2.6815, "mean_token_accuracy": 0.3862069010734558, "step": 38190 }, { "epoch": 0.03847040966277581, "grad_norm": 15.490033775843019, "learning_rate": 3.8469673468565556e-05, "loss": 2.536, "mean_token_accuracy": 0.4344827592372894, "step": 38195 }, { "epoch": 0.038475445715879984, "grad_norm": 11.799882722567173, "learning_rate": 3.8474709425296616e-05, "loss": 2.3958, "mean_token_accuracy": 0.42068966031074523, "step": 38200 }, { "epoch": 0.03848048176898416, "grad_norm": 19.89391963915202, "learning_rate": 3.847974538202768e-05, "loss": 2.9041, "mean_token_accuracy": 0.38620689511299133, "step": 38205 }, { "epoch": 0.03848551782208833, "grad_norm": 13.99630180117272, "learning_rate": 3.848478133875874e-05, "loss": 2.4836, "mean_token_accuracy": 0.4296430706977844, "step": 38210 }, { "epoch": 0.038490553875192506, "grad_norm": 16.147191691371855, "learning_rate": 3.84898172954898e-05, "loss": 2.4556, "mean_token_accuracy": 0.4068965554237366, "step": 38215 }, { "epoch": 0.03849558992829668, "grad_norm": 14.0017089247059, "learning_rate": 3.849485325222086e-05, "loss": 3.0551, "mean_token_accuracy": 0.3413793116807938, "step": 38220 }, { "epoch": 0.038500625981400846, "grad_norm": 15.105778679561979, "learning_rate": 3.849988920895192e-05, "loss": 2.4309, "mean_token_accuracy": 0.4103448212146759, "step": 38225 }, { "epoch": 0.03850566203450502, "grad_norm": 14.903483998605717, "learning_rate": 3.850492516568298e-05, "loss": 2.1723, "mean_token_accuracy": 0.44137930274009707, "step": 38230 }, { "epoch": 0.038510698087609194, "grad_norm": 15.338867428282827, "learning_rate": 3.850996112241404e-05, "loss": 2.5887, "mean_token_accuracy": 0.3662561535835266, "step": 38235 }, { "epoch": 0.03851573414071337, "grad_norm": 11.924206404533528, "learning_rate": 3.8514997079145097e-05, "loss": 2.186, "mean_token_accuracy": 0.42758620977401735, "step": 38240 }, { "epoch": 0.03852077019381754, "grad_norm": 19.98645033335937, "learning_rate": 3.8520033035876156e-05, "loss": 2.5683, "mean_token_accuracy": 0.3896551787853241, "step": 38245 }, { "epoch": 0.038525806246921715, "grad_norm": 14.201366781945191, "learning_rate": 3.8525068992607215e-05, "loss": 2.7904, "mean_token_accuracy": 0.43103448748588563, "step": 38250 }, { "epoch": 0.03853084230002589, "grad_norm": 13.822016067985869, "learning_rate": 3.8530104949338274e-05, "loss": 2.7829, "mean_token_accuracy": 0.31724137663841245, "step": 38255 }, { "epoch": 0.038535878353130056, "grad_norm": 13.093084251038144, "learning_rate": 3.853514090606934e-05, "loss": 3.0322, "mean_token_accuracy": 0.3206896483898163, "step": 38260 }, { "epoch": 0.03854091440623423, "grad_norm": 12.687391736898395, "learning_rate": 3.85401768628004e-05, "loss": 2.3818, "mean_token_accuracy": 0.42068966031074523, "step": 38265 }, { "epoch": 0.0385459504593384, "grad_norm": 14.614551598448754, "learning_rate": 3.854521281953146e-05, "loss": 2.5805, "mean_token_accuracy": 0.3551724076271057, "step": 38270 }, { "epoch": 0.03855098651244258, "grad_norm": 14.503808518369269, "learning_rate": 3.855024877626252e-05, "loss": 2.6157, "mean_token_accuracy": 0.34482758343219755, "step": 38275 }, { "epoch": 0.03855602256554675, "grad_norm": 21.50399550649558, "learning_rate": 3.855528473299358e-05, "loss": 2.9316, "mean_token_accuracy": 0.35862069129943847, "step": 38280 }, { "epoch": 0.038561058618650924, "grad_norm": 13.787310494450276, "learning_rate": 3.8560320689724637e-05, "loss": 2.8589, "mean_token_accuracy": 0.3482758581638336, "step": 38285 }, { "epoch": 0.0385660946717551, "grad_norm": 21.51656683130556, "learning_rate": 3.8565356646455696e-05, "loss": 2.9416, "mean_token_accuracy": 0.36896551251411436, "step": 38290 }, { "epoch": 0.038571130724859265, "grad_norm": 16.036620720865905, "learning_rate": 3.8570392603186755e-05, "loss": 2.8207, "mean_token_accuracy": 0.38784028887748717, "step": 38295 }, { "epoch": 0.03857616677796344, "grad_norm": 13.414725476163719, "learning_rate": 3.8575428559917814e-05, "loss": 2.8088, "mean_token_accuracy": 0.38275861740112305, "step": 38300 }, { "epoch": 0.03858120283106761, "grad_norm": 13.735236926175432, "learning_rate": 3.8580464516648874e-05, "loss": 2.7301, "mean_token_accuracy": 0.38620689511299133, "step": 38305 }, { "epoch": 0.038586238884171786, "grad_norm": 12.937496294003896, "learning_rate": 3.858550047337993e-05, "loss": 2.6606, "mean_token_accuracy": 0.4, "step": 38310 }, { "epoch": 0.03859127493727596, "grad_norm": 15.802319890417898, "learning_rate": 3.8590536430111e-05, "loss": 2.7987, "mean_token_accuracy": 0.3551724135875702, "step": 38315 }, { "epoch": 0.038596310990380134, "grad_norm": 13.50307738162297, "learning_rate": 3.859557238684206e-05, "loss": 2.4511, "mean_token_accuracy": 0.4482758641242981, "step": 38320 }, { "epoch": 0.03860134704348431, "grad_norm": 15.050724093542225, "learning_rate": 3.860060834357311e-05, "loss": 2.6892, "mean_token_accuracy": 0.3689655065536499, "step": 38325 }, { "epoch": 0.038606383096588474, "grad_norm": 15.342623046038343, "learning_rate": 3.860564430030417e-05, "loss": 2.402, "mean_token_accuracy": 0.4172413766384125, "step": 38330 }, { "epoch": 0.03861141914969265, "grad_norm": 12.287894148491352, "learning_rate": 3.861068025703523e-05, "loss": 2.4966, "mean_token_accuracy": 0.38965516090393065, "step": 38335 }, { "epoch": 0.03861645520279682, "grad_norm": 12.951531416883368, "learning_rate": 3.8615716213766295e-05, "loss": 2.4105, "mean_token_accuracy": 0.4137930989265442, "step": 38340 }, { "epoch": 0.038621491255900996, "grad_norm": 17.00901406882622, "learning_rate": 3.8620752170497354e-05, "loss": 2.5555, "mean_token_accuracy": 0.3517241358757019, "step": 38345 }, { "epoch": 0.03862652730900517, "grad_norm": 15.691564248157894, "learning_rate": 3.8625788127228414e-05, "loss": 2.5077, "mean_token_accuracy": 0.42643678188323975, "step": 38350 }, { "epoch": 0.03863156336210934, "grad_norm": 17.140350545218983, "learning_rate": 3.863082408395947e-05, "loss": 2.6344, "mean_token_accuracy": 0.3896551728248596, "step": 38355 }, { "epoch": 0.03863659941521352, "grad_norm": 13.81575954331368, "learning_rate": 3.863586004069053e-05, "loss": 2.3762, "mean_token_accuracy": 0.4344827592372894, "step": 38360 }, { "epoch": 0.038641635468317684, "grad_norm": 15.708941526422842, "learning_rate": 3.864089599742159e-05, "loss": 2.4715, "mean_token_accuracy": 0.4379310369491577, "step": 38365 }, { "epoch": 0.03864667152142186, "grad_norm": 13.503567091318997, "learning_rate": 3.864593195415265e-05, "loss": 2.4124, "mean_token_accuracy": 0.41379310488700866, "step": 38370 }, { "epoch": 0.03865170757452603, "grad_norm": 14.337966375308937, "learning_rate": 3.865096791088371e-05, "loss": 2.4697, "mean_token_accuracy": 0.3827586233615875, "step": 38375 }, { "epoch": 0.038656743627630205, "grad_norm": 17.012538545446787, "learning_rate": 3.865600386761477e-05, "loss": 2.6393, "mean_token_accuracy": 0.37755596041679385, "step": 38380 }, { "epoch": 0.03866177968073438, "grad_norm": 13.001729921080834, "learning_rate": 3.866103982434583e-05, "loss": 2.6144, "mean_token_accuracy": 0.3931034505367279, "step": 38385 }, { "epoch": 0.03866681573383855, "grad_norm": 17.07054699206022, "learning_rate": 3.866607578107689e-05, "loss": 2.2787, "mean_token_accuracy": 0.42758620381355283, "step": 38390 }, { "epoch": 0.038671851786942726, "grad_norm": 12.111233371820354, "learning_rate": 3.8671111737807954e-05, "loss": 2.7196, "mean_token_accuracy": 0.3241379290819168, "step": 38395 }, { "epoch": 0.03867688784004689, "grad_norm": 12.117422535933377, "learning_rate": 3.867614769453901e-05, "loss": 2.3982, "mean_token_accuracy": 0.4428312063217163, "step": 38400 }, { "epoch": 0.03868192389315107, "grad_norm": 19.069326130729035, "learning_rate": 3.868118365127007e-05, "loss": 2.5136, "mean_token_accuracy": 0.43103447556495667, "step": 38405 }, { "epoch": 0.03868695994625524, "grad_norm": 12.372257511405454, "learning_rate": 3.868621960800113e-05, "loss": 2.5031, "mean_token_accuracy": 0.4241379380226135, "step": 38410 }, { "epoch": 0.038691995999359415, "grad_norm": 16.10223006243153, "learning_rate": 3.8691255564732184e-05, "loss": 2.8531, "mean_token_accuracy": 0.3310344785451889, "step": 38415 }, { "epoch": 0.03869703205246359, "grad_norm": 13.03977126188214, "learning_rate": 3.869629152146325e-05, "loss": 2.5734, "mean_token_accuracy": 0.40344826579093934, "step": 38420 }, { "epoch": 0.03870206810556776, "grad_norm": 13.234773980211228, "learning_rate": 3.870132747819431e-05, "loss": 2.7245, "mean_token_accuracy": 0.3482758641242981, "step": 38425 }, { "epoch": 0.038707104158671936, "grad_norm": 13.895064033941393, "learning_rate": 3.870636343492537e-05, "loss": 2.5547, "mean_token_accuracy": 0.35862069129943847, "step": 38430 }, { "epoch": 0.0387121402117761, "grad_norm": 15.122056472264191, "learning_rate": 3.871139939165643e-05, "loss": 2.5987, "mean_token_accuracy": 0.4188142716884613, "step": 38435 }, { "epoch": 0.038717176264880276, "grad_norm": 13.610495687546567, "learning_rate": 3.871643534838749e-05, "loss": 2.1821, "mean_token_accuracy": 0.4068965554237366, "step": 38440 }, { "epoch": 0.03872221231798445, "grad_norm": 17.14199740429114, "learning_rate": 3.872147130511855e-05, "loss": 2.4604, "mean_token_accuracy": 0.4295220851898193, "step": 38445 }, { "epoch": 0.038727248371088624, "grad_norm": 17.654721704807155, "learning_rate": 3.872650726184961e-05, "loss": 2.7951, "mean_token_accuracy": 0.36206896901130675, "step": 38450 }, { "epoch": 0.0387322844241928, "grad_norm": 12.92812197832319, "learning_rate": 3.873154321858067e-05, "loss": 2.2783, "mean_token_accuracy": 0.42413793206214906, "step": 38455 }, { "epoch": 0.03873732047729697, "grad_norm": 13.32601949336529, "learning_rate": 3.8736579175311724e-05, "loss": 2.613, "mean_token_accuracy": 0.37931033968925476, "step": 38460 }, { "epoch": 0.038742356530401145, "grad_norm": 20.302579900468036, "learning_rate": 3.874161513204278e-05, "loss": 2.9831, "mean_token_accuracy": 0.3896551728248596, "step": 38465 }, { "epoch": 0.03874739258350531, "grad_norm": 19.702416163403022, "learning_rate": 3.874665108877385e-05, "loss": 3.0987, "mean_token_accuracy": 0.324137932062149, "step": 38470 }, { "epoch": 0.038752428636609486, "grad_norm": 15.047604190283767, "learning_rate": 3.875168704550491e-05, "loss": 2.5964, "mean_token_accuracy": 0.41724138259887694, "step": 38475 }, { "epoch": 0.03875746468971366, "grad_norm": 17.461245736093673, "learning_rate": 3.875672300223597e-05, "loss": 2.8901, "mean_token_accuracy": 0.41379311084747317, "step": 38480 }, { "epoch": 0.03876250074281783, "grad_norm": 14.793532148710232, "learning_rate": 3.876175895896703e-05, "loss": 2.8375, "mean_token_accuracy": 0.3965517282485962, "step": 38485 }, { "epoch": 0.03876753679592201, "grad_norm": 13.94459817380488, "learning_rate": 3.8766794915698086e-05, "loss": 2.4298, "mean_token_accuracy": 0.40508166551589964, "step": 38490 }, { "epoch": 0.03877257284902618, "grad_norm": 16.015068346315935, "learning_rate": 3.8771830872429146e-05, "loss": 2.8024, "mean_token_accuracy": 0.37586206793785093, "step": 38495 }, { "epoch": 0.03877760890213035, "grad_norm": 28.052871323987514, "learning_rate": 3.8776866829160205e-05, "loss": 2.8827, "mean_token_accuracy": 0.3655172437429428, "step": 38500 }, { "epoch": 0.03878264495523452, "grad_norm": 14.451302722836752, "learning_rate": 3.8781902785891264e-05, "loss": 2.8041, "mean_token_accuracy": 0.37241379618644715, "step": 38505 }, { "epoch": 0.038787681008338695, "grad_norm": 16.237444178506518, "learning_rate": 3.878693874262232e-05, "loss": 2.6191, "mean_token_accuracy": 0.3413793116807938, "step": 38510 }, { "epoch": 0.03879271706144287, "grad_norm": 13.54112577823189, "learning_rate": 3.879197469935338e-05, "loss": 2.6762, "mean_token_accuracy": 0.39310344457626345, "step": 38515 }, { "epoch": 0.03879775311454704, "grad_norm": 13.254402656175895, "learning_rate": 3.879701065608444e-05, "loss": 2.5798, "mean_token_accuracy": 0.4068965554237366, "step": 38520 }, { "epoch": 0.03880278916765122, "grad_norm": 12.511698364984422, "learning_rate": 3.880204661281551e-05, "loss": 2.683, "mean_token_accuracy": 0.40344826579093934, "step": 38525 }, { "epoch": 0.03880782522075539, "grad_norm": 17.68335189930687, "learning_rate": 3.880708256954657e-05, "loss": 2.6896, "mean_token_accuracy": 0.3793103516101837, "step": 38530 }, { "epoch": 0.03881286127385956, "grad_norm": 13.259262737933796, "learning_rate": 3.8812118526277626e-05, "loss": 2.2989, "mean_token_accuracy": 0.39655172228813174, "step": 38535 }, { "epoch": 0.03881789732696373, "grad_norm": 12.791777285225903, "learning_rate": 3.8817154483008686e-05, "loss": 2.3641, "mean_token_accuracy": 0.40689654350280763, "step": 38540 }, { "epoch": 0.038822933380067905, "grad_norm": 17.316074796847175, "learning_rate": 3.8822190439739745e-05, "loss": 2.7057, "mean_token_accuracy": 0.4206896543502808, "step": 38545 }, { "epoch": 0.03882796943317208, "grad_norm": 20.112405905653954, "learning_rate": 3.8827226396470804e-05, "loss": 2.2278, "mean_token_accuracy": 0.47586206197738645, "step": 38550 }, { "epoch": 0.03883300548627625, "grad_norm": 14.511426742591292, "learning_rate": 3.883226235320186e-05, "loss": 2.9229, "mean_token_accuracy": 0.39443435668945315, "step": 38555 }, { "epoch": 0.038838041539380426, "grad_norm": 15.875742211139539, "learning_rate": 3.883729830993292e-05, "loss": 2.6791, "mean_token_accuracy": 0.35862069129943847, "step": 38560 }, { "epoch": 0.0388430775924846, "grad_norm": 14.829180220728038, "learning_rate": 3.884233426666398e-05, "loss": 2.5868, "mean_token_accuracy": 0.37931033968925476, "step": 38565 }, { "epoch": 0.03884811364558877, "grad_norm": 12.364730572976033, "learning_rate": 3.884737022339504e-05, "loss": 2.3576, "mean_token_accuracy": 0.41034482717514037, "step": 38570 }, { "epoch": 0.03885314969869294, "grad_norm": 14.087908530494651, "learning_rate": 3.88524061801261e-05, "loss": 2.5741, "mean_token_accuracy": 0.37931033968925476, "step": 38575 }, { "epoch": 0.038858185751797114, "grad_norm": 16.571447705206264, "learning_rate": 3.8857442136857166e-05, "loss": 2.336, "mean_token_accuracy": 0.44482757449150084, "step": 38580 }, { "epoch": 0.03886322180490129, "grad_norm": 13.680760293574012, "learning_rate": 3.8862478093588226e-05, "loss": 2.8325, "mean_token_accuracy": 0.36896551251411436, "step": 38585 }, { "epoch": 0.03886825785800546, "grad_norm": 14.890020421915905, "learning_rate": 3.886751405031928e-05, "loss": 2.4543, "mean_token_accuracy": 0.41724138855934145, "step": 38590 }, { "epoch": 0.038873293911109635, "grad_norm": 17.447794546387453, "learning_rate": 3.887255000705034e-05, "loss": 3.1763, "mean_token_accuracy": 0.3793103456497192, "step": 38595 }, { "epoch": 0.03887832996421381, "grad_norm": 12.780789311644094, "learning_rate": 3.88775859637814e-05, "loss": 2.2954, "mean_token_accuracy": 0.3965517282485962, "step": 38600 }, { "epoch": 0.038883366017317976, "grad_norm": 11.908152044690178, "learning_rate": 3.888262192051246e-05, "loss": 2.6285, "mean_token_accuracy": 0.43793103098869324, "step": 38605 }, { "epoch": 0.03888840207042215, "grad_norm": 19.272278434805227, "learning_rate": 3.888765787724352e-05, "loss": 2.7464, "mean_token_accuracy": 0.419252872467041, "step": 38610 }, { "epoch": 0.038893438123526324, "grad_norm": 18.916175007153512, "learning_rate": 3.889269383397458e-05, "loss": 2.334, "mean_token_accuracy": 0.46896551847457885, "step": 38615 }, { "epoch": 0.0388984741766305, "grad_norm": 15.437027348007515, "learning_rate": 3.889772979070564e-05, "loss": 2.6108, "mean_token_accuracy": 0.37241379618644715, "step": 38620 }, { "epoch": 0.03890351022973467, "grad_norm": 14.580554486029213, "learning_rate": 3.89027657474367e-05, "loss": 2.6533, "mean_token_accuracy": 0.3862069010734558, "step": 38625 }, { "epoch": 0.038908546282838845, "grad_norm": 11.135602105148733, "learning_rate": 3.890780170416776e-05, "loss": 2.0744, "mean_token_accuracy": 0.49165154099464414, "step": 38630 }, { "epoch": 0.03891358233594302, "grad_norm": 21.541042813984255, "learning_rate": 3.891283766089882e-05, "loss": 3.0318, "mean_token_accuracy": 0.34137930870056155, "step": 38635 }, { "epoch": 0.038918618389047185, "grad_norm": 13.575323253365715, "learning_rate": 3.891787361762988e-05, "loss": 2.4308, "mean_token_accuracy": 0.4068965554237366, "step": 38640 }, { "epoch": 0.03892365444215136, "grad_norm": 29.619872340594796, "learning_rate": 3.892290957436094e-05, "loss": 2.2155, "mean_token_accuracy": 0.4448275864124298, "step": 38645 }, { "epoch": 0.03892869049525553, "grad_norm": 17.214147013841465, "learning_rate": 3.8927945531091996e-05, "loss": 2.5681, "mean_token_accuracy": 0.4, "step": 38650 }, { "epoch": 0.03893372654835971, "grad_norm": 15.809873276925861, "learning_rate": 3.8932981487823055e-05, "loss": 2.553, "mean_token_accuracy": 0.4034482717514038, "step": 38655 }, { "epoch": 0.03893876260146388, "grad_norm": 19.529083004269495, "learning_rate": 3.893801744455412e-05, "loss": 2.8468, "mean_token_accuracy": 0.3931034475564957, "step": 38660 }, { "epoch": 0.038943798654568054, "grad_norm": 11.687554079836254, "learning_rate": 3.894305340128518e-05, "loss": 2.529, "mean_token_accuracy": 0.43641862869262693, "step": 38665 }, { "epoch": 0.03894883470767223, "grad_norm": 11.91961109614288, "learning_rate": 3.894808935801624e-05, "loss": 2.3753, "mean_token_accuracy": 0.439310348033905, "step": 38670 }, { "epoch": 0.038953870760776395, "grad_norm": 17.282072650647486, "learning_rate": 3.89531253147473e-05, "loss": 2.6406, "mean_token_accuracy": 0.34827586114406583, "step": 38675 }, { "epoch": 0.03895890681388057, "grad_norm": 15.9014575120477, "learning_rate": 3.895816127147836e-05, "loss": 2.5613, "mean_token_accuracy": 0.3551724076271057, "step": 38680 }, { "epoch": 0.03896394286698474, "grad_norm": 16.78874647017567, "learning_rate": 3.896319722820942e-05, "loss": 2.5114, "mean_token_accuracy": 0.42758620977401735, "step": 38685 }, { "epoch": 0.038968978920088916, "grad_norm": 13.132809863819832, "learning_rate": 3.896823318494048e-05, "loss": 2.822, "mean_token_accuracy": 0.3551724076271057, "step": 38690 }, { "epoch": 0.03897401497319309, "grad_norm": 15.495603375488376, "learning_rate": 3.8973269141671536e-05, "loss": 2.5422, "mean_token_accuracy": 0.4310344815254211, "step": 38695 }, { "epoch": 0.038979051026297264, "grad_norm": 19.12854806315473, "learning_rate": 3.8978305098402595e-05, "loss": 2.701, "mean_token_accuracy": 0.3620689570903778, "step": 38700 }, { "epoch": 0.03898408707940144, "grad_norm": 14.376982918016251, "learning_rate": 3.8983341055133654e-05, "loss": 2.4269, "mean_token_accuracy": 0.3965517282485962, "step": 38705 }, { "epoch": 0.038989123132505604, "grad_norm": 15.798646169923915, "learning_rate": 3.898837701186472e-05, "loss": 2.3501, "mean_token_accuracy": 0.4517241358757019, "step": 38710 }, { "epoch": 0.03899415918560978, "grad_norm": 13.119479750589646, "learning_rate": 3.899341296859578e-05, "loss": 2.6241, "mean_token_accuracy": 0.4149425268173218, "step": 38715 }, { "epoch": 0.03899919523871395, "grad_norm": 16.755598681688472, "learning_rate": 3.899844892532684e-05, "loss": 2.6403, "mean_token_accuracy": 0.43103448748588563, "step": 38720 }, { "epoch": 0.039004231291818126, "grad_norm": 12.280810564057813, "learning_rate": 3.900348488205789e-05, "loss": 2.7568, "mean_token_accuracy": 0.34827586114406583, "step": 38725 }, { "epoch": 0.0390092673449223, "grad_norm": 15.87289000272071, "learning_rate": 3.900852083878895e-05, "loss": 2.6659, "mean_token_accuracy": 0.3793103456497192, "step": 38730 }, { "epoch": 0.03901430339802647, "grad_norm": 14.59812564396407, "learning_rate": 3.901355679552001e-05, "loss": 2.6119, "mean_token_accuracy": 0.38620689511299133, "step": 38735 }, { "epoch": 0.03901933945113065, "grad_norm": 13.686110460533895, "learning_rate": 3.9018592752251076e-05, "loss": 2.8186, "mean_token_accuracy": 0.37931033968925476, "step": 38740 }, { "epoch": 0.039024375504234814, "grad_norm": 13.179596708545821, "learning_rate": 3.9023628708982135e-05, "loss": 2.7055, "mean_token_accuracy": 0.40871143341064453, "step": 38745 }, { "epoch": 0.03902941155733899, "grad_norm": 13.61369397773637, "learning_rate": 3.9028664665713195e-05, "loss": 2.5917, "mean_token_accuracy": 0.41379310488700866, "step": 38750 }, { "epoch": 0.03903444761044316, "grad_norm": 13.795705824078269, "learning_rate": 3.9033700622444254e-05, "loss": 2.5287, "mean_token_accuracy": 0.43103447556495667, "step": 38755 }, { "epoch": 0.039039483663547335, "grad_norm": 12.983170848811769, "learning_rate": 3.903873657917531e-05, "loss": 2.5331, "mean_token_accuracy": 0.41379310488700866, "step": 38760 }, { "epoch": 0.03904451971665151, "grad_norm": 17.449488767719775, "learning_rate": 3.904377253590637e-05, "loss": 2.4696, "mean_token_accuracy": 0.4000000059604645, "step": 38765 }, { "epoch": 0.03904955576975568, "grad_norm": 13.248763612637218, "learning_rate": 3.904880849263743e-05, "loss": 2.7131, "mean_token_accuracy": 0.37241379022598264, "step": 38770 }, { "epoch": 0.039054591822859856, "grad_norm": 18.29404768110175, "learning_rate": 3.905384444936849e-05, "loss": 2.8362, "mean_token_accuracy": 0.4172413766384125, "step": 38775 }, { "epoch": 0.03905962787596402, "grad_norm": 16.547322035915307, "learning_rate": 3.905888040609955e-05, "loss": 2.997, "mean_token_accuracy": 0.3551724076271057, "step": 38780 }, { "epoch": 0.0390646639290682, "grad_norm": 12.317070802377199, "learning_rate": 3.906391636283061e-05, "loss": 2.145, "mean_token_accuracy": 0.45862069725990295, "step": 38785 }, { "epoch": 0.03906969998217237, "grad_norm": 17.770034437052445, "learning_rate": 3.9068952319561675e-05, "loss": 3.0065, "mean_token_accuracy": 0.32068965435028074, "step": 38790 }, { "epoch": 0.039074736035276544, "grad_norm": 16.797171795798455, "learning_rate": 3.9073988276292735e-05, "loss": 3.0249, "mean_token_accuracy": 0.36551723480224607, "step": 38795 }, { "epoch": 0.03907977208838072, "grad_norm": 20.22679474979558, "learning_rate": 3.9079024233023794e-05, "loss": 2.7456, "mean_token_accuracy": 0.441379314661026, "step": 38800 }, { "epoch": 0.03908480814148489, "grad_norm": 16.148022244865817, "learning_rate": 3.908406018975485e-05, "loss": 2.4045, "mean_token_accuracy": 0.4068965494632721, "step": 38805 }, { "epoch": 0.039089844194589066, "grad_norm": 13.504311675405004, "learning_rate": 3.908909614648591e-05, "loss": 2.4682, "mean_token_accuracy": 0.4068965554237366, "step": 38810 }, { "epoch": 0.03909488024769323, "grad_norm": 19.056439954903873, "learning_rate": 3.909413210321697e-05, "loss": 2.4998, "mean_token_accuracy": 0.42413793206214906, "step": 38815 }, { "epoch": 0.039099916300797406, "grad_norm": 22.052896919589823, "learning_rate": 3.909916805994803e-05, "loss": 2.618, "mean_token_accuracy": 0.37586207389831544, "step": 38820 }, { "epoch": 0.03910495235390158, "grad_norm": 11.851922815075556, "learning_rate": 3.910420401667909e-05, "loss": 2.2593, "mean_token_accuracy": 0.4482758641242981, "step": 38825 }, { "epoch": 0.039109988407005754, "grad_norm": 19.939857952714195, "learning_rate": 3.910923997341015e-05, "loss": 2.8097, "mean_token_accuracy": 0.3241379290819168, "step": 38830 }, { "epoch": 0.03911502446010993, "grad_norm": 11.604616431471497, "learning_rate": 3.911427593014121e-05, "loss": 2.5012, "mean_token_accuracy": 0.39310344457626345, "step": 38835 }, { "epoch": 0.0391200605132141, "grad_norm": 13.006562505662664, "learning_rate": 3.911931188687227e-05, "loss": 2.4535, "mean_token_accuracy": 0.4172413766384125, "step": 38840 }, { "epoch": 0.039125096566318275, "grad_norm": 17.154948578072254, "learning_rate": 3.9124347843603334e-05, "loss": 2.3754, "mean_token_accuracy": 0.4068965494632721, "step": 38845 }, { "epoch": 0.03913013261942244, "grad_norm": 13.767325408131235, "learning_rate": 3.912938380033439e-05, "loss": 2.566, "mean_token_accuracy": 0.36551723480224607, "step": 38850 }, { "epoch": 0.039135168672526616, "grad_norm": 14.13204342923202, "learning_rate": 3.913441975706545e-05, "loss": 2.6176, "mean_token_accuracy": 0.3999999940395355, "step": 38855 }, { "epoch": 0.03914020472563079, "grad_norm": 23.92223017244613, "learning_rate": 3.9139455713796505e-05, "loss": 2.5893, "mean_token_accuracy": 0.41379310488700866, "step": 38860 }, { "epoch": 0.03914524077873496, "grad_norm": 12.696961875721472, "learning_rate": 3.9144491670527564e-05, "loss": 2.4454, "mean_token_accuracy": 0.43793103098869324, "step": 38865 }, { "epoch": 0.03915027683183914, "grad_norm": 20.84491659432228, "learning_rate": 3.914952762725863e-05, "loss": 2.7846, "mean_token_accuracy": 0.4, "step": 38870 }, { "epoch": 0.03915531288494331, "grad_norm": 17.036519454695824, "learning_rate": 3.915456358398969e-05, "loss": 2.3603, "mean_token_accuracy": 0.412522679567337, "step": 38875 }, { "epoch": 0.039160348938047485, "grad_norm": 14.236087343210915, "learning_rate": 3.915959954072075e-05, "loss": 2.7244, "mean_token_accuracy": 0.42758620977401735, "step": 38880 }, { "epoch": 0.03916538499115165, "grad_norm": 13.960424645657923, "learning_rate": 3.916463549745181e-05, "loss": 2.4017, "mean_token_accuracy": 0.4068965494632721, "step": 38885 }, { "epoch": 0.039170421044255825, "grad_norm": 14.167016912206758, "learning_rate": 3.916967145418287e-05, "loss": 2.5066, "mean_token_accuracy": 0.41034482717514037, "step": 38890 }, { "epoch": 0.03917545709736, "grad_norm": 15.96352251101983, "learning_rate": 3.917470741091393e-05, "loss": 2.4379, "mean_token_accuracy": 0.4103448152542114, "step": 38895 }, { "epoch": 0.03918049315046417, "grad_norm": 16.13910166269604, "learning_rate": 3.9179743367644986e-05, "loss": 2.5056, "mean_token_accuracy": 0.4586206912994385, "step": 38900 }, { "epoch": 0.039185529203568346, "grad_norm": 14.3241108631312, "learning_rate": 3.9184779324376045e-05, "loss": 3.2258, "mean_token_accuracy": 0.31379309892654417, "step": 38905 }, { "epoch": 0.03919056525667252, "grad_norm": 13.893291154046372, "learning_rate": 3.9189815281107104e-05, "loss": 2.6545, "mean_token_accuracy": 0.43998789191246035, "step": 38910 }, { "epoch": 0.039195601309776694, "grad_norm": 38.90380102099931, "learning_rate": 3.9194851237838163e-05, "loss": 2.6143, "mean_token_accuracy": 0.36896551251411436, "step": 38915 }, { "epoch": 0.03920063736288086, "grad_norm": 10.425371685548125, "learning_rate": 3.919988719456922e-05, "loss": 2.4644, "mean_token_accuracy": 0.441379314661026, "step": 38920 }, { "epoch": 0.039205673415985035, "grad_norm": 12.775357601215736, "learning_rate": 3.920492315130029e-05, "loss": 2.5944, "mean_token_accuracy": 0.41724138259887694, "step": 38925 }, { "epoch": 0.03921070946908921, "grad_norm": 13.687075921960881, "learning_rate": 3.920995910803135e-05, "loss": 2.2032, "mean_token_accuracy": 0.482758617401123, "step": 38930 }, { "epoch": 0.03921574552219338, "grad_norm": 15.081052104354697, "learning_rate": 3.921499506476241e-05, "loss": 2.6906, "mean_token_accuracy": 0.36896551251411436, "step": 38935 }, { "epoch": 0.039220781575297556, "grad_norm": 13.717081588147721, "learning_rate": 3.9220031021493466e-05, "loss": 2.7863, "mean_token_accuracy": 0.3827586203813553, "step": 38940 }, { "epoch": 0.03922581762840173, "grad_norm": 13.848264871103488, "learning_rate": 3.9225066978224526e-05, "loss": 2.6957, "mean_token_accuracy": 0.4034482717514038, "step": 38945 }, { "epoch": 0.0392308536815059, "grad_norm": 13.372593277872918, "learning_rate": 3.9230102934955585e-05, "loss": 2.4021, "mean_token_accuracy": 0.39310344457626345, "step": 38950 }, { "epoch": 0.03923588973461007, "grad_norm": 16.369399696426182, "learning_rate": 3.9235138891686644e-05, "loss": 2.4953, "mean_token_accuracy": 0.4310344815254211, "step": 38955 }, { "epoch": 0.039240925787714244, "grad_norm": 17.954263836056256, "learning_rate": 3.9240174848417704e-05, "loss": 2.6452, "mean_token_accuracy": 0.36896551847457887, "step": 38960 }, { "epoch": 0.03924596184081842, "grad_norm": 13.82868095935141, "learning_rate": 3.924521080514876e-05, "loss": 2.702, "mean_token_accuracy": 0.36896551251411436, "step": 38965 }, { "epoch": 0.03925099789392259, "grad_norm": 19.05901643465952, "learning_rate": 3.925024676187982e-05, "loss": 2.6551, "mean_token_accuracy": 0.4068965494632721, "step": 38970 }, { "epoch": 0.039256033947026765, "grad_norm": 22.73567806521048, "learning_rate": 3.925528271861089e-05, "loss": 2.6067, "mean_token_accuracy": 0.41379311084747317, "step": 38975 }, { "epoch": 0.03926107000013094, "grad_norm": 16.425022425374742, "learning_rate": 3.926031867534195e-05, "loss": 2.446, "mean_token_accuracy": 0.43793103098869324, "step": 38980 }, { "epoch": 0.03926610605323511, "grad_norm": 13.713862656993555, "learning_rate": 3.9265354632073007e-05, "loss": 2.4991, "mean_token_accuracy": 0.42758620977401735, "step": 38985 }, { "epoch": 0.03927114210633928, "grad_norm": 13.212786090280394, "learning_rate": 3.9270390588804066e-05, "loss": 2.6537, "mean_token_accuracy": 0.41034482717514037, "step": 38990 }, { "epoch": 0.03927617815944345, "grad_norm": 14.63901908967399, "learning_rate": 3.927542654553512e-05, "loss": 2.9621, "mean_token_accuracy": 0.32068965435028074, "step": 38995 }, { "epoch": 0.03928121421254763, "grad_norm": 13.373920777160192, "learning_rate": 3.928046250226618e-05, "loss": 2.4328, "mean_token_accuracy": 0.4137930989265442, "step": 39000 }, { "epoch": 0.0392862502656518, "grad_norm": 18.92285610909184, "learning_rate": 3.9285498458997244e-05, "loss": 2.5265, "mean_token_accuracy": 0.39655172228813174, "step": 39005 }, { "epoch": 0.039291286318755975, "grad_norm": 12.47323915534808, "learning_rate": 3.92905344157283e-05, "loss": 2.6145, "mean_token_accuracy": 0.38620689511299133, "step": 39010 }, { "epoch": 0.03929632237186015, "grad_norm": 15.928124103717755, "learning_rate": 3.929557037245936e-05, "loss": 2.8476, "mean_token_accuracy": 0.334482753276825, "step": 39015 }, { "epoch": 0.03930135842496432, "grad_norm": 16.586202455818835, "learning_rate": 3.930060632919042e-05, "loss": 2.6401, "mean_token_accuracy": 0.39310343861579894, "step": 39020 }, { "epoch": 0.03930639447806849, "grad_norm": 16.052724379771487, "learning_rate": 3.930564228592148e-05, "loss": 2.5165, "mean_token_accuracy": 0.33103448152542114, "step": 39025 }, { "epoch": 0.03931143053117266, "grad_norm": 16.04497080388729, "learning_rate": 3.9310678242652547e-05, "loss": 2.855, "mean_token_accuracy": 0.39655172228813174, "step": 39030 }, { "epoch": 0.03931646658427684, "grad_norm": 16.88602485430087, "learning_rate": 3.93157141993836e-05, "loss": 2.8025, "mean_token_accuracy": 0.3344827562570572, "step": 39035 }, { "epoch": 0.03932150263738101, "grad_norm": 13.334421150265165, "learning_rate": 3.932075015611466e-05, "loss": 2.7785, "mean_token_accuracy": 0.40852995216846466, "step": 39040 }, { "epoch": 0.039326538690485184, "grad_norm": 13.173590983803653, "learning_rate": 3.932578611284572e-05, "loss": 2.5949, "mean_token_accuracy": 0.39310344457626345, "step": 39045 }, { "epoch": 0.03933157474358936, "grad_norm": 13.155064135401226, "learning_rate": 3.933082206957678e-05, "loss": 2.6883, "mean_token_accuracy": 0.42758620977401735, "step": 39050 }, { "epoch": 0.03933661079669353, "grad_norm": 15.165735716401644, "learning_rate": 3.933585802630784e-05, "loss": 2.6232, "mean_token_accuracy": 0.34137930274009703, "step": 39055 }, { "epoch": 0.0393416468497977, "grad_norm": 11.886119822368393, "learning_rate": 3.93408939830389e-05, "loss": 2.8737, "mean_token_accuracy": 0.3551724135875702, "step": 39060 }, { "epoch": 0.03934668290290187, "grad_norm": 12.350236250893216, "learning_rate": 3.934592993976996e-05, "loss": 2.5434, "mean_token_accuracy": 0.42413793206214906, "step": 39065 }, { "epoch": 0.039351718956006046, "grad_norm": 13.604534412022453, "learning_rate": 3.935096589650102e-05, "loss": 2.579, "mean_token_accuracy": 0.4448275864124298, "step": 39070 }, { "epoch": 0.03935675500911022, "grad_norm": 17.637371062875392, "learning_rate": 3.935600185323208e-05, "loss": 2.4638, "mean_token_accuracy": 0.4, "step": 39075 }, { "epoch": 0.039361791062214393, "grad_norm": 14.819936343905479, "learning_rate": 3.936103780996314e-05, "loss": 2.6425, "mean_token_accuracy": 0.35862069129943847, "step": 39080 }, { "epoch": 0.03936682711531857, "grad_norm": 20.073671155662876, "learning_rate": 3.93660737666942e-05, "loss": 2.799, "mean_token_accuracy": 0.37410768270492556, "step": 39085 }, { "epoch": 0.03937186316842274, "grad_norm": 16.782005443273068, "learning_rate": 3.937110972342526e-05, "loss": 2.3635, "mean_token_accuracy": 0.41379310488700866, "step": 39090 }, { "epoch": 0.03937689922152691, "grad_norm": 13.326606143731963, "learning_rate": 3.937614568015632e-05, "loss": 2.8715, "mean_token_accuracy": 0.358620685338974, "step": 39095 }, { "epoch": 0.03938193527463108, "grad_norm": 17.283200358835902, "learning_rate": 3.9381181636887376e-05, "loss": 2.7853, "mean_token_accuracy": 0.36364186406135557, "step": 39100 }, { "epoch": 0.039386971327735255, "grad_norm": 12.977211605237093, "learning_rate": 3.9386217593618435e-05, "loss": 2.7324, "mean_token_accuracy": 0.3827586114406586, "step": 39105 }, { "epoch": 0.03939200738083943, "grad_norm": 12.916576388942303, "learning_rate": 3.93912535503495e-05, "loss": 2.7038, "mean_token_accuracy": 0.3551724076271057, "step": 39110 }, { "epoch": 0.0393970434339436, "grad_norm": 17.86370132021268, "learning_rate": 3.939628950708056e-05, "loss": 3.2147, "mean_token_accuracy": 0.36896551251411436, "step": 39115 }, { "epoch": 0.03940207948704778, "grad_norm": 13.045852787930244, "learning_rate": 3.940132546381162e-05, "loss": 2.7357, "mean_token_accuracy": 0.36551723480224607, "step": 39120 }, { "epoch": 0.03940711554015195, "grad_norm": 15.757778651201477, "learning_rate": 3.940636142054267e-05, "loss": 2.2894, "mean_token_accuracy": 0.3931034505367279, "step": 39125 }, { "epoch": 0.03941215159325612, "grad_norm": 16.8276276516325, "learning_rate": 3.941139737727373e-05, "loss": 2.9455, "mean_token_accuracy": 0.3655172407627106, "step": 39130 }, { "epoch": 0.03941718764636029, "grad_norm": 14.071941602518693, "learning_rate": 3.94164333340048e-05, "loss": 2.4264, "mean_token_accuracy": 0.4329703629016876, "step": 39135 }, { "epoch": 0.039422223699464465, "grad_norm": 17.58594611345993, "learning_rate": 3.942146929073586e-05, "loss": 2.6935, "mean_token_accuracy": 0.4034482777118683, "step": 39140 }, { "epoch": 0.03942725975256864, "grad_norm": 16.02610329655536, "learning_rate": 3.9426505247466916e-05, "loss": 2.5236, "mean_token_accuracy": 0.39310345649719236, "step": 39145 }, { "epoch": 0.03943229580567281, "grad_norm": 11.940800327529, "learning_rate": 3.9431541204197975e-05, "loss": 2.2514, "mean_token_accuracy": 0.4814881980419159, "step": 39150 }, { "epoch": 0.039437331858776986, "grad_norm": 16.033471299773147, "learning_rate": 3.9436577160929035e-05, "loss": 2.5791, "mean_token_accuracy": 0.4256503343582153, "step": 39155 }, { "epoch": 0.03944236791188116, "grad_norm": 14.03174889257257, "learning_rate": 3.9441613117660094e-05, "loss": 2.8602, "mean_token_accuracy": 0.37241379022598264, "step": 39160 }, { "epoch": 0.03944740396498533, "grad_norm": 13.625744841911843, "learning_rate": 3.944664907439115e-05, "loss": 2.3798, "mean_token_accuracy": 0.4676950931549072, "step": 39165 }, { "epoch": 0.0394524400180895, "grad_norm": 13.660925290153212, "learning_rate": 3.945168503112221e-05, "loss": 2.6829, "mean_token_accuracy": 0.3896551728248596, "step": 39170 }, { "epoch": 0.039457476071193674, "grad_norm": 17.056419438956265, "learning_rate": 3.945672098785327e-05, "loss": 2.7271, "mean_token_accuracy": 0.36896551847457887, "step": 39175 }, { "epoch": 0.03946251212429785, "grad_norm": 13.605546504044312, "learning_rate": 3.946175694458433e-05, "loss": 2.2995, "mean_token_accuracy": 0.4482758641242981, "step": 39180 }, { "epoch": 0.03946754817740202, "grad_norm": 14.499820582181188, "learning_rate": 3.946679290131539e-05, "loss": 2.6638, "mean_token_accuracy": 0.4103448331356049, "step": 39185 }, { "epoch": 0.039472584230506196, "grad_norm": 16.687328248807603, "learning_rate": 3.9471828858046456e-05, "loss": 2.6975, "mean_token_accuracy": 0.36551724672317504, "step": 39190 }, { "epoch": 0.03947762028361037, "grad_norm": 13.185617452566404, "learning_rate": 3.9476864814777515e-05, "loss": 2.3954, "mean_token_accuracy": 0.46551724672317507, "step": 39195 }, { "epoch": 0.039482656336714536, "grad_norm": 13.915040673167244, "learning_rate": 3.9481900771508575e-05, "loss": 2.3264, "mean_token_accuracy": 0.42413793206214906, "step": 39200 }, { "epoch": 0.03948769238981871, "grad_norm": 14.276362012705954, "learning_rate": 3.9486936728239634e-05, "loss": 2.5525, "mean_token_accuracy": 0.38965518176555636, "step": 39205 }, { "epoch": 0.039492728442922884, "grad_norm": 12.361969557379272, "learning_rate": 3.949197268497069e-05, "loss": 2.5832, "mean_token_accuracy": 0.4366606116294861, "step": 39210 }, { "epoch": 0.03949776449602706, "grad_norm": 19.32632648050562, "learning_rate": 3.949700864170175e-05, "loss": 2.5092, "mean_token_accuracy": 0.4344827592372894, "step": 39215 }, { "epoch": 0.03950280054913123, "grad_norm": 20.43910138073725, "learning_rate": 3.950204459843281e-05, "loss": 2.8964, "mean_token_accuracy": 0.3793103456497192, "step": 39220 }, { "epoch": 0.039507836602235405, "grad_norm": 25.823660481786103, "learning_rate": 3.950708055516387e-05, "loss": 2.8454, "mean_token_accuracy": 0.3482758581638336, "step": 39225 }, { "epoch": 0.03951287265533958, "grad_norm": 13.509988081653239, "learning_rate": 3.951211651189493e-05, "loss": 2.3458, "mean_token_accuracy": 0.44827585220336913, "step": 39230 }, { "epoch": 0.039517908708443746, "grad_norm": 14.843754017966694, "learning_rate": 3.951715246862599e-05, "loss": 2.5759, "mean_token_accuracy": 0.4034482717514038, "step": 39235 }, { "epoch": 0.03952294476154792, "grad_norm": 18.787312345872902, "learning_rate": 3.952218842535705e-05, "loss": 2.906, "mean_token_accuracy": 0.3931034505367279, "step": 39240 }, { "epoch": 0.03952798081465209, "grad_norm": 16.082131899332857, "learning_rate": 3.9527224382088115e-05, "loss": 2.7071, "mean_token_accuracy": 0.3862069010734558, "step": 39245 }, { "epoch": 0.03953301686775627, "grad_norm": 16.29557245253997, "learning_rate": 3.9532260338819174e-05, "loss": 2.9652, "mean_token_accuracy": 0.36896551847457887, "step": 39250 }, { "epoch": 0.03953805292086044, "grad_norm": 18.533207059767904, "learning_rate": 3.953729629555023e-05, "loss": 2.3665, "mean_token_accuracy": 0.41724138259887694, "step": 39255 }, { "epoch": 0.039543088973964614, "grad_norm": 10.727089125913102, "learning_rate": 3.9542332252281286e-05, "loss": 2.5451, "mean_token_accuracy": 0.37241379022598264, "step": 39260 }, { "epoch": 0.03954812502706879, "grad_norm": 12.924395723644993, "learning_rate": 3.9547368209012345e-05, "loss": 2.3814, "mean_token_accuracy": 0.4068965554237366, "step": 39265 }, { "epoch": 0.039553161080172955, "grad_norm": 11.500845381146906, "learning_rate": 3.955240416574341e-05, "loss": 2.1867, "mean_token_accuracy": 0.4689655125141144, "step": 39270 }, { "epoch": 0.03955819713327713, "grad_norm": 11.441086676486723, "learning_rate": 3.955744012247447e-05, "loss": 2.2728, "mean_token_accuracy": 0.4402298927307129, "step": 39275 }, { "epoch": 0.0395632331863813, "grad_norm": 18.25151814645436, "learning_rate": 3.956247607920553e-05, "loss": 2.6677, "mean_token_accuracy": 0.4103448212146759, "step": 39280 }, { "epoch": 0.039568269239485476, "grad_norm": 12.327168727445336, "learning_rate": 3.956751203593659e-05, "loss": 2.4921, "mean_token_accuracy": 0.42413793206214906, "step": 39285 }, { "epoch": 0.03957330529258965, "grad_norm": 13.901024123853642, "learning_rate": 3.957254799266765e-05, "loss": 3.0384, "mean_token_accuracy": 0.3517241388559341, "step": 39290 }, { "epoch": 0.039578341345693824, "grad_norm": 16.37573327854223, "learning_rate": 3.9577583949398714e-05, "loss": 2.5583, "mean_token_accuracy": 0.38275861740112305, "step": 39295 }, { "epoch": 0.039583377398798, "grad_norm": 14.923322933260472, "learning_rate": 3.9582619906129767e-05, "loss": 2.4811, "mean_token_accuracy": 0.3896551728248596, "step": 39300 }, { "epoch": 0.039588413451902164, "grad_norm": 15.537843465630097, "learning_rate": 3.9587655862860826e-05, "loss": 2.7747, "mean_token_accuracy": 0.36896551251411436, "step": 39305 }, { "epoch": 0.03959344950500634, "grad_norm": 15.14466450252084, "learning_rate": 3.9592691819591885e-05, "loss": 2.7147, "mean_token_accuracy": 0.36551723480224607, "step": 39310 }, { "epoch": 0.03959848555811051, "grad_norm": 13.217036898690914, "learning_rate": 3.9597727776322944e-05, "loss": 2.5231, "mean_token_accuracy": 0.38965516686439516, "step": 39315 }, { "epoch": 0.039603521611214686, "grad_norm": 20.50421717083436, "learning_rate": 3.960276373305401e-05, "loss": 2.9057, "mean_token_accuracy": 0.3896551728248596, "step": 39320 }, { "epoch": 0.03960855766431886, "grad_norm": 13.158844564619992, "learning_rate": 3.960779968978507e-05, "loss": 2.8296, "mean_token_accuracy": 0.36551723480224607, "step": 39325 }, { "epoch": 0.03961359371742303, "grad_norm": 14.51961099041043, "learning_rate": 3.961283564651613e-05, "loss": 2.3579, "mean_token_accuracy": 0.41724138259887694, "step": 39330 }, { "epoch": 0.03961862977052721, "grad_norm": 12.246682889074927, "learning_rate": 3.961787160324719e-05, "loss": 2.4549, "mean_token_accuracy": 0.4068965494632721, "step": 39335 }, { "epoch": 0.039623665823631374, "grad_norm": 18.642027729901674, "learning_rate": 3.962290755997825e-05, "loss": 2.538, "mean_token_accuracy": 0.3827586233615875, "step": 39340 }, { "epoch": 0.03962870187673555, "grad_norm": 14.764659301778105, "learning_rate": 3.962794351670931e-05, "loss": 2.5658, "mean_token_accuracy": 0.4241379201412201, "step": 39345 }, { "epoch": 0.03963373792983972, "grad_norm": 15.888077346363804, "learning_rate": 3.9632979473440366e-05, "loss": 2.7151, "mean_token_accuracy": 0.4413793087005615, "step": 39350 }, { "epoch": 0.039638773982943895, "grad_norm": 13.022821189394236, "learning_rate": 3.9638015430171425e-05, "loss": 2.4815, "mean_token_accuracy": 0.4223835408687592, "step": 39355 }, { "epoch": 0.03964381003604807, "grad_norm": 17.24888371910432, "learning_rate": 3.9643051386902484e-05, "loss": 2.5759, "mean_token_accuracy": 0.417241370677948, "step": 39360 }, { "epoch": 0.03964884608915224, "grad_norm": 14.778974633246731, "learning_rate": 3.9648087343633544e-05, "loss": 2.2983, "mean_token_accuracy": 0.4620689570903778, "step": 39365 }, { "epoch": 0.039653882142256416, "grad_norm": 18.294872011754546, "learning_rate": 3.96531233003646e-05, "loss": 2.9639, "mean_token_accuracy": 0.4103448331356049, "step": 39370 }, { "epoch": 0.03965891819536058, "grad_norm": 15.05311614354758, "learning_rate": 3.965815925709567e-05, "loss": 2.7486, "mean_token_accuracy": 0.3586206942796707, "step": 39375 }, { "epoch": 0.03966395424846476, "grad_norm": 13.28741748287684, "learning_rate": 3.966319521382673e-05, "loss": 2.5122, "mean_token_accuracy": 0.4517241358757019, "step": 39380 }, { "epoch": 0.03966899030156893, "grad_norm": 12.48948293722184, "learning_rate": 3.966823117055779e-05, "loss": 2.3586, "mean_token_accuracy": 0.4034482717514038, "step": 39385 }, { "epoch": 0.039674026354673105, "grad_norm": 11.478660292164042, "learning_rate": 3.967326712728885e-05, "loss": 2.5666, "mean_token_accuracy": 0.3862068891525269, "step": 39390 }, { "epoch": 0.03967906240777728, "grad_norm": 151.94760866317085, "learning_rate": 3.96783030840199e-05, "loss": 2.7741, "mean_token_accuracy": 0.3793103456497192, "step": 39395 }, { "epoch": 0.03968409846088145, "grad_norm": 17.343988304911704, "learning_rate": 3.9683339040750965e-05, "loss": 2.5792, "mean_token_accuracy": 0.42068966031074523, "step": 39400 }, { "epoch": 0.039689134513985626, "grad_norm": 12.411400232651037, "learning_rate": 3.9688374997482024e-05, "loss": 2.3402, "mean_token_accuracy": 0.43103448748588563, "step": 39405 }, { "epoch": 0.03969417056708979, "grad_norm": 13.451871682944464, "learning_rate": 3.9693410954213084e-05, "loss": 2.7724, "mean_token_accuracy": 0.37241379022598264, "step": 39410 }, { "epoch": 0.039699206620193966, "grad_norm": 13.462184975519026, "learning_rate": 3.969844691094414e-05, "loss": 2.4708, "mean_token_accuracy": 0.4137930989265442, "step": 39415 }, { "epoch": 0.03970424267329814, "grad_norm": 13.412206325538921, "learning_rate": 3.97034828676752e-05, "loss": 2.6791, "mean_token_accuracy": 0.3793103456497192, "step": 39420 }, { "epoch": 0.039709278726402314, "grad_norm": 13.708919603103682, "learning_rate": 3.970851882440626e-05, "loss": 2.4917, "mean_token_accuracy": 0.4000000059604645, "step": 39425 }, { "epoch": 0.03971431477950649, "grad_norm": 15.09815940955768, "learning_rate": 3.971355478113733e-05, "loss": 2.5872, "mean_token_accuracy": 0.40689654350280763, "step": 39430 }, { "epoch": 0.03971935083261066, "grad_norm": 16.309432701949717, "learning_rate": 3.971859073786838e-05, "loss": 2.7296, "mean_token_accuracy": 0.3586206793785095, "step": 39435 }, { "epoch": 0.039724386885714835, "grad_norm": 13.121613910149232, "learning_rate": 3.972362669459944e-05, "loss": 2.4803, "mean_token_accuracy": 0.43793103098869324, "step": 39440 }, { "epoch": 0.039729422938819, "grad_norm": 14.001426772261661, "learning_rate": 3.97286626513305e-05, "loss": 2.4129, "mean_token_accuracy": 0.3999999940395355, "step": 39445 }, { "epoch": 0.039734458991923176, "grad_norm": 12.247200506466168, "learning_rate": 3.973369860806156e-05, "loss": 2.6323, "mean_token_accuracy": 0.37586206793785093, "step": 39450 }, { "epoch": 0.03973949504502735, "grad_norm": 17.205158538958848, "learning_rate": 3.9738734564792624e-05, "loss": 2.4117, "mean_token_accuracy": 0.4068965494632721, "step": 39455 }, { "epoch": 0.03974453109813152, "grad_norm": 12.588708079672095, "learning_rate": 3.974377052152368e-05, "loss": 2.4337, "mean_token_accuracy": 0.4157894730567932, "step": 39460 }, { "epoch": 0.0397495671512357, "grad_norm": 15.630111012201981, "learning_rate": 3.974880647825474e-05, "loss": 2.7837, "mean_token_accuracy": 0.37241379618644715, "step": 39465 }, { "epoch": 0.03975460320433987, "grad_norm": 31.02993385507879, "learning_rate": 3.97538424349858e-05, "loss": 2.5506, "mean_token_accuracy": 0.39729064106941225, "step": 39470 }, { "epoch": 0.039759639257444045, "grad_norm": 12.081723926618887, "learning_rate": 3.975887839171686e-05, "loss": 2.4648, "mean_token_accuracy": 0.4103448331356049, "step": 39475 }, { "epoch": 0.03976467531054821, "grad_norm": 13.35006092217864, "learning_rate": 3.976391434844792e-05, "loss": 2.7746, "mean_token_accuracy": 0.37586206793785093, "step": 39480 }, { "epoch": 0.039769711363652385, "grad_norm": 12.747042574499629, "learning_rate": 3.976895030517898e-05, "loss": 2.1216, "mean_token_accuracy": 0.47749547362327577, "step": 39485 }, { "epoch": 0.03977474741675656, "grad_norm": 13.063347868763474, "learning_rate": 3.977398626191004e-05, "loss": 2.562, "mean_token_accuracy": 0.39310344457626345, "step": 39490 }, { "epoch": 0.03977978346986073, "grad_norm": 16.274363635852616, "learning_rate": 3.97790222186411e-05, "loss": 2.9617, "mean_token_accuracy": 0.33793103098869326, "step": 39495 }, { "epoch": 0.03978481952296491, "grad_norm": 18.248884440205874, "learning_rate": 3.978405817537216e-05, "loss": 2.9764, "mean_token_accuracy": 0.3827586233615875, "step": 39500 }, { "epoch": 0.03978985557606908, "grad_norm": 22.20417504610283, "learning_rate": 3.9789094132103216e-05, "loss": 2.7394, "mean_token_accuracy": 0.38620689511299133, "step": 39505 }, { "epoch": 0.039794891629173254, "grad_norm": 14.698414093180439, "learning_rate": 3.979413008883428e-05, "loss": 3.0379, "mean_token_accuracy": 0.33103448152542114, "step": 39510 }, { "epoch": 0.03979992768227742, "grad_norm": 10.820874692897485, "learning_rate": 3.979916604556534e-05, "loss": 2.25, "mean_token_accuracy": 0.4448275864124298, "step": 39515 }, { "epoch": 0.039804963735381595, "grad_norm": 16.08883257284815, "learning_rate": 3.98042020022964e-05, "loss": 2.6267, "mean_token_accuracy": 0.37241379022598264, "step": 39520 }, { "epoch": 0.03980999978848577, "grad_norm": 17.108290393163657, "learning_rate": 3.980923795902745e-05, "loss": 2.5097, "mean_token_accuracy": 0.41379310488700866, "step": 39525 }, { "epoch": 0.03981503584158994, "grad_norm": 12.990902019388828, "learning_rate": 3.981427391575851e-05, "loss": 2.7497, "mean_token_accuracy": 0.3482758581638336, "step": 39530 }, { "epoch": 0.039820071894694116, "grad_norm": 18.301559410203318, "learning_rate": 3.981930987248958e-05, "loss": 2.6243, "mean_token_accuracy": 0.3620689630508423, "step": 39535 }, { "epoch": 0.03982510794779829, "grad_norm": 15.60913427188001, "learning_rate": 3.982434582922064e-05, "loss": 2.6539, "mean_token_accuracy": 0.43103448748588563, "step": 39540 }, { "epoch": 0.039830144000902463, "grad_norm": 14.059613663406635, "learning_rate": 3.98293817859517e-05, "loss": 2.3813, "mean_token_accuracy": 0.41905626058578493, "step": 39545 }, { "epoch": 0.03983518005400663, "grad_norm": 14.835932478956138, "learning_rate": 3.9834417742682756e-05, "loss": 2.6615, "mean_token_accuracy": 0.39310345649719236, "step": 39550 }, { "epoch": 0.039840216107110804, "grad_norm": 18.976964715379268, "learning_rate": 3.9839453699413816e-05, "loss": 2.8048, "mean_token_accuracy": 0.37586206793785093, "step": 39555 }, { "epoch": 0.03984525216021498, "grad_norm": 13.570990325679105, "learning_rate": 3.984448965614488e-05, "loss": 2.7024, "mean_token_accuracy": 0.4, "step": 39560 }, { "epoch": 0.03985028821331915, "grad_norm": 15.49435482012221, "learning_rate": 3.984952561287594e-05, "loss": 2.8828, "mean_token_accuracy": 0.37241379022598264, "step": 39565 }, { "epoch": 0.039855324266423325, "grad_norm": 17.347070954245464, "learning_rate": 3.985456156960699e-05, "loss": 2.9726, "mean_token_accuracy": 0.36896551251411436, "step": 39570 }, { "epoch": 0.0398603603195275, "grad_norm": 15.129222708713614, "learning_rate": 3.985959752633805e-05, "loss": 2.7856, "mean_token_accuracy": 0.36896551847457887, "step": 39575 }, { "epoch": 0.03986539637263167, "grad_norm": 16.59690027805056, "learning_rate": 3.986463348306911e-05, "loss": 2.9819, "mean_token_accuracy": 0.38275861740112305, "step": 39580 }, { "epoch": 0.03987043242573584, "grad_norm": 12.391903664635596, "learning_rate": 3.986966943980017e-05, "loss": 2.3139, "mean_token_accuracy": 0.41034482717514037, "step": 39585 }, { "epoch": 0.039875468478840013, "grad_norm": 16.0721734222642, "learning_rate": 3.987470539653124e-05, "loss": 2.6243, "mean_token_accuracy": 0.41379310488700866, "step": 39590 }, { "epoch": 0.03988050453194419, "grad_norm": 14.767124030520424, "learning_rate": 3.9879741353262296e-05, "loss": 2.7639, "mean_token_accuracy": 0.3793103456497192, "step": 39595 }, { "epoch": 0.03988554058504836, "grad_norm": 12.71939357137036, "learning_rate": 3.9884777309993356e-05, "loss": 2.6112, "mean_token_accuracy": 0.4103448331356049, "step": 39600 }, { "epoch": 0.039890576638152535, "grad_norm": 19.177204748747915, "learning_rate": 3.9889813266724415e-05, "loss": 2.7197, "mean_token_accuracy": 0.3827586203813553, "step": 39605 }, { "epoch": 0.03989561269125671, "grad_norm": 14.171236057807544, "learning_rate": 3.9894849223455474e-05, "loss": 2.7357, "mean_token_accuracy": 0.37011494636535647, "step": 39610 }, { "epoch": 0.03990064874436088, "grad_norm": 13.365838389479187, "learning_rate": 3.9899885180186533e-05, "loss": 2.7664, "mean_token_accuracy": 0.38620689809322356, "step": 39615 }, { "epoch": 0.03990568479746505, "grad_norm": 15.91503837211013, "learning_rate": 3.990492113691759e-05, "loss": 2.2819, "mean_token_accuracy": 0.43103448748588563, "step": 39620 }, { "epoch": 0.03991072085056922, "grad_norm": 21.21639858593874, "learning_rate": 3.990995709364865e-05, "loss": 2.9169, "mean_token_accuracy": 0.3741681814193726, "step": 39625 }, { "epoch": 0.0399157569036734, "grad_norm": 15.074685093765835, "learning_rate": 3.991499305037971e-05, "loss": 2.5945, "mean_token_accuracy": 0.3862069010734558, "step": 39630 }, { "epoch": 0.03992079295677757, "grad_norm": 11.82394670560778, "learning_rate": 3.992002900711077e-05, "loss": 2.7318, "mean_token_accuracy": 0.3551724076271057, "step": 39635 }, { "epoch": 0.039925829009881744, "grad_norm": 34.84472649110662, "learning_rate": 3.9925064963841836e-05, "loss": 2.5183, "mean_token_accuracy": 0.3862069010734558, "step": 39640 }, { "epoch": 0.03993086506298592, "grad_norm": 14.963263901484002, "learning_rate": 3.9930100920572896e-05, "loss": 2.5927, "mean_token_accuracy": 0.38275861740112305, "step": 39645 }, { "epoch": 0.03993590111609009, "grad_norm": 15.574997832659685, "learning_rate": 3.9935136877303955e-05, "loss": 2.3676, "mean_token_accuracy": 0.4503327250480652, "step": 39650 }, { "epoch": 0.03994093716919426, "grad_norm": 13.07420189884145, "learning_rate": 3.9940172834035014e-05, "loss": 2.7193, "mean_token_accuracy": 0.35862069129943847, "step": 39655 }, { "epoch": 0.03994597322229843, "grad_norm": 15.804916855015218, "learning_rate": 3.994520879076607e-05, "loss": 2.6825, "mean_token_accuracy": 0.3931034505367279, "step": 39660 }, { "epoch": 0.039951009275402606, "grad_norm": 13.905909818659488, "learning_rate": 3.9950244747497126e-05, "loss": 2.3118, "mean_token_accuracy": 0.41379310488700866, "step": 39665 }, { "epoch": 0.03995604532850678, "grad_norm": 18.204565726185997, "learning_rate": 3.995528070422819e-05, "loss": 2.2973, "mean_token_accuracy": 0.4206896543502808, "step": 39670 }, { "epoch": 0.039961081381610954, "grad_norm": 12.825762135670029, "learning_rate": 3.996031666095925e-05, "loss": 2.9628, "mean_token_accuracy": 0.3793103456497192, "step": 39675 }, { "epoch": 0.03996611743471513, "grad_norm": 17.326167533405084, "learning_rate": 3.996535261769031e-05, "loss": 2.8315, "mean_token_accuracy": 0.37241379618644715, "step": 39680 }, { "epoch": 0.0399711534878193, "grad_norm": 15.953521243193723, "learning_rate": 3.997038857442137e-05, "loss": 2.9432, "mean_token_accuracy": 0.36896551251411436, "step": 39685 }, { "epoch": 0.03997618954092347, "grad_norm": 13.293369844590192, "learning_rate": 3.997542453115243e-05, "loss": 2.6419, "mean_token_accuracy": 0.3896551728248596, "step": 39690 }, { "epoch": 0.03998122559402764, "grad_norm": 12.508295016875014, "learning_rate": 3.9980460487883495e-05, "loss": 2.1332, "mean_token_accuracy": 0.49258058667182925, "step": 39695 }, { "epoch": 0.039986261647131816, "grad_norm": 12.979883233880802, "learning_rate": 3.998549644461455e-05, "loss": 2.2111, "mean_token_accuracy": 0.4241379380226135, "step": 39700 }, { "epoch": 0.03999129770023599, "grad_norm": 13.612385737416012, "learning_rate": 3.999053240134561e-05, "loss": 2.7444, "mean_token_accuracy": 0.38965516686439516, "step": 39705 }, { "epoch": 0.03999633375334016, "grad_norm": 19.859392133279393, "learning_rate": 3.9995568358076666e-05, "loss": 2.7312, "mean_token_accuracy": 0.37586207389831544, "step": 39710 }, { "epoch": 0.04000136980644434, "grad_norm": 13.474925896107422, "learning_rate": 4.0000604314807725e-05, "loss": 2.3764, "mean_token_accuracy": 0.43103447556495667, "step": 39715 }, { "epoch": 0.04000640585954851, "grad_norm": 14.251108773177421, "learning_rate": 4.000564027153879e-05, "loss": 2.3792, "mean_token_accuracy": 0.45517241954803467, "step": 39720 }, { "epoch": 0.04001144191265268, "grad_norm": 13.369294854956598, "learning_rate": 4.001067622826985e-05, "loss": 2.3234, "mean_token_accuracy": 0.5022383630275726, "step": 39725 }, { "epoch": 0.04001647796575685, "grad_norm": 29.233679026782127, "learning_rate": 4.001571218500091e-05, "loss": 2.6879, "mean_token_accuracy": 0.4551724076271057, "step": 39730 }, { "epoch": 0.040021514018861025, "grad_norm": 12.945092100152676, "learning_rate": 4.002074814173197e-05, "loss": 2.5479, "mean_token_accuracy": 0.42068966031074523, "step": 39735 }, { "epoch": 0.0400265500719652, "grad_norm": 17.017701734706634, "learning_rate": 4.002578409846303e-05, "loss": 2.4442, "mean_token_accuracy": 0.4517241358757019, "step": 39740 }, { "epoch": 0.04003158612506937, "grad_norm": 14.541000991914617, "learning_rate": 4.003082005519409e-05, "loss": 2.3415, "mean_token_accuracy": 0.4068965554237366, "step": 39745 }, { "epoch": 0.040036622178173546, "grad_norm": 12.55519627141762, "learning_rate": 4.003585601192515e-05, "loss": 2.5065, "mean_token_accuracy": 0.4, "step": 39750 }, { "epoch": 0.04004165823127772, "grad_norm": 12.702581169156028, "learning_rate": 4.0040891968656206e-05, "loss": 2.1775, "mean_token_accuracy": 0.4551724076271057, "step": 39755 }, { "epoch": 0.04004669428438189, "grad_norm": 12.78524961888808, "learning_rate": 4.0045927925387265e-05, "loss": 2.8534, "mean_token_accuracy": 0.3931034505367279, "step": 39760 }, { "epoch": 0.04005173033748606, "grad_norm": 13.276735221617152, "learning_rate": 4.0050963882118325e-05, "loss": 2.3624, "mean_token_accuracy": 0.42068965137004855, "step": 39765 }, { "epoch": 0.040056766390590234, "grad_norm": 12.696455481803552, "learning_rate": 4.0055999838849384e-05, "loss": 2.522, "mean_token_accuracy": 0.4034482717514038, "step": 39770 }, { "epoch": 0.04006180244369441, "grad_norm": 14.957941700938411, "learning_rate": 4.006103579558045e-05, "loss": 2.3928, "mean_token_accuracy": 0.41379310488700866, "step": 39775 }, { "epoch": 0.04006683849679858, "grad_norm": 21.41062825229992, "learning_rate": 4.006607175231151e-05, "loss": 2.8774, "mean_token_accuracy": 0.3999999940395355, "step": 39780 }, { "epoch": 0.040071874549902756, "grad_norm": 15.370285361584779, "learning_rate": 4.007110770904257e-05, "loss": 2.5028, "mean_token_accuracy": 0.4461822688579559, "step": 39785 }, { "epoch": 0.04007691060300693, "grad_norm": 14.842441323552666, "learning_rate": 4.007614366577363e-05, "loss": 2.5884, "mean_token_accuracy": 0.3999999940395355, "step": 39790 }, { "epoch": 0.040081946656111096, "grad_norm": 16.651649652264293, "learning_rate": 4.008117962250468e-05, "loss": 2.5498, "mean_token_accuracy": 0.4016938954591751, "step": 39795 }, { "epoch": 0.04008698270921527, "grad_norm": 12.660773969921829, "learning_rate": 4.0086215579235746e-05, "loss": 2.1226, "mean_token_accuracy": 0.4676950991153717, "step": 39800 }, { "epoch": 0.040092018762319444, "grad_norm": 12.70686696694512, "learning_rate": 4.0091251535966805e-05, "loss": 2.2087, "mean_token_accuracy": 0.4517241299152374, "step": 39805 }, { "epoch": 0.04009705481542362, "grad_norm": 16.460759926264725, "learning_rate": 4.0096287492697865e-05, "loss": 2.7078, "mean_token_accuracy": 0.38275861740112305, "step": 39810 }, { "epoch": 0.04010209086852779, "grad_norm": 14.536287357474801, "learning_rate": 4.0101323449428924e-05, "loss": 2.8086, "mean_token_accuracy": 0.3655172437429428, "step": 39815 }, { "epoch": 0.040107126921631965, "grad_norm": 13.195345427123746, "learning_rate": 4.010635940615998e-05, "loss": 2.3435, "mean_token_accuracy": 0.43684210777282717, "step": 39820 }, { "epoch": 0.04011216297473614, "grad_norm": 14.250115645395615, "learning_rate": 4.011139536289105e-05, "loss": 2.3767, "mean_token_accuracy": 0.41034482717514037, "step": 39825 }, { "epoch": 0.040117199027840306, "grad_norm": 14.260382150847615, "learning_rate": 4.011643131962211e-05, "loss": 2.6749, "mean_token_accuracy": 0.34827586114406583, "step": 39830 }, { "epoch": 0.04012223508094448, "grad_norm": 14.17119630028932, "learning_rate": 4.012146727635316e-05, "loss": 2.3429, "mean_token_accuracy": 0.46406532526016236, "step": 39835 }, { "epoch": 0.04012727113404865, "grad_norm": 13.240532046129761, "learning_rate": 4.012650323308422e-05, "loss": 2.309, "mean_token_accuracy": 0.4709618866443634, "step": 39840 }, { "epoch": 0.04013230718715283, "grad_norm": 15.600227316518486, "learning_rate": 4.013153918981528e-05, "loss": 2.6155, "mean_token_accuracy": 0.38275861740112305, "step": 39845 }, { "epoch": 0.040137343240257, "grad_norm": 14.03703090974591, "learning_rate": 4.013657514654634e-05, "loss": 2.7289, "mean_token_accuracy": 0.358620685338974, "step": 39850 }, { "epoch": 0.040142379293361174, "grad_norm": 11.053685476260426, "learning_rate": 4.0141611103277405e-05, "loss": 2.3202, "mean_token_accuracy": 0.48166969418525696, "step": 39855 }, { "epoch": 0.04014741534646535, "grad_norm": 14.032916466049887, "learning_rate": 4.0146647060008464e-05, "loss": 2.791, "mean_token_accuracy": 0.35862068831920624, "step": 39860 }, { "epoch": 0.040152451399569515, "grad_norm": 14.408653274915176, "learning_rate": 4.015168301673952e-05, "loss": 3.1839, "mean_token_accuracy": 0.37586207389831544, "step": 39865 }, { "epoch": 0.04015748745267369, "grad_norm": 18.680229848765556, "learning_rate": 4.015671897347058e-05, "loss": 2.5193, "mean_token_accuracy": 0.3562008500099182, "step": 39870 }, { "epoch": 0.04016252350577786, "grad_norm": 15.1808385690575, "learning_rate": 4.016175493020164e-05, "loss": 2.6264, "mean_token_accuracy": 0.3931034505367279, "step": 39875 }, { "epoch": 0.040167559558882036, "grad_norm": 15.970736521705348, "learning_rate": 4.01667908869327e-05, "loss": 2.8489, "mean_token_accuracy": 0.35862068831920624, "step": 39880 }, { "epoch": 0.04017259561198621, "grad_norm": 14.517567764246381, "learning_rate": 4.017182684366376e-05, "loss": 2.7939, "mean_token_accuracy": 0.36206896901130675, "step": 39885 }, { "epoch": 0.040177631665090384, "grad_norm": 13.234747788343773, "learning_rate": 4.017686280039482e-05, "loss": 2.6144, "mean_token_accuracy": 0.3586206942796707, "step": 39890 }, { "epoch": 0.04018266771819456, "grad_norm": 13.063504517567486, "learning_rate": 4.018189875712588e-05, "loss": 2.2507, "mean_token_accuracy": 0.42068966031074523, "step": 39895 }, { "epoch": 0.040187703771298725, "grad_norm": 11.50998517758508, "learning_rate": 4.018693471385694e-05, "loss": 2.3363, "mean_token_accuracy": 0.4724137902259827, "step": 39900 }, { "epoch": 0.0401927398244029, "grad_norm": 15.583452661683973, "learning_rate": 4.0191970670588004e-05, "loss": 2.6645, "mean_token_accuracy": 0.44271020889282225, "step": 39905 }, { "epoch": 0.04019777587750707, "grad_norm": 14.389492054247935, "learning_rate": 4.019700662731906e-05, "loss": 2.4437, "mean_token_accuracy": 0.4448275864124298, "step": 39910 }, { "epoch": 0.040202811930611246, "grad_norm": 15.431898764803782, "learning_rate": 4.020204258405012e-05, "loss": 2.8849, "mean_token_accuracy": 0.4068965494632721, "step": 39915 }, { "epoch": 0.04020784798371542, "grad_norm": 13.105860591824907, "learning_rate": 4.020707854078118e-05, "loss": 2.7021, "mean_token_accuracy": 0.35517241060733795, "step": 39920 }, { "epoch": 0.04021288403681959, "grad_norm": 13.510288944707405, "learning_rate": 4.021211449751224e-05, "loss": 2.3518, "mean_token_accuracy": 0.41379310488700866, "step": 39925 }, { "epoch": 0.04021792008992377, "grad_norm": 12.738166811618783, "learning_rate": 4.0217150454243293e-05, "loss": 2.0412, "mean_token_accuracy": 0.46376285552978513, "step": 39930 }, { "epoch": 0.040222956143027934, "grad_norm": 13.896454278918146, "learning_rate": 4.022218641097436e-05, "loss": 2.6781, "mean_token_accuracy": 0.42413792610168455, "step": 39935 }, { "epoch": 0.04022799219613211, "grad_norm": 19.12239498902643, "learning_rate": 4.022722236770542e-05, "loss": 2.2712, "mean_token_accuracy": 0.45172414779663084, "step": 39940 }, { "epoch": 0.04023302824923628, "grad_norm": 14.829419419894139, "learning_rate": 4.023225832443648e-05, "loss": 2.716, "mean_token_accuracy": 0.36896551251411436, "step": 39945 }, { "epoch": 0.040238064302340455, "grad_norm": 13.49299500020081, "learning_rate": 4.023729428116754e-05, "loss": 2.114, "mean_token_accuracy": 0.5009852230548859, "step": 39950 }, { "epoch": 0.04024310035544463, "grad_norm": 17.450758403217062, "learning_rate": 4.0242330237898596e-05, "loss": 2.546, "mean_token_accuracy": 0.42758620977401735, "step": 39955 }, { "epoch": 0.0402481364085488, "grad_norm": 14.608014522234031, "learning_rate": 4.024736619462966e-05, "loss": 2.8459, "mean_token_accuracy": 0.35862069129943847, "step": 39960 }, { "epoch": 0.040253172461652977, "grad_norm": 12.724956957101552, "learning_rate": 4.025240215136072e-05, "loss": 2.4664, "mean_token_accuracy": 0.4137930989265442, "step": 39965 }, { "epoch": 0.04025820851475714, "grad_norm": 19.27986363761355, "learning_rate": 4.0257438108091774e-05, "loss": 2.393, "mean_token_accuracy": 0.4517241358757019, "step": 39970 }, { "epoch": 0.04026324456786132, "grad_norm": 16.71052478726502, "learning_rate": 4.0262474064822833e-05, "loss": 3.0029, "mean_token_accuracy": 0.3482758641242981, "step": 39975 }, { "epoch": 0.04026828062096549, "grad_norm": 15.68861715413, "learning_rate": 4.026751002155389e-05, "loss": 2.4975, "mean_token_accuracy": 0.41034482717514037, "step": 39980 }, { "epoch": 0.040273316674069665, "grad_norm": 13.350757706504654, "learning_rate": 4.027254597828496e-05, "loss": 2.6966, "mean_token_accuracy": 0.35341801941394807, "step": 39985 }, { "epoch": 0.04027835272717384, "grad_norm": 14.178444111448552, "learning_rate": 4.027758193501602e-05, "loss": 2.4122, "mean_token_accuracy": 0.42262552976608275, "step": 39990 }, { "epoch": 0.04028338878027801, "grad_norm": 15.074160570797384, "learning_rate": 4.028261789174708e-05, "loss": 2.6751, "mean_token_accuracy": 0.4344827651977539, "step": 39995 }, { "epoch": 0.040288424833382186, "grad_norm": 13.633886330334724, "learning_rate": 4.0287653848478137e-05, "loss": 2.7902, "mean_token_accuracy": 0.35172412991523744, "step": 40000 }, { "epoch": 0.04029346088648635, "grad_norm": 15.819527152883886, "learning_rate": 4.0292689805209196e-05, "loss": 2.4424, "mean_token_accuracy": 0.4172413766384125, "step": 40005 }, { "epoch": 0.04029849693959053, "grad_norm": 14.588128465895133, "learning_rate": 4.0297725761940255e-05, "loss": 2.8978, "mean_token_accuracy": 0.3655172288417816, "step": 40010 }, { "epoch": 0.0403035329926947, "grad_norm": 19.418776498569628, "learning_rate": 4.0302761718671314e-05, "loss": 2.5772, "mean_token_accuracy": 0.41034482419490814, "step": 40015 }, { "epoch": 0.040308569045798874, "grad_norm": 14.371097075714992, "learning_rate": 4.0307797675402374e-05, "loss": 2.7407, "mean_token_accuracy": 0.341379314661026, "step": 40020 }, { "epoch": 0.04031360509890305, "grad_norm": 17.00403547905162, "learning_rate": 4.031283363213343e-05, "loss": 2.4783, "mean_token_accuracy": 0.3896551728248596, "step": 40025 }, { "epoch": 0.04031864115200722, "grad_norm": 12.852021026378056, "learning_rate": 4.031786958886449e-05, "loss": 2.1619, "mean_token_accuracy": 0.4586206912994385, "step": 40030 }, { "epoch": 0.040323677205111395, "grad_norm": 15.734888043913397, "learning_rate": 4.032290554559555e-05, "loss": 2.6248, "mean_token_accuracy": 0.3620689630508423, "step": 40035 }, { "epoch": 0.04032871325821556, "grad_norm": 15.702209025185407, "learning_rate": 4.032794150232662e-05, "loss": 2.6463, "mean_token_accuracy": 0.38620689511299133, "step": 40040 }, { "epoch": 0.040333749311319736, "grad_norm": 14.990376540388237, "learning_rate": 4.0332977459057677e-05, "loss": 2.4319, "mean_token_accuracy": 0.41034482717514037, "step": 40045 }, { "epoch": 0.04033878536442391, "grad_norm": 19.444441096499443, "learning_rate": 4.0338013415788736e-05, "loss": 2.4033, "mean_token_accuracy": 0.4620689541101456, "step": 40050 }, { "epoch": 0.040343821417528083, "grad_norm": 13.55350872524074, "learning_rate": 4.0343049372519795e-05, "loss": 2.5339, "mean_token_accuracy": 0.37241379618644715, "step": 40055 }, { "epoch": 0.04034885747063226, "grad_norm": 12.60640981805468, "learning_rate": 4.034808532925085e-05, "loss": 2.8221, "mean_token_accuracy": 0.35862069129943847, "step": 40060 }, { "epoch": 0.04035389352373643, "grad_norm": 15.14327400828799, "learning_rate": 4.0353121285981914e-05, "loss": 2.255, "mean_token_accuracy": 0.4264367878437042, "step": 40065 }, { "epoch": 0.040358929576840605, "grad_norm": 14.102725519669287, "learning_rate": 4.035815724271297e-05, "loss": 2.7173, "mean_token_accuracy": 0.3482758581638336, "step": 40070 }, { "epoch": 0.04036396562994477, "grad_norm": 15.683997247331774, "learning_rate": 4.036319319944403e-05, "loss": 2.55, "mean_token_accuracy": 0.41034482717514037, "step": 40075 }, { "epoch": 0.040369001683048945, "grad_norm": 25.78434112953753, "learning_rate": 4.036822915617509e-05, "loss": 2.9449, "mean_token_accuracy": 0.3551724076271057, "step": 40080 }, { "epoch": 0.04037403773615312, "grad_norm": 12.603977641579434, "learning_rate": 4.037326511290615e-05, "loss": 2.4442, "mean_token_accuracy": 0.4206896543502808, "step": 40085 }, { "epoch": 0.04037907378925729, "grad_norm": 13.792403791601364, "learning_rate": 4.037830106963722e-05, "loss": 2.5566, "mean_token_accuracy": 0.4344827592372894, "step": 40090 }, { "epoch": 0.04038410984236147, "grad_norm": 13.97060711966777, "learning_rate": 4.0383337026368276e-05, "loss": 2.4337, "mean_token_accuracy": 0.4344827592372894, "step": 40095 }, { "epoch": 0.04038914589546564, "grad_norm": 10.193224227996545, "learning_rate": 4.0388372983099335e-05, "loss": 2.5262, "mean_token_accuracy": 0.40344826579093934, "step": 40100 }, { "epoch": 0.040394181948569814, "grad_norm": 14.97493455306575, "learning_rate": 4.039340893983039e-05, "loss": 2.5593, "mean_token_accuracy": 0.3758620619773865, "step": 40105 }, { "epoch": 0.04039921800167398, "grad_norm": 14.145988218176424, "learning_rate": 4.039844489656145e-05, "loss": 2.6482, "mean_token_accuracy": 0.41379311084747317, "step": 40110 }, { "epoch": 0.040404254054778155, "grad_norm": 13.204790304664215, "learning_rate": 4.0403480853292506e-05, "loss": 2.8498, "mean_token_accuracy": 0.3413793116807938, "step": 40115 }, { "epoch": 0.04040929010788233, "grad_norm": 15.070572364371507, "learning_rate": 4.040851681002357e-05, "loss": 2.3916, "mean_token_accuracy": 0.4344827592372894, "step": 40120 }, { "epoch": 0.0404143261609865, "grad_norm": 12.063254560680296, "learning_rate": 4.041355276675463e-05, "loss": 2.3117, "mean_token_accuracy": 0.42413792610168455, "step": 40125 }, { "epoch": 0.040419362214090676, "grad_norm": 12.057546790790164, "learning_rate": 4.041858872348569e-05, "loss": 2.1039, "mean_token_accuracy": 0.4793103516101837, "step": 40130 }, { "epoch": 0.04042439826719485, "grad_norm": 14.310705467692763, "learning_rate": 4.042362468021675e-05, "loss": 2.5193, "mean_token_accuracy": 0.4068965554237366, "step": 40135 }, { "epoch": 0.040429434320299024, "grad_norm": 14.819268658965951, "learning_rate": 4.042866063694781e-05, "loss": 2.1075, "mean_token_accuracy": 0.458620685338974, "step": 40140 }, { "epoch": 0.04043447037340319, "grad_norm": 14.754042453235211, "learning_rate": 4.043369659367887e-05, "loss": 2.59, "mean_token_accuracy": 0.4434361755847931, "step": 40145 }, { "epoch": 0.040439506426507364, "grad_norm": 14.815716888624625, "learning_rate": 4.043873255040993e-05, "loss": 2.3313, "mean_token_accuracy": 0.43793103098869324, "step": 40150 }, { "epoch": 0.04044454247961154, "grad_norm": 10.111888274307887, "learning_rate": 4.044376850714099e-05, "loss": 2.0824, "mean_token_accuracy": 0.48965516686439514, "step": 40155 }, { "epoch": 0.04044957853271571, "grad_norm": 16.698105248302497, "learning_rate": 4.0448804463872046e-05, "loss": 2.7159, "mean_token_accuracy": 0.3999999940395355, "step": 40160 }, { "epoch": 0.040454614585819886, "grad_norm": 15.605077112515527, "learning_rate": 4.0453840420603105e-05, "loss": 2.3632, "mean_token_accuracy": 0.47426108121871946, "step": 40165 }, { "epoch": 0.04045965063892406, "grad_norm": 18.035391667495215, "learning_rate": 4.045887637733417e-05, "loss": 2.5201, "mean_token_accuracy": 0.4434361755847931, "step": 40170 }, { "epoch": 0.04046468669202823, "grad_norm": 14.36820091883167, "learning_rate": 4.046391233406523e-05, "loss": 2.4819, "mean_token_accuracy": 0.3914095640182495, "step": 40175 }, { "epoch": 0.0404697227451324, "grad_norm": 11.573553617957032, "learning_rate": 4.046894829079629e-05, "loss": 2.1786, "mean_token_accuracy": 0.4817906737327576, "step": 40180 }, { "epoch": 0.040474758798236574, "grad_norm": 15.42526592277216, "learning_rate": 4.047398424752735e-05, "loss": 2.5204, "mean_token_accuracy": 0.4137930989265442, "step": 40185 }, { "epoch": 0.04047979485134075, "grad_norm": 13.493049585503737, "learning_rate": 4.047902020425841e-05, "loss": 2.801, "mean_token_accuracy": 0.38275861740112305, "step": 40190 }, { "epoch": 0.04048483090444492, "grad_norm": 11.497841807995542, "learning_rate": 4.048405616098946e-05, "loss": 2.7018, "mean_token_accuracy": 0.43103448748588563, "step": 40195 }, { "epoch": 0.040489866957549095, "grad_norm": 12.361256404549827, "learning_rate": 4.048909211772053e-05, "loss": 2.5215, "mean_token_accuracy": 0.4068965494632721, "step": 40200 }, { "epoch": 0.04049490301065327, "grad_norm": 19.932367522102613, "learning_rate": 4.0494128074451586e-05, "loss": 2.5494, "mean_token_accuracy": 0.4224440395832062, "step": 40205 }, { "epoch": 0.04049993906375744, "grad_norm": 12.405396549533842, "learning_rate": 4.0499164031182645e-05, "loss": 2.2776, "mean_token_accuracy": 0.43103447556495667, "step": 40210 }, { "epoch": 0.04050497511686161, "grad_norm": 15.536607853808702, "learning_rate": 4.0504199987913705e-05, "loss": 2.8305, "mean_token_accuracy": 0.4082879602909088, "step": 40215 }, { "epoch": 0.04051001116996578, "grad_norm": 15.542400979739453, "learning_rate": 4.0509235944644764e-05, "loss": 2.619, "mean_token_accuracy": 0.37931033968925476, "step": 40220 }, { "epoch": 0.04051504722306996, "grad_norm": 13.723849518378993, "learning_rate": 4.051427190137583e-05, "loss": 2.6731, "mean_token_accuracy": 0.4137930989265442, "step": 40225 }, { "epoch": 0.04052008327617413, "grad_norm": 12.61960229675886, "learning_rate": 4.051930785810689e-05, "loss": 2.3781, "mean_token_accuracy": 0.42758620381355283, "step": 40230 }, { "epoch": 0.040525119329278304, "grad_norm": 14.500142262754462, "learning_rate": 4.052434381483794e-05, "loss": 2.174, "mean_token_accuracy": 0.4849364757537842, "step": 40235 }, { "epoch": 0.04053015538238248, "grad_norm": 13.429478477699076, "learning_rate": 4.0529379771569e-05, "loss": 2.1939, "mean_token_accuracy": 0.4758620738983154, "step": 40240 }, { "epoch": 0.04053519143548665, "grad_norm": 14.643173046804844, "learning_rate": 4.053441572830006e-05, "loss": 2.5536, "mean_token_accuracy": 0.42758620977401735, "step": 40245 }, { "epoch": 0.04054022748859082, "grad_norm": 13.157804366559992, "learning_rate": 4.0539451685031126e-05, "loss": 2.7284, "mean_token_accuracy": 0.3947973370552063, "step": 40250 }, { "epoch": 0.04054526354169499, "grad_norm": 17.95852209582341, "learning_rate": 4.0544487641762186e-05, "loss": 3.0836, "mean_token_accuracy": 0.3655172437429428, "step": 40255 }, { "epoch": 0.040550299594799166, "grad_norm": 13.817097223215429, "learning_rate": 4.0549523598493245e-05, "loss": 2.6153, "mean_token_accuracy": 0.42068966031074523, "step": 40260 }, { "epoch": 0.04055533564790334, "grad_norm": 12.605436879555116, "learning_rate": 4.0554559555224304e-05, "loss": 3.0235, "mean_token_accuracy": 0.35862069129943847, "step": 40265 }, { "epoch": 0.040560371701007514, "grad_norm": 15.177462165015818, "learning_rate": 4.055959551195536e-05, "loss": 2.8487, "mean_token_accuracy": 0.35862069129943847, "step": 40270 }, { "epoch": 0.04056540775411169, "grad_norm": 14.111872153324049, "learning_rate": 4.056463146868642e-05, "loss": 2.2194, "mean_token_accuracy": 0.45668481588363646, "step": 40275 }, { "epoch": 0.04057044380721586, "grad_norm": 15.197010766130397, "learning_rate": 4.056966742541748e-05, "loss": 2.596, "mean_token_accuracy": 0.3551724076271057, "step": 40280 }, { "epoch": 0.04057547986032003, "grad_norm": 12.49697646389517, "learning_rate": 4.057470338214854e-05, "loss": 2.5275, "mean_token_accuracy": 0.441379314661026, "step": 40285 }, { "epoch": 0.0405805159134242, "grad_norm": 13.131840291435024, "learning_rate": 4.05797393388796e-05, "loss": 2.3514, "mean_token_accuracy": 0.4673926174640656, "step": 40290 }, { "epoch": 0.040585551966528376, "grad_norm": 14.32031273790913, "learning_rate": 4.058477529561066e-05, "loss": 2.5451, "mean_token_accuracy": 0.3896551728248596, "step": 40295 }, { "epoch": 0.04059058801963255, "grad_norm": 16.39867204657384, "learning_rate": 4.058981125234172e-05, "loss": 2.4992, "mean_token_accuracy": 0.39655172228813174, "step": 40300 }, { "epoch": 0.04059562407273672, "grad_norm": 12.401328280417237, "learning_rate": 4.0594847209072785e-05, "loss": 2.7137, "mean_token_accuracy": 0.36896551847457887, "step": 40305 }, { "epoch": 0.0406006601258409, "grad_norm": 12.979115967932202, "learning_rate": 4.0599883165803844e-05, "loss": 2.6952, "mean_token_accuracy": 0.3793103456497192, "step": 40310 }, { "epoch": 0.04060569617894507, "grad_norm": 16.484774699584264, "learning_rate": 4.06049191225349e-05, "loss": 2.6009, "mean_token_accuracy": 0.391349059343338, "step": 40315 }, { "epoch": 0.04061073223204924, "grad_norm": 12.495178891047514, "learning_rate": 4.060995507926596e-05, "loss": 2.609, "mean_token_accuracy": 0.4137930989265442, "step": 40320 }, { "epoch": 0.04061576828515341, "grad_norm": 18.98296185885065, "learning_rate": 4.061499103599702e-05, "loss": 2.7579, "mean_token_accuracy": 0.37241379618644715, "step": 40325 }, { "epoch": 0.040620804338257585, "grad_norm": 14.085628153444143, "learning_rate": 4.062002699272808e-05, "loss": 2.5144, "mean_token_accuracy": 0.4275862157344818, "step": 40330 }, { "epoch": 0.04062584039136176, "grad_norm": 21.086182489396823, "learning_rate": 4.062506294945914e-05, "loss": 2.8383, "mean_token_accuracy": 0.3275862097740173, "step": 40335 }, { "epoch": 0.04063087644446593, "grad_norm": 14.218072081541653, "learning_rate": 4.06300989061902e-05, "loss": 2.8979, "mean_token_accuracy": 0.31379309892654417, "step": 40340 }, { "epoch": 0.040635912497570106, "grad_norm": 13.484947423457687, "learning_rate": 4.063513486292126e-05, "loss": 2.6697, "mean_token_accuracy": 0.4301875352859497, "step": 40345 }, { "epoch": 0.04064094855067428, "grad_norm": 12.268786992553803, "learning_rate": 4.064017081965232e-05, "loss": 2.5629, "mean_token_accuracy": 0.43509851694107055, "step": 40350 }, { "epoch": 0.04064598460377845, "grad_norm": 16.521492441440987, "learning_rate": 4.064520677638338e-05, "loss": 3.0524, "mean_token_accuracy": 0.36896551847457887, "step": 40355 }, { "epoch": 0.04065102065688262, "grad_norm": 13.281052066680923, "learning_rate": 4.0650242733114443e-05, "loss": 2.4557, "mean_token_accuracy": 0.39655172228813174, "step": 40360 }, { "epoch": 0.040656056709986794, "grad_norm": 11.180764094050256, "learning_rate": 4.06552786898455e-05, "loss": 2.6835, "mean_token_accuracy": 0.4156079888343811, "step": 40365 }, { "epoch": 0.04066109276309097, "grad_norm": 13.909859648467254, "learning_rate": 4.0660314646576555e-05, "loss": 2.4924, "mean_token_accuracy": 0.4103448212146759, "step": 40370 }, { "epoch": 0.04066612881619514, "grad_norm": 14.810362550633833, "learning_rate": 4.0665350603307614e-05, "loss": 2.8278, "mean_token_accuracy": 0.3896551728248596, "step": 40375 }, { "epoch": 0.040671164869299316, "grad_norm": 12.80696021388534, "learning_rate": 4.0670386560038674e-05, "loss": 2.9772, "mean_token_accuracy": 0.38620689809322356, "step": 40380 }, { "epoch": 0.04067620092240349, "grad_norm": 13.176419183861915, "learning_rate": 4.067542251676974e-05, "loss": 3.111, "mean_token_accuracy": 0.26896551847457884, "step": 40385 }, { "epoch": 0.040681236975507656, "grad_norm": 11.698790130399717, "learning_rate": 4.06804584735008e-05, "loss": 2.7296, "mean_token_accuracy": 0.3813672125339508, "step": 40390 }, { "epoch": 0.04068627302861183, "grad_norm": 17.973211855135062, "learning_rate": 4.068549443023186e-05, "loss": 2.5719, "mean_token_accuracy": 0.4517241418361664, "step": 40395 }, { "epoch": 0.040691309081716004, "grad_norm": 12.221812011902507, "learning_rate": 4.069053038696292e-05, "loss": 2.6374, "mean_token_accuracy": 0.3931034505367279, "step": 40400 }, { "epoch": 0.04069634513482018, "grad_norm": 13.467318321528381, "learning_rate": 4.069556634369398e-05, "loss": 2.8584, "mean_token_accuracy": 0.37241379022598264, "step": 40405 }, { "epoch": 0.04070138118792435, "grad_norm": 13.343336926680532, "learning_rate": 4.0700602300425036e-05, "loss": 2.9105, "mean_token_accuracy": 0.3551724076271057, "step": 40410 }, { "epoch": 0.040706417241028525, "grad_norm": 14.315625409395771, "learning_rate": 4.0705638257156095e-05, "loss": 2.938, "mean_token_accuracy": 0.33793103098869326, "step": 40415 }, { "epoch": 0.0407114532941327, "grad_norm": 13.184877751042649, "learning_rate": 4.0710674213887154e-05, "loss": 2.4272, "mean_token_accuracy": 0.3709013909101486, "step": 40420 }, { "epoch": 0.040716489347236866, "grad_norm": 17.975178885215517, "learning_rate": 4.0715710170618214e-05, "loss": 2.7943, "mean_token_accuracy": 0.3862069010734558, "step": 40425 }, { "epoch": 0.04072152540034104, "grad_norm": 12.249754702959212, "learning_rate": 4.072074612734927e-05, "loss": 2.7838, "mean_token_accuracy": 0.3827586203813553, "step": 40430 }, { "epoch": 0.04072656145344521, "grad_norm": 13.972745253645067, "learning_rate": 4.072578208408033e-05, "loss": 2.5193, "mean_token_accuracy": 0.4034482717514038, "step": 40435 }, { "epoch": 0.04073159750654939, "grad_norm": 12.950327748990452, "learning_rate": 4.07308180408114e-05, "loss": 2.3661, "mean_token_accuracy": 0.4034482777118683, "step": 40440 }, { "epoch": 0.04073663355965356, "grad_norm": 18.68515683284272, "learning_rate": 4.073585399754246e-05, "loss": 2.8108, "mean_token_accuracy": 0.3793103456497192, "step": 40445 }, { "epoch": 0.040741669612757735, "grad_norm": 14.957861412946045, "learning_rate": 4.074088995427352e-05, "loss": 2.5108, "mean_token_accuracy": 0.3793103456497192, "step": 40450 }, { "epoch": 0.04074670566586191, "grad_norm": 13.415675856912834, "learning_rate": 4.0745925911004576e-05, "loss": 2.4644, "mean_token_accuracy": 0.41379311084747317, "step": 40455 }, { "epoch": 0.040751741718966075, "grad_norm": 13.92069037022313, "learning_rate": 4.0750961867735635e-05, "loss": 2.2323, "mean_token_accuracy": 0.46551724076271056, "step": 40460 }, { "epoch": 0.04075677777207025, "grad_norm": 14.182163601131016, "learning_rate": 4.0755997824466694e-05, "loss": 2.8331, "mean_token_accuracy": 0.3758620619773865, "step": 40465 }, { "epoch": 0.04076181382517442, "grad_norm": 17.17798866693746, "learning_rate": 4.0761033781197754e-05, "loss": 2.8712, "mean_token_accuracy": 0.3758620619773865, "step": 40470 }, { "epoch": 0.040766849878278597, "grad_norm": 15.737960040166513, "learning_rate": 4.076606973792881e-05, "loss": 2.4134, "mean_token_accuracy": 0.47697044014930723, "step": 40475 }, { "epoch": 0.04077188593138277, "grad_norm": 25.74996968372903, "learning_rate": 4.077110569465987e-05, "loss": 2.6363, "mean_token_accuracy": 0.3896551728248596, "step": 40480 }, { "epoch": 0.040776921984486944, "grad_norm": 10.958362938132924, "learning_rate": 4.077614165139093e-05, "loss": 2.6637, "mean_token_accuracy": 0.3793103456497192, "step": 40485 }, { "epoch": 0.04078195803759112, "grad_norm": 13.927932063847432, "learning_rate": 4.0781177608122e-05, "loss": 2.2969, "mean_token_accuracy": 0.4517241358757019, "step": 40490 }, { "epoch": 0.040786994090695285, "grad_norm": 14.282961590902799, "learning_rate": 4.078621356485306e-05, "loss": 2.9345, "mean_token_accuracy": 0.37586206793785093, "step": 40495 }, { "epoch": 0.04079203014379946, "grad_norm": 14.193602746174795, "learning_rate": 4.0791249521584116e-05, "loss": 2.6052, "mean_token_accuracy": 0.4103448212146759, "step": 40500 }, { "epoch": 0.04079706619690363, "grad_norm": 15.185313988176489, "learning_rate": 4.079628547831517e-05, "loss": 2.386, "mean_token_accuracy": 0.41724138259887694, "step": 40505 }, { "epoch": 0.040802102250007806, "grad_norm": 13.097351522885797, "learning_rate": 4.080132143504623e-05, "loss": 2.8522, "mean_token_accuracy": 0.358620685338974, "step": 40510 }, { "epoch": 0.04080713830311198, "grad_norm": 16.253261592358637, "learning_rate": 4.0806357391777294e-05, "loss": 2.718, "mean_token_accuracy": 0.3999999940395355, "step": 40515 }, { "epoch": 0.040812174356216153, "grad_norm": 12.998398279697463, "learning_rate": 4.081139334850835e-05, "loss": 2.8613, "mean_token_accuracy": 0.41034482717514037, "step": 40520 }, { "epoch": 0.04081721040932033, "grad_norm": 15.977455032235298, "learning_rate": 4.081642930523941e-05, "loss": 2.8762, "mean_token_accuracy": 0.358620685338974, "step": 40525 }, { "epoch": 0.040822246462424494, "grad_norm": 13.013625306199334, "learning_rate": 4.082146526197047e-05, "loss": 2.3466, "mean_token_accuracy": 0.43103447556495667, "step": 40530 }, { "epoch": 0.04082728251552867, "grad_norm": 13.016290280937307, "learning_rate": 4.082650121870153e-05, "loss": 2.6598, "mean_token_accuracy": 0.4000000059604645, "step": 40535 }, { "epoch": 0.04083231856863284, "grad_norm": 12.037041020479435, "learning_rate": 4.083153717543259e-05, "loss": 2.3676, "mean_token_accuracy": 0.41034482717514037, "step": 40540 }, { "epoch": 0.040837354621737015, "grad_norm": 13.441339310365143, "learning_rate": 4.083657313216365e-05, "loss": 2.6254, "mean_token_accuracy": 0.40689654648303986, "step": 40545 }, { "epoch": 0.04084239067484119, "grad_norm": 13.127658224034365, "learning_rate": 4.084160908889471e-05, "loss": 2.2756, "mean_token_accuracy": 0.44827585220336913, "step": 40550 }, { "epoch": 0.04084742672794536, "grad_norm": 15.579187000630988, "learning_rate": 4.084664504562577e-05, "loss": 2.6314, "mean_token_accuracy": 0.34137930870056155, "step": 40555 }, { "epoch": 0.04085246278104954, "grad_norm": 13.347342572182912, "learning_rate": 4.085168100235683e-05, "loss": 2.3917, "mean_token_accuracy": 0.43103447556495667, "step": 40560 }, { "epoch": 0.040857498834153703, "grad_norm": 9.81468321774655, "learning_rate": 4.0856716959087886e-05, "loss": 2.4034, "mean_token_accuracy": 0.4384236454963684, "step": 40565 }, { "epoch": 0.04086253488725788, "grad_norm": 14.268267627089497, "learning_rate": 4.086175291581895e-05, "loss": 2.4621, "mean_token_accuracy": 0.42758620381355283, "step": 40570 }, { "epoch": 0.04086757094036205, "grad_norm": 11.908638131299766, "learning_rate": 4.086678887255001e-05, "loss": 2.5021, "mean_token_accuracy": 0.4413793087005615, "step": 40575 }, { "epoch": 0.040872606993466225, "grad_norm": 23.1438661088492, "learning_rate": 4.087182482928107e-05, "loss": 2.3299, "mean_token_accuracy": 0.42268602550029755, "step": 40580 }, { "epoch": 0.0408776430465704, "grad_norm": 12.996090611875152, "learning_rate": 4.087686078601213e-05, "loss": 2.36, "mean_token_accuracy": 0.4206896543502808, "step": 40585 }, { "epoch": 0.04088267909967457, "grad_norm": 12.399271185288155, "learning_rate": 4.088189674274319e-05, "loss": 2.4971, "mean_token_accuracy": 0.3793103456497192, "step": 40590 }, { "epoch": 0.040887715152778746, "grad_norm": 12.832666503248527, "learning_rate": 4.088693269947425e-05, "loss": 2.3295, "mean_token_accuracy": 0.441379314661026, "step": 40595 }, { "epoch": 0.04089275120588291, "grad_norm": 18.18384056212232, "learning_rate": 4.089196865620531e-05, "loss": 2.5061, "mean_token_accuracy": 0.4156079888343811, "step": 40600 }, { "epoch": 0.04089778725898709, "grad_norm": 16.654318686521577, "learning_rate": 4.089700461293637e-05, "loss": 2.672, "mean_token_accuracy": 0.3931034505367279, "step": 40605 }, { "epoch": 0.04090282331209126, "grad_norm": 16.24392108644982, "learning_rate": 4.0902040569667426e-05, "loss": 2.6491, "mean_token_accuracy": 0.3620689630508423, "step": 40610 }, { "epoch": 0.040907859365195434, "grad_norm": 12.401222077953115, "learning_rate": 4.0907076526398486e-05, "loss": 2.3224, "mean_token_accuracy": 0.4551724135875702, "step": 40615 }, { "epoch": 0.04091289541829961, "grad_norm": 15.732593651233447, "learning_rate": 4.0912112483129545e-05, "loss": 2.648, "mean_token_accuracy": 0.3896551787853241, "step": 40620 }, { "epoch": 0.04091793147140378, "grad_norm": 13.646305694808373, "learning_rate": 4.091714843986061e-05, "loss": 2.8443, "mean_token_accuracy": 0.3620689630508423, "step": 40625 }, { "epoch": 0.040922967524507955, "grad_norm": 11.843081124586387, "learning_rate": 4.092218439659167e-05, "loss": 2.5841, "mean_token_accuracy": 0.42413793206214906, "step": 40630 }, { "epoch": 0.04092800357761212, "grad_norm": 12.985064077592844, "learning_rate": 4.092722035332273e-05, "loss": 2.5685, "mean_token_accuracy": 0.3999999940395355, "step": 40635 }, { "epoch": 0.040933039630716296, "grad_norm": 16.936283220069605, "learning_rate": 4.093225631005378e-05, "loss": 2.9723, "mean_token_accuracy": 0.324137932062149, "step": 40640 }, { "epoch": 0.04093807568382047, "grad_norm": 13.206424090552508, "learning_rate": 4.093729226678484e-05, "loss": 2.2142, "mean_token_accuracy": 0.4517241358757019, "step": 40645 }, { "epoch": 0.040943111736924644, "grad_norm": 13.885542152833098, "learning_rate": 4.094232822351591e-05, "loss": 2.6863, "mean_token_accuracy": 0.34137931764125823, "step": 40650 }, { "epoch": 0.04094814779002882, "grad_norm": 11.657564970807094, "learning_rate": 4.0947364180246966e-05, "loss": 2.5548, "mean_token_accuracy": 0.42068966031074523, "step": 40655 }, { "epoch": 0.04095318384313299, "grad_norm": 14.127762236333332, "learning_rate": 4.0952400136978026e-05, "loss": 3.0577, "mean_token_accuracy": 0.358620685338974, "step": 40660 }, { "epoch": 0.040958219896237165, "grad_norm": 13.982090814797386, "learning_rate": 4.0957436093709085e-05, "loss": 2.3375, "mean_token_accuracy": 0.38275861740112305, "step": 40665 }, { "epoch": 0.04096325594934133, "grad_norm": 15.156656006116709, "learning_rate": 4.0962472050440144e-05, "loss": 2.4323, "mean_token_accuracy": 0.42068964838981626, "step": 40670 }, { "epoch": 0.040968292002445506, "grad_norm": 14.810746183551506, "learning_rate": 4.096750800717121e-05, "loss": 2.5467, "mean_token_accuracy": 0.4103448331356049, "step": 40675 }, { "epoch": 0.04097332805554968, "grad_norm": 16.328588017026107, "learning_rate": 4.097254396390226e-05, "loss": 2.4857, "mean_token_accuracy": 0.4206896543502808, "step": 40680 }, { "epoch": 0.04097836410865385, "grad_norm": 15.497218747085839, "learning_rate": 4.097757992063332e-05, "loss": 2.9045, "mean_token_accuracy": 0.37241379022598264, "step": 40685 }, { "epoch": 0.04098340016175803, "grad_norm": 15.320312370900012, "learning_rate": 4.098261587736438e-05, "loss": 2.3623, "mean_token_accuracy": 0.441379314661026, "step": 40690 }, { "epoch": 0.0409884362148622, "grad_norm": 12.993975810623441, "learning_rate": 4.098765183409544e-05, "loss": 2.4172, "mean_token_accuracy": 0.4034482717514038, "step": 40695 }, { "epoch": 0.040993472267966374, "grad_norm": 18.559909339732226, "learning_rate": 4.09926877908265e-05, "loss": 2.4974, "mean_token_accuracy": 0.4350985288619995, "step": 40700 }, { "epoch": 0.04099850832107054, "grad_norm": 16.04216373848578, "learning_rate": 4.0997723747557566e-05, "loss": 3.0837, "mean_token_accuracy": 0.3551724076271057, "step": 40705 }, { "epoch": 0.041003544374174715, "grad_norm": 14.55733688816893, "learning_rate": 4.1002759704288625e-05, "loss": 2.39, "mean_token_accuracy": 0.4310344815254211, "step": 40710 }, { "epoch": 0.04100858042727889, "grad_norm": 14.673416038199425, "learning_rate": 4.1007795661019684e-05, "loss": 2.7999, "mean_token_accuracy": 0.4312807857990265, "step": 40715 }, { "epoch": 0.04101361648038306, "grad_norm": 13.552807215623094, "learning_rate": 4.1012831617750743e-05, "loss": 2.4671, "mean_token_accuracy": 0.45160314440727234, "step": 40720 }, { "epoch": 0.041018652533487236, "grad_norm": 14.282672312221619, "learning_rate": 4.10178675744818e-05, "loss": 2.7927, "mean_token_accuracy": 0.3896551728248596, "step": 40725 }, { "epoch": 0.04102368858659141, "grad_norm": 13.053871180716206, "learning_rate": 4.102290353121286e-05, "loss": 2.6912, "mean_token_accuracy": 0.4034482717514038, "step": 40730 }, { "epoch": 0.041028724639695584, "grad_norm": 19.232153481280147, "learning_rate": 4.102793948794392e-05, "loss": 2.7318, "mean_token_accuracy": 0.3965517282485962, "step": 40735 }, { "epoch": 0.04103376069279975, "grad_norm": 14.51641334838814, "learning_rate": 4.103297544467498e-05, "loss": 2.5545, "mean_token_accuracy": 0.4068965554237366, "step": 40740 }, { "epoch": 0.041038796745903924, "grad_norm": 11.484792457384179, "learning_rate": 4.103801140140604e-05, "loss": 2.4851, "mean_token_accuracy": 0.43103448748588563, "step": 40745 }, { "epoch": 0.0410438327990081, "grad_norm": 18.21319086556733, "learning_rate": 4.10430473581371e-05, "loss": 2.9283, "mean_token_accuracy": 0.39776164293289185, "step": 40750 }, { "epoch": 0.04104886885211227, "grad_norm": 13.126895551105354, "learning_rate": 4.1048083314868165e-05, "loss": 2.665, "mean_token_accuracy": 0.4034482777118683, "step": 40755 }, { "epoch": 0.041053904905216446, "grad_norm": 14.165312620519686, "learning_rate": 4.1053119271599224e-05, "loss": 2.8806, "mean_token_accuracy": 0.3620689630508423, "step": 40760 }, { "epoch": 0.04105894095832062, "grad_norm": 12.450884132452959, "learning_rate": 4.1058155228330284e-05, "loss": 2.6023, "mean_token_accuracy": 0.3999999940395355, "step": 40765 }, { "epoch": 0.04106397701142479, "grad_norm": 11.562451201782611, "learning_rate": 4.1063191185061336e-05, "loss": 2.7228, "mean_token_accuracy": 0.3655172407627106, "step": 40770 }, { "epoch": 0.04106901306452896, "grad_norm": 12.769946615354918, "learning_rate": 4.1068227141792395e-05, "loss": 2.4852, "mean_token_accuracy": 0.4137930989265442, "step": 40775 }, { "epoch": 0.041074049117633134, "grad_norm": 12.795519257355705, "learning_rate": 4.1073263098523455e-05, "loss": 2.4785, "mean_token_accuracy": 0.37586206793785093, "step": 40780 }, { "epoch": 0.04107908517073731, "grad_norm": 16.381124918924257, "learning_rate": 4.107829905525452e-05, "loss": 2.4587, "mean_token_accuracy": 0.4551724135875702, "step": 40785 }, { "epoch": 0.04108412122384148, "grad_norm": 20.371574404168975, "learning_rate": 4.108333501198558e-05, "loss": 2.8692, "mean_token_accuracy": 0.3517241358757019, "step": 40790 }, { "epoch": 0.041089157276945655, "grad_norm": 16.14120071078137, "learning_rate": 4.108837096871664e-05, "loss": 3.2097, "mean_token_accuracy": 0.3394434332847595, "step": 40795 }, { "epoch": 0.04109419333004983, "grad_norm": 13.784063571533569, "learning_rate": 4.10934069254477e-05, "loss": 2.7411, "mean_token_accuracy": 0.3827586233615875, "step": 40800 }, { "epoch": 0.041099229383154, "grad_norm": 14.729662024621888, "learning_rate": 4.109844288217876e-05, "loss": 2.6535, "mean_token_accuracy": 0.36551723778247835, "step": 40805 }, { "epoch": 0.04110426543625817, "grad_norm": 15.303666740529385, "learning_rate": 4.1103478838909824e-05, "loss": 2.6376, "mean_token_accuracy": 0.3655172407627106, "step": 40810 }, { "epoch": 0.04110930148936234, "grad_norm": 35.77075823880536, "learning_rate": 4.1108514795640876e-05, "loss": 2.8342, "mean_token_accuracy": 0.4, "step": 40815 }, { "epoch": 0.04111433754246652, "grad_norm": 14.994347527814286, "learning_rate": 4.1113550752371935e-05, "loss": 2.6448, "mean_token_accuracy": 0.37241379618644715, "step": 40820 }, { "epoch": 0.04111937359557069, "grad_norm": 15.673555873825235, "learning_rate": 4.1118586709102995e-05, "loss": 2.6984, "mean_token_accuracy": 0.3896551728248596, "step": 40825 }, { "epoch": 0.041124409648674864, "grad_norm": 14.019267891140933, "learning_rate": 4.1123622665834054e-05, "loss": 2.6319, "mean_token_accuracy": 0.39310344457626345, "step": 40830 }, { "epoch": 0.04112944570177904, "grad_norm": 14.15602331940618, "learning_rate": 4.112865862256512e-05, "loss": 2.5096, "mean_token_accuracy": 0.4608590483665466, "step": 40835 }, { "epoch": 0.04113448175488321, "grad_norm": 11.89139031935125, "learning_rate": 4.113369457929618e-05, "loss": 2.7402, "mean_token_accuracy": 0.4241379231214523, "step": 40840 }, { "epoch": 0.04113951780798738, "grad_norm": 23.57044647796897, "learning_rate": 4.113873053602724e-05, "loss": 2.6675, "mean_token_accuracy": 0.3689655214548111, "step": 40845 }, { "epoch": 0.04114455386109155, "grad_norm": 10.605627096867416, "learning_rate": 4.11437664927583e-05, "loss": 2.14, "mean_token_accuracy": 0.4551724076271057, "step": 40850 }, { "epoch": 0.041149589914195726, "grad_norm": 14.140173398304333, "learning_rate": 4.114880244948936e-05, "loss": 2.1722, "mean_token_accuracy": 0.45862067937850953, "step": 40855 }, { "epoch": 0.0411546259672999, "grad_norm": 20.450035048048502, "learning_rate": 4.1153838406220416e-05, "loss": 2.6898, "mean_token_accuracy": 0.37931033968925476, "step": 40860 }, { "epoch": 0.041159662020404074, "grad_norm": 16.9297007223952, "learning_rate": 4.1158874362951475e-05, "loss": 2.9141, "mean_token_accuracy": 0.3551724135875702, "step": 40865 }, { "epoch": 0.04116469807350825, "grad_norm": 14.256559035071326, "learning_rate": 4.1163910319682535e-05, "loss": 2.5266, "mean_token_accuracy": 0.4068965554237366, "step": 40870 }, { "epoch": 0.04116973412661242, "grad_norm": 11.461617365367621, "learning_rate": 4.1168946276413594e-05, "loss": 2.5697, "mean_token_accuracy": 0.4379310429096222, "step": 40875 }, { "epoch": 0.04117477017971659, "grad_norm": 24.32770304821939, "learning_rate": 4.117398223314465e-05, "loss": 2.8787, "mean_token_accuracy": 0.42413793206214906, "step": 40880 }, { "epoch": 0.04117980623282076, "grad_norm": 13.423902885597512, "learning_rate": 4.117901818987571e-05, "loss": 2.5955, "mean_token_accuracy": 0.3482758581638336, "step": 40885 }, { "epoch": 0.041184842285924936, "grad_norm": 13.505336019905643, "learning_rate": 4.118405414660678e-05, "loss": 2.502, "mean_token_accuracy": 0.36206896901130675, "step": 40890 }, { "epoch": 0.04118987833902911, "grad_norm": 16.971950361154793, "learning_rate": 4.118909010333784e-05, "loss": 2.5578, "mean_token_accuracy": 0.4103448212146759, "step": 40895 }, { "epoch": 0.04119491439213328, "grad_norm": 20.999554845587554, "learning_rate": 4.11941260600689e-05, "loss": 3.1846, "mean_token_accuracy": 0.3655172407627106, "step": 40900 }, { "epoch": 0.04119995044523746, "grad_norm": 12.941858308243027, "learning_rate": 4.119916201679995e-05, "loss": 2.7599, "mean_token_accuracy": 0.38275861740112305, "step": 40905 }, { "epoch": 0.04120498649834163, "grad_norm": 15.099528176547238, "learning_rate": 4.120419797353101e-05, "loss": 2.7854, "mean_token_accuracy": 0.42607380747795104, "step": 40910 }, { "epoch": 0.0412100225514458, "grad_norm": 17.38236219110721, "learning_rate": 4.1209233930262075e-05, "loss": 2.7553, "mean_token_accuracy": 0.3931034505367279, "step": 40915 }, { "epoch": 0.04121505860454997, "grad_norm": 16.000960089504773, "learning_rate": 4.1214269886993134e-05, "loss": 2.9845, "mean_token_accuracy": 0.2965517222881317, "step": 40920 }, { "epoch": 0.041220094657654145, "grad_norm": 11.154266470026233, "learning_rate": 4.121930584372419e-05, "loss": 2.382, "mean_token_accuracy": 0.42413792610168455, "step": 40925 }, { "epoch": 0.04122513071075832, "grad_norm": 14.394508625650328, "learning_rate": 4.122434180045525e-05, "loss": 2.406, "mean_token_accuracy": 0.43448275327682495, "step": 40930 }, { "epoch": 0.04123016676386249, "grad_norm": 12.989327427270842, "learning_rate": 4.122937775718631e-05, "loss": 2.5044, "mean_token_accuracy": 0.4379310369491577, "step": 40935 }, { "epoch": 0.041235202816966667, "grad_norm": 13.531064565209123, "learning_rate": 4.123441371391738e-05, "loss": 2.484, "mean_token_accuracy": 0.43266788125038147, "step": 40940 }, { "epoch": 0.04124023887007084, "grad_norm": 14.223310015237725, "learning_rate": 4.123944967064843e-05, "loss": 2.8653, "mean_token_accuracy": 0.3655172288417816, "step": 40945 }, { "epoch": 0.04124527492317501, "grad_norm": 21.186694300253812, "learning_rate": 4.124448562737949e-05, "loss": 2.6835, "mean_token_accuracy": 0.4, "step": 40950 }, { "epoch": 0.04125031097627918, "grad_norm": 12.444712112429578, "learning_rate": 4.124952158411055e-05, "loss": 2.6251, "mean_token_accuracy": 0.37586206793785093, "step": 40955 }, { "epoch": 0.041255347029383355, "grad_norm": 16.970539582878, "learning_rate": 4.125455754084161e-05, "loss": 2.2374, "mean_token_accuracy": 0.42413793206214906, "step": 40960 }, { "epoch": 0.04126038308248753, "grad_norm": 22.215357268305826, "learning_rate": 4.125959349757267e-05, "loss": 2.9667, "mean_token_accuracy": 0.3655172407627106, "step": 40965 }, { "epoch": 0.0412654191355917, "grad_norm": 14.829487396237798, "learning_rate": 4.126462945430373e-05, "loss": 2.4951, "mean_token_accuracy": 0.37241379022598264, "step": 40970 }, { "epoch": 0.041270455188695876, "grad_norm": 15.993909160970029, "learning_rate": 4.126966541103479e-05, "loss": 2.405, "mean_token_accuracy": 0.43103448748588563, "step": 40975 }, { "epoch": 0.04127549124180005, "grad_norm": 16.908618265382852, "learning_rate": 4.127470136776585e-05, "loss": 2.9847, "mean_token_accuracy": 0.3517241358757019, "step": 40980 }, { "epoch": 0.041280527294904217, "grad_norm": 47.30137931843922, "learning_rate": 4.127973732449691e-05, "loss": 2.4514, "mean_token_accuracy": 0.379310342669487, "step": 40985 }, { "epoch": 0.04128556334800839, "grad_norm": 12.39812987397837, "learning_rate": 4.128477328122797e-05, "loss": 2.3535, "mean_token_accuracy": 0.4465214729309082, "step": 40990 }, { "epoch": 0.041290599401112564, "grad_norm": 13.83483188648493, "learning_rate": 4.128980923795903e-05, "loss": 2.6022, "mean_token_accuracy": 0.3620689630508423, "step": 40995 }, { "epoch": 0.04129563545421674, "grad_norm": 19.701850648418716, "learning_rate": 4.129484519469009e-05, "loss": 2.4212, "mean_token_accuracy": 0.4103448212146759, "step": 41000 }, { "epoch": 0.04130067150732091, "grad_norm": 13.143117623214184, "learning_rate": 4.129988115142115e-05, "loss": 2.6967, "mean_token_accuracy": 0.42413792610168455, "step": 41005 }, { "epoch": 0.041305707560425085, "grad_norm": 13.032528668268462, "learning_rate": 4.130491710815221e-05, "loss": 2.5074, "mean_token_accuracy": 0.39443435668945315, "step": 41010 }, { "epoch": 0.04131074361352926, "grad_norm": 15.299685760033984, "learning_rate": 4.1309953064883267e-05, "loss": 2.4205, "mean_token_accuracy": 0.3965517282485962, "step": 41015 }, { "epoch": 0.041315779666633426, "grad_norm": 15.801225287153569, "learning_rate": 4.131498902161433e-05, "loss": 2.4402, "mean_token_accuracy": 0.42413793206214906, "step": 41020 }, { "epoch": 0.0413208157197376, "grad_norm": 18.559612280637484, "learning_rate": 4.132002497834539e-05, "loss": 2.144, "mean_token_accuracy": 0.458620673418045, "step": 41025 }, { "epoch": 0.04132585177284177, "grad_norm": 18.751832344626706, "learning_rate": 4.132506093507645e-05, "loss": 2.4526, "mean_token_accuracy": 0.42413793206214906, "step": 41030 }, { "epoch": 0.04133088782594595, "grad_norm": 17.179923390174572, "learning_rate": 4.133009689180751e-05, "loss": 2.5182, "mean_token_accuracy": 0.4034482777118683, "step": 41035 }, { "epoch": 0.04133592387905012, "grad_norm": 12.729919369511332, "learning_rate": 4.133513284853856e-05, "loss": 2.4972, "mean_token_accuracy": 0.3793103456497192, "step": 41040 }, { "epoch": 0.041340959932154295, "grad_norm": 13.549367264662392, "learning_rate": 4.134016880526962e-05, "loss": 2.4035, "mean_token_accuracy": 0.41379310488700866, "step": 41045 }, { "epoch": 0.04134599598525847, "grad_norm": 17.554360802521778, "learning_rate": 4.134520476200069e-05, "loss": 2.9433, "mean_token_accuracy": 0.36551724672317504, "step": 41050 }, { "epoch": 0.041351032038362635, "grad_norm": 14.40718142106458, "learning_rate": 4.135024071873175e-05, "loss": 2.708, "mean_token_accuracy": 0.4034482717514038, "step": 41055 }, { "epoch": 0.04135606809146681, "grad_norm": 17.394742010907283, "learning_rate": 4.1355276675462807e-05, "loss": 2.7567, "mean_token_accuracy": 0.3793103456497192, "step": 41060 }, { "epoch": 0.04136110414457098, "grad_norm": 24.898044976946323, "learning_rate": 4.1360312632193866e-05, "loss": 2.5342, "mean_token_accuracy": 0.3793103456497192, "step": 41065 }, { "epoch": 0.04136614019767516, "grad_norm": 11.442759671823191, "learning_rate": 4.1365348588924925e-05, "loss": 2.382, "mean_token_accuracy": 0.43103448748588563, "step": 41070 }, { "epoch": 0.04137117625077933, "grad_norm": 16.668984143667156, "learning_rate": 4.137038454565599e-05, "loss": 2.2302, "mean_token_accuracy": 0.4034482717514038, "step": 41075 }, { "epoch": 0.041376212303883504, "grad_norm": 13.794265804700265, "learning_rate": 4.1375420502387044e-05, "loss": 2.4324, "mean_token_accuracy": 0.47586206793785096, "step": 41080 }, { "epoch": 0.04138124835698768, "grad_norm": 21.967810262894325, "learning_rate": 4.13804564591181e-05, "loss": 2.7261, "mean_token_accuracy": 0.3655172407627106, "step": 41085 }, { "epoch": 0.041386284410091845, "grad_norm": 14.290650095830253, "learning_rate": 4.138549241584916e-05, "loss": 2.6638, "mean_token_accuracy": 0.358620685338974, "step": 41090 }, { "epoch": 0.04139132046319602, "grad_norm": 17.955084166042813, "learning_rate": 4.139052837258022e-05, "loss": 2.6688, "mean_token_accuracy": 0.3999999940395355, "step": 41095 }, { "epoch": 0.04139635651630019, "grad_norm": 17.726595017148693, "learning_rate": 4.139556432931129e-05, "loss": 3.2398, "mean_token_accuracy": 0.37931033968925476, "step": 41100 }, { "epoch": 0.041401392569404366, "grad_norm": 13.803975215593738, "learning_rate": 4.140060028604235e-05, "loss": 2.6809, "mean_token_accuracy": 0.41724138259887694, "step": 41105 }, { "epoch": 0.04140642862250854, "grad_norm": 14.969865140735745, "learning_rate": 4.1405636242773406e-05, "loss": 2.7729, "mean_token_accuracy": 0.3810042321681976, "step": 41110 }, { "epoch": 0.041411464675612714, "grad_norm": 19.72395236966796, "learning_rate": 4.1410672199504465e-05, "loss": 2.6601, "mean_token_accuracy": 0.4068965554237366, "step": 41115 }, { "epoch": 0.04141650072871689, "grad_norm": 22.165726152241525, "learning_rate": 4.1415708156235524e-05, "loss": 3.0441, "mean_token_accuracy": 0.3344827562570572, "step": 41120 }, { "epoch": 0.041421536781821054, "grad_norm": 14.276038520085468, "learning_rate": 4.1420744112966584e-05, "loss": 2.6462, "mean_token_accuracy": 0.40344826579093934, "step": 41125 }, { "epoch": 0.04142657283492523, "grad_norm": 15.227849216636812, "learning_rate": 4.142578006969764e-05, "loss": 2.6021, "mean_token_accuracy": 0.3793103516101837, "step": 41130 }, { "epoch": 0.0414316088880294, "grad_norm": 14.88492297414623, "learning_rate": 4.14308160264287e-05, "loss": 3.1043, "mean_token_accuracy": 0.3068965494632721, "step": 41135 }, { "epoch": 0.041436644941133575, "grad_norm": 18.72426752622893, "learning_rate": 4.143585198315976e-05, "loss": 2.5418, "mean_token_accuracy": 0.38620689511299133, "step": 41140 }, { "epoch": 0.04144168099423775, "grad_norm": 13.057649633685267, "learning_rate": 4.144088793989082e-05, "loss": 2.9402, "mean_token_accuracy": 0.3482758641242981, "step": 41145 }, { "epoch": 0.04144671704734192, "grad_norm": 13.634383275664339, "learning_rate": 4.144592389662188e-05, "loss": 3.0117, "mean_token_accuracy": 0.37586207389831544, "step": 41150 }, { "epoch": 0.0414517531004461, "grad_norm": 15.3029209670836, "learning_rate": 4.1450959853352946e-05, "loss": 2.8006, "mean_token_accuracy": 0.38620689511299133, "step": 41155 }, { "epoch": 0.041456789153550264, "grad_norm": 22.389501960311364, "learning_rate": 4.1455995810084005e-05, "loss": 2.5555, "mean_token_accuracy": 0.47071990966796873, "step": 41160 }, { "epoch": 0.04146182520665444, "grad_norm": 14.786013433363001, "learning_rate": 4.1461031766815064e-05, "loss": 2.3108, "mean_token_accuracy": 0.4310344815254211, "step": 41165 }, { "epoch": 0.04146686125975861, "grad_norm": 18.08520394889536, "learning_rate": 4.1466067723546124e-05, "loss": 2.7539, "mean_token_accuracy": 0.3827586233615875, "step": 41170 }, { "epoch": 0.041471897312862785, "grad_norm": 14.63492886045943, "learning_rate": 4.1471103680277176e-05, "loss": 2.7212, "mean_token_accuracy": 0.3965517282485962, "step": 41175 }, { "epoch": 0.04147693336596696, "grad_norm": 15.744077436140618, "learning_rate": 4.147613963700824e-05, "loss": 2.6602, "mean_token_accuracy": 0.38275861740112305, "step": 41180 }, { "epoch": 0.04148196941907113, "grad_norm": 13.690409910973099, "learning_rate": 4.14811755937393e-05, "loss": 2.2618, "mean_token_accuracy": 0.4, "step": 41185 }, { "epoch": 0.041487005472175306, "grad_norm": 13.392073387141549, "learning_rate": 4.148621155047036e-05, "loss": 2.4978, "mean_token_accuracy": 0.449969744682312, "step": 41190 }, { "epoch": 0.04149204152527947, "grad_norm": 13.539970388178874, "learning_rate": 4.149124750720142e-05, "loss": 2.6495, "mean_token_accuracy": 0.3689655244350433, "step": 41195 }, { "epoch": 0.04149707757838365, "grad_norm": 13.971926197438785, "learning_rate": 4.149628346393248e-05, "loss": 2.6975, "mean_token_accuracy": 0.35862069129943847, "step": 41200 }, { "epoch": 0.04150211363148782, "grad_norm": 11.967713234288349, "learning_rate": 4.150131942066354e-05, "loss": 2.5316, "mean_token_accuracy": 0.3862068891525269, "step": 41205 }, { "epoch": 0.041507149684591994, "grad_norm": 15.188985601352249, "learning_rate": 4.1506355377394604e-05, "loss": 2.4204, "mean_token_accuracy": 0.3931034505367279, "step": 41210 }, { "epoch": 0.04151218573769617, "grad_norm": 12.031832599974363, "learning_rate": 4.151139133412566e-05, "loss": 2.2701, "mean_token_accuracy": 0.42413793206214906, "step": 41215 }, { "epoch": 0.04151722179080034, "grad_norm": 12.277975250778423, "learning_rate": 4.1516427290856716e-05, "loss": 2.5418, "mean_token_accuracy": 0.41724138259887694, "step": 41220 }, { "epoch": 0.041522257843904516, "grad_norm": 11.03156192235485, "learning_rate": 4.1521463247587775e-05, "loss": 2.3875, "mean_token_accuracy": 0.41379310488700866, "step": 41225 }, { "epoch": 0.04152729389700868, "grad_norm": 16.072934787809217, "learning_rate": 4.1526499204318835e-05, "loss": 2.7604, "mean_token_accuracy": 0.4, "step": 41230 }, { "epoch": 0.041532329950112856, "grad_norm": 15.122269921090608, "learning_rate": 4.15315351610499e-05, "loss": 2.5147, "mean_token_accuracy": 0.4225045382976532, "step": 41235 }, { "epoch": 0.04153736600321703, "grad_norm": 17.94022118267691, "learning_rate": 4.153657111778096e-05, "loss": 2.8398, "mean_token_accuracy": 0.3620689630508423, "step": 41240 }, { "epoch": 0.041542402056321204, "grad_norm": 18.800729543934647, "learning_rate": 4.154160707451202e-05, "loss": 2.4401, "mean_token_accuracy": 0.4206896543502808, "step": 41245 }, { "epoch": 0.04154743810942538, "grad_norm": 19.65109828813384, "learning_rate": 4.154664303124308e-05, "loss": 2.9384, "mean_token_accuracy": 0.3275862097740173, "step": 41250 }, { "epoch": 0.04155247416252955, "grad_norm": 13.310684574871418, "learning_rate": 4.155167898797414e-05, "loss": 2.6128, "mean_token_accuracy": 0.3724137842655182, "step": 41255 }, { "epoch": 0.041557510215633725, "grad_norm": 19.43584220063741, "learning_rate": 4.15567149447052e-05, "loss": 2.6336, "mean_token_accuracy": 0.45396249890327456, "step": 41260 }, { "epoch": 0.04156254626873789, "grad_norm": 14.50507161059811, "learning_rate": 4.1561750901436256e-05, "loss": 2.5908, "mean_token_accuracy": 0.3655172407627106, "step": 41265 }, { "epoch": 0.041567582321842066, "grad_norm": 11.650635073771703, "learning_rate": 4.1566786858167316e-05, "loss": 2.7806, "mean_token_accuracy": 0.358620685338974, "step": 41270 }, { "epoch": 0.04157261837494624, "grad_norm": 13.760952094118604, "learning_rate": 4.1571822814898375e-05, "loss": 2.7298, "mean_token_accuracy": 0.38620689511299133, "step": 41275 }, { "epoch": 0.04157765442805041, "grad_norm": 12.548975116886325, "learning_rate": 4.1576858771629434e-05, "loss": 2.876, "mean_token_accuracy": 0.34137931764125823, "step": 41280 }, { "epoch": 0.04158269048115459, "grad_norm": 13.956213457097428, "learning_rate": 4.158189472836049e-05, "loss": 2.24, "mean_token_accuracy": 0.4551724135875702, "step": 41285 }, { "epoch": 0.04158772653425876, "grad_norm": 49.69084829804417, "learning_rate": 4.158693068509156e-05, "loss": 3.1969, "mean_token_accuracy": 0.3655172437429428, "step": 41290 }, { "epoch": 0.041592762587362934, "grad_norm": 13.981379815087218, "learning_rate": 4.159196664182262e-05, "loss": 2.3562, "mean_token_accuracy": 0.44482757449150084, "step": 41295 }, { "epoch": 0.0415977986404671, "grad_norm": 15.403457467624529, "learning_rate": 4.159700259855368e-05, "loss": 2.9595, "mean_token_accuracy": 0.3724138021469116, "step": 41300 }, { "epoch": 0.041602834693571275, "grad_norm": 12.431090054433014, "learning_rate": 4.160203855528473e-05, "loss": 2.8006, "mean_token_accuracy": 0.3310344874858856, "step": 41305 }, { "epoch": 0.04160787074667545, "grad_norm": 13.18110872472664, "learning_rate": 4.160707451201579e-05, "loss": 2.771, "mean_token_accuracy": 0.35862069129943847, "step": 41310 }, { "epoch": 0.04161290679977962, "grad_norm": 14.861732553323238, "learning_rate": 4.1612110468746856e-05, "loss": 3.1349, "mean_token_accuracy": 0.33448275923728943, "step": 41315 }, { "epoch": 0.041617942852883796, "grad_norm": 10.614103874888148, "learning_rate": 4.1617146425477915e-05, "loss": 2.3604, "mean_token_accuracy": 0.42758620381355283, "step": 41320 }, { "epoch": 0.04162297890598797, "grad_norm": 18.579272521539604, "learning_rate": 4.1622182382208974e-05, "loss": 2.4029, "mean_token_accuracy": 0.46067755222320556, "step": 41325 }, { "epoch": 0.041628014959092144, "grad_norm": 12.405900187033808, "learning_rate": 4.162721833894003e-05, "loss": 2.3266, "mean_token_accuracy": 0.42758620977401735, "step": 41330 }, { "epoch": 0.04163305101219631, "grad_norm": 13.23086755774929, "learning_rate": 4.163225429567109e-05, "loss": 2.9464, "mean_token_accuracy": 0.36896551847457887, "step": 41335 }, { "epoch": 0.041638087065300484, "grad_norm": 14.364062007184435, "learning_rate": 4.163729025240216e-05, "loss": 2.5793, "mean_token_accuracy": 0.33793103098869326, "step": 41340 }, { "epoch": 0.04164312311840466, "grad_norm": 13.968180786689885, "learning_rate": 4.164232620913321e-05, "loss": 2.535, "mean_token_accuracy": 0.44827587008476255, "step": 41345 }, { "epoch": 0.04164815917150883, "grad_norm": 12.684564198780544, "learning_rate": 4.164736216586427e-05, "loss": 2.7156, "mean_token_accuracy": 0.4034482717514038, "step": 41350 }, { "epoch": 0.041653195224613006, "grad_norm": 14.57429857730503, "learning_rate": 4.165239812259533e-05, "loss": 2.3134, "mean_token_accuracy": 0.41034482717514037, "step": 41355 }, { "epoch": 0.04165823127771718, "grad_norm": 14.179740373168228, "learning_rate": 4.165743407932639e-05, "loss": 3.038, "mean_token_accuracy": 0.39165154099464417, "step": 41360 }, { "epoch": 0.04166326733082135, "grad_norm": 14.747401643973403, "learning_rate": 4.1662470036057455e-05, "loss": 2.7026, "mean_token_accuracy": 0.36551723480224607, "step": 41365 }, { "epoch": 0.04166830338392552, "grad_norm": 12.824293556730922, "learning_rate": 4.1667505992788514e-05, "loss": 2.5718, "mean_token_accuracy": 0.40145190358161925, "step": 41370 }, { "epoch": 0.041673339437029694, "grad_norm": 16.46905648852369, "learning_rate": 4.167254194951957e-05, "loss": 2.6636, "mean_token_accuracy": 0.4206896543502808, "step": 41375 }, { "epoch": 0.04167837549013387, "grad_norm": 14.382500247906147, "learning_rate": 4.167757790625063e-05, "loss": 2.8107, "mean_token_accuracy": 0.32413792610168457, "step": 41380 }, { "epoch": 0.04168341154323804, "grad_norm": 11.964911376859925, "learning_rate": 4.168261386298169e-05, "loss": 2.5012, "mean_token_accuracy": 0.38965516686439516, "step": 41385 }, { "epoch": 0.041688447596342215, "grad_norm": 15.359449529815512, "learning_rate": 4.168764981971275e-05, "loss": 2.5499, "mean_token_accuracy": 0.37931033968925476, "step": 41390 }, { "epoch": 0.04169348364944639, "grad_norm": 18.04229883972483, "learning_rate": 4.169268577644381e-05, "loss": 2.7145, "mean_token_accuracy": 0.39310344457626345, "step": 41395 }, { "epoch": 0.04169851970255056, "grad_norm": 20.652033830011202, "learning_rate": 4.169772173317487e-05, "loss": 2.8569, "mean_token_accuracy": 0.36896551251411436, "step": 41400 }, { "epoch": 0.04170355575565473, "grad_norm": 17.441893075424723, "learning_rate": 4.170275768990593e-05, "loss": 2.3557, "mean_token_accuracy": 0.37241379022598264, "step": 41405 }, { "epoch": 0.0417085918087589, "grad_norm": 13.76016651083468, "learning_rate": 4.170779364663699e-05, "loss": 2.749, "mean_token_accuracy": 0.3241379350423813, "step": 41410 }, { "epoch": 0.04171362786186308, "grad_norm": 12.151249614705563, "learning_rate": 4.171282960336805e-05, "loss": 2.2827, "mean_token_accuracy": 0.4517241418361664, "step": 41415 }, { "epoch": 0.04171866391496725, "grad_norm": 15.591512579545203, "learning_rate": 4.1717865560099113e-05, "loss": 2.4828, "mean_token_accuracy": 0.4068965554237366, "step": 41420 }, { "epoch": 0.041723699968071425, "grad_norm": 13.297030108195143, "learning_rate": 4.172290151683017e-05, "loss": 2.2595, "mean_token_accuracy": 0.4413793087005615, "step": 41425 }, { "epoch": 0.0417287360211756, "grad_norm": 14.025324888402166, "learning_rate": 4.172793747356123e-05, "loss": 2.2977, "mean_token_accuracy": 0.4568663060665131, "step": 41430 }, { "epoch": 0.04173377207427977, "grad_norm": 20.01779592351257, "learning_rate": 4.173297343029229e-05, "loss": 2.8114, "mean_token_accuracy": 0.36551723480224607, "step": 41435 }, { "epoch": 0.04173880812738394, "grad_norm": 13.91000361551655, "learning_rate": 4.1738009387023344e-05, "loss": 2.4125, "mean_token_accuracy": 0.4229280173778534, "step": 41440 }, { "epoch": 0.04174384418048811, "grad_norm": 15.754327259476941, "learning_rate": 4.174304534375441e-05, "loss": 2.6701, "mean_token_accuracy": 0.3689655065536499, "step": 41445 }, { "epoch": 0.041748880233592287, "grad_norm": 10.288310613628173, "learning_rate": 4.174808130048547e-05, "loss": 2.248, "mean_token_accuracy": 0.4551724135875702, "step": 41450 }, { "epoch": 0.04175391628669646, "grad_norm": 17.746416110538224, "learning_rate": 4.175311725721653e-05, "loss": 2.5861, "mean_token_accuracy": 0.36896551251411436, "step": 41455 }, { "epoch": 0.041758952339800634, "grad_norm": 13.815651047743707, "learning_rate": 4.175815321394759e-05, "loss": 2.4924, "mean_token_accuracy": 0.4, "step": 41460 }, { "epoch": 0.04176398839290481, "grad_norm": 13.006045671977251, "learning_rate": 4.176318917067865e-05, "loss": 2.1947, "mean_token_accuracy": 0.4448275864124298, "step": 41465 }, { "epoch": 0.04176902444600898, "grad_norm": 11.308526439456026, "learning_rate": 4.1768225127409706e-05, "loss": 2.3156, "mean_token_accuracy": 0.41379310488700866, "step": 41470 }, { "epoch": 0.04177406049911315, "grad_norm": 16.024539077379107, "learning_rate": 4.177326108414077e-05, "loss": 2.423, "mean_token_accuracy": 0.4310344815254211, "step": 41475 }, { "epoch": 0.04177909655221732, "grad_norm": 13.203006591749823, "learning_rate": 4.1778297040871824e-05, "loss": 2.6127, "mean_token_accuracy": 0.41034482717514037, "step": 41480 }, { "epoch": 0.041784132605321496, "grad_norm": 14.681668284189819, "learning_rate": 4.1783332997602884e-05, "loss": 2.4855, "mean_token_accuracy": 0.4310344815254211, "step": 41485 }, { "epoch": 0.04178916865842567, "grad_norm": 13.893456610697903, "learning_rate": 4.178836895433394e-05, "loss": 2.3494, "mean_token_accuracy": 0.47447065711021424, "step": 41490 }, { "epoch": 0.04179420471152984, "grad_norm": 11.891801505877606, "learning_rate": 4.1793404911065e-05, "loss": 2.3974, "mean_token_accuracy": 0.4551724135875702, "step": 41495 }, { "epoch": 0.04179924076463402, "grad_norm": 13.220660100746214, "learning_rate": 4.179844086779607e-05, "loss": 2.4542, "mean_token_accuracy": 0.3862069010734558, "step": 41500 }, { "epoch": 0.04180427681773819, "grad_norm": 13.73682151540009, "learning_rate": 4.180347682452713e-05, "loss": 2.5342, "mean_token_accuracy": 0.4344827651977539, "step": 41505 }, { "epoch": 0.04180931287084236, "grad_norm": 12.835747085722033, "learning_rate": 4.180851278125819e-05, "loss": 2.3436, "mean_token_accuracy": 0.417241370677948, "step": 41510 }, { "epoch": 0.04181434892394653, "grad_norm": 12.754565771149778, "learning_rate": 4.1813548737989246e-05, "loss": 2.3413, "mean_token_accuracy": 0.4620689630508423, "step": 41515 }, { "epoch": 0.041819384977050705, "grad_norm": 15.138618969940085, "learning_rate": 4.1818584694720305e-05, "loss": 2.606, "mean_token_accuracy": 0.4, "step": 41520 }, { "epoch": 0.04182442103015488, "grad_norm": 11.785215921888069, "learning_rate": 4.1823620651451365e-05, "loss": 2.7353, "mean_token_accuracy": 0.3999999940395355, "step": 41525 }, { "epoch": 0.04182945708325905, "grad_norm": 14.106720060299784, "learning_rate": 4.1828656608182424e-05, "loss": 2.8909, "mean_token_accuracy": 0.3896551728248596, "step": 41530 }, { "epoch": 0.04183449313636323, "grad_norm": 11.823814028080365, "learning_rate": 4.183369256491348e-05, "loss": 2.5415, "mean_token_accuracy": 0.3482758551836014, "step": 41535 }, { "epoch": 0.0418395291894674, "grad_norm": 16.316010495530996, "learning_rate": 4.183872852164454e-05, "loss": 2.7488, "mean_token_accuracy": 0.3931034505367279, "step": 41540 }, { "epoch": 0.04184456524257157, "grad_norm": 12.188700100630532, "learning_rate": 4.18437644783756e-05, "loss": 2.5227, "mean_token_accuracy": 0.42413792610168455, "step": 41545 }, { "epoch": 0.04184960129567574, "grad_norm": 24.860162493224838, "learning_rate": 4.184880043510666e-05, "loss": 2.7558, "mean_token_accuracy": 0.38620689511299133, "step": 41550 }, { "epoch": 0.041854637348779915, "grad_norm": 13.782909705436278, "learning_rate": 4.185383639183773e-05, "loss": 2.7235, "mean_token_accuracy": 0.3896551728248596, "step": 41555 }, { "epoch": 0.04185967340188409, "grad_norm": 12.80075285186288, "learning_rate": 4.1858872348568786e-05, "loss": 2.6448, "mean_token_accuracy": 0.42758620381355283, "step": 41560 }, { "epoch": 0.04186470945498826, "grad_norm": 11.58268670858098, "learning_rate": 4.1863908305299845e-05, "loss": 2.4803, "mean_token_accuracy": 0.44137930274009707, "step": 41565 }, { "epoch": 0.041869745508092436, "grad_norm": 18.895592544981422, "learning_rate": 4.1868944262030905e-05, "loss": 2.9573, "mean_token_accuracy": 0.3344827562570572, "step": 41570 }, { "epoch": 0.04187478156119661, "grad_norm": 12.4378210710087, "learning_rate": 4.187398021876196e-05, "loss": 2.6072, "mean_token_accuracy": 0.4, "step": 41575 }, { "epoch": 0.04187981761430078, "grad_norm": 13.857686376418835, "learning_rate": 4.187901617549302e-05, "loss": 2.3741, "mean_token_accuracy": 0.44827585816383364, "step": 41580 }, { "epoch": 0.04188485366740495, "grad_norm": 18.692672368729866, "learning_rate": 4.188405213222408e-05, "loss": 2.9586, "mean_token_accuracy": 0.32758620381355286, "step": 41585 }, { "epoch": 0.041889889720509124, "grad_norm": 10.414950504374703, "learning_rate": 4.188908808895514e-05, "loss": 1.9589, "mean_token_accuracy": 0.49999999403953554, "step": 41590 }, { "epoch": 0.0418949257736133, "grad_norm": 13.90200027800352, "learning_rate": 4.18941240456862e-05, "loss": 2.1934, "mean_token_accuracy": 0.4620689570903778, "step": 41595 }, { "epoch": 0.04189996182671747, "grad_norm": 12.058743006538208, "learning_rate": 4.189916000241726e-05, "loss": 2.9766, "mean_token_accuracy": 0.3344827562570572, "step": 41600 }, { "epoch": 0.041904997879821645, "grad_norm": 27.594381144469853, "learning_rate": 4.1904195959148326e-05, "loss": 2.8017, "mean_token_accuracy": 0.35862069129943847, "step": 41605 }, { "epoch": 0.04191003393292582, "grad_norm": 14.781095319543887, "learning_rate": 4.1909231915879385e-05, "loss": 2.6863, "mean_token_accuracy": 0.3655172407627106, "step": 41610 }, { "epoch": 0.041915069986029986, "grad_norm": 12.316890887684048, "learning_rate": 4.191426787261044e-05, "loss": 2.1493, "mean_token_accuracy": 0.4551724076271057, "step": 41615 }, { "epoch": 0.04192010603913416, "grad_norm": 12.57431801081382, "learning_rate": 4.19193038293415e-05, "loss": 2.7196, "mean_token_accuracy": 0.41034482717514037, "step": 41620 }, { "epoch": 0.041925142092238334, "grad_norm": 12.402311147187143, "learning_rate": 4.1924339786072556e-05, "loss": 2.9017, "mean_token_accuracy": 0.38100423812866213, "step": 41625 }, { "epoch": 0.04193017814534251, "grad_norm": 16.25893999664443, "learning_rate": 4.1929375742803616e-05, "loss": 2.7347, "mean_token_accuracy": 0.4310344815254211, "step": 41630 }, { "epoch": 0.04193521419844668, "grad_norm": 16.56348751435573, "learning_rate": 4.193441169953468e-05, "loss": 2.2631, "mean_token_accuracy": 0.4206896543502808, "step": 41635 }, { "epoch": 0.041940250251550855, "grad_norm": 15.226048619316597, "learning_rate": 4.193944765626574e-05, "loss": 2.7178, "mean_token_accuracy": 0.37931033968925476, "step": 41640 }, { "epoch": 0.04194528630465503, "grad_norm": 16.141172199150358, "learning_rate": 4.19444836129968e-05, "loss": 2.5711, "mean_token_accuracy": 0.4172413766384125, "step": 41645 }, { "epoch": 0.041950322357759195, "grad_norm": 14.289712125110631, "learning_rate": 4.194951956972786e-05, "loss": 2.6165, "mean_token_accuracy": 0.3705989122390747, "step": 41650 }, { "epoch": 0.04195535841086337, "grad_norm": 16.561235495062853, "learning_rate": 4.195455552645892e-05, "loss": 3.0758, "mean_token_accuracy": 0.36206896901130675, "step": 41655 }, { "epoch": 0.04196039446396754, "grad_norm": 14.517579475588162, "learning_rate": 4.195959148318998e-05, "loss": 2.6308, "mean_token_accuracy": 0.38620689511299133, "step": 41660 }, { "epoch": 0.04196543051707172, "grad_norm": 12.546968940414992, "learning_rate": 4.196462743992104e-05, "loss": 2.4348, "mean_token_accuracy": 0.41034482717514037, "step": 41665 }, { "epoch": 0.04197046657017589, "grad_norm": 19.664358909350923, "learning_rate": 4.1969663396652096e-05, "loss": 2.7112, "mean_token_accuracy": 0.3965517282485962, "step": 41670 }, { "epoch": 0.041975502623280064, "grad_norm": 12.196021361296129, "learning_rate": 4.1974699353383156e-05, "loss": 2.7838, "mean_token_accuracy": 0.3981246203184128, "step": 41675 }, { "epoch": 0.04198053867638424, "grad_norm": 12.196797249522211, "learning_rate": 4.1979735310114215e-05, "loss": 2.6309, "mean_token_accuracy": 0.41379310488700866, "step": 41680 }, { "epoch": 0.041985574729488405, "grad_norm": 11.845231325497963, "learning_rate": 4.198477126684528e-05, "loss": 2.5764, "mean_token_accuracy": 0.34137930572032926, "step": 41685 }, { "epoch": 0.04199061078259258, "grad_norm": 14.52205529075529, "learning_rate": 4.198980722357634e-05, "loss": 2.6205, "mean_token_accuracy": 0.4, "step": 41690 }, { "epoch": 0.04199564683569675, "grad_norm": 13.986418280413199, "learning_rate": 4.19948431803074e-05, "loss": 2.6281, "mean_token_accuracy": 0.37586206793785093, "step": 41695 }, { "epoch": 0.042000682888800926, "grad_norm": 21.69263993191705, "learning_rate": 4.199987913703846e-05, "loss": 2.7697, "mean_token_accuracy": 0.4486453115940094, "step": 41700 }, { "epoch": 0.0420057189419051, "grad_norm": 11.899754992699709, "learning_rate": 4.200491509376952e-05, "loss": 2.242, "mean_token_accuracy": 0.47931034564971925, "step": 41705 }, { "epoch": 0.042010754995009274, "grad_norm": 14.197205453966133, "learning_rate": 4.200995105050058e-05, "loss": 2.6726, "mean_token_accuracy": 0.4000000059604645, "step": 41710 }, { "epoch": 0.04201579104811345, "grad_norm": 14.319816827391895, "learning_rate": 4.2014987007231636e-05, "loss": 2.7133, "mean_token_accuracy": 0.42413792908191683, "step": 41715 }, { "epoch": 0.042020827101217614, "grad_norm": 18.49433381223356, "learning_rate": 4.2020022963962696e-05, "loss": 3.3711, "mean_token_accuracy": 0.36521475911140444, "step": 41720 }, { "epoch": 0.04202586315432179, "grad_norm": 12.832568538903356, "learning_rate": 4.2025058920693755e-05, "loss": 2.4101, "mean_token_accuracy": 0.39310344457626345, "step": 41725 }, { "epoch": 0.04203089920742596, "grad_norm": 19.72543660561526, "learning_rate": 4.2030094877424814e-05, "loss": 2.3535, "mean_token_accuracy": 0.46896551847457885, "step": 41730 }, { "epoch": 0.042035935260530136, "grad_norm": 12.58584181900052, "learning_rate": 4.2035130834155873e-05, "loss": 2.8308, "mean_token_accuracy": 0.41034482717514037, "step": 41735 }, { "epoch": 0.04204097131363431, "grad_norm": 16.45054280482935, "learning_rate": 4.204016679088694e-05, "loss": 2.5882, "mean_token_accuracy": 0.42413792610168455, "step": 41740 }, { "epoch": 0.04204600736673848, "grad_norm": 19.811649460175378, "learning_rate": 4.2045202747618e-05, "loss": 2.8868, "mean_token_accuracy": 0.4, "step": 41745 }, { "epoch": 0.04205104341984266, "grad_norm": 13.869608553108694, "learning_rate": 4.205023870434905e-05, "loss": 2.5202, "mean_token_accuracy": 0.39310344457626345, "step": 41750 }, { "epoch": 0.042056079472946824, "grad_norm": 14.798758324402543, "learning_rate": 4.205527466108011e-05, "loss": 2.6703, "mean_token_accuracy": 0.37586206793785093, "step": 41755 }, { "epoch": 0.042061115526051, "grad_norm": 14.977398058464582, "learning_rate": 4.206031061781117e-05, "loss": 2.4922, "mean_token_accuracy": 0.3793103456497192, "step": 41760 }, { "epoch": 0.04206615157915517, "grad_norm": 14.211552459282368, "learning_rate": 4.2065346574542236e-05, "loss": 2.3848, "mean_token_accuracy": 0.44652147889137267, "step": 41765 }, { "epoch": 0.042071187632259345, "grad_norm": 12.17146546151082, "learning_rate": 4.2070382531273295e-05, "loss": 2.3536, "mean_token_accuracy": 0.43236538767814636, "step": 41770 }, { "epoch": 0.04207622368536352, "grad_norm": 16.863958240925193, "learning_rate": 4.2075418488004354e-05, "loss": 2.7531, "mean_token_accuracy": 0.37586207389831544, "step": 41775 }, { "epoch": 0.04208125973846769, "grad_norm": 13.510037655146801, "learning_rate": 4.2080454444735414e-05, "loss": 2.5447, "mean_token_accuracy": 0.4137930989265442, "step": 41780 }, { "epoch": 0.042086295791571866, "grad_norm": 14.871698933290757, "learning_rate": 4.208549040146647e-05, "loss": 2.8904, "mean_token_accuracy": 0.3793103486299515, "step": 41785 }, { "epoch": 0.04209133184467603, "grad_norm": 12.249409337383172, "learning_rate": 4.209052635819753e-05, "loss": 2.3642, "mean_token_accuracy": 0.3965517282485962, "step": 41790 }, { "epoch": 0.04209636789778021, "grad_norm": 14.484971940424183, "learning_rate": 4.209556231492859e-05, "loss": 2.5277, "mean_token_accuracy": 0.41379311084747317, "step": 41795 }, { "epoch": 0.04210140395088438, "grad_norm": 18.505288290979788, "learning_rate": 4.210059827165965e-05, "loss": 2.8781, "mean_token_accuracy": 0.41724138259887694, "step": 41800 }, { "epoch": 0.042106440003988554, "grad_norm": 10.981973479120942, "learning_rate": 4.210563422839071e-05, "loss": 2.2296, "mean_token_accuracy": 0.4833743929862976, "step": 41805 }, { "epoch": 0.04211147605709273, "grad_norm": 13.128831235684228, "learning_rate": 4.211067018512177e-05, "loss": 2.4139, "mean_token_accuracy": 0.4551724076271057, "step": 41810 }, { "epoch": 0.0421165121101969, "grad_norm": 13.848885356014558, "learning_rate": 4.211570614185283e-05, "loss": 2.3585, "mean_token_accuracy": 0.42758620977401735, "step": 41815 }, { "epoch": 0.042121548163301076, "grad_norm": 15.27973561704685, "learning_rate": 4.2120742098583894e-05, "loss": 2.4664, "mean_token_accuracy": 0.3896551638841629, "step": 41820 }, { "epoch": 0.04212658421640524, "grad_norm": 18.232381575854937, "learning_rate": 4.2125778055314954e-05, "loss": 2.5992, "mean_token_accuracy": 0.4034482717514038, "step": 41825 }, { "epoch": 0.042131620269509416, "grad_norm": 35.92981112914529, "learning_rate": 4.213081401204601e-05, "loss": 3.2475, "mean_token_accuracy": 0.3068965464830399, "step": 41830 }, { "epoch": 0.04213665632261359, "grad_norm": 19.293098380789058, "learning_rate": 4.213584996877707e-05, "loss": 2.7086, "mean_token_accuracy": 0.41379310488700866, "step": 41835 }, { "epoch": 0.042141692375717764, "grad_norm": 12.292907334985212, "learning_rate": 4.2140885925508125e-05, "loss": 3.001, "mean_token_accuracy": 0.3633393824100494, "step": 41840 }, { "epoch": 0.04214672842882194, "grad_norm": 11.603784730699676, "learning_rate": 4.214592188223919e-05, "loss": 2.5251, "mean_token_accuracy": 0.39854809641838074, "step": 41845 }, { "epoch": 0.04215176448192611, "grad_norm": 10.853543406412781, "learning_rate": 4.215095783897025e-05, "loss": 2.4223, "mean_token_accuracy": 0.3827586233615875, "step": 41850 }, { "epoch": 0.042156800535030285, "grad_norm": 12.300983182140644, "learning_rate": 4.215599379570131e-05, "loss": 2.1721, "mean_token_accuracy": 0.4034482717514038, "step": 41855 }, { "epoch": 0.04216183658813445, "grad_norm": 13.37102439527431, "learning_rate": 4.216102975243237e-05, "loss": 2.4762, "mean_token_accuracy": 0.43103447556495667, "step": 41860 }, { "epoch": 0.042166872641238626, "grad_norm": 15.324667298293514, "learning_rate": 4.216606570916343e-05, "loss": 2.6459, "mean_token_accuracy": 0.4068965494632721, "step": 41865 }, { "epoch": 0.0421719086943428, "grad_norm": 37.98088179474751, "learning_rate": 4.2171101665894494e-05, "loss": 2.6729, "mean_token_accuracy": 0.3620689630508423, "step": 41870 }, { "epoch": 0.04217694474744697, "grad_norm": 12.068133269757723, "learning_rate": 4.217613762262555e-05, "loss": 2.5234, "mean_token_accuracy": 0.4344827592372894, "step": 41875 }, { "epoch": 0.04218198080055115, "grad_norm": 15.76145118699871, "learning_rate": 4.2181173579356605e-05, "loss": 2.4281, "mean_token_accuracy": 0.42758620977401735, "step": 41880 }, { "epoch": 0.04218701685365532, "grad_norm": 14.910335477221567, "learning_rate": 4.2186209536087665e-05, "loss": 2.7649, "mean_token_accuracy": 0.3758620709180832, "step": 41885 }, { "epoch": 0.042192052906759495, "grad_norm": 11.81527555509976, "learning_rate": 4.2191245492818724e-05, "loss": 2.5669, "mean_token_accuracy": 0.4103448301553726, "step": 41890 }, { "epoch": 0.04219708895986366, "grad_norm": 12.728966719523346, "learning_rate": 4.219628144954978e-05, "loss": 2.6018, "mean_token_accuracy": 0.3482758581638336, "step": 41895 }, { "epoch": 0.042202125012967835, "grad_norm": 12.883633091968685, "learning_rate": 4.220131740628085e-05, "loss": 2.415, "mean_token_accuracy": 0.42413792610168455, "step": 41900 }, { "epoch": 0.04220716106607201, "grad_norm": 14.008719081821134, "learning_rate": 4.220635336301191e-05, "loss": 2.6938, "mean_token_accuracy": 0.34137930572032926, "step": 41905 }, { "epoch": 0.04221219711917618, "grad_norm": 12.8733060908149, "learning_rate": 4.221138931974297e-05, "loss": 2.4629, "mean_token_accuracy": 0.42068964838981626, "step": 41910 }, { "epoch": 0.042217233172280356, "grad_norm": 14.369915547441158, "learning_rate": 4.221642527647403e-05, "loss": 2.6673, "mean_token_accuracy": 0.3517241358757019, "step": 41915 }, { "epoch": 0.04222226922538453, "grad_norm": 29.377598118090567, "learning_rate": 4.2221461233205086e-05, "loss": 3.3392, "mean_token_accuracy": 0.31034482419490816, "step": 41920 }, { "epoch": 0.042227305278488704, "grad_norm": 13.031121699384238, "learning_rate": 4.2226497189936145e-05, "loss": 2.6579, "mean_token_accuracy": 0.4000000059604645, "step": 41925 }, { "epoch": 0.04223234133159287, "grad_norm": 12.569187069862734, "learning_rate": 4.2231533146667205e-05, "loss": 2.5284, "mean_token_accuracy": 0.4379310250282288, "step": 41930 }, { "epoch": 0.042237377384697045, "grad_norm": 11.713053192207543, "learning_rate": 4.2236569103398264e-05, "loss": 2.4914, "mean_token_accuracy": 0.3620689630508423, "step": 41935 }, { "epoch": 0.04224241343780122, "grad_norm": 11.885212434655756, "learning_rate": 4.224160506012932e-05, "loss": 2.4609, "mean_token_accuracy": 0.3793103456497192, "step": 41940 }, { "epoch": 0.04224744949090539, "grad_norm": 11.623040491163774, "learning_rate": 4.224664101686038e-05, "loss": 2.6083, "mean_token_accuracy": 0.4054446518421173, "step": 41945 }, { "epoch": 0.042252485544009566, "grad_norm": 13.828990567441382, "learning_rate": 4.225167697359145e-05, "loss": 2.178, "mean_token_accuracy": 0.4517241358757019, "step": 41950 }, { "epoch": 0.04225752159711374, "grad_norm": 13.72644657704056, "learning_rate": 4.225671293032251e-05, "loss": 2.8159, "mean_token_accuracy": 0.38275861740112305, "step": 41955 }, { "epoch": 0.04226255765021791, "grad_norm": 15.187757632517355, "learning_rate": 4.226174888705357e-05, "loss": 2.9138, "mean_token_accuracy": 0.34137930870056155, "step": 41960 }, { "epoch": 0.04226759370332208, "grad_norm": 12.413715368462153, "learning_rate": 4.2266784843784626e-05, "loss": 2.4774, "mean_token_accuracy": 0.43793103098869324, "step": 41965 }, { "epoch": 0.042272629756426254, "grad_norm": 13.778074047430437, "learning_rate": 4.2271820800515685e-05, "loss": 2.8406, "mean_token_accuracy": 0.32758620381355286, "step": 41970 }, { "epoch": 0.04227766580953043, "grad_norm": 15.537898966533053, "learning_rate": 4.227685675724674e-05, "loss": 3.0642, "mean_token_accuracy": 0.3565033197402954, "step": 41975 }, { "epoch": 0.0422827018626346, "grad_norm": 13.185269839919933, "learning_rate": 4.2281892713977804e-05, "loss": 2.4481, "mean_token_accuracy": 0.3965517163276672, "step": 41980 }, { "epoch": 0.042287737915738775, "grad_norm": 12.420004477453237, "learning_rate": 4.228692867070886e-05, "loss": 2.2869, "mean_token_accuracy": 0.45862067937850953, "step": 41985 }, { "epoch": 0.04229277396884295, "grad_norm": 17.975204420268977, "learning_rate": 4.229196462743992e-05, "loss": 2.9945, "mean_token_accuracy": 0.38747731447219846, "step": 41990 }, { "epoch": 0.04229781002194712, "grad_norm": 11.952792915868843, "learning_rate": 4.229700058417098e-05, "loss": 2.4427, "mean_token_accuracy": 0.43793101906776427, "step": 41995 }, { "epoch": 0.04230284607505129, "grad_norm": 12.832417725715763, "learning_rate": 4.230203654090204e-05, "loss": 2.4983, "mean_token_accuracy": 0.42758620977401735, "step": 42000 }, { "epoch": 0.04230788212815546, "grad_norm": 14.326387972889306, "learning_rate": 4.230707249763311e-05, "loss": 2.5622, "mean_token_accuracy": 0.34482758343219755, "step": 42005 }, { "epoch": 0.04231291818125964, "grad_norm": 14.617655120901565, "learning_rate": 4.2312108454364166e-05, "loss": 2.4867, "mean_token_accuracy": 0.42758620381355283, "step": 42010 }, { "epoch": 0.04231795423436381, "grad_norm": 13.418183133987174, "learning_rate": 4.231714441109522e-05, "loss": 2.5945, "mean_token_accuracy": 0.38275861740112305, "step": 42015 }, { "epoch": 0.042322990287467985, "grad_norm": 19.639909320233777, "learning_rate": 4.232218036782628e-05, "loss": 2.529, "mean_token_accuracy": 0.34137930870056155, "step": 42020 }, { "epoch": 0.04232802634057216, "grad_norm": 13.374467595890817, "learning_rate": 4.232721632455734e-05, "loss": 2.7504, "mean_token_accuracy": 0.4034482777118683, "step": 42025 }, { "epoch": 0.04233306239367633, "grad_norm": 12.143299335538108, "learning_rate": 4.23322522812884e-05, "loss": 2.1599, "mean_token_accuracy": 0.43980641961097716, "step": 42030 }, { "epoch": 0.0423380984467805, "grad_norm": 17.61451381729175, "learning_rate": 4.233728823801946e-05, "loss": 2.6257, "mean_token_accuracy": 0.3724137932062149, "step": 42035 }, { "epoch": 0.04234313449988467, "grad_norm": 13.847306709784116, "learning_rate": 4.234232419475052e-05, "loss": 2.5287, "mean_token_accuracy": 0.4068965554237366, "step": 42040 }, { "epoch": 0.04234817055298885, "grad_norm": 20.84006487057853, "learning_rate": 4.234736015148158e-05, "loss": 2.6803, "mean_token_accuracy": 0.38620689511299133, "step": 42045 }, { "epoch": 0.04235320660609302, "grad_norm": 12.870407741873784, "learning_rate": 4.235239610821264e-05, "loss": 2.4426, "mean_token_accuracy": 0.37241379022598264, "step": 42050 }, { "epoch": 0.042358242659197194, "grad_norm": 14.138877184780087, "learning_rate": 4.23574320649437e-05, "loss": 3.1251, "mean_token_accuracy": 0.3482758641242981, "step": 42055 }, { "epoch": 0.04236327871230137, "grad_norm": 25.720919805583133, "learning_rate": 4.236246802167476e-05, "loss": 2.9327, "mean_token_accuracy": 0.3980036199092865, "step": 42060 }, { "epoch": 0.04236831476540554, "grad_norm": 9.206785587147353, "learning_rate": 4.236750397840582e-05, "loss": 2.3184, "mean_token_accuracy": 0.43448275327682495, "step": 42065 }, { "epoch": 0.04237335081850971, "grad_norm": 18.975102113100423, "learning_rate": 4.237253993513688e-05, "loss": 2.5569, "mean_token_accuracy": 0.42643678188323975, "step": 42070 }, { "epoch": 0.04237838687161388, "grad_norm": 13.395251209303156, "learning_rate": 4.2377575891867937e-05, "loss": 2.5877, "mean_token_accuracy": 0.3896551728248596, "step": 42075 }, { "epoch": 0.042383422924718056, "grad_norm": 15.599159847855095, "learning_rate": 4.2382611848598996e-05, "loss": 2.6956, "mean_token_accuracy": 0.3931034505367279, "step": 42080 }, { "epoch": 0.04238845897782223, "grad_norm": 17.88762643063267, "learning_rate": 4.238764780533006e-05, "loss": 2.9442, "mean_token_accuracy": 0.37241379022598264, "step": 42085 }, { "epoch": 0.042393495030926404, "grad_norm": 18.754705378775704, "learning_rate": 4.239268376206112e-05, "loss": 3.0249, "mean_token_accuracy": 0.36896551847457887, "step": 42090 }, { "epoch": 0.04239853108403058, "grad_norm": 11.573587665864922, "learning_rate": 4.239771971879218e-05, "loss": 2.5534, "mean_token_accuracy": 0.42413792610168455, "step": 42095 }, { "epoch": 0.04240356713713475, "grad_norm": 18.166801202055836, "learning_rate": 4.240275567552324e-05, "loss": 2.6535, "mean_token_accuracy": 0.44349666833877566, "step": 42100 }, { "epoch": 0.04240860319023892, "grad_norm": 18.041575724305037, "learning_rate": 4.24077916322543e-05, "loss": 2.8118, "mean_token_accuracy": 0.3517241358757019, "step": 42105 }, { "epoch": 0.04241363924334309, "grad_norm": 17.34579171263323, "learning_rate": 4.241282758898536e-05, "loss": 2.5329, "mean_token_accuracy": 0.3931034505367279, "step": 42110 }, { "epoch": 0.042418675296447265, "grad_norm": 11.330988321707263, "learning_rate": 4.241786354571642e-05, "loss": 2.2997, "mean_token_accuracy": 0.4607380449771881, "step": 42115 }, { "epoch": 0.04242371134955144, "grad_norm": 14.075020274599758, "learning_rate": 4.2422899502447477e-05, "loss": 2.7019, "mean_token_accuracy": 0.42758620381355283, "step": 42120 }, { "epoch": 0.04242874740265561, "grad_norm": 15.0719883830294, "learning_rate": 4.2427935459178536e-05, "loss": 2.4078, "mean_token_accuracy": 0.43448275327682495, "step": 42125 }, { "epoch": 0.04243378345575979, "grad_norm": 15.596669861885054, "learning_rate": 4.2432971415909595e-05, "loss": 2.5482, "mean_token_accuracy": 0.3827586233615875, "step": 42130 }, { "epoch": 0.042438819508863954, "grad_norm": 13.45930697461373, "learning_rate": 4.243800737264066e-05, "loss": 2.2291, "mean_token_accuracy": 0.48457350730896, "step": 42135 }, { "epoch": 0.04244385556196813, "grad_norm": 12.928044486619804, "learning_rate": 4.244304332937172e-05, "loss": 2.5144, "mean_token_accuracy": 0.4034482777118683, "step": 42140 }, { "epoch": 0.0424488916150723, "grad_norm": 18.302949803594974, "learning_rate": 4.244807928610278e-05, "loss": 2.348, "mean_token_accuracy": 0.46551724076271056, "step": 42145 }, { "epoch": 0.042453927668176475, "grad_norm": 14.097152360466584, "learning_rate": 4.245311524283383e-05, "loss": 2.5896, "mean_token_accuracy": 0.3599515974521637, "step": 42150 }, { "epoch": 0.04245896372128065, "grad_norm": 15.873745275098289, "learning_rate": 4.245815119956489e-05, "loss": 2.4762, "mean_token_accuracy": 0.4034482777118683, "step": 42155 }, { "epoch": 0.04246399977438482, "grad_norm": 12.642225767181579, "learning_rate": 4.246318715629595e-05, "loss": 2.4846, "mean_token_accuracy": 0.4068965494632721, "step": 42160 }, { "epoch": 0.042469035827488996, "grad_norm": 15.742250200906533, "learning_rate": 4.246822311302702e-05, "loss": 2.6393, "mean_token_accuracy": 0.3931034505367279, "step": 42165 }, { "epoch": 0.04247407188059316, "grad_norm": 14.336653862827957, "learning_rate": 4.2473259069758076e-05, "loss": 2.2442, "mean_token_accuracy": 0.41203871965408323, "step": 42170 }, { "epoch": 0.04247910793369734, "grad_norm": 18.631276554370075, "learning_rate": 4.2478295026489135e-05, "loss": 2.8472, "mean_token_accuracy": 0.3241379290819168, "step": 42175 }, { "epoch": 0.04248414398680151, "grad_norm": 19.177148216566874, "learning_rate": 4.2483330983220194e-05, "loss": 2.5995, "mean_token_accuracy": 0.441379314661026, "step": 42180 }, { "epoch": 0.042489180039905684, "grad_norm": 14.870899194173267, "learning_rate": 4.2488366939951254e-05, "loss": 2.4629, "mean_token_accuracy": 0.39243799448013306, "step": 42185 }, { "epoch": 0.04249421609300986, "grad_norm": 17.319787922211166, "learning_rate": 4.249340289668231e-05, "loss": 3.0309, "mean_token_accuracy": 0.3655172407627106, "step": 42190 }, { "epoch": 0.04249925214611403, "grad_norm": 14.776330039628686, "learning_rate": 4.249843885341337e-05, "loss": 2.6946, "mean_token_accuracy": 0.39310344457626345, "step": 42195 }, { "epoch": 0.042504288199218206, "grad_norm": 11.958051709792558, "learning_rate": 4.250347481014443e-05, "loss": 2.2899, "mean_token_accuracy": 0.4344827711582184, "step": 42200 }, { "epoch": 0.04250932425232237, "grad_norm": 15.111951352392746, "learning_rate": 4.250851076687549e-05, "loss": 2.7625, "mean_token_accuracy": 0.42068964838981626, "step": 42205 }, { "epoch": 0.042514360305426546, "grad_norm": 18.35699183872143, "learning_rate": 4.251354672360655e-05, "loss": 2.7042, "mean_token_accuracy": 0.40834845304489137, "step": 42210 }, { "epoch": 0.04251939635853072, "grad_norm": 14.412213982532519, "learning_rate": 4.2518582680337616e-05, "loss": 2.919, "mean_token_accuracy": 0.3137931048870087, "step": 42215 }, { "epoch": 0.042524432411634894, "grad_norm": 13.924079055611344, "learning_rate": 4.2523618637068675e-05, "loss": 3.1514, "mean_token_accuracy": 0.31379309892654417, "step": 42220 }, { "epoch": 0.04252946846473907, "grad_norm": 17.29722559270806, "learning_rate": 4.2528654593799734e-05, "loss": 2.677, "mean_token_accuracy": 0.44482758045196535, "step": 42225 }, { "epoch": 0.04253450451784324, "grad_norm": 12.559958089901833, "learning_rate": 4.2533690550530794e-05, "loss": 2.3733, "mean_token_accuracy": 0.4137930989265442, "step": 42230 }, { "epoch": 0.042539540570947415, "grad_norm": 13.952628673728361, "learning_rate": 4.253872650726185e-05, "loss": 2.2965, "mean_token_accuracy": 0.4310344815254211, "step": 42235 }, { "epoch": 0.04254457662405158, "grad_norm": 14.904182712094958, "learning_rate": 4.254376246399291e-05, "loss": 2.6364, "mean_token_accuracy": 0.3517241358757019, "step": 42240 }, { "epoch": 0.042549612677155756, "grad_norm": 14.296432038729126, "learning_rate": 4.254879842072397e-05, "loss": 2.5395, "mean_token_accuracy": 0.3931034505367279, "step": 42245 }, { "epoch": 0.04255464873025993, "grad_norm": 15.441271256602201, "learning_rate": 4.255383437745503e-05, "loss": 2.6517, "mean_token_accuracy": 0.38275861740112305, "step": 42250 }, { "epoch": 0.0425596847833641, "grad_norm": 11.879655297842078, "learning_rate": 4.255887033418609e-05, "loss": 2.9341, "mean_token_accuracy": 0.38723533153533934, "step": 42255 }, { "epoch": 0.04256472083646828, "grad_norm": 16.192863005730583, "learning_rate": 4.256390629091715e-05, "loss": 2.5092, "mean_token_accuracy": 0.41379311084747317, "step": 42260 }, { "epoch": 0.04256975688957245, "grad_norm": 15.344551407430263, "learning_rate": 4.256894224764821e-05, "loss": 2.5688, "mean_token_accuracy": 0.3965517163276672, "step": 42265 }, { "epoch": 0.042574792942676624, "grad_norm": 18.041216192121656, "learning_rate": 4.2573978204379275e-05, "loss": 2.6633, "mean_token_accuracy": 0.3793103456497192, "step": 42270 }, { "epoch": 0.04257982899578079, "grad_norm": 13.095420236141694, "learning_rate": 4.2579014161110334e-05, "loss": 2.5455, "mean_token_accuracy": 0.4534785270690918, "step": 42275 }, { "epoch": 0.042584865048884965, "grad_norm": 18.782660568694478, "learning_rate": 4.258405011784139e-05, "loss": 2.8752, "mean_token_accuracy": 0.38620689511299133, "step": 42280 }, { "epoch": 0.04258990110198914, "grad_norm": 14.331615403627207, "learning_rate": 4.2589086074572446e-05, "loss": 2.4836, "mean_token_accuracy": 0.4119177222251892, "step": 42285 }, { "epoch": 0.04259493715509331, "grad_norm": 14.427374397439023, "learning_rate": 4.2594122031303505e-05, "loss": 2.8262, "mean_token_accuracy": 0.39310344457626345, "step": 42290 }, { "epoch": 0.042599973208197486, "grad_norm": 14.580719797050584, "learning_rate": 4.259915798803457e-05, "loss": 2.5059, "mean_token_accuracy": 0.3827586233615875, "step": 42295 }, { "epoch": 0.04260500926130166, "grad_norm": 15.737302262281352, "learning_rate": 4.260419394476563e-05, "loss": 2.3139, "mean_token_accuracy": 0.44337567687034607, "step": 42300 }, { "epoch": 0.042610045314405834, "grad_norm": 12.396772252258224, "learning_rate": 4.260922990149669e-05, "loss": 2.6244, "mean_token_accuracy": 0.38620689511299133, "step": 42305 }, { "epoch": 0.04261508136751, "grad_norm": 13.944666778957275, "learning_rate": 4.261426585822775e-05, "loss": 2.4062, "mean_token_accuracy": 0.41724138259887694, "step": 42310 }, { "epoch": 0.042620117420614174, "grad_norm": 15.515671024732626, "learning_rate": 4.261930181495881e-05, "loss": 2.717, "mean_token_accuracy": 0.3620689630508423, "step": 42315 }, { "epoch": 0.04262515347371835, "grad_norm": 16.774689058237154, "learning_rate": 4.262433777168987e-05, "loss": 2.8777, "mean_token_accuracy": 0.39655172228813174, "step": 42320 }, { "epoch": 0.04263018952682252, "grad_norm": 17.321443930984262, "learning_rate": 4.2629373728420926e-05, "loss": 2.8524, "mean_token_accuracy": 0.34482758343219755, "step": 42325 }, { "epoch": 0.042635225579926696, "grad_norm": 14.750339144809603, "learning_rate": 4.2634409685151986e-05, "loss": 2.3446, "mean_token_accuracy": 0.4192377507686615, "step": 42330 }, { "epoch": 0.04264026163303087, "grad_norm": 13.441199619491481, "learning_rate": 4.2639445641883045e-05, "loss": 2.4216, "mean_token_accuracy": 0.42758620381355283, "step": 42335 }, { "epoch": 0.04264529768613504, "grad_norm": 15.891862524152994, "learning_rate": 4.2644481598614104e-05, "loss": 2.3795, "mean_token_accuracy": 0.42232305407524107, "step": 42340 }, { "epoch": 0.04265033373923921, "grad_norm": 12.948383133287336, "learning_rate": 4.264951755534516e-05, "loss": 2.1559, "mean_token_accuracy": 0.4949788272380829, "step": 42345 }, { "epoch": 0.042655369792343384, "grad_norm": 15.900049266401648, "learning_rate": 4.265455351207623e-05, "loss": 2.42, "mean_token_accuracy": 0.4068965494632721, "step": 42350 }, { "epoch": 0.04266040584544756, "grad_norm": 15.31039837278257, "learning_rate": 4.265958946880729e-05, "loss": 2.3829, "mean_token_accuracy": 0.4413793087005615, "step": 42355 }, { "epoch": 0.04266544189855173, "grad_norm": 14.99418747520708, "learning_rate": 4.266462542553835e-05, "loss": 2.5893, "mean_token_accuracy": 0.4310344815254211, "step": 42360 }, { "epoch": 0.042670477951655905, "grad_norm": 12.95952244195856, "learning_rate": 4.266966138226941e-05, "loss": 2.7455, "mean_token_accuracy": 0.3999999940395355, "step": 42365 }, { "epoch": 0.04267551400476008, "grad_norm": 17.553404897380318, "learning_rate": 4.2674697339000466e-05, "loss": 2.4891, "mean_token_accuracy": 0.3827586233615875, "step": 42370 }, { "epoch": 0.04268055005786425, "grad_norm": 16.51956062951908, "learning_rate": 4.2679733295731526e-05, "loss": 2.4685, "mean_token_accuracy": 0.42413792610168455, "step": 42375 }, { "epoch": 0.04268558611096842, "grad_norm": 14.219170245675066, "learning_rate": 4.2684769252462585e-05, "loss": 2.4356, "mean_token_accuracy": 0.3965517163276672, "step": 42380 }, { "epoch": 0.04269062216407259, "grad_norm": 14.482484529196329, "learning_rate": 4.2689805209193644e-05, "loss": 2.6356, "mean_token_accuracy": 0.3965517282485962, "step": 42385 }, { "epoch": 0.04269565821717677, "grad_norm": 15.67689406125198, "learning_rate": 4.26948411659247e-05, "loss": 2.9787, "mean_token_accuracy": 0.38275861740112305, "step": 42390 }, { "epoch": 0.04270069427028094, "grad_norm": 12.490245603448765, "learning_rate": 4.269987712265576e-05, "loss": 2.4141, "mean_token_accuracy": 0.39655171930789945, "step": 42395 }, { "epoch": 0.042705730323385115, "grad_norm": 15.540813375818972, "learning_rate": 4.270491307938682e-05, "loss": 2.7374, "mean_token_accuracy": 0.3620689630508423, "step": 42400 }, { "epoch": 0.04271076637648929, "grad_norm": 12.979856849694029, "learning_rate": 4.270994903611789e-05, "loss": 2.8293, "mean_token_accuracy": 0.36896551847457887, "step": 42405 }, { "epoch": 0.04271580242959346, "grad_norm": 11.564042584272398, "learning_rate": 4.271498499284895e-05, "loss": 2.2286, "mean_token_accuracy": 0.4482758641242981, "step": 42410 }, { "epoch": 0.04272083848269763, "grad_norm": 14.029106244070876, "learning_rate": 4.272002094958e-05, "loss": 2.4029, "mean_token_accuracy": 0.4103448212146759, "step": 42415 }, { "epoch": 0.0427258745358018, "grad_norm": 18.05882644843383, "learning_rate": 4.272505690631106e-05, "loss": 2.1484, "mean_token_accuracy": 0.4400483965873718, "step": 42420 }, { "epoch": 0.042730910588905976, "grad_norm": 12.345629538275608, "learning_rate": 4.273009286304212e-05, "loss": 2.4144, "mean_token_accuracy": 0.42244404554367065, "step": 42425 }, { "epoch": 0.04273594664201015, "grad_norm": 12.352939394068553, "learning_rate": 4.2735128819773184e-05, "loss": 2.4946, "mean_token_accuracy": 0.4206896543502808, "step": 42430 }, { "epoch": 0.042740982695114324, "grad_norm": 15.708169505741802, "learning_rate": 4.2740164776504243e-05, "loss": 2.8369, "mean_token_accuracy": 0.3744706571102142, "step": 42435 }, { "epoch": 0.0427460187482185, "grad_norm": 11.675010757335446, "learning_rate": 4.27452007332353e-05, "loss": 2.7415, "mean_token_accuracy": 0.35862069129943847, "step": 42440 }, { "epoch": 0.04275105480132267, "grad_norm": 15.58706914867207, "learning_rate": 4.275023668996636e-05, "loss": 2.7103, "mean_token_accuracy": 0.4137930989265442, "step": 42445 }, { "epoch": 0.04275609085442684, "grad_norm": 12.666488494284264, "learning_rate": 4.275527264669742e-05, "loss": 2.6382, "mean_token_accuracy": 0.4034482777118683, "step": 42450 }, { "epoch": 0.04276112690753101, "grad_norm": 13.826522865107606, "learning_rate": 4.276030860342849e-05, "loss": 2.6516, "mean_token_accuracy": 0.38620689511299133, "step": 42455 }, { "epoch": 0.042766162960635186, "grad_norm": 11.654796102732554, "learning_rate": 4.276534456015954e-05, "loss": 2.5875, "mean_token_accuracy": 0.4, "step": 42460 }, { "epoch": 0.04277119901373936, "grad_norm": 13.037544524089572, "learning_rate": 4.27703805168906e-05, "loss": 2.5072, "mean_token_accuracy": 0.41379310488700866, "step": 42465 }, { "epoch": 0.04277623506684353, "grad_norm": 15.854954238688503, "learning_rate": 4.277541647362166e-05, "loss": 2.6881, "mean_token_accuracy": 0.3620689630508423, "step": 42470 }, { "epoch": 0.04278127111994771, "grad_norm": 13.652851304374813, "learning_rate": 4.278045243035272e-05, "loss": 2.3754, "mean_token_accuracy": 0.3999999940395355, "step": 42475 }, { "epoch": 0.04278630717305188, "grad_norm": 18.583763066731468, "learning_rate": 4.278548838708378e-05, "loss": 2.6745, "mean_token_accuracy": 0.4017543882131577, "step": 42480 }, { "epoch": 0.04279134322615605, "grad_norm": 12.364417676963345, "learning_rate": 4.279052434381484e-05, "loss": 2.2651, "mean_token_accuracy": 0.4641863167285919, "step": 42485 }, { "epoch": 0.04279637927926022, "grad_norm": 13.678942247463272, "learning_rate": 4.27955603005459e-05, "loss": 2.3284, "mean_token_accuracy": 0.458620685338974, "step": 42490 }, { "epoch": 0.042801415332364395, "grad_norm": 12.527288771637664, "learning_rate": 4.280059625727696e-05, "loss": 2.6872, "mean_token_accuracy": 0.3379310369491577, "step": 42495 }, { "epoch": 0.04280645138546857, "grad_norm": 14.437391933939349, "learning_rate": 4.280563221400802e-05, "loss": 2.7137, "mean_token_accuracy": 0.4, "step": 42500 }, { "epoch": 0.04281148743857274, "grad_norm": 14.318938536633889, "learning_rate": 4.281066817073908e-05, "loss": 2.6067, "mean_token_accuracy": 0.41379310488700866, "step": 42505 }, { "epoch": 0.04281652349167692, "grad_norm": 14.449078548213425, "learning_rate": 4.281570412747014e-05, "loss": 2.292, "mean_token_accuracy": 0.4034482777118683, "step": 42510 }, { "epoch": 0.04282155954478109, "grad_norm": 13.363664878706478, "learning_rate": 4.28207400842012e-05, "loss": 2.8499, "mean_token_accuracy": 0.3620689630508423, "step": 42515 }, { "epoch": 0.04282659559788526, "grad_norm": 11.886331527812537, "learning_rate": 4.282577604093226e-05, "loss": 2.5383, "mean_token_accuracy": 0.41929823756217954, "step": 42520 }, { "epoch": 0.04283163165098943, "grad_norm": 14.07978327348943, "learning_rate": 4.283081199766332e-05, "loss": 2.5866, "mean_token_accuracy": 0.39655172228813174, "step": 42525 }, { "epoch": 0.042836667704093605, "grad_norm": 11.879863150683494, "learning_rate": 4.2835847954394376e-05, "loss": 2.5158, "mean_token_accuracy": 0.38620689511299133, "step": 42530 }, { "epoch": 0.04284170375719778, "grad_norm": 17.590022392053584, "learning_rate": 4.284088391112544e-05, "loss": 2.5401, "mean_token_accuracy": 0.38965516686439516, "step": 42535 }, { "epoch": 0.04284673981030195, "grad_norm": 12.437440381832182, "learning_rate": 4.28459198678565e-05, "loss": 2.6532, "mean_token_accuracy": 0.41379310488700866, "step": 42540 }, { "epoch": 0.042851775863406126, "grad_norm": 12.192273837636188, "learning_rate": 4.285095582458756e-05, "loss": 2.6076, "mean_token_accuracy": 0.44482758045196535, "step": 42545 }, { "epoch": 0.0428568119165103, "grad_norm": 12.609447648747556, "learning_rate": 4.285599178131861e-05, "loss": 2.5528, "mean_token_accuracy": 0.3793103456497192, "step": 42550 }, { "epoch": 0.04286184796961447, "grad_norm": 11.286364434772523, "learning_rate": 4.286102773804967e-05, "loss": 2.6489, "mean_token_accuracy": 0.4034482777118683, "step": 42555 }, { "epoch": 0.04286688402271864, "grad_norm": 14.555588426265109, "learning_rate": 4.286606369478074e-05, "loss": 2.2227, "mean_token_accuracy": 0.45862069725990295, "step": 42560 }, { "epoch": 0.042871920075822814, "grad_norm": 10.423228708481181, "learning_rate": 4.28710996515118e-05, "loss": 2.4897, "mean_token_accuracy": 0.43920145034790037, "step": 42565 }, { "epoch": 0.04287695612892699, "grad_norm": 15.583529909639854, "learning_rate": 4.287613560824286e-05, "loss": 2.458, "mean_token_accuracy": 0.39310344457626345, "step": 42570 }, { "epoch": 0.04288199218203116, "grad_norm": 14.819796875372754, "learning_rate": 4.2881171564973916e-05, "loss": 2.3799, "mean_token_accuracy": 0.4034482717514038, "step": 42575 }, { "epoch": 0.042887028235135335, "grad_norm": 13.840809874275212, "learning_rate": 4.2886207521704975e-05, "loss": 2.5159, "mean_token_accuracy": 0.4344827592372894, "step": 42580 }, { "epoch": 0.04289206428823951, "grad_norm": 11.397364721717684, "learning_rate": 4.2891243478436035e-05, "loss": 2.4414, "mean_token_accuracy": 0.4344827592372894, "step": 42585 }, { "epoch": 0.042897100341343676, "grad_norm": 16.904424012318987, "learning_rate": 4.2896279435167094e-05, "loss": 2.5979, "mean_token_accuracy": 0.38620689511299133, "step": 42590 }, { "epoch": 0.04290213639444785, "grad_norm": 15.995372441529161, "learning_rate": 4.290131539189815e-05, "loss": 2.5569, "mean_token_accuracy": 0.3965517163276672, "step": 42595 }, { "epoch": 0.042907172447552024, "grad_norm": 14.534087408043131, "learning_rate": 4.290635134862921e-05, "loss": 2.3422, "mean_token_accuracy": 0.44482759237289426, "step": 42600 }, { "epoch": 0.0429122085006562, "grad_norm": 13.180057679764511, "learning_rate": 4.291138730536027e-05, "loss": 2.3241, "mean_token_accuracy": 0.4310344815254211, "step": 42605 }, { "epoch": 0.04291724455376037, "grad_norm": 16.289163593952928, "learning_rate": 4.291642326209133e-05, "loss": 2.9597, "mean_token_accuracy": 0.3517241418361664, "step": 42610 }, { "epoch": 0.042922280606864545, "grad_norm": 14.192912700929499, "learning_rate": 4.29214592188224e-05, "loss": 2.6274, "mean_token_accuracy": 0.38118572235107423, "step": 42615 }, { "epoch": 0.04292731665996872, "grad_norm": 12.08154023126303, "learning_rate": 4.2926495175553456e-05, "loss": 2.3886, "mean_token_accuracy": 0.4310344815254211, "step": 42620 }, { "epoch": 0.042932352713072885, "grad_norm": 14.368803457135082, "learning_rate": 4.2931531132284515e-05, "loss": 2.8807, "mean_token_accuracy": 0.33448275923728943, "step": 42625 }, { "epoch": 0.04293738876617706, "grad_norm": 14.22655187383774, "learning_rate": 4.2936567089015575e-05, "loss": 2.5781, "mean_token_accuracy": 0.3896551728248596, "step": 42630 }, { "epoch": 0.04294242481928123, "grad_norm": 11.834411363633627, "learning_rate": 4.2941603045746634e-05, "loss": 2.224, "mean_token_accuracy": 0.48118571639060975, "step": 42635 }, { "epoch": 0.04294746087238541, "grad_norm": 14.424438503085119, "learning_rate": 4.294663900247769e-05, "loss": 2.2941, "mean_token_accuracy": 0.4310344815254211, "step": 42640 }, { "epoch": 0.04295249692548958, "grad_norm": 14.405130546509609, "learning_rate": 4.295167495920875e-05, "loss": 2.4781, "mean_token_accuracy": 0.43103447556495667, "step": 42645 }, { "epoch": 0.042957532978593754, "grad_norm": 12.447160404945365, "learning_rate": 4.295671091593981e-05, "loss": 2.8076, "mean_token_accuracy": 0.39497882723808286, "step": 42650 }, { "epoch": 0.04296256903169793, "grad_norm": 13.393840471390812, "learning_rate": 4.296174687267087e-05, "loss": 2.5322, "mean_token_accuracy": 0.4206896543502808, "step": 42655 }, { "epoch": 0.042967605084802095, "grad_norm": 11.045596972590426, "learning_rate": 4.296678282940193e-05, "loss": 2.768, "mean_token_accuracy": 0.4034482777118683, "step": 42660 }, { "epoch": 0.04297264113790627, "grad_norm": 15.474887807216168, "learning_rate": 4.297181878613299e-05, "loss": 2.8179, "mean_token_accuracy": 0.34482758641242983, "step": 42665 }, { "epoch": 0.04297767719101044, "grad_norm": 10.401192947622059, "learning_rate": 4.2976854742864055e-05, "loss": 2.3975, "mean_token_accuracy": 0.4172413766384125, "step": 42670 }, { "epoch": 0.042982713244114616, "grad_norm": 12.833537782131634, "learning_rate": 4.2981890699595115e-05, "loss": 2.5405, "mean_token_accuracy": 0.37241379022598264, "step": 42675 }, { "epoch": 0.04298774929721879, "grad_norm": 14.97859743185117, "learning_rate": 4.2986926656326174e-05, "loss": 2.7911, "mean_token_accuracy": 0.4206896543502808, "step": 42680 }, { "epoch": 0.042992785350322964, "grad_norm": 11.945216989507687, "learning_rate": 4.2991962613057226e-05, "loss": 2.7857, "mean_token_accuracy": 0.3655172407627106, "step": 42685 }, { "epoch": 0.04299782140342714, "grad_norm": 17.318628864970286, "learning_rate": 4.2996998569788286e-05, "loss": 2.5193, "mean_token_accuracy": 0.4034482777118683, "step": 42690 }, { "epoch": 0.043002857456531304, "grad_norm": 13.709504266331162, "learning_rate": 4.300203452651935e-05, "loss": 3.0416, "mean_token_accuracy": 0.3862069070339203, "step": 42695 }, { "epoch": 0.04300789350963548, "grad_norm": 15.38949121289357, "learning_rate": 4.300707048325041e-05, "loss": 3.1066, "mean_token_accuracy": 0.3646702915430069, "step": 42700 }, { "epoch": 0.04301292956273965, "grad_norm": 13.078836278604516, "learning_rate": 4.301210643998147e-05, "loss": 2.2775, "mean_token_accuracy": 0.43793103098869324, "step": 42705 }, { "epoch": 0.043017965615843826, "grad_norm": 12.575992435217163, "learning_rate": 4.301714239671253e-05, "loss": 2.8655, "mean_token_accuracy": 0.358620685338974, "step": 42710 }, { "epoch": 0.043023001668948, "grad_norm": 19.974491645020603, "learning_rate": 4.302217835344359e-05, "loss": 2.7685, "mean_token_accuracy": 0.40169388651847837, "step": 42715 }, { "epoch": 0.04302803772205217, "grad_norm": 12.851491495664765, "learning_rate": 4.3027214310174655e-05, "loss": 2.9187, "mean_token_accuracy": 0.3827586233615875, "step": 42720 }, { "epoch": 0.04303307377515635, "grad_norm": 19.585268603036194, "learning_rate": 4.303225026690571e-05, "loss": 2.6213, "mean_token_accuracy": 0.41379311084747317, "step": 42725 }, { "epoch": 0.043038109828260514, "grad_norm": 12.692247164970185, "learning_rate": 4.3037286223636766e-05, "loss": 3.0999, "mean_token_accuracy": 0.32413792610168457, "step": 42730 }, { "epoch": 0.04304314588136469, "grad_norm": 12.932886868488078, "learning_rate": 4.3042322180367826e-05, "loss": 2.446, "mean_token_accuracy": 0.3814881980419159, "step": 42735 }, { "epoch": 0.04304818193446886, "grad_norm": 14.318685162633372, "learning_rate": 4.3047358137098885e-05, "loss": 2.6926, "mean_token_accuracy": 0.38620689511299133, "step": 42740 }, { "epoch": 0.043053217987573035, "grad_norm": 17.429441479973644, "learning_rate": 4.3052394093829944e-05, "loss": 2.8886, "mean_token_accuracy": 0.3655172407627106, "step": 42745 }, { "epoch": 0.04305825404067721, "grad_norm": 15.412231337552774, "learning_rate": 4.305743005056101e-05, "loss": 2.5241, "mean_token_accuracy": 0.42758620381355283, "step": 42750 }, { "epoch": 0.04306329009378138, "grad_norm": 14.665060980235358, "learning_rate": 4.306246600729207e-05, "loss": 2.3398, "mean_token_accuracy": 0.43103448748588563, "step": 42755 }, { "epoch": 0.043068326146885556, "grad_norm": 15.859227202107647, "learning_rate": 4.306750196402313e-05, "loss": 2.631, "mean_token_accuracy": 0.42413792610168455, "step": 42760 }, { "epoch": 0.04307336219998972, "grad_norm": 14.728487050105088, "learning_rate": 4.307253792075419e-05, "loss": 2.4061, "mean_token_accuracy": 0.4172413766384125, "step": 42765 }, { "epoch": 0.0430783982530939, "grad_norm": 14.875068886786549, "learning_rate": 4.307757387748525e-05, "loss": 2.5842, "mean_token_accuracy": 0.42068964838981626, "step": 42770 }, { "epoch": 0.04308343430619807, "grad_norm": 13.31001833252351, "learning_rate": 4.3082609834216307e-05, "loss": 2.3986, "mean_token_accuracy": 0.3827586233615875, "step": 42775 }, { "epoch": 0.043088470359302244, "grad_norm": 10.654826789800902, "learning_rate": 4.3087645790947366e-05, "loss": 2.5992, "mean_token_accuracy": 0.38124621510505674, "step": 42780 }, { "epoch": 0.04309350641240642, "grad_norm": 13.488921091471132, "learning_rate": 4.3092681747678425e-05, "loss": 2.9385, "mean_token_accuracy": 0.33103448450565337, "step": 42785 }, { "epoch": 0.04309854246551059, "grad_norm": 13.752968324755004, "learning_rate": 4.3097717704409484e-05, "loss": 2.8167, "mean_token_accuracy": 0.3482758581638336, "step": 42790 }, { "epoch": 0.043103578518614766, "grad_norm": 14.54340265546484, "learning_rate": 4.3102753661140544e-05, "loss": 2.6557, "mean_token_accuracy": 0.3620689630508423, "step": 42795 }, { "epoch": 0.04310861457171893, "grad_norm": 13.958273588277942, "learning_rate": 4.310778961787161e-05, "loss": 2.8645, "mean_token_accuracy": 0.33793103098869326, "step": 42800 }, { "epoch": 0.043113650624823106, "grad_norm": 15.052724088428423, "learning_rate": 4.311282557460267e-05, "loss": 2.6175, "mean_token_accuracy": 0.43278887271881106, "step": 42805 }, { "epoch": 0.04311868667792728, "grad_norm": 25.49424797218806, "learning_rate": 4.311786153133373e-05, "loss": 2.7239, "mean_token_accuracy": 0.3551724135875702, "step": 42810 }, { "epoch": 0.043123722731031454, "grad_norm": 8.499539185378897, "learning_rate": 4.312289748806479e-05, "loss": 2.3754, "mean_token_accuracy": 0.4206896543502808, "step": 42815 }, { "epoch": 0.04312875878413563, "grad_norm": 12.030569727972235, "learning_rate": 4.312793344479584e-05, "loss": 2.3915, "mean_token_accuracy": 0.4206896543502808, "step": 42820 }, { "epoch": 0.0431337948372398, "grad_norm": 21.33801015877675, "learning_rate": 4.31329694015269e-05, "loss": 2.7866, "mean_token_accuracy": 0.3482758551836014, "step": 42825 }, { "epoch": 0.043138830890343975, "grad_norm": 11.798603932857539, "learning_rate": 4.3138005358257965e-05, "loss": 2.0824, "mean_token_accuracy": 0.46418632566928864, "step": 42830 }, { "epoch": 0.04314386694344814, "grad_norm": 33.40118134418917, "learning_rate": 4.3143041314989024e-05, "loss": 2.6826, "mean_token_accuracy": 0.3655172407627106, "step": 42835 }, { "epoch": 0.043148902996552316, "grad_norm": 12.788676569442492, "learning_rate": 4.3148077271720084e-05, "loss": 2.5732, "mean_token_accuracy": 0.41034482717514037, "step": 42840 }, { "epoch": 0.04315393904965649, "grad_norm": 11.82799888141429, "learning_rate": 4.315311322845114e-05, "loss": 2.5951, "mean_token_accuracy": 0.3793103456497192, "step": 42845 }, { "epoch": 0.04315897510276066, "grad_norm": 14.042403345673172, "learning_rate": 4.31581491851822e-05, "loss": 2.5832, "mean_token_accuracy": 0.42068964838981626, "step": 42850 }, { "epoch": 0.04316401115586484, "grad_norm": 12.082065602165219, "learning_rate": 4.316318514191327e-05, "loss": 2.4043, "mean_token_accuracy": 0.39310344457626345, "step": 42855 }, { "epoch": 0.04316904720896901, "grad_norm": 10.982935288489422, "learning_rate": 4.316822109864432e-05, "loss": 2.494, "mean_token_accuracy": 0.41724138855934145, "step": 42860 }, { "epoch": 0.043174083262073185, "grad_norm": 11.549541624359899, "learning_rate": 4.317325705537538e-05, "loss": 2.8055, "mean_token_accuracy": 0.4, "step": 42865 }, { "epoch": 0.04317911931517735, "grad_norm": 17.95237555377303, "learning_rate": 4.317829301210644e-05, "loss": 2.7343, "mean_token_accuracy": 0.4068965494632721, "step": 42870 }, { "epoch": 0.043184155368281525, "grad_norm": 13.035323024305521, "learning_rate": 4.31833289688375e-05, "loss": 2.4786, "mean_token_accuracy": 0.42068966031074523, "step": 42875 }, { "epoch": 0.0431891914213857, "grad_norm": 10.172569748117262, "learning_rate": 4.3188364925568564e-05, "loss": 2.4248, "mean_token_accuracy": 0.4275862157344818, "step": 42880 }, { "epoch": 0.04319422747448987, "grad_norm": 14.896547723341527, "learning_rate": 4.3193400882299624e-05, "loss": 2.5989, "mean_token_accuracy": 0.42758620381355283, "step": 42885 }, { "epoch": 0.043199263527594046, "grad_norm": 15.817082310922139, "learning_rate": 4.319843683903068e-05, "loss": 2.5206, "mean_token_accuracy": 0.44615850448608396, "step": 42890 }, { "epoch": 0.04320429958069822, "grad_norm": 12.709828779685514, "learning_rate": 4.320347279576174e-05, "loss": 2.8897, "mean_token_accuracy": 0.36551723480224607, "step": 42895 }, { "epoch": 0.043209335633802394, "grad_norm": 14.22070682297591, "learning_rate": 4.32085087524928e-05, "loss": 2.5968, "mean_token_accuracy": 0.3931034505367279, "step": 42900 }, { "epoch": 0.04321437168690656, "grad_norm": 13.051136429145583, "learning_rate": 4.321354470922386e-05, "loss": 2.4117, "mean_token_accuracy": 0.4068965554237366, "step": 42905 }, { "epoch": 0.043219407740010735, "grad_norm": 14.547566634077588, "learning_rate": 4.321858066595492e-05, "loss": 2.5519, "mean_token_accuracy": 0.4172413766384125, "step": 42910 }, { "epoch": 0.04322444379311491, "grad_norm": 13.401343506142018, "learning_rate": 4.322361662268598e-05, "loss": 2.3575, "mean_token_accuracy": 0.42413793206214906, "step": 42915 }, { "epoch": 0.04322947984621908, "grad_norm": 16.086908711077506, "learning_rate": 4.322865257941704e-05, "loss": 2.5245, "mean_token_accuracy": 0.463520872592926, "step": 42920 }, { "epoch": 0.043234515899323256, "grad_norm": 19.539453496341785, "learning_rate": 4.32336885361481e-05, "loss": 3.0739, "mean_token_accuracy": 0.38275861740112305, "step": 42925 }, { "epoch": 0.04323955195242743, "grad_norm": 16.102171462159763, "learning_rate": 4.323872449287916e-05, "loss": 2.6971, "mean_token_accuracy": 0.4, "step": 42930 }, { "epoch": 0.0432445880055316, "grad_norm": 14.604535188777865, "learning_rate": 4.324376044961022e-05, "loss": 2.7085, "mean_token_accuracy": 0.3655172407627106, "step": 42935 }, { "epoch": 0.04324962405863577, "grad_norm": 14.888040688792518, "learning_rate": 4.324879640634128e-05, "loss": 3.0682, "mean_token_accuracy": 0.3793103516101837, "step": 42940 }, { "epoch": 0.043254660111739944, "grad_norm": 45.10352474136455, "learning_rate": 4.325383236307234e-05, "loss": 3.077, "mean_token_accuracy": 0.3758620709180832, "step": 42945 }, { "epoch": 0.04325969616484412, "grad_norm": 13.255845301379836, "learning_rate": 4.3258868319803394e-05, "loss": 2.4893, "mean_token_accuracy": 0.4068965554237366, "step": 42950 }, { "epoch": 0.04326473221794829, "grad_norm": 19.385992686266437, "learning_rate": 4.326390427653445e-05, "loss": 3.1516, "mean_token_accuracy": 0.3517241358757019, "step": 42955 }, { "epoch": 0.043269768271052465, "grad_norm": 15.5630097100151, "learning_rate": 4.326894023326552e-05, "loss": 3.1361, "mean_token_accuracy": 0.32758620381355286, "step": 42960 }, { "epoch": 0.04327480432415664, "grad_norm": 13.429486287678065, "learning_rate": 4.327397618999658e-05, "loss": 2.7853, "mean_token_accuracy": 0.3517241418361664, "step": 42965 }, { "epoch": 0.04327984037726081, "grad_norm": 12.35284586939357, "learning_rate": 4.327901214672764e-05, "loss": 2.5725, "mean_token_accuracy": 0.3896551787853241, "step": 42970 }, { "epoch": 0.04328487643036498, "grad_norm": 11.43572494800055, "learning_rate": 4.32840481034587e-05, "loss": 2.3576, "mean_token_accuracy": 0.3758620619773865, "step": 42975 }, { "epoch": 0.04328991248346915, "grad_norm": 10.997491340570352, "learning_rate": 4.3289084060189756e-05, "loss": 2.4089, "mean_token_accuracy": 0.4465819835662842, "step": 42980 }, { "epoch": 0.04329494853657333, "grad_norm": 12.925946346186857, "learning_rate": 4.329412001692082e-05, "loss": 2.9017, "mean_token_accuracy": 0.36896551251411436, "step": 42985 }, { "epoch": 0.0432999845896775, "grad_norm": 12.694847266546367, "learning_rate": 4.329915597365188e-05, "loss": 2.9158, "mean_token_accuracy": 0.39655172228813174, "step": 42990 }, { "epoch": 0.043305020642781675, "grad_norm": 12.451086891758022, "learning_rate": 4.3304191930382934e-05, "loss": 2.1592, "mean_token_accuracy": 0.41379310488700866, "step": 42995 }, { "epoch": 0.04331005669588585, "grad_norm": 11.098650021708801, "learning_rate": 4.330922788711399e-05, "loss": 2.6512, "mean_token_accuracy": 0.3896551728248596, "step": 43000 }, { "epoch": 0.04331509274899002, "grad_norm": 12.663143956804458, "learning_rate": 4.331426384384505e-05, "loss": 2.3195, "mean_token_accuracy": 0.4413793087005615, "step": 43005 }, { "epoch": 0.04332012880209419, "grad_norm": 13.805755930495046, "learning_rate": 4.331929980057611e-05, "loss": 3.0402, "mean_token_accuracy": 0.3482758641242981, "step": 43010 }, { "epoch": 0.04332516485519836, "grad_norm": 11.348780509726526, "learning_rate": 4.332433575730718e-05, "loss": 2.3997, "mean_token_accuracy": 0.4551724135875702, "step": 43015 }, { "epoch": 0.04333020090830254, "grad_norm": 12.963712840786435, "learning_rate": 4.332937171403824e-05, "loss": 2.4936, "mean_token_accuracy": 0.42758620977401735, "step": 43020 }, { "epoch": 0.04333523696140671, "grad_norm": 18.92054234869347, "learning_rate": 4.3334407670769296e-05, "loss": 2.78, "mean_token_accuracy": 0.3482758581638336, "step": 43025 }, { "epoch": 0.043340273014510884, "grad_norm": 15.808847624735115, "learning_rate": 4.3339443627500356e-05, "loss": 2.3086, "mean_token_accuracy": 0.4034482717514038, "step": 43030 }, { "epoch": 0.04334530906761506, "grad_norm": 16.89370061101859, "learning_rate": 4.3344479584231415e-05, "loss": 2.6609, "mean_token_accuracy": 0.3896551728248596, "step": 43035 }, { "epoch": 0.04335034512071923, "grad_norm": 14.204253711162275, "learning_rate": 4.3349515540962474e-05, "loss": 2.3443, "mean_token_accuracy": 0.42758620977401735, "step": 43040 }, { "epoch": 0.0433553811738234, "grad_norm": 12.960054436857677, "learning_rate": 4.335455149769353e-05, "loss": 2.6112, "mean_token_accuracy": 0.39655172228813174, "step": 43045 }, { "epoch": 0.04336041722692757, "grad_norm": 13.395314174471789, "learning_rate": 4.335958745442459e-05, "loss": 2.8793, "mean_token_accuracy": 0.379310342669487, "step": 43050 }, { "epoch": 0.043365453280031746, "grad_norm": 17.100956758798244, "learning_rate": 4.336462341115565e-05, "loss": 2.5055, "mean_token_accuracy": 0.4103448212146759, "step": 43055 }, { "epoch": 0.04337048933313592, "grad_norm": 18.113587644654412, "learning_rate": 4.336965936788671e-05, "loss": 2.7012, "mean_token_accuracy": 0.3551724076271057, "step": 43060 }, { "epoch": 0.043375525386240094, "grad_norm": 13.882859292640665, "learning_rate": 4.337469532461778e-05, "loss": 2.5601, "mean_token_accuracy": 0.4241379380226135, "step": 43065 }, { "epoch": 0.04338056143934427, "grad_norm": 13.637055373355933, "learning_rate": 4.3379731281348836e-05, "loss": 2.6017, "mean_token_accuracy": 0.4172413766384125, "step": 43070 }, { "epoch": 0.04338559749244844, "grad_norm": 17.09538398175619, "learning_rate": 4.3384767238079896e-05, "loss": 2.7013, "mean_token_accuracy": 0.4034482777118683, "step": 43075 }, { "epoch": 0.04339063354555261, "grad_norm": 12.381146266209566, "learning_rate": 4.3389803194810955e-05, "loss": 2.099, "mean_token_accuracy": 0.4586206912994385, "step": 43080 }, { "epoch": 0.04339566959865678, "grad_norm": 11.697657761051355, "learning_rate": 4.339483915154201e-05, "loss": 2.5589, "mean_token_accuracy": 0.4103448331356049, "step": 43085 }, { "epoch": 0.043400705651760955, "grad_norm": 12.138802313032107, "learning_rate": 4.3399875108273067e-05, "loss": 2.37, "mean_token_accuracy": 0.44827587604522706, "step": 43090 }, { "epoch": 0.04340574170486513, "grad_norm": 13.100388377327807, "learning_rate": 4.340491106500413e-05, "loss": 2.4283, "mean_token_accuracy": 0.4517241358757019, "step": 43095 }, { "epoch": 0.0434107777579693, "grad_norm": 15.422257364256089, "learning_rate": 4.340994702173519e-05, "loss": 2.6166, "mean_token_accuracy": 0.3935960590839386, "step": 43100 }, { "epoch": 0.04341581381107348, "grad_norm": 13.08115260711766, "learning_rate": 4.341498297846625e-05, "loss": 2.6272, "mean_token_accuracy": 0.3551724016666412, "step": 43105 }, { "epoch": 0.04342084986417765, "grad_norm": 15.029613899522984, "learning_rate": 4.342001893519731e-05, "loss": 2.3388, "mean_token_accuracy": 0.42068964838981626, "step": 43110 }, { "epoch": 0.04342588591728182, "grad_norm": 12.77797923526171, "learning_rate": 4.342505489192837e-05, "loss": 2.5419, "mean_token_accuracy": 0.37241379022598264, "step": 43115 }, { "epoch": 0.04343092197038599, "grad_norm": 12.896080787141113, "learning_rate": 4.3430090848659436e-05, "loss": 2.4695, "mean_token_accuracy": 0.42068966627120974, "step": 43120 }, { "epoch": 0.043435958023490165, "grad_norm": 11.597229728794334, "learning_rate": 4.343512680539049e-05, "loss": 2.6755, "mean_token_accuracy": 0.3482758641242981, "step": 43125 }, { "epoch": 0.04344099407659434, "grad_norm": 10.732689666909268, "learning_rate": 4.344016276212155e-05, "loss": 2.4151, "mean_token_accuracy": 0.4379310369491577, "step": 43130 }, { "epoch": 0.04344603012969851, "grad_norm": 14.411309254698672, "learning_rate": 4.3445198718852607e-05, "loss": 2.6971, "mean_token_accuracy": 0.3517241358757019, "step": 43135 }, { "epoch": 0.043451066182802686, "grad_norm": 15.761689482204545, "learning_rate": 4.3450234675583666e-05, "loss": 2.5084, "mean_token_accuracy": 0.43103448748588563, "step": 43140 }, { "epoch": 0.04345610223590686, "grad_norm": 13.926954334746963, "learning_rate": 4.345527063231473e-05, "loss": 2.5789, "mean_token_accuracy": 0.41379310488700866, "step": 43145 }, { "epoch": 0.04346113828901103, "grad_norm": 14.96288766291618, "learning_rate": 4.346030658904579e-05, "loss": 2.7384, "mean_token_accuracy": 0.41034482717514037, "step": 43150 }, { "epoch": 0.0434661743421152, "grad_norm": 15.272037767044255, "learning_rate": 4.346534254577685e-05, "loss": 2.7702, "mean_token_accuracy": 0.3862069010734558, "step": 43155 }, { "epoch": 0.043471210395219374, "grad_norm": 13.094975164110359, "learning_rate": 4.347037850250791e-05, "loss": 2.3679, "mean_token_accuracy": 0.41724138259887694, "step": 43160 }, { "epoch": 0.04347624644832355, "grad_norm": 17.485843776858143, "learning_rate": 4.347541445923897e-05, "loss": 2.7991, "mean_token_accuracy": 0.379310342669487, "step": 43165 }, { "epoch": 0.04348128250142772, "grad_norm": 12.355648942361583, "learning_rate": 4.348045041597003e-05, "loss": 2.3722, "mean_token_accuracy": 0.4467634618282318, "step": 43170 }, { "epoch": 0.043486318554531896, "grad_norm": 15.748488777614266, "learning_rate": 4.348548637270109e-05, "loss": 2.7299, "mean_token_accuracy": 0.35862069129943847, "step": 43175 }, { "epoch": 0.04349135460763607, "grad_norm": 12.174001820518745, "learning_rate": 4.349052232943215e-05, "loss": 2.6204, "mean_token_accuracy": 0.36551724672317504, "step": 43180 }, { "epoch": 0.043496390660740236, "grad_norm": 16.239929322014568, "learning_rate": 4.3495558286163206e-05, "loss": 2.7893, "mean_token_accuracy": 0.3310344785451889, "step": 43185 }, { "epoch": 0.04350142671384441, "grad_norm": 13.033772908938829, "learning_rate": 4.3500594242894265e-05, "loss": 2.6333, "mean_token_accuracy": 0.3999999940395355, "step": 43190 }, { "epoch": 0.043506462766948584, "grad_norm": 19.35030000193822, "learning_rate": 4.3505630199625324e-05, "loss": 2.8275, "mean_token_accuracy": 0.36896551847457887, "step": 43195 }, { "epoch": 0.04351149882005276, "grad_norm": 15.278772099255892, "learning_rate": 4.351066615635639e-05, "loss": 2.5621, "mean_token_accuracy": 0.3517241358757019, "step": 43200 }, { "epoch": 0.04351653487315693, "grad_norm": 12.560360758937867, "learning_rate": 4.351570211308745e-05, "loss": 2.6419, "mean_token_accuracy": 0.4034482777118683, "step": 43205 }, { "epoch": 0.043521570926261105, "grad_norm": 11.615216013448267, "learning_rate": 4.352073806981851e-05, "loss": 1.9393, "mean_token_accuracy": 0.5322660088539124, "step": 43210 }, { "epoch": 0.04352660697936528, "grad_norm": 14.429088496901855, "learning_rate": 4.352577402654957e-05, "loss": 2.667, "mean_token_accuracy": 0.3793103456497192, "step": 43215 }, { "epoch": 0.043531643032469446, "grad_norm": 15.758399557454972, "learning_rate": 4.353080998328062e-05, "loss": 2.6936, "mean_token_accuracy": 0.4206896543502808, "step": 43220 }, { "epoch": 0.04353667908557362, "grad_norm": 13.882410084424185, "learning_rate": 4.353584594001169e-05, "loss": 2.58, "mean_token_accuracy": 0.37931033968925476, "step": 43225 }, { "epoch": 0.04354171513867779, "grad_norm": 13.163756328293877, "learning_rate": 4.3540881896742746e-05, "loss": 2.946, "mean_token_accuracy": 0.32758620083332063, "step": 43230 }, { "epoch": 0.04354675119178197, "grad_norm": 14.964656195605372, "learning_rate": 4.3545917853473805e-05, "loss": 2.7152, "mean_token_accuracy": 0.3896551728248596, "step": 43235 }, { "epoch": 0.04355178724488614, "grad_norm": 13.706497100872635, "learning_rate": 4.3550953810204864e-05, "loss": 2.6414, "mean_token_accuracy": 0.35862069129943847, "step": 43240 }, { "epoch": 0.043556823297990314, "grad_norm": 11.65289548517909, "learning_rate": 4.3555989766935924e-05, "loss": 2.3751, "mean_token_accuracy": 0.42413793206214906, "step": 43245 }, { "epoch": 0.04356185935109449, "grad_norm": 11.271780160795974, "learning_rate": 4.356102572366698e-05, "loss": 2.7489, "mean_token_accuracy": 0.4034482777118683, "step": 43250 }, { "epoch": 0.043566895404198655, "grad_norm": 13.040516500444163, "learning_rate": 4.356606168039805e-05, "loss": 2.6822, "mean_token_accuracy": 0.39092559218406675, "step": 43255 }, { "epoch": 0.04357193145730283, "grad_norm": 14.111004047941716, "learning_rate": 4.35710976371291e-05, "loss": 2.8235, "mean_token_accuracy": 0.36896551549434664, "step": 43260 }, { "epoch": 0.043576967510407, "grad_norm": 14.84088171486115, "learning_rate": 4.357613359386016e-05, "loss": 2.8361, "mean_token_accuracy": 0.33103448450565337, "step": 43265 }, { "epoch": 0.043582003563511176, "grad_norm": 10.913966474422473, "learning_rate": 4.358116955059122e-05, "loss": 2.6872, "mean_token_accuracy": 0.41379311084747317, "step": 43270 }, { "epoch": 0.04358703961661535, "grad_norm": 14.482373630583574, "learning_rate": 4.358620550732228e-05, "loss": 2.2416, "mean_token_accuracy": 0.4344827651977539, "step": 43275 }, { "epoch": 0.043592075669719524, "grad_norm": 15.040060118216331, "learning_rate": 4.3591241464053345e-05, "loss": 2.333, "mean_token_accuracy": 0.4655172348022461, "step": 43280 }, { "epoch": 0.0435971117228237, "grad_norm": 13.172571306772097, "learning_rate": 4.3596277420784405e-05, "loss": 2.3704, "mean_token_accuracy": 0.45517241954803467, "step": 43285 }, { "epoch": 0.043602147775927864, "grad_norm": 12.216863193308022, "learning_rate": 4.3601313377515464e-05, "loss": 2.8283, "mean_token_accuracy": 0.3896551728248596, "step": 43290 }, { "epoch": 0.04360718382903204, "grad_norm": 12.939393252751017, "learning_rate": 4.360634933424652e-05, "loss": 2.6627, "mean_token_accuracy": 0.42413793206214906, "step": 43295 }, { "epoch": 0.04361221988213621, "grad_norm": 13.39822288707477, "learning_rate": 4.361138529097758e-05, "loss": 2.8026, "mean_token_accuracy": 0.41724138259887694, "step": 43300 }, { "epoch": 0.043617255935240386, "grad_norm": 10.955606182190163, "learning_rate": 4.361642124770864e-05, "loss": 2.3853, "mean_token_accuracy": 0.4344827592372894, "step": 43305 }, { "epoch": 0.04362229198834456, "grad_norm": 15.601764807936371, "learning_rate": 4.36214572044397e-05, "loss": 2.5724, "mean_token_accuracy": 0.4344827592372894, "step": 43310 }, { "epoch": 0.04362732804144873, "grad_norm": 13.842995389962418, "learning_rate": 4.362649316117076e-05, "loss": 2.3568, "mean_token_accuracy": 0.41034482717514037, "step": 43315 }, { "epoch": 0.04363236409455291, "grad_norm": 12.125376662289419, "learning_rate": 4.363152911790182e-05, "loss": 2.6468, "mean_token_accuracy": 0.4, "step": 43320 }, { "epoch": 0.043637400147657074, "grad_norm": 13.351707789151023, "learning_rate": 4.363656507463288e-05, "loss": 2.8311, "mean_token_accuracy": 0.3482758641242981, "step": 43325 }, { "epoch": 0.04364243620076125, "grad_norm": 18.631419854504433, "learning_rate": 4.364160103136394e-05, "loss": 2.6929, "mean_token_accuracy": 0.4206896543502808, "step": 43330 }, { "epoch": 0.04364747225386542, "grad_norm": 16.702363948374455, "learning_rate": 4.3646636988095004e-05, "loss": 2.5476, "mean_token_accuracy": 0.4000000059604645, "step": 43335 }, { "epoch": 0.043652508306969595, "grad_norm": 15.71787728715803, "learning_rate": 4.365167294482606e-05, "loss": 2.6224, "mean_token_accuracy": 0.3655172407627106, "step": 43340 }, { "epoch": 0.04365754436007377, "grad_norm": 11.742339375091637, "learning_rate": 4.365670890155712e-05, "loss": 2.6505, "mean_token_accuracy": 0.39140955805778505, "step": 43345 }, { "epoch": 0.04366258041317794, "grad_norm": 13.476821274968112, "learning_rate": 4.366174485828818e-05, "loss": 3.0041, "mean_token_accuracy": 0.31724137663841245, "step": 43350 }, { "epoch": 0.043667616466282116, "grad_norm": 13.662896839534794, "learning_rate": 4.3666780815019234e-05, "loss": 2.8501, "mean_token_accuracy": 0.38620689511299133, "step": 43355 }, { "epoch": 0.04367265251938628, "grad_norm": 12.981283474223575, "learning_rate": 4.36718167717503e-05, "loss": 2.3514, "mean_token_accuracy": 0.4103448212146759, "step": 43360 }, { "epoch": 0.04367768857249046, "grad_norm": 13.775124424511866, "learning_rate": 4.367685272848136e-05, "loss": 2.8109, "mean_token_accuracy": 0.35862069129943847, "step": 43365 }, { "epoch": 0.04368272462559463, "grad_norm": 15.648144139818218, "learning_rate": 4.368188868521242e-05, "loss": 2.873, "mean_token_accuracy": 0.3551724135875702, "step": 43370 }, { "epoch": 0.043687760678698805, "grad_norm": 13.240898273242951, "learning_rate": 4.368692464194348e-05, "loss": 2.5301, "mean_token_accuracy": 0.42068966031074523, "step": 43375 }, { "epoch": 0.04369279673180298, "grad_norm": 12.370873212076045, "learning_rate": 4.369196059867454e-05, "loss": 2.917, "mean_token_accuracy": 0.3655172407627106, "step": 43380 }, { "epoch": 0.04369783278490715, "grad_norm": 14.79108232576774, "learning_rate": 4.36969965554056e-05, "loss": 2.4644, "mean_token_accuracy": 0.42068966031074523, "step": 43385 }, { "epoch": 0.043702868838011326, "grad_norm": 11.096601229503467, "learning_rate": 4.370203251213666e-05, "loss": 2.1809, "mean_token_accuracy": 0.4601330876350403, "step": 43390 }, { "epoch": 0.04370790489111549, "grad_norm": 14.3581420605594, "learning_rate": 4.3707068468867715e-05, "loss": 2.8766, "mean_token_accuracy": 0.36551723480224607, "step": 43395 }, { "epoch": 0.043712940944219666, "grad_norm": 19.607621485693915, "learning_rate": 4.3712104425598774e-05, "loss": 2.4569, "mean_token_accuracy": 0.4034482717514038, "step": 43400 }, { "epoch": 0.04371797699732384, "grad_norm": 15.07783977626869, "learning_rate": 4.371714038232983e-05, "loss": 2.7806, "mean_token_accuracy": 0.3896551787853241, "step": 43405 }, { "epoch": 0.043723013050428014, "grad_norm": 14.881302314728204, "learning_rate": 4.37221763390609e-05, "loss": 2.5533, "mean_token_accuracy": 0.37241379022598264, "step": 43410 }, { "epoch": 0.04372804910353219, "grad_norm": 12.65799892415228, "learning_rate": 4.372721229579196e-05, "loss": 2.7452, "mean_token_accuracy": 0.4, "step": 43415 }, { "epoch": 0.04373308515663636, "grad_norm": 14.571732356950381, "learning_rate": 4.373224825252302e-05, "loss": 2.7242, "mean_token_accuracy": 0.3551724135875702, "step": 43420 }, { "epoch": 0.043738121209740535, "grad_norm": 12.2128842564891, "learning_rate": 4.373728420925408e-05, "loss": 2.8017, "mean_token_accuracy": 0.39310344457626345, "step": 43425 }, { "epoch": 0.0437431572628447, "grad_norm": 11.055695417531172, "learning_rate": 4.3742320165985136e-05, "loss": 2.5375, "mean_token_accuracy": 0.39655172228813174, "step": 43430 }, { "epoch": 0.043748193315948876, "grad_norm": 18.6310118213333, "learning_rate": 4.3747356122716196e-05, "loss": 2.9781, "mean_token_accuracy": 0.39310344457626345, "step": 43435 }, { "epoch": 0.04375322936905305, "grad_norm": 11.854790543483865, "learning_rate": 4.3752392079447255e-05, "loss": 2.6211, "mean_token_accuracy": 0.4068965554237366, "step": 43440 }, { "epoch": 0.04375826542215722, "grad_norm": 10.767799032639793, "learning_rate": 4.3757428036178314e-05, "loss": 2.2343, "mean_token_accuracy": 0.4379310369491577, "step": 43445 }, { "epoch": 0.0437633014752614, "grad_norm": 15.006047101797147, "learning_rate": 4.3762463992909373e-05, "loss": 2.3543, "mean_token_accuracy": 0.4137930989265442, "step": 43450 }, { "epoch": 0.04376833752836557, "grad_norm": 14.393955725821202, "learning_rate": 4.376749994964043e-05, "loss": 2.6447, "mean_token_accuracy": 0.38275861740112305, "step": 43455 }, { "epoch": 0.043773373581469745, "grad_norm": 15.249000876082574, "learning_rate": 4.377253590637149e-05, "loss": 2.7696, "mean_token_accuracy": 0.3965517163276672, "step": 43460 }, { "epoch": 0.04377840963457391, "grad_norm": 15.171234870313565, "learning_rate": 4.377757186310256e-05, "loss": 2.5549, "mean_token_accuracy": 0.38118572235107423, "step": 43465 }, { "epoch": 0.043783445687678085, "grad_norm": 12.43069198694147, "learning_rate": 4.378260781983362e-05, "loss": 2.7453, "mean_token_accuracy": 0.38620689809322356, "step": 43470 }, { "epoch": 0.04378848174078226, "grad_norm": 11.891889770491794, "learning_rate": 4.3787643776564676e-05, "loss": 2.3951, "mean_token_accuracy": 0.49655171632766726, "step": 43475 }, { "epoch": 0.04379351779388643, "grad_norm": 12.501430879955116, "learning_rate": 4.3792679733295736e-05, "loss": 3.0204, "mean_token_accuracy": 0.3620689630508423, "step": 43480 }, { "epoch": 0.04379855384699061, "grad_norm": 17.836937478361897, "learning_rate": 4.379771569002679e-05, "loss": 2.5659, "mean_token_accuracy": 0.4103448331356049, "step": 43485 }, { "epoch": 0.04380358990009478, "grad_norm": 12.23916541204761, "learning_rate": 4.3802751646757854e-05, "loss": 2.5333, "mean_token_accuracy": 0.4121597111225128, "step": 43490 }, { "epoch": 0.043808625953198954, "grad_norm": 12.62429262145911, "learning_rate": 4.3807787603488913e-05, "loss": 2.4096, "mean_token_accuracy": 0.46382336020469667, "step": 43495 }, { "epoch": 0.04381366200630312, "grad_norm": 12.086492287296393, "learning_rate": 4.381282356021997e-05, "loss": 2.4053, "mean_token_accuracy": 0.4206896543502808, "step": 43500 }, { "epoch": 0.043818698059407295, "grad_norm": 21.91128880272995, "learning_rate": 4.381785951695103e-05, "loss": 2.9402, "mean_token_accuracy": 0.33793103098869326, "step": 43505 }, { "epoch": 0.04382373411251147, "grad_norm": 12.286437009550408, "learning_rate": 4.382289547368209e-05, "loss": 2.5313, "mean_token_accuracy": 0.4413793087005615, "step": 43510 }, { "epoch": 0.04382877016561564, "grad_norm": 14.393027724450064, "learning_rate": 4.382793143041315e-05, "loss": 2.7175, "mean_token_accuracy": 0.36551723480224607, "step": 43515 }, { "epoch": 0.043833806218719816, "grad_norm": 16.187501449458118, "learning_rate": 4.3832967387144217e-05, "loss": 2.3068, "mean_token_accuracy": 0.4310344815254211, "step": 43520 }, { "epoch": 0.04383884227182399, "grad_norm": 13.768130229337245, "learning_rate": 4.3838003343875276e-05, "loss": 3.0751, "mean_token_accuracy": 0.3551724076271057, "step": 43525 }, { "epoch": 0.043843878324928164, "grad_norm": 12.121721781124167, "learning_rate": 4.384303930060633e-05, "loss": 2.5278, "mean_token_accuracy": 0.38275861740112305, "step": 43530 }, { "epoch": 0.04384891437803233, "grad_norm": 23.668780022074916, "learning_rate": 4.384807525733739e-05, "loss": 2.7998, "mean_token_accuracy": 0.3983061134815216, "step": 43535 }, { "epoch": 0.043853950431136504, "grad_norm": 14.88753247939582, "learning_rate": 4.385311121406845e-05, "loss": 2.7228, "mean_token_accuracy": 0.38275861740112305, "step": 43540 }, { "epoch": 0.04385898648424068, "grad_norm": 18.19546441923281, "learning_rate": 4.385814717079951e-05, "loss": 2.5797, "mean_token_accuracy": 0.3983061194419861, "step": 43545 }, { "epoch": 0.04386402253734485, "grad_norm": 11.873757750081026, "learning_rate": 4.386318312753057e-05, "loss": 2.4235, "mean_token_accuracy": 0.37586207389831544, "step": 43550 }, { "epoch": 0.043869058590449025, "grad_norm": 15.76499525569811, "learning_rate": 4.386821908426163e-05, "loss": 2.609, "mean_token_accuracy": 0.4621294617652893, "step": 43555 }, { "epoch": 0.0438740946435532, "grad_norm": 11.240455594661785, "learning_rate": 4.387325504099269e-05, "loss": 2.5631, "mean_token_accuracy": 0.37586206793785093, "step": 43560 }, { "epoch": 0.04387913069665737, "grad_norm": 12.671307529285949, "learning_rate": 4.387829099772375e-05, "loss": 2.4368, "mean_token_accuracy": 0.4295825779438019, "step": 43565 }, { "epoch": 0.04388416674976154, "grad_norm": 16.006486189997982, "learning_rate": 4.388332695445481e-05, "loss": 2.7819, "mean_token_accuracy": 0.3724137842655182, "step": 43570 }, { "epoch": 0.043889202802865714, "grad_norm": 12.975606203537602, "learning_rate": 4.388836291118587e-05, "loss": 2.4424, "mean_token_accuracy": 0.44137930274009707, "step": 43575 }, { "epoch": 0.04389423885596989, "grad_norm": 12.676119609449293, "learning_rate": 4.389339886791693e-05, "loss": 2.671, "mean_token_accuracy": 0.42758620977401735, "step": 43580 }, { "epoch": 0.04389927490907406, "grad_norm": 12.11500954564936, "learning_rate": 4.389843482464799e-05, "loss": 2.3207, "mean_token_accuracy": 0.37586206793785093, "step": 43585 }, { "epoch": 0.043904310962178235, "grad_norm": 15.230201051089736, "learning_rate": 4.3903470781379046e-05, "loss": 2.6726, "mean_token_accuracy": 0.38106473088264464, "step": 43590 }, { "epoch": 0.04390934701528241, "grad_norm": 14.656423872220582, "learning_rate": 4.3908506738110105e-05, "loss": 2.7699, "mean_token_accuracy": 0.39999998807907106, "step": 43595 }, { "epoch": 0.04391438306838658, "grad_norm": 15.95194955176313, "learning_rate": 4.391354269484117e-05, "loss": 2.9379, "mean_token_accuracy": 0.3689655065536499, "step": 43600 }, { "epoch": 0.04391941912149075, "grad_norm": 15.390449689590145, "learning_rate": 4.391857865157223e-05, "loss": 2.874, "mean_token_accuracy": 0.3793103456497192, "step": 43605 }, { "epoch": 0.04392445517459492, "grad_norm": 14.94454643075687, "learning_rate": 4.392361460830329e-05, "loss": 2.4643, "mean_token_accuracy": 0.3896551728248596, "step": 43610 }, { "epoch": 0.0439294912276991, "grad_norm": 13.992088897946488, "learning_rate": 4.392865056503435e-05, "loss": 3.043, "mean_token_accuracy": 0.32413792610168457, "step": 43615 }, { "epoch": 0.04393452728080327, "grad_norm": 13.114610155239365, "learning_rate": 4.39336865217654e-05, "loss": 2.7146, "mean_token_accuracy": 0.35517241060733795, "step": 43620 }, { "epoch": 0.043939563333907444, "grad_norm": 10.539665188350874, "learning_rate": 4.393872247849647e-05, "loss": 2.5809, "mean_token_accuracy": 0.42758620977401735, "step": 43625 }, { "epoch": 0.04394459938701162, "grad_norm": 15.518204153169714, "learning_rate": 4.394375843522753e-05, "loss": 2.5938, "mean_token_accuracy": 0.4034482777118683, "step": 43630 }, { "epoch": 0.04394963544011579, "grad_norm": 16.01199382653238, "learning_rate": 4.3948794391958586e-05, "loss": 2.7238, "mean_token_accuracy": 0.4137930989265442, "step": 43635 }, { "epoch": 0.04395467149321996, "grad_norm": 15.086098118816434, "learning_rate": 4.3953830348689645e-05, "loss": 2.7005, "mean_token_accuracy": 0.39310344457626345, "step": 43640 }, { "epoch": 0.04395970754632413, "grad_norm": 14.420266891746307, "learning_rate": 4.3958866305420705e-05, "loss": 2.5537, "mean_token_accuracy": 0.4379310369491577, "step": 43645 }, { "epoch": 0.043964743599428306, "grad_norm": 13.540792469571088, "learning_rate": 4.396390226215177e-05, "loss": 3.1272, "mean_token_accuracy": 0.3551724135875702, "step": 43650 }, { "epoch": 0.04396977965253248, "grad_norm": 16.99566382146657, "learning_rate": 4.396893821888283e-05, "loss": 2.4422, "mean_token_accuracy": 0.43448275327682495, "step": 43655 }, { "epoch": 0.043974815705636654, "grad_norm": 17.44836585646702, "learning_rate": 4.397397417561388e-05, "loss": 2.4529, "mean_token_accuracy": 0.4344827592372894, "step": 43660 }, { "epoch": 0.04397985175874083, "grad_norm": 10.885810091377175, "learning_rate": 4.397901013234494e-05, "loss": 2.6459, "mean_token_accuracy": 0.3910465776920319, "step": 43665 }, { "epoch": 0.043984887811845, "grad_norm": 12.550285853406592, "learning_rate": 4.3984046089076e-05, "loss": 2.6635, "mean_token_accuracy": 0.3517241388559341, "step": 43670 }, { "epoch": 0.04398992386494917, "grad_norm": 9.49613247794277, "learning_rate": 4.398908204580706e-05, "loss": 2.4407, "mean_token_accuracy": 0.3724137872457504, "step": 43675 }, { "epoch": 0.04399495991805334, "grad_norm": 13.125437573356784, "learning_rate": 4.3994118002538126e-05, "loss": 2.7576, "mean_token_accuracy": 0.35862069129943847, "step": 43680 }, { "epoch": 0.043999995971157516, "grad_norm": 12.839872089977552, "learning_rate": 4.3999153959269185e-05, "loss": 2.6956, "mean_token_accuracy": 0.37931033968925476, "step": 43685 }, { "epoch": 0.04400503202426169, "grad_norm": 14.377575481536416, "learning_rate": 4.4004189916000245e-05, "loss": 2.6968, "mean_token_accuracy": 0.39655172228813174, "step": 43690 }, { "epoch": 0.04401006807736586, "grad_norm": 10.88990637775244, "learning_rate": 4.4009225872731304e-05, "loss": 2.3862, "mean_token_accuracy": 0.38275861740112305, "step": 43695 }, { "epoch": 0.04401510413047004, "grad_norm": 13.919210469504824, "learning_rate": 4.401426182946236e-05, "loss": 2.7538, "mean_token_accuracy": 0.3724137991666794, "step": 43700 }, { "epoch": 0.04402014018357421, "grad_norm": 11.150584547296491, "learning_rate": 4.401929778619342e-05, "loss": 2.5478, "mean_token_accuracy": 0.37241379022598264, "step": 43705 }, { "epoch": 0.04402517623667838, "grad_norm": 12.913715939089982, "learning_rate": 4.402433374292448e-05, "loss": 2.5826, "mean_token_accuracy": 0.37586206793785093, "step": 43710 }, { "epoch": 0.04403021228978255, "grad_norm": 11.547651674715615, "learning_rate": 4.402936969965554e-05, "loss": 2.1618, "mean_token_accuracy": 0.43793103098869324, "step": 43715 }, { "epoch": 0.044035248342886725, "grad_norm": 13.768130087692429, "learning_rate": 4.40344056563866e-05, "loss": 2.6029, "mean_token_accuracy": 0.3848154842853546, "step": 43720 }, { "epoch": 0.0440402843959909, "grad_norm": 13.752865387757156, "learning_rate": 4.403944161311766e-05, "loss": 2.3477, "mean_token_accuracy": 0.42758620977401735, "step": 43725 }, { "epoch": 0.04404532044909507, "grad_norm": 16.475875742150453, "learning_rate": 4.4044477569848725e-05, "loss": 3.1165, "mean_token_accuracy": 0.31379309892654417, "step": 43730 }, { "epoch": 0.044050356502199246, "grad_norm": 14.55227356458699, "learning_rate": 4.4049513526579785e-05, "loss": 3.0658, "mean_token_accuracy": 0.4026618242263794, "step": 43735 }, { "epoch": 0.04405539255530342, "grad_norm": 20.005117202031062, "learning_rate": 4.4054549483310844e-05, "loss": 2.6527, "mean_token_accuracy": 0.3517241358757019, "step": 43740 }, { "epoch": 0.04406042860840759, "grad_norm": 14.050212591558072, "learning_rate": 4.40595854400419e-05, "loss": 3.0267, "mean_token_accuracy": 0.3655172407627106, "step": 43745 }, { "epoch": 0.04406546466151176, "grad_norm": 13.67818999601452, "learning_rate": 4.406462139677296e-05, "loss": 2.8563, "mean_token_accuracy": 0.36896551847457887, "step": 43750 }, { "epoch": 0.044070500714615934, "grad_norm": 11.334334092583576, "learning_rate": 4.406965735350402e-05, "loss": 2.4591, "mean_token_accuracy": 0.4257108271121979, "step": 43755 }, { "epoch": 0.04407553676772011, "grad_norm": 13.272564301251222, "learning_rate": 4.407469331023508e-05, "loss": 2.7904, "mean_token_accuracy": 0.4068965554237366, "step": 43760 }, { "epoch": 0.04408057282082428, "grad_norm": 15.710494821795324, "learning_rate": 4.407972926696614e-05, "loss": 2.6368, "mean_token_accuracy": 0.4103448212146759, "step": 43765 }, { "epoch": 0.044085608873928456, "grad_norm": 11.905366655545963, "learning_rate": 4.40847652236972e-05, "loss": 2.2527, "mean_token_accuracy": 0.4586206912994385, "step": 43770 }, { "epoch": 0.04409064492703263, "grad_norm": 15.595386346370512, "learning_rate": 4.408980118042826e-05, "loss": 2.7083, "mean_token_accuracy": 0.4034482777118683, "step": 43775 }, { "epoch": 0.044095680980136796, "grad_norm": 16.178445419278653, "learning_rate": 4.409483713715932e-05, "loss": 2.592, "mean_token_accuracy": 0.42758620381355283, "step": 43780 }, { "epoch": 0.04410071703324097, "grad_norm": 13.205894255430962, "learning_rate": 4.4099873093890384e-05, "loss": 2.4082, "mean_token_accuracy": 0.4068965494632721, "step": 43785 }, { "epoch": 0.044105753086345144, "grad_norm": 11.394460263659385, "learning_rate": 4.410490905062144e-05, "loss": 2.5042, "mean_token_accuracy": 0.4068965494632721, "step": 43790 }, { "epoch": 0.04411078913944932, "grad_norm": 12.675084919974456, "learning_rate": 4.4109945007352496e-05, "loss": 2.6957, "mean_token_accuracy": 0.38275861740112305, "step": 43795 }, { "epoch": 0.04411582519255349, "grad_norm": 17.684637539547733, "learning_rate": 4.4114980964083555e-05, "loss": 2.8493, "mean_token_accuracy": 0.42068966031074523, "step": 43800 }, { "epoch": 0.044120861245657665, "grad_norm": 11.456533280840457, "learning_rate": 4.4120016920814614e-05, "loss": 2.5509, "mean_token_accuracy": 0.37931033968925476, "step": 43805 }, { "epoch": 0.04412589729876184, "grad_norm": 15.016299058846634, "learning_rate": 4.412505287754568e-05, "loss": 2.5862, "mean_token_accuracy": 0.3517241388559341, "step": 43810 }, { "epoch": 0.044130933351866006, "grad_norm": 14.383159694238401, "learning_rate": 4.413008883427674e-05, "loss": 2.4404, "mean_token_accuracy": 0.4034482777118683, "step": 43815 }, { "epoch": 0.04413596940497018, "grad_norm": 12.118681224847318, "learning_rate": 4.41351247910078e-05, "loss": 2.4932, "mean_token_accuracy": 0.3896551728248596, "step": 43820 }, { "epoch": 0.04414100545807435, "grad_norm": 17.133862619974614, "learning_rate": 4.414016074773886e-05, "loss": 2.7807, "mean_token_accuracy": 0.3620689630508423, "step": 43825 }, { "epoch": 0.04414604151117853, "grad_norm": 20.056912466891188, "learning_rate": 4.414519670446992e-05, "loss": 3.4276, "mean_token_accuracy": 0.28620689511299136, "step": 43830 }, { "epoch": 0.0441510775642827, "grad_norm": 18.102861563149474, "learning_rate": 4.4150232661200977e-05, "loss": 2.7318, "mean_token_accuracy": 0.37586205899715425, "step": 43835 }, { "epoch": 0.044156113617386875, "grad_norm": 14.292039411223833, "learning_rate": 4.4155268617932036e-05, "loss": 3.1264, "mean_token_accuracy": 0.3379310339689255, "step": 43840 }, { "epoch": 0.04416114967049105, "grad_norm": 10.579454149875932, "learning_rate": 4.4160304574663095e-05, "loss": 2.1754, "mean_token_accuracy": 0.44313369393348695, "step": 43845 }, { "epoch": 0.044166185723595215, "grad_norm": 15.396866945426872, "learning_rate": 4.4165340531394154e-05, "loss": 2.7405, "mean_token_accuracy": 0.4, "step": 43850 }, { "epoch": 0.04417122177669939, "grad_norm": 15.528453114757294, "learning_rate": 4.4170376488125214e-05, "loss": 2.6528, "mean_token_accuracy": 0.39310343861579894, "step": 43855 }, { "epoch": 0.04417625782980356, "grad_norm": 14.74537839469161, "learning_rate": 4.417541244485627e-05, "loss": 2.8865, "mean_token_accuracy": 0.4206896543502808, "step": 43860 }, { "epoch": 0.044181293882907736, "grad_norm": 13.775705182332455, "learning_rate": 4.418044840158734e-05, "loss": 2.5958, "mean_token_accuracy": 0.37931033968925476, "step": 43865 }, { "epoch": 0.04418632993601191, "grad_norm": 11.376219245554795, "learning_rate": 4.41854843583184e-05, "loss": 2.3761, "mean_token_accuracy": 0.4, "step": 43870 }, { "epoch": 0.044191365989116084, "grad_norm": 20.86183329100325, "learning_rate": 4.419052031504946e-05, "loss": 2.6758, "mean_token_accuracy": 0.3482758641242981, "step": 43875 }, { "epoch": 0.04419640204222026, "grad_norm": 11.68228321000608, "learning_rate": 4.4195556271780517e-05, "loss": 2.5105, "mean_token_accuracy": 0.40689654648303986, "step": 43880 }, { "epoch": 0.044201438095324425, "grad_norm": 13.42922684174359, "learning_rate": 4.4200592228511576e-05, "loss": 2.1671, "mean_token_accuracy": 0.42413793206214906, "step": 43885 }, { "epoch": 0.0442064741484286, "grad_norm": 16.972728813627693, "learning_rate": 4.4205628185242635e-05, "loss": 3.1125, "mean_token_accuracy": 0.3620689630508423, "step": 43890 }, { "epoch": 0.04421151020153277, "grad_norm": 14.81411444748737, "learning_rate": 4.4210664141973694e-05, "loss": 2.5934, "mean_token_accuracy": 0.41034482717514037, "step": 43895 }, { "epoch": 0.044216546254636946, "grad_norm": 15.204932694191843, "learning_rate": 4.4215700098704754e-05, "loss": 2.8579, "mean_token_accuracy": 0.3846340000629425, "step": 43900 }, { "epoch": 0.04422158230774112, "grad_norm": 17.49491941604814, "learning_rate": 4.422073605543581e-05, "loss": 2.8646, "mean_token_accuracy": 0.4034482717514038, "step": 43905 }, { "epoch": 0.04422661836084529, "grad_norm": 10.743589355484843, "learning_rate": 4.422577201216687e-05, "loss": 2.3692, "mean_token_accuracy": 0.42413793206214906, "step": 43910 }, { "epoch": 0.04423165441394947, "grad_norm": 14.414382728486386, "learning_rate": 4.423080796889794e-05, "loss": 2.5724, "mean_token_accuracy": 0.41034482717514037, "step": 43915 }, { "epoch": 0.044236690467053634, "grad_norm": 13.165846234432177, "learning_rate": 4.4235843925629e-05, "loss": 2.6172, "mean_token_accuracy": 0.4310344815254211, "step": 43920 }, { "epoch": 0.04424172652015781, "grad_norm": 14.486240487120567, "learning_rate": 4.424087988236006e-05, "loss": 2.9629, "mean_token_accuracy": 0.4, "step": 43925 }, { "epoch": 0.04424676257326198, "grad_norm": 13.660265930657884, "learning_rate": 4.424591583909111e-05, "loss": 2.632, "mean_token_accuracy": 0.3896551728248596, "step": 43930 }, { "epoch": 0.044251798626366155, "grad_norm": 16.07635783765737, "learning_rate": 4.425095179582217e-05, "loss": 2.8131, "mean_token_accuracy": 0.38965516686439516, "step": 43935 }, { "epoch": 0.04425683467947033, "grad_norm": 14.385083765692428, "learning_rate": 4.425598775255323e-05, "loss": 2.3577, "mean_token_accuracy": 0.42068966031074523, "step": 43940 }, { "epoch": 0.0442618707325745, "grad_norm": 12.208947757770387, "learning_rate": 4.4261023709284294e-05, "loss": 2.6754, "mean_token_accuracy": 0.3448275804519653, "step": 43945 }, { "epoch": 0.04426690678567868, "grad_norm": 10.20887015461642, "learning_rate": 4.426605966601535e-05, "loss": 2.3222, "mean_token_accuracy": 0.4091349124908447, "step": 43950 }, { "epoch": 0.04427194283878284, "grad_norm": 25.587248342384086, "learning_rate": 4.427109562274641e-05, "loss": 2.4235, "mean_token_accuracy": 0.40344828367233276, "step": 43955 }, { "epoch": 0.04427697889188702, "grad_norm": 35.544632456124724, "learning_rate": 4.427613157947747e-05, "loss": 2.7106, "mean_token_accuracy": 0.36733212471008303, "step": 43960 }, { "epoch": 0.04428201494499119, "grad_norm": 13.672025715210397, "learning_rate": 4.428116753620853e-05, "loss": 2.4017, "mean_token_accuracy": 0.3827586203813553, "step": 43965 }, { "epoch": 0.044287050998095365, "grad_norm": 16.313576476635678, "learning_rate": 4.428620349293959e-05, "loss": 2.8721, "mean_token_accuracy": 0.35359951853752136, "step": 43970 }, { "epoch": 0.04429208705119954, "grad_norm": 12.951026491725921, "learning_rate": 4.429123944967065e-05, "loss": 2.2072, "mean_token_accuracy": 0.46436781883239747, "step": 43975 }, { "epoch": 0.04429712310430371, "grad_norm": 12.55905992552368, "learning_rate": 4.429627540640171e-05, "loss": 2.6696, "mean_token_accuracy": 0.4068965494632721, "step": 43980 }, { "epoch": 0.044302159157407886, "grad_norm": 13.247270304542925, "learning_rate": 4.430131136313277e-05, "loss": 3.2904, "mean_token_accuracy": 0.3517241418361664, "step": 43985 }, { "epoch": 0.04430719521051205, "grad_norm": 20.180152137718867, "learning_rate": 4.430634731986383e-05, "loss": 2.9211, "mean_token_accuracy": 0.38777979612350466, "step": 43990 }, { "epoch": 0.04431223126361623, "grad_norm": 11.171229089528058, "learning_rate": 4.431138327659489e-05, "loss": 2.4675, "mean_token_accuracy": 0.39310343861579894, "step": 43995 }, { "epoch": 0.0443172673167204, "grad_norm": 13.244745715143324, "learning_rate": 4.431641923332595e-05, "loss": 2.1049, "mean_token_accuracy": 0.46763460636138915, "step": 44000 }, { "epoch": 0.044322303369824574, "grad_norm": 11.586161596776906, "learning_rate": 4.432145519005701e-05, "loss": 2.5895, "mean_token_accuracy": 0.4347290605306625, "step": 44005 }, { "epoch": 0.04432733942292875, "grad_norm": 12.802390562995457, "learning_rate": 4.432649114678807e-05, "loss": 2.5838, "mean_token_accuracy": 0.37931033968925476, "step": 44010 }, { "epoch": 0.04433237547603292, "grad_norm": 16.577183767493647, "learning_rate": 4.433152710351913e-05, "loss": 2.7776, "mean_token_accuracy": 0.42068964838981626, "step": 44015 }, { "epoch": 0.044337411529137095, "grad_norm": 14.65299066960234, "learning_rate": 4.433656306025018e-05, "loss": 2.5993, "mean_token_accuracy": 0.36896551847457887, "step": 44020 }, { "epoch": 0.04434244758224126, "grad_norm": 17.574626153518555, "learning_rate": 4.434159901698125e-05, "loss": 2.7832, "mean_token_accuracy": 0.3758620619773865, "step": 44025 }, { "epoch": 0.044347483635345436, "grad_norm": 15.266270271923032, "learning_rate": 4.434663497371231e-05, "loss": 2.6148, "mean_token_accuracy": 0.4000000059604645, "step": 44030 }, { "epoch": 0.04435251968844961, "grad_norm": 17.52078813585745, "learning_rate": 4.435167093044337e-05, "loss": 3.1746, "mean_token_accuracy": 0.38620689511299133, "step": 44035 }, { "epoch": 0.044357555741553784, "grad_norm": 13.204293847967051, "learning_rate": 4.4356706887174426e-05, "loss": 2.7203, "mean_token_accuracy": 0.3827586233615875, "step": 44040 }, { "epoch": 0.04436259179465796, "grad_norm": 15.78829097756, "learning_rate": 4.4361742843905485e-05, "loss": 2.7295, "mean_token_accuracy": 0.40508166551589964, "step": 44045 }, { "epoch": 0.04436762784776213, "grad_norm": 11.916087929460101, "learning_rate": 4.436677880063655e-05, "loss": 2.304, "mean_token_accuracy": 0.44827585816383364, "step": 44050 }, { "epoch": 0.044372663900866305, "grad_norm": 13.993047007449642, "learning_rate": 4.437181475736761e-05, "loss": 2.658, "mean_token_accuracy": 0.44137930274009707, "step": 44055 }, { "epoch": 0.04437769995397047, "grad_norm": 12.34778371608988, "learning_rate": 4.437685071409867e-05, "loss": 2.8391, "mean_token_accuracy": 0.3551724135875702, "step": 44060 }, { "epoch": 0.044382736007074645, "grad_norm": 13.896543441911039, "learning_rate": 4.438188667082972e-05, "loss": 2.3494, "mean_token_accuracy": 0.4, "step": 44065 }, { "epoch": 0.04438777206017882, "grad_norm": 11.07907737241743, "learning_rate": 4.438692262756078e-05, "loss": 2.2097, "mean_token_accuracy": 0.476043564081192, "step": 44070 }, { "epoch": 0.04439280811328299, "grad_norm": 12.03353161885049, "learning_rate": 4.439195858429185e-05, "loss": 2.9754, "mean_token_accuracy": 0.3620689660310745, "step": 44075 }, { "epoch": 0.04439784416638717, "grad_norm": 11.057742877871155, "learning_rate": 4.439699454102291e-05, "loss": 2.2957, "mean_token_accuracy": 0.4172413766384125, "step": 44080 }, { "epoch": 0.04440288021949134, "grad_norm": 13.268963383080676, "learning_rate": 4.4402030497753966e-05, "loss": 2.4289, "mean_token_accuracy": 0.37931033968925476, "step": 44085 }, { "epoch": 0.044407916272595514, "grad_norm": 16.029497578611387, "learning_rate": 4.4407066454485026e-05, "loss": 2.9301, "mean_token_accuracy": 0.3275862067937851, "step": 44090 }, { "epoch": 0.04441295232569968, "grad_norm": 13.869820287353509, "learning_rate": 4.4412102411216085e-05, "loss": 2.5593, "mean_token_accuracy": 0.3793103456497192, "step": 44095 }, { "epoch": 0.044417988378803855, "grad_norm": 13.255381435259588, "learning_rate": 4.4417138367947144e-05, "loss": 2.4582, "mean_token_accuracy": 0.4241379380226135, "step": 44100 }, { "epoch": 0.04442302443190803, "grad_norm": 16.74543282962715, "learning_rate": 4.44221743246782e-05, "loss": 2.8708, "mean_token_accuracy": 0.33103448152542114, "step": 44105 }, { "epoch": 0.0444280604850122, "grad_norm": 12.772562394945036, "learning_rate": 4.442721028140926e-05, "loss": 2.3225, "mean_token_accuracy": 0.4689655125141144, "step": 44110 }, { "epoch": 0.044433096538116376, "grad_norm": 11.65763473842032, "learning_rate": 4.443224623814032e-05, "loss": 2.2633, "mean_token_accuracy": 0.46061705946922304, "step": 44115 }, { "epoch": 0.04443813259122055, "grad_norm": 19.894449697057485, "learning_rate": 4.443728219487138e-05, "loss": 2.5711, "mean_token_accuracy": 0.39310345351696013, "step": 44120 }, { "epoch": 0.044443168644324724, "grad_norm": 14.066864522239403, "learning_rate": 4.444231815160244e-05, "loss": 2.513, "mean_token_accuracy": 0.4068965554237366, "step": 44125 }, { "epoch": 0.04444820469742889, "grad_norm": 13.095708790248539, "learning_rate": 4.4447354108333506e-05, "loss": 2.6114, "mean_token_accuracy": 0.3965517282485962, "step": 44130 }, { "epoch": 0.044453240750533064, "grad_norm": 15.791717803668181, "learning_rate": 4.4452390065064566e-05, "loss": 3.0297, "mean_token_accuracy": 0.3862069010734558, "step": 44135 }, { "epoch": 0.04445827680363724, "grad_norm": 16.574606331727026, "learning_rate": 4.4457426021795625e-05, "loss": 2.9818, "mean_token_accuracy": 0.3482758641242981, "step": 44140 }, { "epoch": 0.04446331285674141, "grad_norm": 13.482615368760458, "learning_rate": 4.4462461978526684e-05, "loss": 2.5806, "mean_token_accuracy": 0.4172413766384125, "step": 44145 }, { "epoch": 0.044468348909845586, "grad_norm": 15.556970806392002, "learning_rate": 4.446749793525774e-05, "loss": 2.4947, "mean_token_accuracy": 0.33103448152542114, "step": 44150 }, { "epoch": 0.04447338496294976, "grad_norm": 11.859608417894577, "learning_rate": 4.44725338919888e-05, "loss": 2.9373, "mean_token_accuracy": 0.33448275923728943, "step": 44155 }, { "epoch": 0.04447842101605393, "grad_norm": 13.71762418087473, "learning_rate": 4.447756984871986e-05, "loss": 3.223, "mean_token_accuracy": 0.35862069129943847, "step": 44160 }, { "epoch": 0.0444834570691581, "grad_norm": 12.389245952732699, "learning_rate": 4.448260580545092e-05, "loss": 2.8521, "mean_token_accuracy": 0.37586206793785093, "step": 44165 }, { "epoch": 0.044488493122262274, "grad_norm": 12.575490624298796, "learning_rate": 4.448764176218198e-05, "loss": 2.8181, "mean_token_accuracy": 0.38275861740112305, "step": 44170 }, { "epoch": 0.04449352917536645, "grad_norm": 11.180172218204662, "learning_rate": 4.449267771891304e-05, "loss": 2.2103, "mean_token_accuracy": 0.49999999403953554, "step": 44175 }, { "epoch": 0.04449856522847062, "grad_norm": 12.24252896763768, "learning_rate": 4.4497713675644106e-05, "loss": 2.9076, "mean_token_accuracy": 0.35862068831920624, "step": 44180 }, { "epoch": 0.044503601281574795, "grad_norm": 11.678041234348868, "learning_rate": 4.4502749632375165e-05, "loss": 2.5964, "mean_token_accuracy": 0.36896551251411436, "step": 44185 }, { "epoch": 0.04450863733467897, "grad_norm": 13.456046012696975, "learning_rate": 4.4507785589106224e-05, "loss": 2.6489, "mean_token_accuracy": 0.38275861740112305, "step": 44190 }, { "epoch": 0.04451367338778314, "grad_norm": 10.8990411141468, "learning_rate": 4.451282154583728e-05, "loss": 2.0318, "mean_token_accuracy": 0.47586206793785096, "step": 44195 }, { "epoch": 0.04451870944088731, "grad_norm": 16.139466028786916, "learning_rate": 4.4517857502568336e-05, "loss": 2.8554, "mean_token_accuracy": 0.358620685338974, "step": 44200 }, { "epoch": 0.04452374549399148, "grad_norm": 11.493187539473524, "learning_rate": 4.4522893459299395e-05, "loss": 2.6881, "mean_token_accuracy": 0.42758620381355283, "step": 44205 }, { "epoch": 0.04452878154709566, "grad_norm": 13.015327060511801, "learning_rate": 4.452792941603046e-05, "loss": 2.5566, "mean_token_accuracy": 0.3807622492313385, "step": 44210 }, { "epoch": 0.04453381760019983, "grad_norm": 12.039084357471584, "learning_rate": 4.453296537276152e-05, "loss": 2.1018, "mean_token_accuracy": 0.46551724076271056, "step": 44215 }, { "epoch": 0.044538853653304004, "grad_norm": 12.790894339968776, "learning_rate": 4.453800132949258e-05, "loss": 2.5324, "mean_token_accuracy": 0.40689654350280763, "step": 44220 }, { "epoch": 0.04454388970640818, "grad_norm": 13.900376808674402, "learning_rate": 4.454303728622364e-05, "loss": 2.0906, "mean_token_accuracy": 0.4535390198230743, "step": 44225 }, { "epoch": 0.04454892575951235, "grad_norm": 15.818694148213773, "learning_rate": 4.45480732429547e-05, "loss": 2.4829, "mean_token_accuracy": 0.4089534133672714, "step": 44230 }, { "epoch": 0.04455396181261652, "grad_norm": 14.415968311929939, "learning_rate": 4.455310919968576e-05, "loss": 2.5899, "mean_token_accuracy": 0.3848759740591049, "step": 44235 }, { "epoch": 0.04455899786572069, "grad_norm": 16.9729841896028, "learning_rate": 4.455814515641682e-05, "loss": 2.5431, "mean_token_accuracy": 0.3862068891525269, "step": 44240 }, { "epoch": 0.044564033918824866, "grad_norm": 13.701515115043184, "learning_rate": 4.4563181113147876e-05, "loss": 2.228, "mean_token_accuracy": 0.4413793206214905, "step": 44245 }, { "epoch": 0.04456906997192904, "grad_norm": 12.435988768084208, "learning_rate": 4.4568217069878935e-05, "loss": 2.4747, "mean_token_accuracy": 0.41034482717514037, "step": 44250 }, { "epoch": 0.044574106025033214, "grad_norm": 22.5872541908809, "learning_rate": 4.4573253026609994e-05, "loss": 2.8522, "mean_token_accuracy": 0.40344826579093934, "step": 44255 }, { "epoch": 0.04457914207813739, "grad_norm": 14.338152582830393, "learning_rate": 4.457828898334106e-05, "loss": 2.508, "mean_token_accuracy": 0.4293406009674072, "step": 44260 }, { "epoch": 0.04458417813124156, "grad_norm": 15.278189641122646, "learning_rate": 4.458332494007212e-05, "loss": 3.1292, "mean_token_accuracy": 0.3551724165678024, "step": 44265 }, { "epoch": 0.04458921418434573, "grad_norm": 12.401834703512241, "learning_rate": 4.458836089680318e-05, "loss": 2.4663, "mean_token_accuracy": 0.3910465896129608, "step": 44270 }, { "epoch": 0.0445942502374499, "grad_norm": 12.507967426027502, "learning_rate": 4.459339685353424e-05, "loss": 2.4483, "mean_token_accuracy": 0.38275861740112305, "step": 44275 }, { "epoch": 0.044599286290554076, "grad_norm": 14.02532766013267, "learning_rate": 4.45984328102653e-05, "loss": 2.6396, "mean_token_accuracy": 0.37241379022598264, "step": 44280 }, { "epoch": 0.04460432234365825, "grad_norm": 18.285235598600288, "learning_rate": 4.460346876699636e-05, "loss": 2.4179, "mean_token_accuracy": 0.47241380214691164, "step": 44285 }, { "epoch": 0.04460935839676242, "grad_norm": 15.31588776597003, "learning_rate": 4.4608504723727416e-05, "loss": 2.7084, "mean_token_accuracy": 0.38275861740112305, "step": 44290 }, { "epoch": 0.0446143944498666, "grad_norm": 14.054555029166396, "learning_rate": 4.4613540680458475e-05, "loss": 2.848, "mean_token_accuracy": 0.37241379618644715, "step": 44295 }, { "epoch": 0.04461943050297077, "grad_norm": 12.215086349634712, "learning_rate": 4.4618576637189534e-05, "loss": 2.6303, "mean_token_accuracy": 0.3862068921327591, "step": 44300 }, { "epoch": 0.04462446655607494, "grad_norm": 12.151825269295024, "learning_rate": 4.4623612593920594e-05, "loss": 2.3216, "mean_token_accuracy": 0.43103448748588563, "step": 44305 }, { "epoch": 0.04462950260917911, "grad_norm": 12.337592109693574, "learning_rate": 4.462864855065165e-05, "loss": 2.6531, "mean_token_accuracy": 0.35862069129943847, "step": 44310 }, { "epoch": 0.044634538662283285, "grad_norm": 11.308240696710952, "learning_rate": 4.463368450738272e-05, "loss": 2.4427, "mean_token_accuracy": 0.45172414779663084, "step": 44315 }, { "epoch": 0.04463957471538746, "grad_norm": 15.057823959320642, "learning_rate": 4.463872046411378e-05, "loss": 2.1578, "mean_token_accuracy": 0.5034482836723327, "step": 44320 }, { "epoch": 0.04464461076849163, "grad_norm": 13.22310463344464, "learning_rate": 4.464375642084484e-05, "loss": 2.8505, "mean_token_accuracy": 0.42758620381355283, "step": 44325 }, { "epoch": 0.044649646821595806, "grad_norm": 12.933959932656371, "learning_rate": 4.464879237757589e-05, "loss": 2.4781, "mean_token_accuracy": 0.3862069010734558, "step": 44330 }, { "epoch": 0.04465468287469998, "grad_norm": 14.31492755930774, "learning_rate": 4.465382833430695e-05, "loss": 2.677, "mean_token_accuracy": 0.3793103456497192, "step": 44335 }, { "epoch": 0.04465971892780415, "grad_norm": 18.416663979726813, "learning_rate": 4.4658864291038015e-05, "loss": 2.7629, "mean_token_accuracy": 0.3758620619773865, "step": 44340 }, { "epoch": 0.04466475498090832, "grad_norm": 15.29278948661555, "learning_rate": 4.4663900247769075e-05, "loss": 2.8602, "mean_token_accuracy": 0.37241379618644715, "step": 44345 }, { "epoch": 0.044669791034012495, "grad_norm": 13.182355378266555, "learning_rate": 4.4668936204500134e-05, "loss": 2.2739, "mean_token_accuracy": 0.3999999940395355, "step": 44350 }, { "epoch": 0.04467482708711667, "grad_norm": 12.920442536862305, "learning_rate": 4.467397216123119e-05, "loss": 2.4517, "mean_token_accuracy": 0.382758629322052, "step": 44355 }, { "epoch": 0.04467986314022084, "grad_norm": 14.157576514165662, "learning_rate": 4.467900811796225e-05, "loss": 2.6826, "mean_token_accuracy": 0.37931033968925476, "step": 44360 }, { "epoch": 0.044684899193325016, "grad_norm": 13.451230478782248, "learning_rate": 4.468404407469331e-05, "loss": 2.3212, "mean_token_accuracy": 0.4172413766384125, "step": 44365 }, { "epoch": 0.04468993524642919, "grad_norm": 13.861578518274676, "learning_rate": 4.468908003142437e-05, "loss": 2.8463, "mean_token_accuracy": 0.37241379618644715, "step": 44370 }, { "epoch": 0.044694971299533356, "grad_norm": 12.230623727688428, "learning_rate": 4.469411598815543e-05, "loss": 2.4743, "mean_token_accuracy": 0.4068965554237366, "step": 44375 }, { "epoch": 0.04470000735263753, "grad_norm": 13.447931386339436, "learning_rate": 4.469915194488649e-05, "loss": 2.2186, "mean_token_accuracy": 0.40344826877117157, "step": 44380 }, { "epoch": 0.044705043405741704, "grad_norm": 18.12107992714274, "learning_rate": 4.470418790161755e-05, "loss": 2.498, "mean_token_accuracy": 0.42758620977401735, "step": 44385 }, { "epoch": 0.04471007945884588, "grad_norm": 12.805731503744495, "learning_rate": 4.470922385834861e-05, "loss": 2.6872, "mean_token_accuracy": 0.42413792610168455, "step": 44390 }, { "epoch": 0.04471511551195005, "grad_norm": 13.769819248922563, "learning_rate": 4.4714259815079674e-05, "loss": 2.4897, "mean_token_accuracy": 0.4162129402160645, "step": 44395 }, { "epoch": 0.044720151565054225, "grad_norm": 11.864130916783793, "learning_rate": 4.471929577181073e-05, "loss": 2.7927, "mean_token_accuracy": 0.3793103456497192, "step": 44400 }, { "epoch": 0.0447251876181584, "grad_norm": 13.396209461691038, "learning_rate": 4.472433172854179e-05, "loss": 2.3081, "mean_token_accuracy": 0.4068965494632721, "step": 44405 }, { "epoch": 0.044730223671262566, "grad_norm": 12.709868966267702, "learning_rate": 4.472936768527285e-05, "loss": 2.1781, "mean_token_accuracy": 0.41724138259887694, "step": 44410 }, { "epoch": 0.04473525972436674, "grad_norm": 12.11317895640667, "learning_rate": 4.473440364200391e-05, "loss": 2.7178, "mean_token_accuracy": 0.3517241358757019, "step": 44415 }, { "epoch": 0.04474029577747091, "grad_norm": 15.407549047555106, "learning_rate": 4.473943959873497e-05, "loss": 2.8256, "mean_token_accuracy": 0.3362976431846619, "step": 44420 }, { "epoch": 0.04474533183057509, "grad_norm": 14.130655391230654, "learning_rate": 4.474447555546603e-05, "loss": 2.3911, "mean_token_accuracy": 0.38620689511299133, "step": 44425 }, { "epoch": 0.04475036788367926, "grad_norm": 10.926372087200113, "learning_rate": 4.474951151219709e-05, "loss": 2.6161, "mean_token_accuracy": 0.35862069129943847, "step": 44430 }, { "epoch": 0.044755403936783435, "grad_norm": 12.325945859230604, "learning_rate": 4.475454746892815e-05, "loss": 2.4937, "mean_token_accuracy": 0.38620689511299133, "step": 44435 }, { "epoch": 0.04476043998988761, "grad_norm": 18.215045573944565, "learning_rate": 4.475958342565921e-05, "loss": 2.7619, "mean_token_accuracy": 0.3448275774717331, "step": 44440 }, { "epoch": 0.044765476042991775, "grad_norm": 14.807605192228278, "learning_rate": 4.4764619382390266e-05, "loss": 2.489, "mean_token_accuracy": 0.42068964838981626, "step": 44445 }, { "epoch": 0.04477051209609595, "grad_norm": 13.892652389388157, "learning_rate": 4.476965533912133e-05, "loss": 2.8009, "mean_token_accuracy": 0.3551724135875702, "step": 44450 }, { "epoch": 0.04477554814920012, "grad_norm": 13.361401039907307, "learning_rate": 4.477469129585239e-05, "loss": 2.5935, "mean_token_accuracy": 0.3206896513700485, "step": 44455 }, { "epoch": 0.0447805842023043, "grad_norm": 12.729058355462525, "learning_rate": 4.477972725258345e-05, "loss": 2.5722, "mean_token_accuracy": 0.3551724076271057, "step": 44460 }, { "epoch": 0.04478562025540847, "grad_norm": 11.627597072418135, "learning_rate": 4.4784763209314503e-05, "loss": 2.3534, "mean_token_accuracy": 0.42413793206214906, "step": 44465 }, { "epoch": 0.044790656308512644, "grad_norm": 13.190708742167434, "learning_rate": 4.478979916604556e-05, "loss": 3.0369, "mean_token_accuracy": 0.324137932062149, "step": 44470 }, { "epoch": 0.04479569236161682, "grad_norm": 13.32595141358663, "learning_rate": 4.479483512277663e-05, "loss": 2.4456, "mean_token_accuracy": 0.4172413766384125, "step": 44475 }, { "epoch": 0.044800728414720985, "grad_norm": 15.825311688568338, "learning_rate": 4.479987107950769e-05, "loss": 2.5695, "mean_token_accuracy": 0.4, "step": 44480 }, { "epoch": 0.04480576446782516, "grad_norm": 11.61436552440119, "learning_rate": 4.480490703623875e-05, "loss": 2.3753, "mean_token_accuracy": 0.4482758641242981, "step": 44485 }, { "epoch": 0.04481080052092933, "grad_norm": 13.145620395350704, "learning_rate": 4.4809942992969806e-05, "loss": 2.3909, "mean_token_accuracy": 0.42758620977401735, "step": 44490 }, { "epoch": 0.044815836574033506, "grad_norm": 13.230035323904444, "learning_rate": 4.4814978949700866e-05, "loss": 2.6816, "mean_token_accuracy": 0.3586206823587418, "step": 44495 }, { "epoch": 0.04482087262713768, "grad_norm": 12.534248736435215, "learning_rate": 4.482001490643193e-05, "loss": 2.4213, "mean_token_accuracy": 0.4275862008333206, "step": 44500 }, { "epoch": 0.044825908680241854, "grad_norm": 13.447574471687213, "learning_rate": 4.4825050863162984e-05, "loss": 2.4677, "mean_token_accuracy": 0.3931034505367279, "step": 44505 }, { "epoch": 0.04483094473334603, "grad_norm": 18.968628176314848, "learning_rate": 4.4830086819894043e-05, "loss": 2.9686, "mean_token_accuracy": 0.36206896901130675, "step": 44510 }, { "epoch": 0.044835980786450194, "grad_norm": 16.984272942961983, "learning_rate": 4.48351227766251e-05, "loss": 2.9033, "mean_token_accuracy": 0.33103448152542114, "step": 44515 }, { "epoch": 0.04484101683955437, "grad_norm": 15.346671391467446, "learning_rate": 4.484015873335616e-05, "loss": 2.6786, "mean_token_accuracy": 0.3655172407627106, "step": 44520 }, { "epoch": 0.04484605289265854, "grad_norm": 12.545366494309848, "learning_rate": 4.484519469008722e-05, "loss": 2.9542, "mean_token_accuracy": 0.358620685338974, "step": 44525 }, { "epoch": 0.044851088945762715, "grad_norm": 13.626004997590783, "learning_rate": 4.485023064681829e-05, "loss": 2.5628, "mean_token_accuracy": 0.39473684430122374, "step": 44530 }, { "epoch": 0.04485612499886689, "grad_norm": 13.47694154156956, "learning_rate": 4.4855266603549346e-05, "loss": 2.4378, "mean_token_accuracy": 0.4275862067937851, "step": 44535 }, { "epoch": 0.04486116105197106, "grad_norm": 15.654751327336484, "learning_rate": 4.4860302560280406e-05, "loss": 2.5108, "mean_token_accuracy": 0.4172413766384125, "step": 44540 }, { "epoch": 0.04486619710507524, "grad_norm": 13.82111434068498, "learning_rate": 4.4865338517011465e-05, "loss": 2.4908, "mean_token_accuracy": 0.34137931764125823, "step": 44545 }, { "epoch": 0.044871233158179404, "grad_norm": 12.715819856179923, "learning_rate": 4.4870374473742524e-05, "loss": 2.6238, "mean_token_accuracy": 0.39655172228813174, "step": 44550 }, { "epoch": 0.04487626921128358, "grad_norm": 13.322495642989322, "learning_rate": 4.4875410430473583e-05, "loss": 2.6038, "mean_token_accuracy": 0.4379310369491577, "step": 44555 }, { "epoch": 0.04488130526438775, "grad_norm": 13.986680220984965, "learning_rate": 4.488044638720464e-05, "loss": 2.4617, "mean_token_accuracy": 0.37931033968925476, "step": 44560 }, { "epoch": 0.044886341317491925, "grad_norm": 14.0607427660587, "learning_rate": 4.48854823439357e-05, "loss": 2.4463, "mean_token_accuracy": 0.4103448212146759, "step": 44565 }, { "epoch": 0.0448913773705961, "grad_norm": 13.703988287454036, "learning_rate": 4.489051830066676e-05, "loss": 2.4781, "mean_token_accuracy": 0.40689654350280763, "step": 44570 }, { "epoch": 0.04489641342370027, "grad_norm": 11.035394799873702, "learning_rate": 4.489555425739782e-05, "loss": 2.5606, "mean_token_accuracy": 0.38275861740112305, "step": 44575 }, { "epoch": 0.044901449476804446, "grad_norm": 14.7156874787611, "learning_rate": 4.4900590214128887e-05, "loss": 2.2619, "mean_token_accuracy": 0.39655172228813174, "step": 44580 }, { "epoch": 0.04490648552990861, "grad_norm": 15.624608073456418, "learning_rate": 4.4905626170859946e-05, "loss": 2.4313, "mean_token_accuracy": 0.42758620977401735, "step": 44585 }, { "epoch": 0.04491152158301279, "grad_norm": 21.681843150290984, "learning_rate": 4.4910662127591005e-05, "loss": 2.926, "mean_token_accuracy": 0.3793103456497192, "step": 44590 }, { "epoch": 0.04491655763611696, "grad_norm": 14.843879021398818, "learning_rate": 4.4915698084322064e-05, "loss": 2.8801, "mean_token_accuracy": 0.3310344755649567, "step": 44595 }, { "epoch": 0.044921593689221134, "grad_norm": 17.1733923615299, "learning_rate": 4.492073404105312e-05, "loss": 2.757, "mean_token_accuracy": 0.35995160341262816, "step": 44600 }, { "epoch": 0.04492662974232531, "grad_norm": 13.778073803680591, "learning_rate": 4.492576999778418e-05, "loss": 2.764, "mean_token_accuracy": 0.3827586233615875, "step": 44605 }, { "epoch": 0.04493166579542948, "grad_norm": 12.524776993175706, "learning_rate": 4.493080595451524e-05, "loss": 2.4319, "mean_token_accuracy": 0.4034482717514038, "step": 44610 }, { "epoch": 0.044936701848533656, "grad_norm": 14.821109219181194, "learning_rate": 4.49358419112463e-05, "loss": 2.7008, "mean_token_accuracy": 0.37586206793785093, "step": 44615 }, { "epoch": 0.04494173790163782, "grad_norm": 15.717830315416789, "learning_rate": 4.494087786797736e-05, "loss": 2.4452, "mean_token_accuracy": 0.41724138259887694, "step": 44620 }, { "epoch": 0.044946773954741996, "grad_norm": 14.73696298402566, "learning_rate": 4.494591382470842e-05, "loss": 2.6372, "mean_token_accuracy": 0.35862069129943847, "step": 44625 }, { "epoch": 0.04495181000784617, "grad_norm": 15.84376418463873, "learning_rate": 4.495094978143948e-05, "loss": 2.7592, "mean_token_accuracy": 0.3620689630508423, "step": 44630 }, { "epoch": 0.044956846060950344, "grad_norm": 17.43755312573065, "learning_rate": 4.4955985738170545e-05, "loss": 2.7583, "mean_token_accuracy": 0.4041871964931488, "step": 44635 }, { "epoch": 0.04496188211405452, "grad_norm": 13.475833931571286, "learning_rate": 4.49610216949016e-05, "loss": 2.8714, "mean_token_accuracy": 0.3793103456497192, "step": 44640 }, { "epoch": 0.04496691816715869, "grad_norm": 13.183023464311228, "learning_rate": 4.496605765163266e-05, "loss": 3.1301, "mean_token_accuracy": 0.3482758641242981, "step": 44645 }, { "epoch": 0.044971954220262865, "grad_norm": 14.82176523865926, "learning_rate": 4.4971093608363716e-05, "loss": 2.9793, "mean_token_accuracy": 0.3551724076271057, "step": 44650 }, { "epoch": 0.04497699027336703, "grad_norm": 11.718102289194606, "learning_rate": 4.4976129565094775e-05, "loss": 2.5778, "mean_token_accuracy": 0.4071428537368774, "step": 44655 }, { "epoch": 0.044982026326471206, "grad_norm": 12.904900760854074, "learning_rate": 4.498116552182584e-05, "loss": 2.5993, "mean_token_accuracy": 0.42413793206214906, "step": 44660 }, { "epoch": 0.04498706237957538, "grad_norm": 11.189013646277129, "learning_rate": 4.49862014785569e-05, "loss": 2.3111, "mean_token_accuracy": 0.4482758641242981, "step": 44665 }, { "epoch": 0.04499209843267955, "grad_norm": 12.364641210207758, "learning_rate": 4.499123743528796e-05, "loss": 2.4811, "mean_token_accuracy": 0.4068965494632721, "step": 44670 }, { "epoch": 0.04499713448578373, "grad_norm": 12.248789589564423, "learning_rate": 4.499627339201902e-05, "loss": 2.4579, "mean_token_accuracy": 0.39310343861579894, "step": 44675 }, { "epoch": 0.0450021705388879, "grad_norm": 31.431256882326537, "learning_rate": 4.500130934875008e-05, "loss": 2.8885, "mean_token_accuracy": 0.37586206793785093, "step": 44680 }, { "epoch": 0.045007206591992074, "grad_norm": 10.834743652664642, "learning_rate": 4.500634530548114e-05, "loss": 2.2241, "mean_token_accuracy": 0.4517241418361664, "step": 44685 }, { "epoch": 0.04501224264509624, "grad_norm": 11.518130140139382, "learning_rate": 4.50113812622122e-05, "loss": 2.4581, "mean_token_accuracy": 0.44137930274009707, "step": 44690 }, { "epoch": 0.045017278698200415, "grad_norm": 12.13692751446189, "learning_rate": 4.5016417218943256e-05, "loss": 2.5273, "mean_token_accuracy": 0.38620689511299133, "step": 44695 }, { "epoch": 0.04502231475130459, "grad_norm": 12.572883118270255, "learning_rate": 4.5021453175674315e-05, "loss": 2.7817, "mean_token_accuracy": 0.34137930572032926, "step": 44700 }, { "epoch": 0.04502735080440876, "grad_norm": 12.618903642930425, "learning_rate": 4.5026489132405375e-05, "loss": 2.6376, "mean_token_accuracy": 0.4206896543502808, "step": 44705 }, { "epoch": 0.045032386857512936, "grad_norm": 12.640048671297029, "learning_rate": 4.5031525089136434e-05, "loss": 2.6966, "mean_token_accuracy": 0.34827586114406583, "step": 44710 }, { "epoch": 0.04503742291061711, "grad_norm": 26.540501301522333, "learning_rate": 4.50365610458675e-05, "loss": 2.7657, "mean_token_accuracy": 0.40556563436985016, "step": 44715 }, { "epoch": 0.045042458963721284, "grad_norm": 12.011423733051945, "learning_rate": 4.504159700259856e-05, "loss": 2.4184, "mean_token_accuracy": 0.4172413766384125, "step": 44720 }, { "epoch": 0.04504749501682545, "grad_norm": 15.207243542723162, "learning_rate": 4.504663295932962e-05, "loss": 2.5776, "mean_token_accuracy": 0.3931034505367279, "step": 44725 }, { "epoch": 0.045052531069929624, "grad_norm": 13.995078934737471, "learning_rate": 4.505166891606067e-05, "loss": 2.3076, "mean_token_accuracy": 0.41034482717514037, "step": 44730 }, { "epoch": 0.0450575671230338, "grad_norm": 11.286122428997395, "learning_rate": 4.505670487279173e-05, "loss": 2.5061, "mean_token_accuracy": 0.4034482717514038, "step": 44735 }, { "epoch": 0.04506260317613797, "grad_norm": 14.512346379128926, "learning_rate": 4.5061740829522796e-05, "loss": 2.3961, "mean_token_accuracy": 0.46400484442710876, "step": 44740 }, { "epoch": 0.045067639229242146, "grad_norm": 11.71884258864979, "learning_rate": 4.5066776786253855e-05, "loss": 2.5092, "mean_token_accuracy": 0.43793103098869324, "step": 44745 }, { "epoch": 0.04507267528234632, "grad_norm": 11.66576226404458, "learning_rate": 4.5071812742984915e-05, "loss": 2.6895, "mean_token_accuracy": 0.3655172407627106, "step": 44750 }, { "epoch": 0.04507771133545049, "grad_norm": 12.579058538080561, "learning_rate": 4.5076848699715974e-05, "loss": 2.5134, "mean_token_accuracy": 0.362068971991539, "step": 44755 }, { "epoch": 0.04508274738855466, "grad_norm": 14.046241368844642, "learning_rate": 4.508188465644703e-05, "loss": 2.9575, "mean_token_accuracy": 0.3206896483898163, "step": 44760 }, { "epoch": 0.045087783441658834, "grad_norm": 24.522396476210343, "learning_rate": 4.50869206131781e-05, "loss": 2.5308, "mean_token_accuracy": 0.4068965554237366, "step": 44765 }, { "epoch": 0.04509281949476301, "grad_norm": 14.144320149720668, "learning_rate": 4.509195656990915e-05, "loss": 3.056, "mean_token_accuracy": 0.29605911374092103, "step": 44770 }, { "epoch": 0.04509785554786718, "grad_norm": 10.920331475097846, "learning_rate": 4.509699252664021e-05, "loss": 2.5061, "mean_token_accuracy": 0.4068965494632721, "step": 44775 }, { "epoch": 0.045102891600971355, "grad_norm": 11.72524101044772, "learning_rate": 4.510202848337127e-05, "loss": 2.5745, "mean_token_accuracy": 0.4310344815254211, "step": 44780 }, { "epoch": 0.04510792765407553, "grad_norm": 12.320080506480053, "learning_rate": 4.510706444010233e-05, "loss": 2.8897, "mean_token_accuracy": 0.36896551251411436, "step": 44785 }, { "epoch": 0.0451129637071797, "grad_norm": 15.069534790653307, "learning_rate": 4.511210039683339e-05, "loss": 2.4126, "mean_token_accuracy": 0.4296430706977844, "step": 44790 }, { "epoch": 0.04511799976028387, "grad_norm": 12.988498484701191, "learning_rate": 4.5117136353564455e-05, "loss": 2.3502, "mean_token_accuracy": 0.4551724135875702, "step": 44795 }, { "epoch": 0.04512303581338804, "grad_norm": 15.257683808187931, "learning_rate": 4.5122172310295514e-05, "loss": 2.5055, "mean_token_accuracy": 0.4103448331356049, "step": 44800 }, { "epoch": 0.04512807186649222, "grad_norm": 11.028958388596825, "learning_rate": 4.512720826702657e-05, "loss": 2.6047, "mean_token_accuracy": 0.3793103456497192, "step": 44805 }, { "epoch": 0.04513310791959639, "grad_norm": 12.38096647608136, "learning_rate": 4.513224422375763e-05, "loss": 2.7929, "mean_token_accuracy": 0.38777979016304015, "step": 44810 }, { "epoch": 0.045138143972700565, "grad_norm": 13.084057645359465, "learning_rate": 4.513728018048869e-05, "loss": 2.7383, "mean_token_accuracy": 0.3965517282485962, "step": 44815 }, { "epoch": 0.04514318002580474, "grad_norm": 17.363644535670936, "learning_rate": 4.514231613721975e-05, "loss": 2.6152, "mean_token_accuracy": 0.36896551251411436, "step": 44820 }, { "epoch": 0.04514821607890891, "grad_norm": 14.35910716313129, "learning_rate": 4.514735209395081e-05, "loss": 2.7302, "mean_token_accuracy": 0.42758620381355283, "step": 44825 }, { "epoch": 0.04515325213201308, "grad_norm": 12.787321011253866, "learning_rate": 4.515238805068187e-05, "loss": 2.5357, "mean_token_accuracy": 0.3896551728248596, "step": 44830 }, { "epoch": 0.04515828818511725, "grad_norm": 20.71823753765038, "learning_rate": 4.515742400741293e-05, "loss": 2.5943, "mean_token_accuracy": 0.37241379618644715, "step": 44835 }, { "epoch": 0.045163324238221426, "grad_norm": 13.894358983274286, "learning_rate": 4.516245996414399e-05, "loss": 2.5057, "mean_token_accuracy": 0.43992740511894224, "step": 44840 }, { "epoch": 0.0451683602913256, "grad_norm": 14.753950013805605, "learning_rate": 4.5167495920875054e-05, "loss": 2.6425, "mean_token_accuracy": 0.3724137932062149, "step": 44845 }, { "epoch": 0.045173396344429774, "grad_norm": 17.313524455228322, "learning_rate": 4.517253187760611e-05, "loss": 2.9129, "mean_token_accuracy": 0.3827586233615875, "step": 44850 }, { "epoch": 0.04517843239753395, "grad_norm": 13.52798270682511, "learning_rate": 4.517756783433717e-05, "loss": 2.8442, "mean_token_accuracy": 0.34137930274009703, "step": 44855 }, { "epoch": 0.04518346845063812, "grad_norm": 13.920348650831679, "learning_rate": 4.518260379106823e-05, "loss": 2.3065, "mean_token_accuracy": 0.38275861740112305, "step": 44860 }, { "epoch": 0.04518850450374229, "grad_norm": 11.589635436367812, "learning_rate": 4.5187639747799284e-05, "loss": 2.7843, "mean_token_accuracy": 0.3882032632827759, "step": 44865 }, { "epoch": 0.04519354055684646, "grad_norm": 12.560111397875085, "learning_rate": 4.5192675704530344e-05, "loss": 2.2077, "mean_token_accuracy": 0.44482759237289426, "step": 44870 }, { "epoch": 0.045198576609950636, "grad_norm": 16.236942271857647, "learning_rate": 4.519771166126141e-05, "loss": 2.8092, "mean_token_accuracy": 0.39461584985256193, "step": 44875 }, { "epoch": 0.04520361266305481, "grad_norm": 13.976763920840503, "learning_rate": 4.520274761799247e-05, "loss": 2.7381, "mean_token_accuracy": 0.37586207389831544, "step": 44880 }, { "epoch": 0.04520864871615898, "grad_norm": 15.358232230559201, "learning_rate": 4.520778357472353e-05, "loss": 2.7635, "mean_token_accuracy": 0.3896551728248596, "step": 44885 }, { "epoch": 0.04521368476926316, "grad_norm": 14.11073286749314, "learning_rate": 4.521281953145459e-05, "loss": 2.4965, "mean_token_accuracy": 0.39310343861579894, "step": 44890 }, { "epoch": 0.04521872082236733, "grad_norm": 17.809997309013614, "learning_rate": 4.5217855488185647e-05, "loss": 2.6129, "mean_token_accuracy": 0.4103448212146759, "step": 44895 }, { "epoch": 0.0452237568754715, "grad_norm": 14.196946351947064, "learning_rate": 4.522289144491671e-05, "loss": 2.8313, "mean_token_accuracy": 0.3931034475564957, "step": 44900 }, { "epoch": 0.04522879292857567, "grad_norm": 12.012069827578587, "learning_rate": 4.5227927401647765e-05, "loss": 2.3098, "mean_token_accuracy": 0.44827585220336913, "step": 44905 }, { "epoch": 0.045233828981679845, "grad_norm": 14.604945470332536, "learning_rate": 4.5232963358378824e-05, "loss": 2.4493, "mean_token_accuracy": 0.4206896543502808, "step": 44910 }, { "epoch": 0.04523886503478402, "grad_norm": 13.102857646316075, "learning_rate": 4.5237999315109884e-05, "loss": 2.9991, "mean_token_accuracy": 0.3689655214548111, "step": 44915 }, { "epoch": 0.04524390108788819, "grad_norm": 15.45215553415538, "learning_rate": 4.524303527184094e-05, "loss": 2.4055, "mean_token_accuracy": 0.41379310488700866, "step": 44920 }, { "epoch": 0.04524893714099237, "grad_norm": 10.060139766925662, "learning_rate": 4.524807122857201e-05, "loss": 2.1536, "mean_token_accuracy": 0.4799757957458496, "step": 44925 }, { "epoch": 0.04525397319409654, "grad_norm": 14.271780639408592, "learning_rate": 4.525310718530307e-05, "loss": 2.7418, "mean_token_accuracy": 0.36896551251411436, "step": 44930 }, { "epoch": 0.04525900924720071, "grad_norm": 11.913261194249856, "learning_rate": 4.525814314203413e-05, "loss": 2.737, "mean_token_accuracy": 0.4103448331356049, "step": 44935 }, { "epoch": 0.04526404530030488, "grad_norm": 19.468992862647276, "learning_rate": 4.526317909876519e-05, "loss": 3.4134, "mean_token_accuracy": 0.341379314661026, "step": 44940 }, { "epoch": 0.045269081353409055, "grad_norm": 15.247338262604975, "learning_rate": 4.5268215055496246e-05, "loss": 3.1125, "mean_token_accuracy": 0.3034482687711716, "step": 44945 }, { "epoch": 0.04527411740651323, "grad_norm": 14.71080179980999, "learning_rate": 4.5273251012227305e-05, "loss": 2.7258, "mean_token_accuracy": 0.3517241358757019, "step": 44950 }, { "epoch": 0.0452791534596174, "grad_norm": 11.48049556491395, "learning_rate": 4.5278286968958364e-05, "loss": 2.2934, "mean_token_accuracy": 0.46551724076271056, "step": 44955 }, { "epoch": 0.045284189512721576, "grad_norm": 15.560301098530603, "learning_rate": 4.5283322925689424e-05, "loss": 2.5009, "mean_token_accuracy": 0.3978826284408569, "step": 44960 }, { "epoch": 0.04528922556582575, "grad_norm": 16.093610703512436, "learning_rate": 4.528835888242048e-05, "loss": 2.4346, "mean_token_accuracy": 0.43103448748588563, "step": 44965 }, { "epoch": 0.04529426161892992, "grad_norm": 12.474781193179483, "learning_rate": 4.529339483915154e-05, "loss": 2.8864, "mean_token_accuracy": 0.37586206793785093, "step": 44970 }, { "epoch": 0.04529929767203409, "grad_norm": 11.21095300596419, "learning_rate": 4.52984307958826e-05, "loss": 2.764, "mean_token_accuracy": 0.379310342669487, "step": 44975 }, { "epoch": 0.045304333725138264, "grad_norm": 12.921260807828796, "learning_rate": 4.530346675261367e-05, "loss": 2.5652, "mean_token_accuracy": 0.4, "step": 44980 }, { "epoch": 0.04530936977824244, "grad_norm": 16.150416597561772, "learning_rate": 4.530850270934473e-05, "loss": 2.6838, "mean_token_accuracy": 0.3896551728248596, "step": 44985 }, { "epoch": 0.04531440583134661, "grad_norm": 16.438275675735824, "learning_rate": 4.5313538666075786e-05, "loss": 2.5359, "mean_token_accuracy": 0.4482758641242981, "step": 44990 }, { "epoch": 0.045319441884450785, "grad_norm": 13.23974283340723, "learning_rate": 4.5318574622806845e-05, "loss": 2.7044, "mean_token_accuracy": 0.4191167652606964, "step": 44995 }, { "epoch": 0.04532447793755496, "grad_norm": 14.278162896508482, "learning_rate": 4.53236105795379e-05, "loss": 2.5955, "mean_token_accuracy": 0.39310344457626345, "step": 45000 }, { "epoch": 0.045329513990659126, "grad_norm": 13.279330456357624, "learning_rate": 4.5328646536268964e-05, "loss": 2.7701, "mean_token_accuracy": 0.36896551847457887, "step": 45005 }, { "epoch": 0.0453345500437633, "grad_norm": 12.227205842960329, "learning_rate": 4.533368249300002e-05, "loss": 2.305, "mean_token_accuracy": 0.43793103098869324, "step": 45010 }, { "epoch": 0.045339586096867474, "grad_norm": 15.823759611160018, "learning_rate": 4.533871844973108e-05, "loss": 2.3793, "mean_token_accuracy": 0.40689654350280763, "step": 45015 }, { "epoch": 0.04534462214997165, "grad_norm": 15.277685693081558, "learning_rate": 4.534375440646214e-05, "loss": 2.719, "mean_token_accuracy": 0.3551724135875702, "step": 45020 }, { "epoch": 0.04534965820307582, "grad_norm": 10.948883206537369, "learning_rate": 4.53487903631932e-05, "loss": 2.0965, "mean_token_accuracy": 0.4879007875919342, "step": 45025 }, { "epoch": 0.045354694256179995, "grad_norm": 13.032313888006943, "learning_rate": 4.535382631992427e-05, "loss": 2.8642, "mean_token_accuracy": 0.3448275804519653, "step": 45030 }, { "epoch": 0.04535973030928417, "grad_norm": 12.41164447401999, "learning_rate": 4.5358862276655326e-05, "loss": 2.4806, "mean_token_accuracy": 0.4310344815254211, "step": 45035 }, { "epoch": 0.045364766362388335, "grad_norm": 14.028944506771541, "learning_rate": 4.536389823338638e-05, "loss": 2.7482, "mean_token_accuracy": 0.3807622462511063, "step": 45040 }, { "epoch": 0.04536980241549251, "grad_norm": 9.989692737163047, "learning_rate": 4.536893419011744e-05, "loss": 2.7546, "mean_token_accuracy": 0.41246218085289, "step": 45045 }, { "epoch": 0.04537483846859668, "grad_norm": 11.981570854103037, "learning_rate": 4.53739701468485e-05, "loss": 2.5018, "mean_token_accuracy": 0.42262552976608275, "step": 45050 }, { "epoch": 0.04537987452170086, "grad_norm": 20.789060529480373, "learning_rate": 4.5379006103579556e-05, "loss": 2.7282, "mean_token_accuracy": 0.33448275923728943, "step": 45055 }, { "epoch": 0.04538491057480503, "grad_norm": 13.134537467113612, "learning_rate": 4.538404206031062e-05, "loss": 2.6924, "mean_token_accuracy": 0.4068965494632721, "step": 45060 }, { "epoch": 0.045389946627909204, "grad_norm": 19.99016207285458, "learning_rate": 4.538907801704168e-05, "loss": 2.6623, "mean_token_accuracy": 0.36206896901130675, "step": 45065 }, { "epoch": 0.04539498268101338, "grad_norm": 15.687119097866171, "learning_rate": 4.539411397377274e-05, "loss": 2.4346, "mean_token_accuracy": 0.4172413766384125, "step": 45070 }, { "epoch": 0.045400018734117545, "grad_norm": 10.565059484840424, "learning_rate": 4.53991499305038e-05, "loss": 2.4925, "mean_token_accuracy": 0.39310344457626345, "step": 45075 }, { "epoch": 0.04540505478722172, "grad_norm": 12.40190254307408, "learning_rate": 4.540418588723486e-05, "loss": 2.1745, "mean_token_accuracy": 0.4517241418361664, "step": 45080 }, { "epoch": 0.04541009084032589, "grad_norm": 13.345929675396324, "learning_rate": 4.540922184396592e-05, "loss": 2.4919, "mean_token_accuracy": 0.4137930989265442, "step": 45085 }, { "epoch": 0.045415126893430066, "grad_norm": 14.372914290071753, "learning_rate": 4.541425780069698e-05, "loss": 2.8893, "mean_token_accuracy": 0.38965516686439516, "step": 45090 }, { "epoch": 0.04542016294653424, "grad_norm": 12.979539390233766, "learning_rate": 4.541929375742804e-05, "loss": 2.8954, "mean_token_accuracy": 0.3793103486299515, "step": 45095 }, { "epoch": 0.045425198999638414, "grad_norm": 12.699002533074161, "learning_rate": 4.5424329714159096e-05, "loss": 2.5109, "mean_token_accuracy": 0.4137930989265442, "step": 45100 }, { "epoch": 0.04543023505274259, "grad_norm": 12.721114195962278, "learning_rate": 4.5429365670890156e-05, "loss": 2.7352, "mean_token_accuracy": 0.3896551728248596, "step": 45105 }, { "epoch": 0.045435271105846754, "grad_norm": 14.015904415423396, "learning_rate": 4.543440162762122e-05, "loss": 2.6036, "mean_token_accuracy": 0.3551724135875702, "step": 45110 }, { "epoch": 0.04544030715895093, "grad_norm": 13.133000410533114, "learning_rate": 4.543943758435228e-05, "loss": 2.6331, "mean_token_accuracy": 0.373986679315567, "step": 45115 }, { "epoch": 0.0454453432120551, "grad_norm": 13.387916214373474, "learning_rate": 4.544447354108334e-05, "loss": 2.5296, "mean_token_accuracy": 0.36896551549434664, "step": 45120 }, { "epoch": 0.045450379265159276, "grad_norm": 12.172324705801335, "learning_rate": 4.54495094978144e-05, "loss": 2.5096, "mean_token_accuracy": 0.39655172228813174, "step": 45125 }, { "epoch": 0.04545541531826345, "grad_norm": 11.474715654145777, "learning_rate": 4.545454545454546e-05, "loss": 2.5683, "mean_token_accuracy": 0.4068965494632721, "step": 45130 }, { "epoch": 0.04546045137136762, "grad_norm": 15.418982788824612, "learning_rate": 4.545958141127651e-05, "loss": 2.4908, "mean_token_accuracy": 0.4034482717514038, "step": 45135 }, { "epoch": 0.0454654874244718, "grad_norm": 13.025867252525092, "learning_rate": 4.546461736800758e-05, "loss": 2.5986, "mean_token_accuracy": 0.3704779237508774, "step": 45140 }, { "epoch": 0.045470523477575964, "grad_norm": 13.793031477045323, "learning_rate": 4.5469653324738636e-05, "loss": 2.7253, "mean_token_accuracy": 0.41034482717514037, "step": 45145 }, { "epoch": 0.04547555953068014, "grad_norm": 11.780683624012756, "learning_rate": 4.5474689281469696e-05, "loss": 2.469, "mean_token_accuracy": 0.4310344815254211, "step": 45150 }, { "epoch": 0.04548059558378431, "grad_norm": 10.843711578316588, "learning_rate": 4.5479725238200755e-05, "loss": 2.3262, "mean_token_accuracy": 0.46454930305480957, "step": 45155 }, { "epoch": 0.045485631636888485, "grad_norm": 12.729036297927463, "learning_rate": 4.5484761194931814e-05, "loss": 2.3739, "mean_token_accuracy": 0.4103448212146759, "step": 45160 }, { "epoch": 0.04549066768999266, "grad_norm": 14.583126762727854, "learning_rate": 4.548979715166288e-05, "loss": 2.5018, "mean_token_accuracy": 0.3931034505367279, "step": 45165 }, { "epoch": 0.04549570374309683, "grad_norm": 12.058081638374917, "learning_rate": 4.549483310839394e-05, "loss": 2.4108, "mean_token_accuracy": 0.4206896543502808, "step": 45170 }, { "epoch": 0.045500739796201006, "grad_norm": 30.518044025478805, "learning_rate": 4.549986906512499e-05, "loss": 3.3733, "mean_token_accuracy": 0.3241379290819168, "step": 45175 }, { "epoch": 0.04550577584930517, "grad_norm": 13.809847290105825, "learning_rate": 4.550490502185605e-05, "loss": 2.8638, "mean_token_accuracy": 0.37586206793785093, "step": 45180 }, { "epoch": 0.04551081190240935, "grad_norm": 10.964380395555134, "learning_rate": 4.550994097858711e-05, "loss": 2.83, "mean_token_accuracy": 0.4068965494632721, "step": 45185 }, { "epoch": 0.04551584795551352, "grad_norm": 18.611028361718628, "learning_rate": 4.5514976935318176e-05, "loss": 2.6292, "mean_token_accuracy": 0.43623714447021483, "step": 45190 }, { "epoch": 0.045520884008617694, "grad_norm": 13.053846072940907, "learning_rate": 4.5520012892049236e-05, "loss": 2.3459, "mean_token_accuracy": 0.43103448748588563, "step": 45195 }, { "epoch": 0.04552592006172187, "grad_norm": 11.836688385513353, "learning_rate": 4.5525048848780295e-05, "loss": 2.7525, "mean_token_accuracy": 0.4068965554237366, "step": 45200 }, { "epoch": 0.04553095611482604, "grad_norm": 12.278971542999273, "learning_rate": 4.5530084805511354e-05, "loss": 2.3394, "mean_token_accuracy": 0.44827585816383364, "step": 45205 }, { "epoch": 0.045535992167930216, "grad_norm": 14.22381548008495, "learning_rate": 4.5535120762242413e-05, "loss": 2.9581, "mean_token_accuracy": 0.3379310369491577, "step": 45210 }, { "epoch": 0.04554102822103438, "grad_norm": 14.57102630617339, "learning_rate": 4.554015671897347e-05, "loss": 2.4632, "mean_token_accuracy": 0.43448275327682495, "step": 45215 }, { "epoch": 0.045546064274138556, "grad_norm": 12.507942353808424, "learning_rate": 4.554519267570453e-05, "loss": 2.3297, "mean_token_accuracy": 0.39310344457626345, "step": 45220 }, { "epoch": 0.04555110032724273, "grad_norm": 14.5910206215796, "learning_rate": 4.555022863243559e-05, "loss": 2.4346, "mean_token_accuracy": 0.4103448331356049, "step": 45225 }, { "epoch": 0.045556136380346904, "grad_norm": 11.991073843444786, "learning_rate": 4.555526458916665e-05, "loss": 2.555, "mean_token_accuracy": 0.4379310369491577, "step": 45230 }, { "epoch": 0.04556117243345108, "grad_norm": 15.996963708223808, "learning_rate": 4.556030054589771e-05, "loss": 2.8072, "mean_token_accuracy": 0.3517241388559341, "step": 45235 }, { "epoch": 0.04556620848655525, "grad_norm": 10.909285878807983, "learning_rate": 4.556533650262877e-05, "loss": 2.4091, "mean_token_accuracy": 0.4413793087005615, "step": 45240 }, { "epoch": 0.045571244539659425, "grad_norm": 10.625567947163363, "learning_rate": 4.5570372459359835e-05, "loss": 2.6194, "mean_token_accuracy": 0.4137930989265442, "step": 45245 }, { "epoch": 0.04557628059276359, "grad_norm": 9.362659009210685, "learning_rate": 4.5575408416090894e-05, "loss": 2.662, "mean_token_accuracy": 0.41379310488700866, "step": 45250 }, { "epoch": 0.045581316645867766, "grad_norm": 12.567007236797476, "learning_rate": 4.5580444372821953e-05, "loss": 2.824, "mean_token_accuracy": 0.4000000059604645, "step": 45255 }, { "epoch": 0.04558635269897194, "grad_norm": 16.84349586642506, "learning_rate": 4.558548032955301e-05, "loss": 2.6391, "mean_token_accuracy": 0.39655171930789945, "step": 45260 }, { "epoch": 0.04559138875207611, "grad_norm": 18.160329500038696, "learning_rate": 4.5590516286284065e-05, "loss": 2.916, "mean_token_accuracy": 0.39999998807907106, "step": 45265 }, { "epoch": 0.04559642480518029, "grad_norm": 11.510593555131928, "learning_rate": 4.559555224301513e-05, "loss": 2.5816, "mean_token_accuracy": 0.42413792610168455, "step": 45270 }, { "epoch": 0.04560146085828446, "grad_norm": 11.64209086899516, "learning_rate": 4.560058819974619e-05, "loss": 2.5579, "mean_token_accuracy": 0.39310344457626345, "step": 45275 }, { "epoch": 0.045606496911388635, "grad_norm": 12.3352603329827, "learning_rate": 4.560562415647725e-05, "loss": 2.8205, "mean_token_accuracy": 0.3946763455867767, "step": 45280 }, { "epoch": 0.0456115329644928, "grad_norm": 13.91472397634847, "learning_rate": 4.561066011320831e-05, "loss": 2.8784, "mean_token_accuracy": 0.3620689570903778, "step": 45285 }, { "epoch": 0.045616569017596975, "grad_norm": 12.334802760260308, "learning_rate": 4.561569606993937e-05, "loss": 2.3678, "mean_token_accuracy": 0.42068966031074523, "step": 45290 }, { "epoch": 0.04562160507070115, "grad_norm": 16.51528036705448, "learning_rate": 4.562073202667043e-05, "loss": 2.5628, "mean_token_accuracy": 0.3931034505367279, "step": 45295 }, { "epoch": 0.04562664112380532, "grad_norm": 11.096902800062322, "learning_rate": 4.5625767983401493e-05, "loss": 2.7286, "mean_token_accuracy": 0.38965516686439516, "step": 45300 }, { "epoch": 0.045631677176909496, "grad_norm": 14.76987054674303, "learning_rate": 4.5630803940132546e-05, "loss": 2.817, "mean_token_accuracy": 0.33448275923728943, "step": 45305 }, { "epoch": 0.04563671323001367, "grad_norm": 14.965456859411088, "learning_rate": 4.5635839896863605e-05, "loss": 3.4042, "mean_token_accuracy": 0.31724137961864474, "step": 45310 }, { "epoch": 0.045641749283117844, "grad_norm": 13.525317413286624, "learning_rate": 4.5640875853594664e-05, "loss": 2.8693, "mean_token_accuracy": 0.34827586114406583, "step": 45315 }, { "epoch": 0.04564678533622201, "grad_norm": 14.514024973758806, "learning_rate": 4.5645911810325724e-05, "loss": 2.601, "mean_token_accuracy": 0.3896551728248596, "step": 45320 }, { "epoch": 0.045651821389326185, "grad_norm": 12.470327250810271, "learning_rate": 4.565094776705679e-05, "loss": 2.8991, "mean_token_accuracy": 0.3620689630508423, "step": 45325 }, { "epoch": 0.04565685744243036, "grad_norm": 14.101465357793328, "learning_rate": 4.565598372378785e-05, "loss": 2.4769, "mean_token_accuracy": 0.4034482717514038, "step": 45330 }, { "epoch": 0.04566189349553453, "grad_norm": 16.096599093008184, "learning_rate": 4.566101968051891e-05, "loss": 2.5989, "mean_token_accuracy": 0.38620689511299133, "step": 45335 }, { "epoch": 0.045666929548638706, "grad_norm": 11.885569712547452, "learning_rate": 4.566605563724997e-05, "loss": 2.482, "mean_token_accuracy": 0.3896551728248596, "step": 45340 }, { "epoch": 0.04567196560174288, "grad_norm": 14.410809487306187, "learning_rate": 4.567109159398103e-05, "loss": 2.3762, "mean_token_accuracy": 0.45517241954803467, "step": 45345 }, { "epoch": 0.04567700165484705, "grad_norm": 11.952594174349422, "learning_rate": 4.5676127550712086e-05, "loss": 2.6135, "mean_token_accuracy": 0.4, "step": 45350 }, { "epoch": 0.04568203770795122, "grad_norm": 12.812536415361809, "learning_rate": 4.5681163507443145e-05, "loss": 2.5453, "mean_token_accuracy": 0.41379310488700866, "step": 45355 }, { "epoch": 0.045687073761055394, "grad_norm": 13.202700090104042, "learning_rate": 4.5686199464174205e-05, "loss": 2.376, "mean_token_accuracy": 0.4068965494632721, "step": 45360 }, { "epoch": 0.04569210981415957, "grad_norm": 11.165970616648178, "learning_rate": 4.5691235420905264e-05, "loss": 2.6221, "mean_token_accuracy": 0.4034482717514038, "step": 45365 }, { "epoch": 0.04569714586726374, "grad_norm": 23.302681403952413, "learning_rate": 4.569627137763632e-05, "loss": 2.382, "mean_token_accuracy": 0.41724138259887694, "step": 45370 }, { "epoch": 0.045702181920367915, "grad_norm": 11.15379949212112, "learning_rate": 4.570130733436738e-05, "loss": 2.2873, "mean_token_accuracy": 0.4571687877178192, "step": 45375 }, { "epoch": 0.04570721797347209, "grad_norm": 12.152728765855322, "learning_rate": 4.570634329109845e-05, "loss": 2.3114, "mean_token_accuracy": 0.38965516686439516, "step": 45380 }, { "epoch": 0.04571225402657626, "grad_norm": 15.29930951775352, "learning_rate": 4.571137924782951e-05, "loss": 2.4733, "mean_token_accuracy": 0.39655172228813174, "step": 45385 }, { "epoch": 0.04571729007968043, "grad_norm": 9.648632861139788, "learning_rate": 4.571641520456057e-05, "loss": 2.3832, "mean_token_accuracy": 0.44827587008476255, "step": 45390 }, { "epoch": 0.0457223261327846, "grad_norm": 16.86753401078081, "learning_rate": 4.5721451161291626e-05, "loss": 2.5552, "mean_token_accuracy": 0.3586206793785095, "step": 45395 }, { "epoch": 0.04572736218588878, "grad_norm": 20.935801595693228, "learning_rate": 4.572648711802268e-05, "loss": 3.0217, "mean_token_accuracy": 0.3482758641242981, "step": 45400 }, { "epoch": 0.04573239823899295, "grad_norm": 12.513682665244652, "learning_rate": 4.5731523074753745e-05, "loss": 2.4091, "mean_token_accuracy": 0.422202056646347, "step": 45405 }, { "epoch": 0.045737434292097125, "grad_norm": 17.436336117296158, "learning_rate": 4.5736559031484804e-05, "loss": 3.1206, "mean_token_accuracy": 0.34137930870056155, "step": 45410 }, { "epoch": 0.0457424703452013, "grad_norm": 15.352637817630113, "learning_rate": 4.574159498821586e-05, "loss": 3.086, "mean_token_accuracy": 0.3310344755649567, "step": 45415 }, { "epoch": 0.04574750639830547, "grad_norm": 19.698937123809284, "learning_rate": 4.574663094494692e-05, "loss": 2.5823, "mean_token_accuracy": 0.4068965494632721, "step": 45420 }, { "epoch": 0.04575254245140964, "grad_norm": 15.238568703625944, "learning_rate": 4.575166690167798e-05, "loss": 2.6403, "mean_token_accuracy": 0.3620689630508423, "step": 45425 }, { "epoch": 0.04575757850451381, "grad_norm": 13.403932793058017, "learning_rate": 4.575670285840905e-05, "loss": 2.4071, "mean_token_accuracy": 0.4103448331356049, "step": 45430 }, { "epoch": 0.04576261455761799, "grad_norm": 14.554827372958975, "learning_rate": 4.576173881514011e-05, "loss": 2.6868, "mean_token_accuracy": 0.3999999940395355, "step": 45435 }, { "epoch": 0.04576765061072216, "grad_norm": 11.762121879511273, "learning_rate": 4.576677477187116e-05, "loss": 2.7936, "mean_token_accuracy": 0.4068965494632721, "step": 45440 }, { "epoch": 0.045772686663826334, "grad_norm": 11.798432969573954, "learning_rate": 4.577181072860222e-05, "loss": 2.46, "mean_token_accuracy": 0.4275862157344818, "step": 45445 }, { "epoch": 0.04577772271693051, "grad_norm": 15.66176989713497, "learning_rate": 4.577684668533328e-05, "loss": 2.8727, "mean_token_accuracy": 0.3448275804519653, "step": 45450 }, { "epoch": 0.04578275877003468, "grad_norm": 12.09495091439029, "learning_rate": 4.5781882642064344e-05, "loss": 2.6136, "mean_token_accuracy": 0.3793103456497192, "step": 45455 }, { "epoch": 0.04578779482313885, "grad_norm": 14.498910632951626, "learning_rate": 4.57869185987954e-05, "loss": 2.8005, "mean_token_accuracy": 0.33103448152542114, "step": 45460 }, { "epoch": 0.04579283087624302, "grad_norm": 11.130412154352795, "learning_rate": 4.579195455552646e-05, "loss": 2.2109, "mean_token_accuracy": 0.482758617401123, "step": 45465 }, { "epoch": 0.045797866929347196, "grad_norm": 14.015365356305098, "learning_rate": 4.579699051225752e-05, "loss": 2.8315, "mean_token_accuracy": 0.3931034505367279, "step": 45470 }, { "epoch": 0.04580290298245137, "grad_norm": 12.36931362450068, "learning_rate": 4.580202646898858e-05, "loss": 2.5634, "mean_token_accuracy": 0.3931034505367279, "step": 45475 }, { "epoch": 0.045807939035555544, "grad_norm": 13.486536390372022, "learning_rate": 4.580706242571964e-05, "loss": 2.8581, "mean_token_accuracy": 0.3344827651977539, "step": 45480 }, { "epoch": 0.04581297508865972, "grad_norm": 13.150043921388892, "learning_rate": 4.58120983824507e-05, "loss": 2.5689, "mean_token_accuracy": 0.3793103456497192, "step": 45485 }, { "epoch": 0.04581801114176389, "grad_norm": 12.963822154742406, "learning_rate": 4.581713433918176e-05, "loss": 2.4885, "mean_token_accuracy": 0.4310344815254211, "step": 45490 }, { "epoch": 0.04582304719486806, "grad_norm": 11.205178569388478, "learning_rate": 4.582217029591282e-05, "loss": 2.4859, "mean_token_accuracy": 0.45517241954803467, "step": 45495 }, { "epoch": 0.04582808324797223, "grad_norm": 12.209173521188307, "learning_rate": 4.582720625264388e-05, "loss": 3.1721, "mean_token_accuracy": 0.3724137872457504, "step": 45500 }, { "epoch": 0.045833119301076405, "grad_norm": 12.98699211196934, "learning_rate": 4.5832242209374936e-05, "loss": 2.4983, "mean_token_accuracy": 0.3965517282485962, "step": 45505 }, { "epoch": 0.04583815535418058, "grad_norm": 11.009061285521462, "learning_rate": 4.5837278166106e-05, "loss": 2.1259, "mean_token_accuracy": 0.4551724076271057, "step": 45510 }, { "epoch": 0.04584319140728475, "grad_norm": 11.623508660644774, "learning_rate": 4.584231412283706e-05, "loss": 2.4234, "mean_token_accuracy": 0.3724137932062149, "step": 45515 }, { "epoch": 0.04584822746038893, "grad_norm": 17.628579591246982, "learning_rate": 4.584735007956812e-05, "loss": 2.814, "mean_token_accuracy": 0.4034482717514038, "step": 45520 }, { "epoch": 0.0458532635134931, "grad_norm": 13.992630195178101, "learning_rate": 4.585238603629918e-05, "loss": 2.5112, "mean_token_accuracy": 0.43793103098869324, "step": 45525 }, { "epoch": 0.04585829956659727, "grad_norm": 11.968142943325905, "learning_rate": 4.585742199303024e-05, "loss": 2.3557, "mean_token_accuracy": 0.4379310250282288, "step": 45530 }, { "epoch": 0.04586333561970144, "grad_norm": 15.163929894168575, "learning_rate": 4.58624579497613e-05, "loss": 2.6151, "mean_token_accuracy": 0.3862069010734558, "step": 45535 }, { "epoch": 0.045868371672805615, "grad_norm": 15.187845828103997, "learning_rate": 4.586749390649236e-05, "loss": 2.8985, "mean_token_accuracy": 0.32758620381355286, "step": 45540 }, { "epoch": 0.04587340772590979, "grad_norm": 31.220236831813427, "learning_rate": 4.587252986322342e-05, "loss": 2.7791, "mean_token_accuracy": 0.35172412991523744, "step": 45545 }, { "epoch": 0.04587844377901396, "grad_norm": 16.300278851084947, "learning_rate": 4.5877565819954476e-05, "loss": 3.1288, "mean_token_accuracy": 0.35862069129943847, "step": 45550 }, { "epoch": 0.045883479832118136, "grad_norm": 26.21242639945067, "learning_rate": 4.5882601776685536e-05, "loss": 2.9057, "mean_token_accuracy": 0.35172412991523744, "step": 45555 }, { "epoch": 0.04588851588522231, "grad_norm": 15.053394884701435, "learning_rate": 4.5887637733416595e-05, "loss": 2.3891, "mean_token_accuracy": 0.40689654350280763, "step": 45560 }, { "epoch": 0.04589355193832648, "grad_norm": 13.403645504350676, "learning_rate": 4.589267369014766e-05, "loss": 2.2811, "mean_token_accuracy": 0.47785844206809996, "step": 45565 }, { "epoch": 0.04589858799143065, "grad_norm": 16.16527277649867, "learning_rate": 4.589770964687872e-05, "loss": 2.6712, "mean_token_accuracy": 0.3655172407627106, "step": 45570 }, { "epoch": 0.045903624044534824, "grad_norm": 14.20188482430698, "learning_rate": 4.590274560360977e-05, "loss": 2.5316, "mean_token_accuracy": 0.4172413766384125, "step": 45575 }, { "epoch": 0.045908660097639, "grad_norm": 11.75065649183626, "learning_rate": 4.590778156034083e-05, "loss": 2.2585, "mean_token_accuracy": 0.44137930274009707, "step": 45580 }, { "epoch": 0.04591369615074317, "grad_norm": 16.83404339558262, "learning_rate": 4.591281751707189e-05, "loss": 2.6975, "mean_token_accuracy": 0.4047791838645935, "step": 45585 }, { "epoch": 0.045918732203847346, "grad_norm": 15.595834662706329, "learning_rate": 4.591785347380296e-05, "loss": 2.426, "mean_token_accuracy": 0.39310344457626345, "step": 45590 }, { "epoch": 0.04592376825695152, "grad_norm": 12.95339417384313, "learning_rate": 4.5922889430534017e-05, "loss": 2.3539, "mean_token_accuracy": 0.43103448748588563, "step": 45595 }, { "epoch": 0.045928804310055686, "grad_norm": 14.118091642364684, "learning_rate": 4.5927925387265076e-05, "loss": 2.9484, "mean_token_accuracy": 0.33103448450565337, "step": 45600 }, { "epoch": 0.04593384036315986, "grad_norm": 15.258472165997654, "learning_rate": 4.5932961343996135e-05, "loss": 2.8437, "mean_token_accuracy": 0.41917725205421447, "step": 45605 }, { "epoch": 0.045938876416264034, "grad_norm": 27.655858364668973, "learning_rate": 4.5937997300727194e-05, "loss": 3.2397, "mean_token_accuracy": 0.3620689630508423, "step": 45610 }, { "epoch": 0.04594391246936821, "grad_norm": 12.893115737990815, "learning_rate": 4.5943033257458254e-05, "loss": 2.5015, "mean_token_accuracy": 0.4206896543502808, "step": 45615 }, { "epoch": 0.04594894852247238, "grad_norm": 15.560259802787362, "learning_rate": 4.594806921418931e-05, "loss": 2.6624, "mean_token_accuracy": 0.4068965554237366, "step": 45620 }, { "epoch": 0.045953984575576555, "grad_norm": 13.783219609655367, "learning_rate": 4.595310517092037e-05, "loss": 2.7483, "mean_token_accuracy": 0.3827586233615875, "step": 45625 }, { "epoch": 0.04595902062868073, "grad_norm": 12.126746903240528, "learning_rate": 4.595814112765143e-05, "loss": 2.5374, "mean_token_accuracy": 0.37241379022598264, "step": 45630 }, { "epoch": 0.045964056681784896, "grad_norm": 35.467791217302356, "learning_rate": 4.596317708438249e-05, "loss": 2.7132, "mean_token_accuracy": 0.3689655065536499, "step": 45635 }, { "epoch": 0.04596909273488907, "grad_norm": 15.534479093949958, "learning_rate": 4.596821304111355e-05, "loss": 2.9164, "mean_token_accuracy": 0.32758620381355286, "step": 45640 }, { "epoch": 0.04597412878799324, "grad_norm": 13.767392553279088, "learning_rate": 4.5973248997844616e-05, "loss": 2.7214, "mean_token_accuracy": 0.36896551847457887, "step": 45645 }, { "epoch": 0.04597916484109742, "grad_norm": 12.815546323290299, "learning_rate": 4.5978284954575675e-05, "loss": 2.3025, "mean_token_accuracy": 0.47586206197738645, "step": 45650 }, { "epoch": 0.04598420089420159, "grad_norm": 10.807010621174983, "learning_rate": 4.5983320911306734e-05, "loss": 2.5834, "mean_token_accuracy": 0.3931034505367279, "step": 45655 }, { "epoch": 0.045989236947305764, "grad_norm": 13.438594344234566, "learning_rate": 4.5988356868037794e-05, "loss": 2.5965, "mean_token_accuracy": 0.3793103456497192, "step": 45660 }, { "epoch": 0.04599427300040994, "grad_norm": 18.72233270912934, "learning_rate": 4.599339282476885e-05, "loss": 2.422, "mean_token_accuracy": 0.3896551728248596, "step": 45665 }, { "epoch": 0.045999309053514105, "grad_norm": 11.108895225874964, "learning_rate": 4.599842878149991e-05, "loss": 2.1143, "mean_token_accuracy": 0.42413793206214906, "step": 45670 }, { "epoch": 0.04600434510661828, "grad_norm": 13.162516207680952, "learning_rate": 4.600346473823097e-05, "loss": 2.5998, "mean_token_accuracy": 0.44827585816383364, "step": 45675 }, { "epoch": 0.04600938115972245, "grad_norm": 14.830432889166389, "learning_rate": 4.600850069496203e-05, "loss": 2.1864, "mean_token_accuracy": 0.42068964838981626, "step": 45680 }, { "epoch": 0.046014417212826626, "grad_norm": 18.705022670216028, "learning_rate": 4.601353665169309e-05, "loss": 2.8639, "mean_token_accuracy": 0.334482753276825, "step": 45685 }, { "epoch": 0.0460194532659308, "grad_norm": 13.460681302812048, "learning_rate": 4.601857260842415e-05, "loss": 2.7348, "mean_token_accuracy": 0.3793103516101837, "step": 45690 }, { "epoch": 0.046024489319034974, "grad_norm": 15.797978028441186, "learning_rate": 4.6023608565155215e-05, "loss": 2.5959, "mean_token_accuracy": 0.334482753276825, "step": 45695 }, { "epoch": 0.04602952537213915, "grad_norm": 15.57258454902569, "learning_rate": 4.6028644521886274e-05, "loss": 2.5973, "mean_token_accuracy": 0.3758620619773865, "step": 45700 }, { "epoch": 0.046034561425243314, "grad_norm": 10.419813635174432, "learning_rate": 4.6033680478617334e-05, "loss": 2.3198, "mean_token_accuracy": 0.4310344815254211, "step": 45705 }, { "epoch": 0.04603959747834749, "grad_norm": 11.78540913536877, "learning_rate": 4.6038716435348386e-05, "loss": 2.387, "mean_token_accuracy": 0.4310344815254211, "step": 45710 }, { "epoch": 0.04604463353145166, "grad_norm": 13.189267790984745, "learning_rate": 4.6043752392079445e-05, "loss": 2.5371, "mean_token_accuracy": 0.4137930989265442, "step": 45715 }, { "epoch": 0.046049669584555836, "grad_norm": 17.653667702615536, "learning_rate": 4.6048788348810505e-05, "loss": 2.8459, "mean_token_accuracy": 0.3793103456497192, "step": 45720 }, { "epoch": 0.04605470563766001, "grad_norm": 12.384130938014792, "learning_rate": 4.605382430554157e-05, "loss": 2.6283, "mean_token_accuracy": 0.4034482777118683, "step": 45725 }, { "epoch": 0.04605974169076418, "grad_norm": 11.632733570756454, "learning_rate": 4.605886026227263e-05, "loss": 2.3427, "mean_token_accuracy": 0.4448275864124298, "step": 45730 }, { "epoch": 0.04606477774386836, "grad_norm": 11.743119721745678, "learning_rate": 4.606389621900369e-05, "loss": 2.316, "mean_token_accuracy": 0.44827585816383364, "step": 45735 }, { "epoch": 0.046069813796972524, "grad_norm": 15.854647080869608, "learning_rate": 4.606893217573475e-05, "loss": 2.5813, "mean_token_accuracy": 0.42758620381355283, "step": 45740 }, { "epoch": 0.0460748498500767, "grad_norm": 13.235535356141703, "learning_rate": 4.607396813246581e-05, "loss": 2.4936, "mean_token_accuracy": 0.458620685338974, "step": 45745 }, { "epoch": 0.04607988590318087, "grad_norm": 15.36839971987012, "learning_rate": 4.607900408919687e-05, "loss": 3.2742, "mean_token_accuracy": 0.3482758581638336, "step": 45750 }, { "epoch": 0.046084921956285045, "grad_norm": 14.199701946720843, "learning_rate": 4.6084040045927926e-05, "loss": 2.8233, "mean_token_accuracy": 0.37241379022598264, "step": 45755 }, { "epoch": 0.04608995800938922, "grad_norm": 12.30201440937251, "learning_rate": 4.6089076002658985e-05, "loss": 2.698, "mean_token_accuracy": 0.38275861740112305, "step": 45760 }, { "epoch": 0.04609499406249339, "grad_norm": 14.761749768972521, "learning_rate": 4.6094111959390045e-05, "loss": 2.7797, "mean_token_accuracy": 0.3551724135875702, "step": 45765 }, { "epoch": 0.046100030115597566, "grad_norm": 11.639297506318416, "learning_rate": 4.6099147916121104e-05, "loss": 2.496, "mean_token_accuracy": 0.4103448331356049, "step": 45770 }, { "epoch": 0.04610506616870173, "grad_norm": 13.361179659944613, "learning_rate": 4.610418387285217e-05, "loss": 2.7539, "mean_token_accuracy": 0.379310342669487, "step": 45775 }, { "epoch": 0.04611010222180591, "grad_norm": 14.192818919801738, "learning_rate": 4.610921982958323e-05, "loss": 2.5288, "mean_token_accuracy": 0.40689654350280763, "step": 45780 }, { "epoch": 0.04611513827491008, "grad_norm": 11.533548260031099, "learning_rate": 4.611425578631429e-05, "loss": 2.4879, "mean_token_accuracy": 0.4172413766384125, "step": 45785 }, { "epoch": 0.046120174328014255, "grad_norm": 12.391252335039688, "learning_rate": 4.611929174304535e-05, "loss": 2.4831, "mean_token_accuracy": 0.3655172407627106, "step": 45790 }, { "epoch": 0.04612521038111843, "grad_norm": 12.310805400359408, "learning_rate": 4.612432769977641e-05, "loss": 2.7256, "mean_token_accuracy": 0.41379310488700866, "step": 45795 }, { "epoch": 0.0461302464342226, "grad_norm": 12.882633636133292, "learning_rate": 4.6129363656507466e-05, "loss": 2.4285, "mean_token_accuracy": 0.46896551847457885, "step": 45800 }, { "epoch": 0.04613528248732677, "grad_norm": 12.031589951086408, "learning_rate": 4.6134399613238525e-05, "loss": 2.5765, "mean_token_accuracy": 0.38620689511299133, "step": 45805 }, { "epoch": 0.04614031854043094, "grad_norm": 13.095266145802576, "learning_rate": 4.6139435569969585e-05, "loss": 2.7523, "mean_token_accuracy": 0.358620697259903, "step": 45810 }, { "epoch": 0.046145354593535116, "grad_norm": 14.355599774014687, "learning_rate": 4.6144471526700644e-05, "loss": 2.6419, "mean_token_accuracy": 0.3793103516101837, "step": 45815 }, { "epoch": 0.04615039064663929, "grad_norm": 16.600529842860706, "learning_rate": 4.61495074834317e-05, "loss": 2.7405, "mean_token_accuracy": 0.35662431716918946, "step": 45820 }, { "epoch": 0.046155426699743464, "grad_norm": 10.907384480275216, "learning_rate": 4.615454344016276e-05, "loss": 2.3394, "mean_token_accuracy": 0.41724138259887694, "step": 45825 }, { "epoch": 0.04616046275284764, "grad_norm": 12.220516256346185, "learning_rate": 4.615957939689383e-05, "loss": 2.7312, "mean_token_accuracy": 0.3724137872457504, "step": 45830 }, { "epoch": 0.04616549880595181, "grad_norm": 12.01460540365314, "learning_rate": 4.616461535362489e-05, "loss": 2.401, "mean_token_accuracy": 0.40344828367233276, "step": 45835 }, { "epoch": 0.04617053485905598, "grad_norm": 14.88337025700096, "learning_rate": 4.616965131035594e-05, "loss": 2.6828, "mean_token_accuracy": 0.37241379022598264, "step": 45840 }, { "epoch": 0.04617557091216015, "grad_norm": 12.086693837324553, "learning_rate": 4.6174687267087e-05, "loss": 2.4457, "mean_token_accuracy": 0.4620689690113068, "step": 45845 }, { "epoch": 0.046180606965264326, "grad_norm": 13.645888427371792, "learning_rate": 4.617972322381806e-05, "loss": 2.5342, "mean_token_accuracy": 0.4206896543502808, "step": 45850 }, { "epoch": 0.0461856430183685, "grad_norm": 16.31438302023713, "learning_rate": 4.6184759180549125e-05, "loss": 2.7061, "mean_token_accuracy": 0.4000000059604645, "step": 45855 }, { "epoch": 0.04619067907147267, "grad_norm": 16.258522309892264, "learning_rate": 4.6189795137280184e-05, "loss": 2.8062, "mean_token_accuracy": 0.37931033968925476, "step": 45860 }, { "epoch": 0.04619571512457685, "grad_norm": 13.843857971600505, "learning_rate": 4.619483109401124e-05, "loss": 2.6559, "mean_token_accuracy": 0.3655172407627106, "step": 45865 }, { "epoch": 0.04620075117768102, "grad_norm": 11.820816346128776, "learning_rate": 4.61998670507423e-05, "loss": 2.5112, "mean_token_accuracy": 0.3827586233615875, "step": 45870 }, { "epoch": 0.04620578723078519, "grad_norm": 16.923931552518596, "learning_rate": 4.620490300747336e-05, "loss": 2.4844, "mean_token_accuracy": 0.40508167147636415, "step": 45875 }, { "epoch": 0.04621082328388936, "grad_norm": 13.022211695787268, "learning_rate": 4.620993896420443e-05, "loss": 2.5294, "mean_token_accuracy": 0.4, "step": 45880 }, { "epoch": 0.046215859336993535, "grad_norm": 14.568218355406565, "learning_rate": 4.621497492093548e-05, "loss": 2.5195, "mean_token_accuracy": 0.42068966627120974, "step": 45885 }, { "epoch": 0.04622089539009771, "grad_norm": 15.08965299059806, "learning_rate": 4.622001087766654e-05, "loss": 2.6306, "mean_token_accuracy": 0.38275861740112305, "step": 45890 }, { "epoch": 0.04622593144320188, "grad_norm": 12.658108172795455, "learning_rate": 4.62250468343976e-05, "loss": 2.4904, "mean_token_accuracy": 0.36896551251411436, "step": 45895 }, { "epoch": 0.04623096749630606, "grad_norm": 28.73455812794667, "learning_rate": 4.623008279112866e-05, "loss": 2.4466, "mean_token_accuracy": 0.4620689630508423, "step": 45900 }, { "epoch": 0.04623600354941023, "grad_norm": 13.900968562454844, "learning_rate": 4.623511874785972e-05, "loss": 2.5235, "mean_token_accuracy": 0.4103448331356049, "step": 45905 }, { "epoch": 0.0462410396025144, "grad_norm": 35.17792021822705, "learning_rate": 4.624015470459078e-05, "loss": 2.8049, "mean_token_accuracy": 0.36896551847457887, "step": 45910 }, { "epoch": 0.04624607565561857, "grad_norm": 16.21915073681264, "learning_rate": 4.624519066132184e-05, "loss": 2.4903, "mean_token_accuracy": 0.3999999940395355, "step": 45915 }, { "epoch": 0.046251111708722745, "grad_norm": 14.413071856923924, "learning_rate": 4.62502266180529e-05, "loss": 2.4567, "mean_token_accuracy": 0.42413793206214906, "step": 45920 }, { "epoch": 0.04625614776182692, "grad_norm": 12.162893616460925, "learning_rate": 4.625526257478396e-05, "loss": 2.57, "mean_token_accuracy": 0.38965516090393065, "step": 45925 }, { "epoch": 0.04626118381493109, "grad_norm": 16.089058240117364, "learning_rate": 4.626029853151502e-05, "loss": 2.9962, "mean_token_accuracy": 0.3482758641242981, "step": 45930 }, { "epoch": 0.046266219868035266, "grad_norm": 13.718959636544689, "learning_rate": 4.626533448824608e-05, "loss": 3.0507, "mean_token_accuracy": 0.36551724672317504, "step": 45935 }, { "epoch": 0.04627125592113944, "grad_norm": 44.33563518514151, "learning_rate": 4.627037044497714e-05, "loss": 2.7675, "mean_token_accuracy": 0.3862069010734558, "step": 45940 }, { "epoch": 0.04627629197424361, "grad_norm": 13.861620347278153, "learning_rate": 4.62754064017082e-05, "loss": 2.3097, "mean_token_accuracy": 0.447065943479538, "step": 45945 }, { "epoch": 0.04628132802734778, "grad_norm": 14.521775461305122, "learning_rate": 4.628044235843926e-05, "loss": 2.3223, "mean_token_accuracy": 0.4620689690113068, "step": 45950 }, { "epoch": 0.046286364080451954, "grad_norm": 17.41446443111772, "learning_rate": 4.628547831517032e-05, "loss": 2.4409, "mean_token_accuracy": 0.4562807857990265, "step": 45955 }, { "epoch": 0.04629140013355613, "grad_norm": 14.534815184253757, "learning_rate": 4.629051427190138e-05, "loss": 2.4058, "mean_token_accuracy": 0.4157894730567932, "step": 45960 }, { "epoch": 0.0462964361866603, "grad_norm": 16.439437777946964, "learning_rate": 4.629555022863244e-05, "loss": 2.9946, "mean_token_accuracy": 0.37586207389831544, "step": 45965 }, { "epoch": 0.046301472239764475, "grad_norm": 13.039451096799368, "learning_rate": 4.63005861853635e-05, "loss": 2.5333, "mean_token_accuracy": 0.42068964838981626, "step": 45970 }, { "epoch": 0.04630650829286865, "grad_norm": 12.840304665819563, "learning_rate": 4.6305622142094554e-05, "loss": 2.5294, "mean_token_accuracy": 0.37241379022598264, "step": 45975 }, { "epoch": 0.046311544345972816, "grad_norm": 15.432918138778208, "learning_rate": 4.631065809882561e-05, "loss": 2.7596, "mean_token_accuracy": 0.37241379618644715, "step": 45980 }, { "epoch": 0.04631658039907699, "grad_norm": 15.270511119006484, "learning_rate": 4.631569405555667e-05, "loss": 2.5033, "mean_token_accuracy": 0.3862069010734558, "step": 45985 }, { "epoch": 0.046321616452181164, "grad_norm": 13.167415132014218, "learning_rate": 4.632073001228774e-05, "loss": 2.6223, "mean_token_accuracy": 0.3482758581638336, "step": 45990 }, { "epoch": 0.04632665250528534, "grad_norm": 13.633104861808908, "learning_rate": 4.63257659690188e-05, "loss": 2.7588, "mean_token_accuracy": 0.3620689630508423, "step": 45995 }, { "epoch": 0.04633168855838951, "grad_norm": 11.857464468974793, "learning_rate": 4.633080192574986e-05, "loss": 2.5808, "mean_token_accuracy": 0.40689654350280763, "step": 46000 }, { "epoch": 0.046336724611493685, "grad_norm": 24.237490066125293, "learning_rate": 4.6335837882480916e-05, "loss": 2.6148, "mean_token_accuracy": 0.4344827592372894, "step": 46005 }, { "epoch": 0.04634176066459786, "grad_norm": 14.701628101743385, "learning_rate": 4.6340873839211975e-05, "loss": 2.6068, "mean_token_accuracy": 0.39503931999206543, "step": 46010 }, { "epoch": 0.046346796717702025, "grad_norm": 12.7457082523556, "learning_rate": 4.6345909795943034e-05, "loss": 2.7071, "mean_token_accuracy": 0.37931033968925476, "step": 46015 }, { "epoch": 0.0463518327708062, "grad_norm": 9.676119655095004, "learning_rate": 4.6350945752674094e-05, "loss": 2.6978, "mean_token_accuracy": 0.41966121792793276, "step": 46020 }, { "epoch": 0.04635686882391037, "grad_norm": 22.42029528285458, "learning_rate": 4.635598170940515e-05, "loss": 2.694, "mean_token_accuracy": 0.4206896543502808, "step": 46025 }, { "epoch": 0.04636190487701455, "grad_norm": 12.430331594229502, "learning_rate": 4.636101766613621e-05, "loss": 2.3322, "mean_token_accuracy": 0.4310344815254211, "step": 46030 }, { "epoch": 0.04636694093011872, "grad_norm": 13.531095412882072, "learning_rate": 4.636605362286727e-05, "loss": 2.53, "mean_token_accuracy": 0.38620689511299133, "step": 46035 }, { "epoch": 0.046371976983222894, "grad_norm": 11.199041581310555, "learning_rate": 4.637108957959834e-05, "loss": 2.5986, "mean_token_accuracy": 0.3758620709180832, "step": 46040 }, { "epoch": 0.04637701303632707, "grad_norm": 10.566736499541863, "learning_rate": 4.63761255363294e-05, "loss": 2.2256, "mean_token_accuracy": 0.4329098641872406, "step": 46045 }, { "epoch": 0.046382049089431235, "grad_norm": 15.958288357559006, "learning_rate": 4.6381161493060456e-05, "loss": 2.5156, "mean_token_accuracy": 0.41379310488700866, "step": 46050 }, { "epoch": 0.04638708514253541, "grad_norm": 12.738298865562033, "learning_rate": 4.6386197449791515e-05, "loss": 2.334, "mean_token_accuracy": 0.41034482717514037, "step": 46055 }, { "epoch": 0.04639212119563958, "grad_norm": 18.100403347697107, "learning_rate": 4.6391233406522574e-05, "loss": 2.6332, "mean_token_accuracy": 0.42758620977401735, "step": 46060 }, { "epoch": 0.046397157248743756, "grad_norm": 13.061823492778986, "learning_rate": 4.6396269363253634e-05, "loss": 2.4386, "mean_token_accuracy": 0.38275861740112305, "step": 46065 }, { "epoch": 0.04640219330184793, "grad_norm": 12.597300846567174, "learning_rate": 4.640130531998469e-05, "loss": 2.4289, "mean_token_accuracy": 0.3758620709180832, "step": 46070 }, { "epoch": 0.046407229354952104, "grad_norm": 16.4216660964597, "learning_rate": 4.640634127671575e-05, "loss": 2.7114, "mean_token_accuracy": 0.36896551847457887, "step": 46075 }, { "epoch": 0.04641226540805628, "grad_norm": 15.040774392297783, "learning_rate": 4.641137723344681e-05, "loss": 2.9222, "mean_token_accuracy": 0.4034482717514038, "step": 46080 }, { "epoch": 0.046417301461160444, "grad_norm": 11.536233569369402, "learning_rate": 4.641641319017787e-05, "loss": 2.4877, "mean_token_accuracy": 0.41034482717514037, "step": 46085 }, { "epoch": 0.04642233751426462, "grad_norm": 30.83155244510312, "learning_rate": 4.642144914690893e-05, "loss": 2.6535, "mean_token_accuracy": 0.3965517163276672, "step": 46090 }, { "epoch": 0.04642737356736879, "grad_norm": 14.845675195096591, "learning_rate": 4.6426485103639996e-05, "loss": 2.2555, "mean_token_accuracy": 0.47647783160209656, "step": 46095 }, { "epoch": 0.046432409620472966, "grad_norm": 12.969754348440164, "learning_rate": 4.6431521060371055e-05, "loss": 2.858, "mean_token_accuracy": 0.38275861740112305, "step": 46100 }, { "epoch": 0.04643744567357714, "grad_norm": 13.983398680459782, "learning_rate": 4.6436557017102115e-05, "loss": 2.9348, "mean_token_accuracy": 0.3413793116807938, "step": 46105 }, { "epoch": 0.04644248172668131, "grad_norm": 12.883394252233046, "learning_rate": 4.644159297383317e-05, "loss": 2.2188, "mean_token_accuracy": 0.42758620977401735, "step": 46110 }, { "epoch": 0.04644751777978549, "grad_norm": 15.153527208853713, "learning_rate": 4.6446628930564226e-05, "loss": 2.6293, "mean_token_accuracy": 0.41379310488700866, "step": 46115 }, { "epoch": 0.046452553832889654, "grad_norm": 13.481225231100817, "learning_rate": 4.645166488729529e-05, "loss": 2.6796, "mean_token_accuracy": 0.4344827592372894, "step": 46120 }, { "epoch": 0.04645758988599383, "grad_norm": 15.774491627679929, "learning_rate": 4.645670084402635e-05, "loss": 2.5303, "mean_token_accuracy": 0.42413792610168455, "step": 46125 }, { "epoch": 0.046462625939098, "grad_norm": 11.896931345667403, "learning_rate": 4.646173680075741e-05, "loss": 2.5707, "mean_token_accuracy": 0.42758620977401735, "step": 46130 }, { "epoch": 0.046467661992202175, "grad_norm": 11.005845355316321, "learning_rate": 4.646677275748847e-05, "loss": 2.4982, "mean_token_accuracy": 0.3862068891525269, "step": 46135 }, { "epoch": 0.04647269804530635, "grad_norm": 14.304115405775352, "learning_rate": 4.647180871421953e-05, "loss": 2.4297, "mean_token_accuracy": 0.3827586203813553, "step": 46140 }, { "epoch": 0.04647773409841052, "grad_norm": 10.847422434972705, "learning_rate": 4.647684467095059e-05, "loss": 2.2724, "mean_token_accuracy": 0.46551724672317507, "step": 46145 }, { "epoch": 0.046482770151514696, "grad_norm": 19.815230883292827, "learning_rate": 4.648188062768165e-05, "loss": 2.6843, "mean_token_accuracy": 0.38965517580509185, "step": 46150 }, { "epoch": 0.04648780620461886, "grad_norm": 12.690830671352783, "learning_rate": 4.648691658441271e-05, "loss": 2.4956, "mean_token_accuracy": 0.4172413766384125, "step": 46155 }, { "epoch": 0.04649284225772304, "grad_norm": 13.450250643956371, "learning_rate": 4.6491952541143766e-05, "loss": 2.691, "mean_token_accuracy": 0.3655172407627106, "step": 46160 }, { "epoch": 0.04649787831082721, "grad_norm": 14.628030472441715, "learning_rate": 4.6496988497874826e-05, "loss": 2.7533, "mean_token_accuracy": 0.38421053290367124, "step": 46165 }, { "epoch": 0.046502914363931384, "grad_norm": 12.668612685982131, "learning_rate": 4.6502024454605885e-05, "loss": 2.6825, "mean_token_accuracy": 0.3793103456497192, "step": 46170 }, { "epoch": 0.04650795041703556, "grad_norm": 13.499269566685483, "learning_rate": 4.650706041133695e-05, "loss": 3.0405, "mean_token_accuracy": 0.33793103098869326, "step": 46175 }, { "epoch": 0.04651298647013973, "grad_norm": 15.330818155606407, "learning_rate": 4.651209636806801e-05, "loss": 2.3518, "mean_token_accuracy": 0.4257713258266449, "step": 46180 }, { "epoch": 0.046518022523243906, "grad_norm": 13.69473693507867, "learning_rate": 4.651713232479907e-05, "loss": 3.1926, "mean_token_accuracy": 0.34137930572032926, "step": 46185 }, { "epoch": 0.04652305857634807, "grad_norm": 22.651420678413547, "learning_rate": 4.652216828153013e-05, "loss": 3.2714, "mean_token_accuracy": 0.3034482777118683, "step": 46190 }, { "epoch": 0.046528094629452246, "grad_norm": 11.816002337664438, "learning_rate": 4.652720423826119e-05, "loss": 2.7271, "mean_token_accuracy": 0.41379310488700866, "step": 46195 }, { "epoch": 0.04653313068255642, "grad_norm": 15.415220495216337, "learning_rate": 4.653224019499225e-05, "loss": 2.5223, "mean_token_accuracy": 0.3931034505367279, "step": 46200 }, { "epoch": 0.046538166735660594, "grad_norm": 11.402544626173952, "learning_rate": 4.6537276151723306e-05, "loss": 2.3957, "mean_token_accuracy": 0.42256503105163573, "step": 46205 }, { "epoch": 0.04654320278876477, "grad_norm": 12.322532220079593, "learning_rate": 4.6542312108454366e-05, "loss": 2.1181, "mean_token_accuracy": 0.47797942757606504, "step": 46210 }, { "epoch": 0.04654823884186894, "grad_norm": 12.825588110301824, "learning_rate": 4.6547348065185425e-05, "loss": 2.7777, "mean_token_accuracy": 0.4034482777118683, "step": 46215 }, { "epoch": 0.046553274894973115, "grad_norm": 29.998588780292636, "learning_rate": 4.6552384021916484e-05, "loss": 2.9102, "mean_token_accuracy": 0.3984270989894867, "step": 46220 }, { "epoch": 0.04655831094807728, "grad_norm": 11.784893909019942, "learning_rate": 4.655741997864755e-05, "loss": 2.4031, "mean_token_accuracy": 0.43248639106750486, "step": 46225 }, { "epoch": 0.046563347001181456, "grad_norm": 12.466769828053527, "learning_rate": 4.656245593537861e-05, "loss": 2.5748, "mean_token_accuracy": 0.41609196066856385, "step": 46230 }, { "epoch": 0.04656838305428563, "grad_norm": 12.779649672054347, "learning_rate": 4.656749189210967e-05, "loss": 2.1994, "mean_token_accuracy": 0.45517240166664125, "step": 46235 }, { "epoch": 0.0465734191073898, "grad_norm": 10.789015253044095, "learning_rate": 4.657252784884073e-05, "loss": 2.2572, "mean_token_accuracy": 0.4310344815254211, "step": 46240 }, { "epoch": 0.04657845516049398, "grad_norm": 12.226269920113237, "learning_rate": 4.657756380557178e-05, "loss": 2.5446, "mean_token_accuracy": 0.3758620709180832, "step": 46245 }, { "epoch": 0.04658349121359815, "grad_norm": 11.036590385017957, "learning_rate": 4.658259976230284e-05, "loss": 2.6158, "mean_token_accuracy": 0.3965517282485962, "step": 46250 }, { "epoch": 0.046588527266702325, "grad_norm": 16.40896065968302, "learning_rate": 4.6587635719033906e-05, "loss": 2.3883, "mean_token_accuracy": 0.43103447556495667, "step": 46255 }, { "epoch": 0.04659356331980649, "grad_norm": 13.80523218301721, "learning_rate": 4.6592671675764965e-05, "loss": 3.0293, "mean_token_accuracy": 0.39655172228813174, "step": 46260 }, { "epoch": 0.046598599372910665, "grad_norm": 14.458737260186087, "learning_rate": 4.6597707632496024e-05, "loss": 2.789, "mean_token_accuracy": 0.3379310369491577, "step": 46265 }, { "epoch": 0.04660363542601484, "grad_norm": 11.291111957617563, "learning_rate": 4.6602743589227083e-05, "loss": 2.1188, "mean_token_accuracy": 0.45517240166664125, "step": 46270 }, { "epoch": 0.04660867147911901, "grad_norm": 14.800970571681939, "learning_rate": 4.660777954595814e-05, "loss": 3.069, "mean_token_accuracy": 0.33793102502822875, "step": 46275 }, { "epoch": 0.046613707532223186, "grad_norm": 11.89288989225702, "learning_rate": 4.661281550268921e-05, "loss": 2.8296, "mean_token_accuracy": 0.32068965435028074, "step": 46280 }, { "epoch": 0.04661874358532736, "grad_norm": 11.01402505290828, "learning_rate": 4.661785145942026e-05, "loss": 2.3195, "mean_token_accuracy": 0.4275861978530884, "step": 46285 }, { "epoch": 0.046623779638431534, "grad_norm": 16.729750156321543, "learning_rate": 4.662288741615132e-05, "loss": 2.8058, "mean_token_accuracy": 0.3931034505367279, "step": 46290 }, { "epoch": 0.0466288156915357, "grad_norm": 13.351939971723985, "learning_rate": 4.662792337288238e-05, "loss": 2.6625, "mean_token_accuracy": 0.4068965494632721, "step": 46295 }, { "epoch": 0.046633851744639875, "grad_norm": 11.542197123257866, "learning_rate": 4.663295932961344e-05, "loss": 2.277, "mean_token_accuracy": 0.458620685338974, "step": 46300 }, { "epoch": 0.04663888779774405, "grad_norm": 14.183882761164176, "learning_rate": 4.6637995286344505e-05, "loss": 2.5842, "mean_token_accuracy": 0.36896551549434664, "step": 46305 }, { "epoch": 0.04664392385084822, "grad_norm": 14.025220305500262, "learning_rate": 4.6643031243075564e-05, "loss": 2.7793, "mean_token_accuracy": 0.33103448152542114, "step": 46310 }, { "epoch": 0.046648959903952396, "grad_norm": 12.156555456064357, "learning_rate": 4.6648067199806623e-05, "loss": 2.5299, "mean_token_accuracy": 0.42758620381355283, "step": 46315 }, { "epoch": 0.04665399595705657, "grad_norm": 11.56394372480486, "learning_rate": 4.665310315653768e-05, "loss": 2.4368, "mean_token_accuracy": 0.43448275327682495, "step": 46320 }, { "epoch": 0.04665903201016074, "grad_norm": 11.621710501489801, "learning_rate": 4.665813911326874e-05, "loss": 2.6931, "mean_token_accuracy": 0.38620689511299133, "step": 46325 }, { "epoch": 0.04666406806326491, "grad_norm": 10.122138176410097, "learning_rate": 4.66631750699998e-05, "loss": 2.2622, "mean_token_accuracy": 0.4655172348022461, "step": 46330 }, { "epoch": 0.046669104116369084, "grad_norm": 14.11305958198568, "learning_rate": 4.666821102673086e-05, "loss": 2.7757, "mean_token_accuracy": 0.41379310488700866, "step": 46335 }, { "epoch": 0.04667414016947326, "grad_norm": 13.018365124729732, "learning_rate": 4.667324698346192e-05, "loss": 2.1244, "mean_token_accuracy": 0.5034482836723327, "step": 46340 }, { "epoch": 0.04667917622257743, "grad_norm": 14.468809234865356, "learning_rate": 4.667828294019298e-05, "loss": 2.5634, "mean_token_accuracy": 0.37586206793785093, "step": 46345 }, { "epoch": 0.046684212275681605, "grad_norm": 16.391595665965475, "learning_rate": 4.668331889692404e-05, "loss": 2.5285, "mean_token_accuracy": 0.4122807025909424, "step": 46350 }, { "epoch": 0.04668924832878578, "grad_norm": 13.869131538328594, "learning_rate": 4.66883548536551e-05, "loss": 2.662, "mean_token_accuracy": 0.36551723480224607, "step": 46355 }, { "epoch": 0.04669428438188995, "grad_norm": 13.480983941122094, "learning_rate": 4.6693390810386164e-05, "loss": 2.8522, "mean_token_accuracy": 0.3517241358757019, "step": 46360 }, { "epoch": 0.04669932043499412, "grad_norm": 19.603999857757433, "learning_rate": 4.669842676711722e-05, "loss": 2.8948, "mean_token_accuracy": 0.34482758641242983, "step": 46365 }, { "epoch": 0.04670435648809829, "grad_norm": 15.79953807659748, "learning_rate": 4.670346272384828e-05, "loss": 2.8059, "mean_token_accuracy": 0.36206897497177126, "step": 46370 }, { "epoch": 0.04670939254120247, "grad_norm": 16.76586316731833, "learning_rate": 4.6708498680579335e-05, "loss": 2.4394, "mean_token_accuracy": 0.3980641275644302, "step": 46375 }, { "epoch": 0.04671442859430664, "grad_norm": 11.24548291977423, "learning_rate": 4.6713534637310394e-05, "loss": 2.8442, "mean_token_accuracy": 0.3689655214548111, "step": 46380 }, { "epoch": 0.046719464647410815, "grad_norm": 13.296213634868522, "learning_rate": 4.671857059404146e-05, "loss": 2.7322, "mean_token_accuracy": 0.3551724076271057, "step": 46385 }, { "epoch": 0.04672450070051499, "grad_norm": 11.411439432864737, "learning_rate": 4.672360655077252e-05, "loss": 2.7464, "mean_token_accuracy": 0.34482758641242983, "step": 46390 }, { "epoch": 0.04672953675361916, "grad_norm": 10.975083434080346, "learning_rate": 4.672864250750358e-05, "loss": 2.3412, "mean_token_accuracy": 0.44482758045196535, "step": 46395 }, { "epoch": 0.04673457280672333, "grad_norm": 14.931923755646302, "learning_rate": 4.673367846423464e-05, "loss": 3.081, "mean_token_accuracy": 0.35862069129943847, "step": 46400 }, { "epoch": 0.0467396088598275, "grad_norm": 12.467442786279372, "learning_rate": 4.67387144209657e-05, "loss": 2.7651, "mean_token_accuracy": 0.4068965554237366, "step": 46405 }, { "epoch": 0.04674464491293168, "grad_norm": 18.17146844695907, "learning_rate": 4.6743750377696756e-05, "loss": 2.547, "mean_token_accuracy": 0.42413793206214906, "step": 46410 }, { "epoch": 0.04674968096603585, "grad_norm": 12.625502605888316, "learning_rate": 4.674878633442782e-05, "loss": 2.2155, "mean_token_accuracy": 0.4379310429096222, "step": 46415 }, { "epoch": 0.046754717019140024, "grad_norm": 12.35210482196056, "learning_rate": 4.6753822291158875e-05, "loss": 2.7684, "mean_token_accuracy": 0.37241379618644715, "step": 46420 }, { "epoch": 0.0467597530722442, "grad_norm": 17.11679177558172, "learning_rate": 4.6758858247889934e-05, "loss": 2.8563, "mean_token_accuracy": 0.36896551251411436, "step": 46425 }, { "epoch": 0.04676478912534837, "grad_norm": 11.122507746611133, "learning_rate": 4.676389420462099e-05, "loss": 2.2496, "mean_token_accuracy": 0.4379310369491577, "step": 46430 }, { "epoch": 0.04676982517845254, "grad_norm": 13.852223878838121, "learning_rate": 4.676893016135205e-05, "loss": 2.3647, "mean_token_accuracy": 0.4, "step": 46435 }, { "epoch": 0.04677486123155671, "grad_norm": 16.784735380222674, "learning_rate": 4.677396611808312e-05, "loss": 2.7288, "mean_token_accuracy": 0.36206896007061007, "step": 46440 }, { "epoch": 0.046779897284660886, "grad_norm": 12.197946667123045, "learning_rate": 4.677900207481418e-05, "loss": 2.5614, "mean_token_accuracy": 0.36896551847457887, "step": 46445 }, { "epoch": 0.04678493333776506, "grad_norm": 13.932155933715231, "learning_rate": 4.678403803154524e-05, "loss": 2.0727, "mean_token_accuracy": 0.48275862336158754, "step": 46450 }, { "epoch": 0.046789969390869234, "grad_norm": 13.212000835654651, "learning_rate": 4.6789073988276296e-05, "loss": 2.5825, "mean_token_accuracy": 0.4034482777118683, "step": 46455 }, { "epoch": 0.04679500544397341, "grad_norm": 14.054493040528458, "learning_rate": 4.6794109945007355e-05, "loss": 2.9621, "mean_token_accuracy": 0.39655172228813174, "step": 46460 }, { "epoch": 0.04680004149707758, "grad_norm": 19.154669492394557, "learning_rate": 4.6799145901738415e-05, "loss": 3.1245, "mean_token_accuracy": 0.37586206793785093, "step": 46465 }, { "epoch": 0.04680507755018175, "grad_norm": 20.22559632517103, "learning_rate": 4.6804181858469474e-05, "loss": 3.0609, "mean_token_accuracy": 0.4148820281028748, "step": 46470 }, { "epoch": 0.04681011360328592, "grad_norm": 14.627308769164507, "learning_rate": 4.680921781520053e-05, "loss": 2.5286, "mean_token_accuracy": 0.39147005677223207, "step": 46475 }, { "epoch": 0.046815149656390095, "grad_norm": 14.23085695293475, "learning_rate": 4.681425377193159e-05, "loss": 2.6632, "mean_token_accuracy": 0.34658198058605194, "step": 46480 }, { "epoch": 0.04682018570949427, "grad_norm": 12.34526051469207, "learning_rate": 4.681928972866265e-05, "loss": 2.6481, "mean_token_accuracy": 0.40344828367233276, "step": 46485 }, { "epoch": 0.04682522176259844, "grad_norm": 14.06021847357866, "learning_rate": 4.682432568539371e-05, "loss": 2.6544, "mean_token_accuracy": 0.3862069010734558, "step": 46490 }, { "epoch": 0.04683025781570262, "grad_norm": 13.021144960209002, "learning_rate": 4.682936164212478e-05, "loss": 2.4008, "mean_token_accuracy": 0.40000000298023225, "step": 46495 }, { "epoch": 0.04683529386880679, "grad_norm": 11.192147435076782, "learning_rate": 4.6834397598855836e-05, "loss": 2.6397, "mean_token_accuracy": 0.4068965494632721, "step": 46500 }, { "epoch": 0.04684032992191096, "grad_norm": 11.75850928410697, "learning_rate": 4.6839433555586895e-05, "loss": 2.471, "mean_token_accuracy": 0.4275861978530884, "step": 46505 }, { "epoch": 0.04684536597501513, "grad_norm": 12.914189914686297, "learning_rate": 4.684446951231795e-05, "loss": 2.7455, "mean_token_accuracy": 0.35692680180072783, "step": 46510 }, { "epoch": 0.046850402028119305, "grad_norm": 12.41781632823946, "learning_rate": 4.684950546904901e-05, "loss": 2.3996, "mean_token_accuracy": 0.3931034505367279, "step": 46515 }, { "epoch": 0.04685543808122348, "grad_norm": 14.353762277644186, "learning_rate": 4.685454142578007e-05, "loss": 2.6852, "mean_token_accuracy": 0.37586206793785093, "step": 46520 }, { "epoch": 0.04686047413432765, "grad_norm": 12.600590710543017, "learning_rate": 4.685957738251113e-05, "loss": 2.4563, "mean_token_accuracy": 0.4172413766384125, "step": 46525 }, { "epoch": 0.046865510187431826, "grad_norm": 12.313054187027676, "learning_rate": 4.686461333924219e-05, "loss": 2.6385, "mean_token_accuracy": 0.35862069129943847, "step": 46530 }, { "epoch": 0.046870546240536, "grad_norm": 18.397559587832312, "learning_rate": 4.686964929597325e-05, "loss": 2.518, "mean_token_accuracy": 0.39655172228813174, "step": 46535 }, { "epoch": 0.04687558229364017, "grad_norm": 16.13502934294121, "learning_rate": 4.687468525270431e-05, "loss": 2.4234, "mean_token_accuracy": 0.3655172407627106, "step": 46540 }, { "epoch": 0.04688061834674434, "grad_norm": 11.811237622960931, "learning_rate": 4.6879721209435376e-05, "loss": 2.8467, "mean_token_accuracy": 0.40544464588165285, "step": 46545 }, { "epoch": 0.046885654399848514, "grad_norm": 12.299191706586337, "learning_rate": 4.688475716616643e-05, "loss": 2.8507, "mean_token_accuracy": 0.3934728980064392, "step": 46550 }, { "epoch": 0.04689069045295269, "grad_norm": 12.854228269174623, "learning_rate": 4.688979312289749e-05, "loss": 2.6075, "mean_token_accuracy": 0.41724138259887694, "step": 46555 }, { "epoch": 0.04689572650605686, "grad_norm": 12.157262814784477, "learning_rate": 4.689482907962855e-05, "loss": 2.7778, "mean_token_accuracy": 0.3620689630508423, "step": 46560 }, { "epoch": 0.046900762559161036, "grad_norm": 11.99758696016837, "learning_rate": 4.6899865036359606e-05, "loss": 2.2614, "mean_token_accuracy": 0.4290381133556366, "step": 46565 }, { "epoch": 0.04690579861226521, "grad_norm": 17.606979154400342, "learning_rate": 4.6904900993090666e-05, "loss": 3.135, "mean_token_accuracy": 0.37755595445632933, "step": 46570 }, { "epoch": 0.046910834665369376, "grad_norm": 19.66955528139329, "learning_rate": 4.690993694982173e-05, "loss": 2.6442, "mean_token_accuracy": 0.44295220971107485, "step": 46575 }, { "epoch": 0.04691587071847355, "grad_norm": 18.63968596257936, "learning_rate": 4.691497290655279e-05, "loss": 2.8048, "mean_token_accuracy": 0.3846340000629425, "step": 46580 }, { "epoch": 0.046920906771577724, "grad_norm": 16.46319461788385, "learning_rate": 4.692000886328385e-05, "loss": 2.6738, "mean_token_accuracy": 0.3896551728248596, "step": 46585 }, { "epoch": 0.0469259428246819, "grad_norm": 12.245045734264155, "learning_rate": 4.692504482001491e-05, "loss": 2.5495, "mean_token_accuracy": 0.37241379618644715, "step": 46590 }, { "epoch": 0.04693097887778607, "grad_norm": 12.385526541243276, "learning_rate": 4.693008077674597e-05, "loss": 2.6172, "mean_token_accuracy": 0.35862069129943847, "step": 46595 }, { "epoch": 0.046936014930890245, "grad_norm": 14.888157583386725, "learning_rate": 4.693511673347703e-05, "loss": 2.1672, "mean_token_accuracy": 0.5018148839473724, "step": 46600 }, { "epoch": 0.04694105098399442, "grad_norm": 13.143638307203076, "learning_rate": 4.694015269020809e-05, "loss": 2.6597, "mean_token_accuracy": 0.35862069129943847, "step": 46605 }, { "epoch": 0.046946087037098586, "grad_norm": 13.563188908257573, "learning_rate": 4.6945188646939147e-05, "loss": 2.451, "mean_token_accuracy": 0.4379310369491577, "step": 46610 }, { "epoch": 0.04695112309020276, "grad_norm": 13.18366108581065, "learning_rate": 4.6950224603670206e-05, "loss": 2.2403, "mean_token_accuracy": 0.42758620381355283, "step": 46615 }, { "epoch": 0.04695615914330693, "grad_norm": 15.264158148189173, "learning_rate": 4.6955260560401265e-05, "loss": 2.7083, "mean_token_accuracy": 0.40689654350280763, "step": 46620 }, { "epoch": 0.04696119519641111, "grad_norm": 16.446309693210857, "learning_rate": 4.696029651713233e-05, "loss": 2.4492, "mean_token_accuracy": 0.41379310488700866, "step": 46625 }, { "epoch": 0.04696623124951528, "grad_norm": 13.467528000181634, "learning_rate": 4.696533247386339e-05, "loss": 2.4682, "mean_token_accuracy": 0.39655172228813174, "step": 46630 }, { "epoch": 0.046971267302619454, "grad_norm": 11.150932576995208, "learning_rate": 4.697036843059445e-05, "loss": 2.4734, "mean_token_accuracy": 0.38275861740112305, "step": 46635 }, { "epoch": 0.04697630335572363, "grad_norm": 12.890656340857578, "learning_rate": 4.697540438732551e-05, "loss": 2.5718, "mean_token_accuracy": 0.3878402948379517, "step": 46640 }, { "epoch": 0.046981339408827795, "grad_norm": 12.337165984575114, "learning_rate": 4.698044034405656e-05, "loss": 2.2637, "mean_token_accuracy": 0.4344827592372894, "step": 46645 }, { "epoch": 0.04698637546193197, "grad_norm": 12.105350392593298, "learning_rate": 4.698547630078763e-05, "loss": 2.6075, "mean_token_accuracy": 0.3931034505367279, "step": 46650 }, { "epoch": 0.04699141151503614, "grad_norm": 13.72554146429605, "learning_rate": 4.6990512257518687e-05, "loss": 2.5656, "mean_token_accuracy": 0.41034482717514037, "step": 46655 }, { "epoch": 0.046996447568140316, "grad_norm": 12.644716037836664, "learning_rate": 4.6995548214249746e-05, "loss": 2.8063, "mean_token_accuracy": 0.42413793206214906, "step": 46660 }, { "epoch": 0.04700148362124449, "grad_norm": 12.732374530801586, "learning_rate": 4.7000584170980805e-05, "loss": 2.9251, "mean_token_accuracy": 0.3517241358757019, "step": 46665 }, { "epoch": 0.047006519674348664, "grad_norm": 13.086661091428466, "learning_rate": 4.7005620127711864e-05, "loss": 2.8175, "mean_token_accuracy": 0.35862069129943847, "step": 46670 }, { "epoch": 0.04701155572745284, "grad_norm": 13.954023875466135, "learning_rate": 4.7010656084442924e-05, "loss": 2.5536, "mean_token_accuracy": 0.3758620619773865, "step": 46675 }, { "epoch": 0.047016591780557004, "grad_norm": 14.712314792330533, "learning_rate": 4.701569204117399e-05, "loss": 2.2301, "mean_token_accuracy": 0.4534180283546448, "step": 46680 }, { "epoch": 0.04702162783366118, "grad_norm": 11.260427677037695, "learning_rate": 4.702072799790504e-05, "loss": 2.4246, "mean_token_accuracy": 0.4206896543502808, "step": 46685 }, { "epoch": 0.04702666388676535, "grad_norm": 11.42841780227934, "learning_rate": 4.70257639546361e-05, "loss": 2.5593, "mean_token_accuracy": 0.41034482717514037, "step": 46690 }, { "epoch": 0.047031699939869526, "grad_norm": 13.590858004926789, "learning_rate": 4.703079991136716e-05, "loss": 2.477, "mean_token_accuracy": 0.44482759237289426, "step": 46695 }, { "epoch": 0.0470367359929737, "grad_norm": 14.123956055775693, "learning_rate": 4.703583586809822e-05, "loss": 2.4195, "mean_token_accuracy": 0.43103447556495667, "step": 46700 }, { "epoch": 0.04704177204607787, "grad_norm": 13.566368916941018, "learning_rate": 4.7040871824829286e-05, "loss": 2.7452, "mean_token_accuracy": 0.39310345649719236, "step": 46705 }, { "epoch": 0.04704680809918205, "grad_norm": 12.809560043414287, "learning_rate": 4.7045907781560345e-05, "loss": 2.6583, "mean_token_accuracy": 0.3620689630508423, "step": 46710 }, { "epoch": 0.047051844152286214, "grad_norm": 12.754422749999405, "learning_rate": 4.7050943738291404e-05, "loss": 2.7763, "mean_token_accuracy": 0.4241379380226135, "step": 46715 }, { "epoch": 0.04705688020539039, "grad_norm": 12.232446941077315, "learning_rate": 4.7055979695022464e-05, "loss": 2.7733, "mean_token_accuracy": 0.41724138259887694, "step": 46720 }, { "epoch": 0.04706191625849456, "grad_norm": 11.73740908643985, "learning_rate": 4.706101565175352e-05, "loss": 2.8888, "mean_token_accuracy": 0.3827586144208908, "step": 46725 }, { "epoch": 0.047066952311598735, "grad_norm": 60.690759407080556, "learning_rate": 4.706605160848458e-05, "loss": 2.2582, "mean_token_accuracy": 0.39999999701976774, "step": 46730 }, { "epoch": 0.04707198836470291, "grad_norm": 14.374417901352286, "learning_rate": 4.707108756521564e-05, "loss": 2.7803, "mean_token_accuracy": 0.36896551251411436, "step": 46735 }, { "epoch": 0.04707702441780708, "grad_norm": 14.95692805108952, "learning_rate": 4.70761235219467e-05, "loss": 2.4981, "mean_token_accuracy": 0.4123412013053894, "step": 46740 }, { "epoch": 0.047082060470911256, "grad_norm": 14.37502519106531, "learning_rate": 4.708115947867776e-05, "loss": 2.6742, "mean_token_accuracy": 0.3965517282485962, "step": 46745 }, { "epoch": 0.04708709652401542, "grad_norm": 12.418105731518645, "learning_rate": 4.708619543540882e-05, "loss": 2.7018, "mean_token_accuracy": 0.37931033968925476, "step": 46750 }, { "epoch": 0.0470921325771196, "grad_norm": 16.792852637607865, "learning_rate": 4.709123139213988e-05, "loss": 2.7577, "mean_token_accuracy": 0.3896551728248596, "step": 46755 }, { "epoch": 0.04709716863022377, "grad_norm": 13.830284554879546, "learning_rate": 4.7096267348870944e-05, "loss": 2.8418, "mean_token_accuracy": 0.37241379618644715, "step": 46760 }, { "epoch": 0.047102204683327945, "grad_norm": 15.085139267235832, "learning_rate": 4.7101303305602004e-05, "loss": 2.6337, "mean_token_accuracy": 0.39655172228813174, "step": 46765 }, { "epoch": 0.04710724073643212, "grad_norm": 15.565593191214548, "learning_rate": 4.710633926233306e-05, "loss": 2.8822, "mean_token_accuracy": 0.36896551847457887, "step": 46770 }, { "epoch": 0.04711227678953629, "grad_norm": 10.84614335939368, "learning_rate": 4.711137521906412e-05, "loss": 2.6087, "mean_token_accuracy": 0.39999998807907106, "step": 46775 }, { "epoch": 0.047117312842640466, "grad_norm": 13.971495404225001, "learning_rate": 4.7116411175795175e-05, "loss": 2.6162, "mean_token_accuracy": 0.4310344815254211, "step": 46780 }, { "epoch": 0.04712234889574463, "grad_norm": 12.929242894980991, "learning_rate": 4.712144713252624e-05, "loss": 2.9224, "mean_token_accuracy": 0.358620697259903, "step": 46785 }, { "epoch": 0.047127384948848806, "grad_norm": 13.407495311855628, "learning_rate": 4.71264830892573e-05, "loss": 2.4922, "mean_token_accuracy": 0.3827586233615875, "step": 46790 }, { "epoch": 0.04713242100195298, "grad_norm": 15.943942700876404, "learning_rate": 4.713151904598836e-05, "loss": 2.8936, "mean_token_accuracy": 0.37586206793785093, "step": 46795 }, { "epoch": 0.047137457055057154, "grad_norm": 12.231440047204194, "learning_rate": 4.713655500271942e-05, "loss": 2.0397, "mean_token_accuracy": 0.4689655125141144, "step": 46800 }, { "epoch": 0.04714249310816133, "grad_norm": 14.937864017153382, "learning_rate": 4.714159095945048e-05, "loss": 2.6796, "mean_token_accuracy": 0.3827586233615875, "step": 46805 }, { "epoch": 0.0471475291612655, "grad_norm": 13.834133615584573, "learning_rate": 4.7146626916181544e-05, "loss": 2.7042, "mean_token_accuracy": 0.3517241358757019, "step": 46810 }, { "epoch": 0.047152565214369675, "grad_norm": 14.39259426110685, "learning_rate": 4.71516628729126e-05, "loss": 2.7232, "mean_token_accuracy": 0.4068965494632721, "step": 46815 }, { "epoch": 0.04715760126747384, "grad_norm": 11.260942291022031, "learning_rate": 4.7156698829643655e-05, "loss": 2.5944, "mean_token_accuracy": 0.4, "step": 46820 }, { "epoch": 0.047162637320578016, "grad_norm": 13.613161669997456, "learning_rate": 4.7161734786374715e-05, "loss": 2.6142, "mean_token_accuracy": 0.3965517282485962, "step": 46825 }, { "epoch": 0.04716767337368219, "grad_norm": 12.640162064714843, "learning_rate": 4.7166770743105774e-05, "loss": 2.2717, "mean_token_accuracy": 0.3999999940395355, "step": 46830 }, { "epoch": 0.04717270942678636, "grad_norm": 12.010281180206457, "learning_rate": 4.717180669983683e-05, "loss": 2.6745, "mean_token_accuracy": 0.3827586114406586, "step": 46835 }, { "epoch": 0.04717774547989054, "grad_norm": 12.416691553621817, "learning_rate": 4.71768426565679e-05, "loss": 2.3241, "mean_token_accuracy": 0.4172413766384125, "step": 46840 }, { "epoch": 0.04718278153299471, "grad_norm": 16.200551193551572, "learning_rate": 4.718187861329896e-05, "loss": 3.1475, "mean_token_accuracy": 0.324137932062149, "step": 46845 }, { "epoch": 0.047187817586098885, "grad_norm": 15.200814381879688, "learning_rate": 4.718691457003002e-05, "loss": 2.489, "mean_token_accuracy": 0.4137930989265442, "step": 46850 }, { "epoch": 0.04719285363920305, "grad_norm": 17.46089485339386, "learning_rate": 4.719195052676108e-05, "loss": 2.6945, "mean_token_accuracy": 0.4379310250282288, "step": 46855 }, { "epoch": 0.047197889692307225, "grad_norm": 13.388481821637585, "learning_rate": 4.7196986483492136e-05, "loss": 2.4581, "mean_token_accuracy": 0.41034482717514037, "step": 46860 }, { "epoch": 0.0472029257454114, "grad_norm": 12.377998255077587, "learning_rate": 4.7202022440223196e-05, "loss": 2.7504, "mean_token_accuracy": 0.34482758641242983, "step": 46865 }, { "epoch": 0.04720796179851557, "grad_norm": 13.798328915060818, "learning_rate": 4.7207058396954255e-05, "loss": 2.4786, "mean_token_accuracy": 0.3620689630508423, "step": 46870 }, { "epoch": 0.04721299785161975, "grad_norm": 12.422433265992623, "learning_rate": 4.7212094353685314e-05, "loss": 2.4578, "mean_token_accuracy": 0.42413793206214906, "step": 46875 }, { "epoch": 0.04721803390472392, "grad_norm": 16.313528050023596, "learning_rate": 4.721713031041637e-05, "loss": 2.4113, "mean_token_accuracy": 0.4000000059604645, "step": 46880 }, { "epoch": 0.047223069957828094, "grad_norm": 12.964425729000933, "learning_rate": 4.722216626714743e-05, "loss": 2.1286, "mean_token_accuracy": 0.5034482777118683, "step": 46885 }, { "epoch": 0.04722810601093226, "grad_norm": 12.643866987630767, "learning_rate": 4.72272022238785e-05, "loss": 2.5198, "mean_token_accuracy": 0.4034482777118683, "step": 46890 }, { "epoch": 0.047233142064036435, "grad_norm": 14.114454052056356, "learning_rate": 4.723223818060956e-05, "loss": 2.4455, "mean_token_accuracy": 0.41724138259887694, "step": 46895 }, { "epoch": 0.04723817811714061, "grad_norm": 13.856772837875425, "learning_rate": 4.723727413734062e-05, "loss": 2.842, "mean_token_accuracy": 0.3310344874858856, "step": 46900 }, { "epoch": 0.04724321417024478, "grad_norm": 16.488100501934746, "learning_rate": 4.7242310094071676e-05, "loss": 2.8484, "mean_token_accuracy": 0.33448275923728943, "step": 46905 }, { "epoch": 0.047248250223348956, "grad_norm": 14.951377472169764, "learning_rate": 4.724734605080273e-05, "loss": 2.639, "mean_token_accuracy": 0.37586206793785093, "step": 46910 }, { "epoch": 0.04725328627645313, "grad_norm": 13.946600327990788, "learning_rate": 4.725238200753379e-05, "loss": 2.774, "mean_token_accuracy": 0.3517241358757019, "step": 46915 }, { "epoch": 0.047258322329557303, "grad_norm": 12.66580986770408, "learning_rate": 4.7257417964264854e-05, "loss": 2.6328, "mean_token_accuracy": 0.3707199037075043, "step": 46920 }, { "epoch": 0.04726335838266147, "grad_norm": 18.739721490999987, "learning_rate": 4.726245392099591e-05, "loss": 2.8252, "mean_token_accuracy": 0.3827586233615875, "step": 46925 }, { "epoch": 0.047268394435765644, "grad_norm": 9.900952356996473, "learning_rate": 4.726748987772697e-05, "loss": 2.342, "mean_token_accuracy": 0.43641863465309144, "step": 46930 }, { "epoch": 0.04727343048886982, "grad_norm": 13.52618424610419, "learning_rate": 4.727252583445803e-05, "loss": 2.5749, "mean_token_accuracy": 0.3965517282485962, "step": 46935 }, { "epoch": 0.04727846654197399, "grad_norm": 15.386735576621984, "learning_rate": 4.727756179118909e-05, "loss": 2.6357, "mean_token_accuracy": 0.3517241388559341, "step": 46940 }, { "epoch": 0.047283502595078165, "grad_norm": 15.173831769182488, "learning_rate": 4.728259774792016e-05, "loss": 2.6603, "mean_token_accuracy": 0.44827585816383364, "step": 46945 }, { "epoch": 0.04728853864818234, "grad_norm": 12.301488842972068, "learning_rate": 4.7287633704651216e-05, "loss": 2.5918, "mean_token_accuracy": 0.41379310488700866, "step": 46950 }, { "epoch": 0.04729357470128651, "grad_norm": 11.213050114188206, "learning_rate": 4.729266966138227e-05, "loss": 2.5708, "mean_token_accuracy": 0.37931033670902253, "step": 46955 }, { "epoch": 0.04729861075439068, "grad_norm": 10.193669062110873, "learning_rate": 4.729770561811333e-05, "loss": 2.0124, "mean_token_accuracy": 0.49433496594429016, "step": 46960 }, { "epoch": 0.047303646807494854, "grad_norm": 15.41492968451369, "learning_rate": 4.730274157484439e-05, "loss": 2.4214, "mean_token_accuracy": 0.42746521830558776, "step": 46965 }, { "epoch": 0.04730868286059903, "grad_norm": 11.619858208351674, "learning_rate": 4.730777753157545e-05, "loss": 2.7013, "mean_token_accuracy": 0.358620685338974, "step": 46970 }, { "epoch": 0.0473137189137032, "grad_norm": 12.988284061212138, "learning_rate": 4.731281348830651e-05, "loss": 2.5943, "mean_token_accuracy": 0.3448275804519653, "step": 46975 }, { "epoch": 0.047318754966807375, "grad_norm": 15.049561949664746, "learning_rate": 4.731784944503757e-05, "loss": 2.5473, "mean_token_accuracy": 0.4430127084255219, "step": 46980 }, { "epoch": 0.04732379101991155, "grad_norm": 13.789845117126283, "learning_rate": 4.732288540176863e-05, "loss": 2.4549, "mean_token_accuracy": 0.4379310369491577, "step": 46985 }, { "epoch": 0.04732882707301572, "grad_norm": 12.607336510348475, "learning_rate": 4.732792135849969e-05, "loss": 2.2684, "mean_token_accuracy": 0.42068966031074523, "step": 46990 }, { "epoch": 0.04733386312611989, "grad_norm": 16.378718229894094, "learning_rate": 4.733295731523075e-05, "loss": 2.6601, "mean_token_accuracy": 0.3620689570903778, "step": 46995 }, { "epoch": 0.04733889917922406, "grad_norm": 11.834586150240387, "learning_rate": 4.733799327196181e-05, "loss": 2.5797, "mean_token_accuracy": 0.44482759237289426, "step": 47000 }, { "epoch": 0.04734393523232824, "grad_norm": 40.40144235285299, "learning_rate": 4.734302922869287e-05, "loss": 2.6236, "mean_token_accuracy": 0.37241379022598264, "step": 47005 }, { "epoch": 0.04734897128543241, "grad_norm": 12.747632772993049, "learning_rate": 4.734806518542393e-05, "loss": 2.6733, "mean_token_accuracy": 0.3896551728248596, "step": 47010 }, { "epoch": 0.047354007338536584, "grad_norm": 12.722645031298677, "learning_rate": 4.735310114215499e-05, "loss": 2.5185, "mean_token_accuracy": 0.39310344457626345, "step": 47015 }, { "epoch": 0.04735904339164076, "grad_norm": 12.320600705897778, "learning_rate": 4.7358137098886046e-05, "loss": 2.5554, "mean_token_accuracy": 0.4172413766384125, "step": 47020 }, { "epoch": 0.04736407944474493, "grad_norm": 13.482611681704666, "learning_rate": 4.736317305561711e-05, "loss": 2.3386, "mean_token_accuracy": 0.4206896543502808, "step": 47025 }, { "epoch": 0.0473691154978491, "grad_norm": 20.707935668632405, "learning_rate": 4.736820901234817e-05, "loss": 3.0413, "mean_token_accuracy": 0.3620689630508423, "step": 47030 }, { "epoch": 0.04737415155095327, "grad_norm": 13.179989367674752, "learning_rate": 4.737324496907923e-05, "loss": 2.6283, "mean_token_accuracy": 0.39310344457626345, "step": 47035 }, { "epoch": 0.047379187604057446, "grad_norm": 19.258842126936127, "learning_rate": 4.737828092581029e-05, "loss": 2.2648, "mean_token_accuracy": 0.4448275864124298, "step": 47040 }, { "epoch": 0.04738422365716162, "grad_norm": 13.674688237312816, "learning_rate": 4.738331688254134e-05, "loss": 2.4717, "mean_token_accuracy": 0.441379314661026, "step": 47045 }, { "epoch": 0.047389259710265794, "grad_norm": 13.334375243523906, "learning_rate": 4.738835283927241e-05, "loss": 2.5138, "mean_token_accuracy": 0.3931034505367279, "step": 47050 }, { "epoch": 0.04739429576336997, "grad_norm": 14.679757602587795, "learning_rate": 4.739338879600347e-05, "loss": 2.7493, "mean_token_accuracy": 0.3811252325773239, "step": 47055 }, { "epoch": 0.04739933181647414, "grad_norm": 14.287232077110433, "learning_rate": 4.739842475273453e-05, "loss": 2.7077, "mean_token_accuracy": 0.3827586233615875, "step": 47060 }, { "epoch": 0.04740436786957831, "grad_norm": 15.019999986994842, "learning_rate": 4.7403460709465586e-05, "loss": 2.6414, "mean_token_accuracy": 0.39310343861579894, "step": 47065 }, { "epoch": 0.04740940392268248, "grad_norm": 14.946857670030896, "learning_rate": 4.7408496666196645e-05, "loss": 2.616, "mean_token_accuracy": 0.4034482717514038, "step": 47070 }, { "epoch": 0.047414439975786656, "grad_norm": 12.492954994257694, "learning_rate": 4.741353262292771e-05, "loss": 2.4968, "mean_token_accuracy": 0.3965517282485962, "step": 47075 }, { "epoch": 0.04741947602889083, "grad_norm": 12.572445967030859, "learning_rate": 4.741856857965877e-05, "loss": 2.5183, "mean_token_accuracy": 0.38965516686439516, "step": 47080 }, { "epoch": 0.047424512081995, "grad_norm": 16.469168251728778, "learning_rate": 4.742360453638982e-05, "loss": 2.976, "mean_token_accuracy": 0.320689657330513, "step": 47085 }, { "epoch": 0.04742954813509918, "grad_norm": 17.248364700739028, "learning_rate": 4.742864049312088e-05, "loss": 2.582, "mean_token_accuracy": 0.3793103456497192, "step": 47090 }, { "epoch": 0.04743458418820335, "grad_norm": 12.119347068592662, "learning_rate": 4.743367644985194e-05, "loss": 2.4325, "mean_token_accuracy": 0.36896551847457887, "step": 47095 }, { "epoch": 0.04743962024130752, "grad_norm": 13.830515868610359, "learning_rate": 4.7438712406583e-05, "loss": 2.5288, "mean_token_accuracy": 0.4137930989265442, "step": 47100 }, { "epoch": 0.04744465629441169, "grad_norm": 15.866506651420842, "learning_rate": 4.744374836331407e-05, "loss": 3.1445, "mean_token_accuracy": 0.3777374565601349, "step": 47105 }, { "epoch": 0.047449692347515865, "grad_norm": 11.090754039946026, "learning_rate": 4.7448784320045126e-05, "loss": 2.5315, "mean_token_accuracy": 0.41034482717514037, "step": 47110 }, { "epoch": 0.04745472840062004, "grad_norm": 12.911441257970125, "learning_rate": 4.7453820276776185e-05, "loss": 2.7885, "mean_token_accuracy": 0.3862068891525269, "step": 47115 }, { "epoch": 0.04745976445372421, "grad_norm": 11.608537008981578, "learning_rate": 4.7458856233507245e-05, "loss": 2.3829, "mean_token_accuracy": 0.44827585816383364, "step": 47120 }, { "epoch": 0.047464800506828386, "grad_norm": 15.478812662334786, "learning_rate": 4.7463892190238304e-05, "loss": 2.408, "mean_token_accuracy": 0.4275862157344818, "step": 47125 }, { "epoch": 0.04746983655993256, "grad_norm": 17.38623159682501, "learning_rate": 4.746892814696936e-05, "loss": 2.577, "mean_token_accuracy": 0.37586207389831544, "step": 47130 }, { "epoch": 0.04747487261303673, "grad_norm": 13.793548870040283, "learning_rate": 4.747396410370042e-05, "loss": 2.3443, "mean_token_accuracy": 0.42068966031074523, "step": 47135 }, { "epoch": 0.0474799086661409, "grad_norm": 13.814059382267036, "learning_rate": 4.747900006043148e-05, "loss": 2.5931, "mean_token_accuracy": 0.34827585220336915, "step": 47140 }, { "epoch": 0.047484944719245074, "grad_norm": 17.214119318013594, "learning_rate": 4.748403601716254e-05, "loss": 2.9402, "mean_token_accuracy": 0.36001209616661073, "step": 47145 }, { "epoch": 0.04748998077234925, "grad_norm": 13.960123917790995, "learning_rate": 4.74890719738936e-05, "loss": 2.8189, "mean_token_accuracy": 0.3620689630508423, "step": 47150 }, { "epoch": 0.04749501682545342, "grad_norm": 12.230426437116334, "learning_rate": 4.7494107930624666e-05, "loss": 2.5244, "mean_token_accuracy": 0.4, "step": 47155 }, { "epoch": 0.047500052878557596, "grad_norm": 13.23672519959043, "learning_rate": 4.7499143887355725e-05, "loss": 2.6871, "mean_token_accuracy": 0.3517241358757019, "step": 47160 }, { "epoch": 0.04750508893166177, "grad_norm": 14.976286482473116, "learning_rate": 4.7504179844086785e-05, "loss": 2.8344, "mean_token_accuracy": 0.3655172407627106, "step": 47165 }, { "epoch": 0.047510124984765936, "grad_norm": 16.955313851759026, "learning_rate": 4.7509215800817844e-05, "loss": 2.2786, "mean_token_accuracy": 0.41548699140548706, "step": 47170 }, { "epoch": 0.04751516103787011, "grad_norm": 11.764006226293825, "learning_rate": 4.75142517575489e-05, "loss": 2.4371, "mean_token_accuracy": 0.4607380568981171, "step": 47175 }, { "epoch": 0.047520197090974284, "grad_norm": 15.404410346834458, "learning_rate": 4.7519287714279956e-05, "loss": 2.7769, "mean_token_accuracy": 0.34482758641242983, "step": 47180 }, { "epoch": 0.04752523314407846, "grad_norm": 14.702396660645883, "learning_rate": 4.752432367101102e-05, "loss": 2.6084, "mean_token_accuracy": 0.43448275327682495, "step": 47185 }, { "epoch": 0.04753026919718263, "grad_norm": 16.609223765106606, "learning_rate": 4.752935962774208e-05, "loss": 2.8907, "mean_token_accuracy": 0.3482758641242981, "step": 47190 }, { "epoch": 0.047535305250286805, "grad_norm": 9.668932980178516, "learning_rate": 4.753439558447314e-05, "loss": 2.7194, "mean_token_accuracy": 0.3620689660310745, "step": 47195 }, { "epoch": 0.04754034130339098, "grad_norm": 12.191722932116194, "learning_rate": 4.75394315412042e-05, "loss": 2.6017, "mean_token_accuracy": 0.42068966031074523, "step": 47200 }, { "epoch": 0.047545377356495146, "grad_norm": 13.13096284073397, "learning_rate": 4.754446749793526e-05, "loss": 2.863, "mean_token_accuracy": 0.42758620381355283, "step": 47205 }, { "epoch": 0.04755041340959932, "grad_norm": 13.630264350466247, "learning_rate": 4.7549503454666325e-05, "loss": 3.2923, "mean_token_accuracy": 0.3551724076271057, "step": 47210 }, { "epoch": 0.04755544946270349, "grad_norm": 12.396421987422755, "learning_rate": 4.7554539411397384e-05, "loss": 2.3648, "mean_token_accuracy": 0.4206896543502808, "step": 47215 }, { "epoch": 0.04756048551580767, "grad_norm": 12.462003141399798, "learning_rate": 4.7559575368128436e-05, "loss": 2.2428, "mean_token_accuracy": 0.47241379618644713, "step": 47220 }, { "epoch": 0.04756552156891184, "grad_norm": 13.405108332087131, "learning_rate": 4.7564611324859496e-05, "loss": 2.3946, "mean_token_accuracy": 0.4363581418991089, "step": 47225 }, { "epoch": 0.047570557622016015, "grad_norm": 13.319733269129923, "learning_rate": 4.7569647281590555e-05, "loss": 2.7917, "mean_token_accuracy": 0.41034482717514037, "step": 47230 }, { "epoch": 0.04757559367512019, "grad_norm": 11.016908420645432, "learning_rate": 4.757468323832162e-05, "loss": 2.4707, "mean_token_accuracy": 0.4068965494632721, "step": 47235 }, { "epoch": 0.047580629728224355, "grad_norm": 13.084299254837374, "learning_rate": 4.757971919505268e-05, "loss": 2.2323, "mean_token_accuracy": 0.41724138259887694, "step": 47240 }, { "epoch": 0.04758566578132853, "grad_norm": 15.65328422860084, "learning_rate": 4.758475515178374e-05, "loss": 2.3569, "mean_token_accuracy": 0.441379314661026, "step": 47245 }, { "epoch": 0.0475907018344327, "grad_norm": 11.48186512550104, "learning_rate": 4.75897911085148e-05, "loss": 2.6651, "mean_token_accuracy": 0.42413792610168455, "step": 47250 }, { "epoch": 0.047595737887536876, "grad_norm": 12.575634653402448, "learning_rate": 4.759482706524586e-05, "loss": 2.8765, "mean_token_accuracy": 0.3739261955022812, "step": 47255 }, { "epoch": 0.04760077394064105, "grad_norm": 16.393119883754217, "learning_rate": 4.759986302197692e-05, "loss": 2.6222, "mean_token_accuracy": 0.42413792610168455, "step": 47260 }, { "epoch": 0.047605809993745224, "grad_norm": 16.232805732794667, "learning_rate": 4.7604898978707976e-05, "loss": 2.6278, "mean_token_accuracy": 0.4206896543502808, "step": 47265 }, { "epoch": 0.0476108460468494, "grad_norm": 15.427475213314917, "learning_rate": 4.7609934935439036e-05, "loss": 2.2438, "mean_token_accuracy": 0.4500302493572235, "step": 47270 }, { "epoch": 0.047615882099953565, "grad_norm": 16.608149657952602, "learning_rate": 4.7614970892170095e-05, "loss": 2.9403, "mean_token_accuracy": 0.4275861978530884, "step": 47275 }, { "epoch": 0.04762091815305774, "grad_norm": 11.067185965291648, "learning_rate": 4.7620006848901154e-05, "loss": 2.1803, "mean_token_accuracy": 0.4793103337287903, "step": 47280 }, { "epoch": 0.04762595420616191, "grad_norm": 16.56551691552814, "learning_rate": 4.7625042805632213e-05, "loss": 2.1311, "mean_token_accuracy": 0.4517241358757019, "step": 47285 }, { "epoch": 0.047630990259266086, "grad_norm": 12.862492715300638, "learning_rate": 4.763007876236328e-05, "loss": 2.993, "mean_token_accuracy": 0.3551724135875702, "step": 47290 }, { "epoch": 0.04763602631237026, "grad_norm": 14.403407342882064, "learning_rate": 4.763511471909434e-05, "loss": 2.4562, "mean_token_accuracy": 0.39310344457626345, "step": 47295 }, { "epoch": 0.04764106236547443, "grad_norm": 12.463416738685545, "learning_rate": 4.76401506758254e-05, "loss": 2.9109, "mean_token_accuracy": 0.37241379618644715, "step": 47300 }, { "epoch": 0.04764609841857861, "grad_norm": 14.513384976491107, "learning_rate": 4.764518663255646e-05, "loss": 2.3798, "mean_token_accuracy": 0.4068965494632721, "step": 47305 }, { "epoch": 0.047651134471682774, "grad_norm": 13.910357368564709, "learning_rate": 4.7650222589287516e-05, "loss": 2.5427, "mean_token_accuracy": 0.41034482717514037, "step": 47310 }, { "epoch": 0.04765617052478695, "grad_norm": 11.58458893753858, "learning_rate": 4.7655258546018576e-05, "loss": 2.4199, "mean_token_accuracy": 0.43103447556495667, "step": 47315 }, { "epoch": 0.04766120657789112, "grad_norm": 15.843681974535407, "learning_rate": 4.7660294502749635e-05, "loss": 3.0803, "mean_token_accuracy": 0.3854204475879669, "step": 47320 }, { "epoch": 0.047666242630995295, "grad_norm": 15.403869092909018, "learning_rate": 4.7665330459480694e-05, "loss": 2.5424, "mean_token_accuracy": 0.34137930870056155, "step": 47325 }, { "epoch": 0.04767127868409947, "grad_norm": 13.446810345697902, "learning_rate": 4.7670366416211753e-05, "loss": 3.0083, "mean_token_accuracy": 0.35172412991523744, "step": 47330 }, { "epoch": 0.04767631473720364, "grad_norm": 13.165645274157306, "learning_rate": 4.767540237294281e-05, "loss": 2.8023, "mean_token_accuracy": 0.39655172228813174, "step": 47335 }, { "epoch": 0.04768135079030782, "grad_norm": 17.842386870692916, "learning_rate": 4.768043832967387e-05, "loss": 2.7204, "mean_token_accuracy": 0.4034482777118683, "step": 47340 }, { "epoch": 0.04768638684341198, "grad_norm": 13.060415327063652, "learning_rate": 4.768547428640494e-05, "loss": 2.6878, "mean_token_accuracy": 0.4119177222251892, "step": 47345 }, { "epoch": 0.04769142289651616, "grad_norm": 12.87953628826914, "learning_rate": 4.7690510243136e-05, "loss": 2.3714, "mean_token_accuracy": 0.4586206912994385, "step": 47350 }, { "epoch": 0.04769645894962033, "grad_norm": 14.449027697900455, "learning_rate": 4.769554619986705e-05, "loss": 2.5832, "mean_token_accuracy": 0.3999999940395355, "step": 47355 }, { "epoch": 0.047701495002724505, "grad_norm": 18.8853246323621, "learning_rate": 4.770058215659811e-05, "loss": 3.0338, "mean_token_accuracy": 0.3620689570903778, "step": 47360 }, { "epoch": 0.04770653105582868, "grad_norm": 12.698647948785169, "learning_rate": 4.770561811332917e-05, "loss": 2.7163, "mean_token_accuracy": 0.29655171632766725, "step": 47365 }, { "epoch": 0.04771156710893285, "grad_norm": 13.140378143199367, "learning_rate": 4.7710654070060234e-05, "loss": 2.2909, "mean_token_accuracy": 0.47416818141937256, "step": 47370 }, { "epoch": 0.047716603162037026, "grad_norm": 12.616226513223177, "learning_rate": 4.7715690026791294e-05, "loss": 2.4722, "mean_token_accuracy": 0.4, "step": 47375 }, { "epoch": 0.04772163921514119, "grad_norm": 12.915726412890727, "learning_rate": 4.772072598352235e-05, "loss": 2.7875, "mean_token_accuracy": 0.38275861740112305, "step": 47380 }, { "epoch": 0.04772667526824537, "grad_norm": 14.975059782379262, "learning_rate": 4.772576194025341e-05, "loss": 2.7531, "mean_token_accuracy": 0.3620689630508423, "step": 47385 }, { "epoch": 0.04773171132134954, "grad_norm": 13.527117323171572, "learning_rate": 4.773079789698447e-05, "loss": 2.4961, "mean_token_accuracy": 0.3827586114406586, "step": 47390 }, { "epoch": 0.047736747374453714, "grad_norm": 11.315211655767412, "learning_rate": 4.773583385371553e-05, "loss": 2.2745, "mean_token_accuracy": 0.4137930929660797, "step": 47395 }, { "epoch": 0.04774178342755789, "grad_norm": 11.6311722233183, "learning_rate": 4.774086981044659e-05, "loss": 2.7554, "mean_token_accuracy": 0.3551724135875702, "step": 47400 }, { "epoch": 0.04774681948066206, "grad_norm": 10.895099229524956, "learning_rate": 4.774590576717765e-05, "loss": 2.2494, "mean_token_accuracy": 0.47434966564178466, "step": 47405 }, { "epoch": 0.047751855533766235, "grad_norm": 19.38405650296438, "learning_rate": 4.775094172390871e-05, "loss": 2.6273, "mean_token_accuracy": 0.42746522426605227, "step": 47410 }, { "epoch": 0.0477568915868704, "grad_norm": 24.70952099057512, "learning_rate": 4.775597768063977e-05, "loss": 3.3244, "mean_token_accuracy": 0.32758620381355286, "step": 47415 }, { "epoch": 0.047761927639974576, "grad_norm": 13.586058362918633, "learning_rate": 4.776101363737083e-05, "loss": 2.8287, "mean_token_accuracy": 0.35862069576978683, "step": 47420 }, { "epoch": 0.04776696369307875, "grad_norm": 13.58205968718563, "learning_rate": 4.776604959410189e-05, "loss": 2.7323, "mean_token_accuracy": 0.3620689630508423, "step": 47425 }, { "epoch": 0.047771999746182923, "grad_norm": 12.348345831252326, "learning_rate": 4.777108555083295e-05, "loss": 2.8127, "mean_token_accuracy": 0.3965517163276672, "step": 47430 }, { "epoch": 0.0477770357992871, "grad_norm": 12.917410425314785, "learning_rate": 4.777612150756401e-05, "loss": 2.4543, "mean_token_accuracy": 0.4253694534301758, "step": 47435 }, { "epoch": 0.04778207185239127, "grad_norm": 12.505885661421114, "learning_rate": 4.778115746429507e-05, "loss": 2.5886, "mean_token_accuracy": 0.3931034475564957, "step": 47440 }, { "epoch": 0.047787107905495445, "grad_norm": 14.601574616484228, "learning_rate": 4.778619342102612e-05, "loss": 2.7373, "mean_token_accuracy": 0.36551723778247835, "step": 47445 }, { "epoch": 0.04779214395859961, "grad_norm": 18.531817192679004, "learning_rate": 4.779122937775719e-05, "loss": 2.1943, "mean_token_accuracy": 0.4517241418361664, "step": 47450 }, { "epoch": 0.047797180011703785, "grad_norm": 13.826348788923152, "learning_rate": 4.779626533448825e-05, "loss": 2.6413, "mean_token_accuracy": 0.3862069010734558, "step": 47455 }, { "epoch": 0.04780221606480796, "grad_norm": 17.348186625048797, "learning_rate": 4.780130129121931e-05, "loss": 2.6157, "mean_token_accuracy": 0.3827586233615875, "step": 47460 }, { "epoch": 0.04780725211791213, "grad_norm": 16.59620582006953, "learning_rate": 4.780633724795037e-05, "loss": 2.8049, "mean_token_accuracy": 0.35862069129943847, "step": 47465 }, { "epoch": 0.04781228817101631, "grad_norm": 15.22081984340066, "learning_rate": 4.7811373204681426e-05, "loss": 2.4505, "mean_token_accuracy": 0.4620689690113068, "step": 47470 }, { "epoch": 0.04781732422412048, "grad_norm": 14.499682374824959, "learning_rate": 4.781640916141249e-05, "loss": 2.6593, "mean_token_accuracy": 0.4068965554237366, "step": 47475 }, { "epoch": 0.047822360277224654, "grad_norm": 13.421152455111045, "learning_rate": 4.782144511814355e-05, "loss": 2.4685, "mean_token_accuracy": 0.4261947989463806, "step": 47480 }, { "epoch": 0.04782739633032882, "grad_norm": 14.627054074483992, "learning_rate": 4.782648107487461e-05, "loss": 2.9515, "mean_token_accuracy": 0.36733212918043134, "step": 47485 }, { "epoch": 0.047832432383432995, "grad_norm": 13.4258518114012, "learning_rate": 4.783151703160566e-05, "loss": 2.7383, "mean_token_accuracy": 0.38965516686439516, "step": 47490 }, { "epoch": 0.04783746843653717, "grad_norm": 11.62399116890208, "learning_rate": 4.783655298833672e-05, "loss": 2.4835, "mean_token_accuracy": 0.41034482717514037, "step": 47495 }, { "epoch": 0.04784250448964134, "grad_norm": 15.279447737877808, "learning_rate": 4.784158894506779e-05, "loss": 2.3122, "mean_token_accuracy": 0.458620685338974, "step": 47500 }, { "epoch": 0.047847540542745516, "grad_norm": 12.831894407098844, "learning_rate": 4.784662490179885e-05, "loss": 2.3874, "mean_token_accuracy": 0.4847549915313721, "step": 47505 }, { "epoch": 0.04785257659584969, "grad_norm": 15.883314481769363, "learning_rate": 4.785166085852991e-05, "loss": 2.6389, "mean_token_accuracy": 0.3793103456497192, "step": 47510 }, { "epoch": 0.047857612648953864, "grad_norm": 11.11874174807571, "learning_rate": 4.7856696815260966e-05, "loss": 2.5917, "mean_token_accuracy": 0.4551724135875702, "step": 47515 }, { "epoch": 0.04786264870205803, "grad_norm": 14.691309891794297, "learning_rate": 4.7861732771992025e-05, "loss": 2.4293, "mean_token_accuracy": 0.4275861978530884, "step": 47520 }, { "epoch": 0.047867684755162204, "grad_norm": 17.30344541484884, "learning_rate": 4.7866768728723085e-05, "loss": 2.7887, "mean_token_accuracy": 0.37241379618644715, "step": 47525 }, { "epoch": 0.04787272080826638, "grad_norm": 12.607630269238927, "learning_rate": 4.7871804685454144e-05, "loss": 3.0569, "mean_token_accuracy": 0.36896551251411436, "step": 47530 }, { "epoch": 0.04787775686137055, "grad_norm": 13.57270290120401, "learning_rate": 4.78768406421852e-05, "loss": 2.6538, "mean_token_accuracy": 0.4000000089406967, "step": 47535 }, { "epoch": 0.047882792914474726, "grad_norm": 12.991385905151033, "learning_rate": 4.788187659891626e-05, "loss": 2.3742, "mean_token_accuracy": 0.4326073884963989, "step": 47540 }, { "epoch": 0.0478878289675789, "grad_norm": 29.340508325513145, "learning_rate": 4.788691255564732e-05, "loss": 3.1742, "mean_token_accuracy": 0.3310344874858856, "step": 47545 }, { "epoch": 0.04789286502068307, "grad_norm": 14.63007412444858, "learning_rate": 4.789194851237838e-05, "loss": 2.6056, "mean_token_accuracy": 0.39655172526836396, "step": 47550 }, { "epoch": 0.04789790107378724, "grad_norm": 12.659361250741476, "learning_rate": 4.789698446910945e-05, "loss": 2.4788, "mean_token_accuracy": 0.4448275864124298, "step": 47555 }, { "epoch": 0.047902937126891414, "grad_norm": 13.995823385292335, "learning_rate": 4.7902020425840506e-05, "loss": 2.5905, "mean_token_accuracy": 0.39310344457626345, "step": 47560 }, { "epoch": 0.04790797317999559, "grad_norm": 15.033344555188384, "learning_rate": 4.7907056382571565e-05, "loss": 2.5446, "mean_token_accuracy": 0.35862069129943847, "step": 47565 }, { "epoch": 0.04791300923309976, "grad_norm": 11.007238044020639, "learning_rate": 4.7912092339302625e-05, "loss": 2.085, "mean_token_accuracy": 0.47931034564971925, "step": 47570 }, { "epoch": 0.047918045286203935, "grad_norm": 14.477527488645027, "learning_rate": 4.7917128296033684e-05, "loss": 2.7945, "mean_token_accuracy": 0.3939655214548111, "step": 47575 }, { "epoch": 0.04792308133930811, "grad_norm": 11.854237040539159, "learning_rate": 4.792216425276474e-05, "loss": 2.9349, "mean_token_accuracy": 0.36206896901130675, "step": 47580 }, { "epoch": 0.04792811739241228, "grad_norm": 14.073913820936744, "learning_rate": 4.79272002094958e-05, "loss": 2.4123, "mean_token_accuracy": 0.41379310488700866, "step": 47585 }, { "epoch": 0.04793315344551645, "grad_norm": 14.52593872515165, "learning_rate": 4.793223616622686e-05, "loss": 3.046, "mean_token_accuracy": 0.3551724076271057, "step": 47590 }, { "epoch": 0.04793818949862062, "grad_norm": 36.50331504247366, "learning_rate": 4.793727212295792e-05, "loss": 2.5737, "mean_token_accuracy": 0.3724137932062149, "step": 47595 }, { "epoch": 0.0479432255517248, "grad_norm": 9.945653554107935, "learning_rate": 4.794230807968898e-05, "loss": 2.2149, "mean_token_accuracy": 0.43793103098869324, "step": 47600 }, { "epoch": 0.04794826160482897, "grad_norm": 13.7006631163599, "learning_rate": 4.794734403642004e-05, "loss": 2.8674, "mean_token_accuracy": 0.3724137842655182, "step": 47605 }, { "epoch": 0.047953297657933144, "grad_norm": 12.10631630703267, "learning_rate": 4.7952379993151106e-05, "loss": 2.6803, "mean_token_accuracy": 0.3758620619773865, "step": 47610 }, { "epoch": 0.04795833371103732, "grad_norm": 15.816984517714396, "learning_rate": 4.7957415949882165e-05, "loss": 2.761, "mean_token_accuracy": 0.3689655244350433, "step": 47615 }, { "epoch": 0.04796336976414149, "grad_norm": 12.171990449510867, "learning_rate": 4.796245190661322e-05, "loss": 2.4921, "mean_token_accuracy": 0.3551724076271057, "step": 47620 }, { "epoch": 0.04796840581724566, "grad_norm": 12.216730331880434, "learning_rate": 4.7967487863344276e-05, "loss": 2.4788, "mean_token_accuracy": 0.3827586233615875, "step": 47625 }, { "epoch": 0.04797344187034983, "grad_norm": 14.69145150871009, "learning_rate": 4.7972523820075336e-05, "loss": 2.5772, "mean_token_accuracy": 0.41379311084747317, "step": 47630 }, { "epoch": 0.047978477923454006, "grad_norm": 10.446145548000876, "learning_rate": 4.79775597768064e-05, "loss": 2.4547, "mean_token_accuracy": 0.4344827592372894, "step": 47635 }, { "epoch": 0.04798351397655818, "grad_norm": 13.405524575761477, "learning_rate": 4.798259573353746e-05, "loss": 2.4195, "mean_token_accuracy": 0.3931034505367279, "step": 47640 }, { "epoch": 0.047988550029662354, "grad_norm": 14.946476472046575, "learning_rate": 4.798763169026852e-05, "loss": 2.5484, "mean_token_accuracy": 0.4034482777118683, "step": 47645 }, { "epoch": 0.04799358608276653, "grad_norm": 12.995995782048258, "learning_rate": 4.799266764699958e-05, "loss": 2.96, "mean_token_accuracy": 0.3448275804519653, "step": 47650 }, { "epoch": 0.0479986221358707, "grad_norm": 15.57977090773077, "learning_rate": 4.799770360373064e-05, "loss": 2.6251, "mean_token_accuracy": 0.4068965494632721, "step": 47655 }, { "epoch": 0.04800365818897487, "grad_norm": 12.119121667074367, "learning_rate": 4.80027395604617e-05, "loss": 2.9725, "mean_token_accuracy": 0.35172413289546967, "step": 47660 }, { "epoch": 0.04800869424207904, "grad_norm": 10.204302478008904, "learning_rate": 4.800777551719276e-05, "loss": 2.2999, "mean_token_accuracy": 0.4689655065536499, "step": 47665 }, { "epoch": 0.048013730295183216, "grad_norm": 12.36361003065651, "learning_rate": 4.8012811473923817e-05, "loss": 2.2134, "mean_token_accuracy": 0.4896551728248596, "step": 47670 }, { "epoch": 0.04801876634828739, "grad_norm": 11.708252266211383, "learning_rate": 4.8017847430654876e-05, "loss": 2.2681, "mean_token_accuracy": 0.43165024518966677, "step": 47675 }, { "epoch": 0.04802380240139156, "grad_norm": 15.19515233043819, "learning_rate": 4.8022883387385935e-05, "loss": 2.6984, "mean_token_accuracy": 0.4034482777118683, "step": 47680 }, { "epoch": 0.04802883845449574, "grad_norm": 17.662812120916435, "learning_rate": 4.8027919344116994e-05, "loss": 2.6104, "mean_token_accuracy": 0.39310344159603117, "step": 47685 }, { "epoch": 0.04803387450759991, "grad_norm": 12.970820082913875, "learning_rate": 4.803295530084806e-05, "loss": 2.9455, "mean_token_accuracy": 0.3551724135875702, "step": 47690 }, { "epoch": 0.04803891056070408, "grad_norm": 14.746908378838821, "learning_rate": 4.803799125757912e-05, "loss": 2.2427, "mean_token_accuracy": 0.45517241954803467, "step": 47695 }, { "epoch": 0.04804394661380825, "grad_norm": 12.476280128642035, "learning_rate": 4.804302721431018e-05, "loss": 2.2109, "mean_token_accuracy": 0.4275861978530884, "step": 47700 }, { "epoch": 0.048048982666912425, "grad_norm": 12.479025086352847, "learning_rate": 4.804806317104124e-05, "loss": 2.6034, "mean_token_accuracy": 0.41379311084747317, "step": 47705 }, { "epoch": 0.0480540187200166, "grad_norm": 11.948168911930141, "learning_rate": 4.80530991277723e-05, "loss": 2.4805, "mean_token_accuracy": 0.41554749608039854, "step": 47710 }, { "epoch": 0.04805905477312077, "grad_norm": 28.44692819685419, "learning_rate": 4.8058135084503357e-05, "loss": 2.8268, "mean_token_accuracy": 0.41034482717514037, "step": 47715 }, { "epoch": 0.048064090826224946, "grad_norm": 10.602146776872049, "learning_rate": 4.8063171041234416e-05, "loss": 2.7184, "mean_token_accuracy": 0.37586206793785093, "step": 47720 }, { "epoch": 0.04806912687932912, "grad_norm": 11.99676469080963, "learning_rate": 4.8068206997965475e-05, "loss": 2.6639, "mean_token_accuracy": 0.4172413766384125, "step": 47725 }, { "epoch": 0.04807416293243329, "grad_norm": 13.09380050789166, "learning_rate": 4.8073242954696534e-05, "loss": 2.4048, "mean_token_accuracy": 0.42758620381355283, "step": 47730 }, { "epoch": 0.04807919898553746, "grad_norm": 29.341074537477407, "learning_rate": 4.8078278911427594e-05, "loss": 2.7153, "mean_token_accuracy": 0.4517241418361664, "step": 47735 }, { "epoch": 0.048084235038641635, "grad_norm": 11.00878637611333, "learning_rate": 4.808331486815866e-05, "loss": 2.5656, "mean_token_accuracy": 0.44827585816383364, "step": 47740 }, { "epoch": 0.04808927109174581, "grad_norm": 14.482931390313945, "learning_rate": 4.808835082488972e-05, "loss": 2.7436, "mean_token_accuracy": 0.39655172228813174, "step": 47745 }, { "epoch": 0.04809430714484998, "grad_norm": 15.894927773457258, "learning_rate": 4.809338678162078e-05, "loss": 2.9438, "mean_token_accuracy": 0.3517241418361664, "step": 47750 }, { "epoch": 0.048099343197954156, "grad_norm": 14.022630176455692, "learning_rate": 4.809842273835183e-05, "loss": 2.3362, "mean_token_accuracy": 0.4344827592372894, "step": 47755 }, { "epoch": 0.04810437925105833, "grad_norm": 14.088833958288474, "learning_rate": 4.810345869508289e-05, "loss": 2.8153, "mean_token_accuracy": 0.35862069129943847, "step": 47760 }, { "epoch": 0.048109415304162496, "grad_norm": 13.092229916064932, "learning_rate": 4.810849465181395e-05, "loss": 2.4632, "mean_token_accuracy": 0.4137930989265442, "step": 47765 }, { "epoch": 0.04811445135726667, "grad_norm": 14.637402825964937, "learning_rate": 4.8113530608545015e-05, "loss": 2.7748, "mean_token_accuracy": 0.36551723480224607, "step": 47770 }, { "epoch": 0.048119487410370844, "grad_norm": 12.06310851211309, "learning_rate": 4.8118566565276074e-05, "loss": 2.6243, "mean_token_accuracy": 0.40000000298023225, "step": 47775 }, { "epoch": 0.04812452346347502, "grad_norm": 9.927750803758858, "learning_rate": 4.8123602522007134e-05, "loss": 2.4738, "mean_token_accuracy": 0.4482758641242981, "step": 47780 }, { "epoch": 0.04812955951657919, "grad_norm": 16.537513908344895, "learning_rate": 4.812863847873819e-05, "loss": 2.7005, "mean_token_accuracy": 0.36896551847457887, "step": 47785 }, { "epoch": 0.048134595569683365, "grad_norm": 13.002405366863659, "learning_rate": 4.813367443546925e-05, "loss": 2.6624, "mean_token_accuracy": 0.38802178502082824, "step": 47790 }, { "epoch": 0.04813963162278754, "grad_norm": 13.58569352081164, "learning_rate": 4.813871039220031e-05, "loss": 2.8439, "mean_token_accuracy": 0.358620685338974, "step": 47795 }, { "epoch": 0.048144667675891706, "grad_norm": 13.400893208580715, "learning_rate": 4.814374634893137e-05, "loss": 2.7236, "mean_token_accuracy": 0.42413793206214906, "step": 47800 }, { "epoch": 0.04814970372899588, "grad_norm": 16.68576825066535, "learning_rate": 4.814878230566243e-05, "loss": 2.4215, "mean_token_accuracy": 0.4213054239749908, "step": 47805 }, { "epoch": 0.04815473978210005, "grad_norm": 11.452505316145178, "learning_rate": 4.815381826239349e-05, "loss": 2.5019, "mean_token_accuracy": 0.39655172228813174, "step": 47810 }, { "epoch": 0.04815977583520423, "grad_norm": 12.314096323420344, "learning_rate": 4.815885421912455e-05, "loss": 3.052, "mean_token_accuracy": 0.3827586233615875, "step": 47815 }, { "epoch": 0.0481648118883084, "grad_norm": 16.064525852383042, "learning_rate": 4.8163890175855614e-05, "loss": 3.132, "mean_token_accuracy": 0.3758620619773865, "step": 47820 }, { "epoch": 0.048169847941412575, "grad_norm": 12.803621983478275, "learning_rate": 4.8168926132586674e-05, "loss": 2.7101, "mean_token_accuracy": 0.3793103456497192, "step": 47825 }, { "epoch": 0.04817488399451675, "grad_norm": 13.762935052667222, "learning_rate": 4.817396208931773e-05, "loss": 2.7014, "mean_token_accuracy": 0.3931034505367279, "step": 47830 }, { "epoch": 0.048179920047620915, "grad_norm": 17.21572208679351, "learning_rate": 4.817899804604879e-05, "loss": 2.7142, "mean_token_accuracy": 0.3965517282485962, "step": 47835 }, { "epoch": 0.04818495610072509, "grad_norm": 37.567714233857586, "learning_rate": 4.818403400277985e-05, "loss": 3.2081, "mean_token_accuracy": 0.320689657330513, "step": 47840 }, { "epoch": 0.04818999215382926, "grad_norm": 11.76298402357167, "learning_rate": 4.818906995951091e-05, "loss": 2.7678, "mean_token_accuracy": 0.4137930989265442, "step": 47845 }, { "epoch": 0.04819502820693344, "grad_norm": 16.772717131885997, "learning_rate": 4.819410591624197e-05, "loss": 2.7511, "mean_token_accuracy": 0.324137932062149, "step": 47850 }, { "epoch": 0.04820006426003761, "grad_norm": 12.000196952383575, "learning_rate": 4.819914187297303e-05, "loss": 2.5852, "mean_token_accuracy": 0.37241379022598264, "step": 47855 }, { "epoch": 0.048205100313141784, "grad_norm": 13.62305493359036, "learning_rate": 4.820417782970409e-05, "loss": 3.0249, "mean_token_accuracy": 0.36551723480224607, "step": 47860 }, { "epoch": 0.04821013636624596, "grad_norm": 12.60647579190391, "learning_rate": 4.820921378643515e-05, "loss": 2.822, "mean_token_accuracy": 0.3620689630508423, "step": 47865 }, { "epoch": 0.048215172419350125, "grad_norm": 11.891029156714671, "learning_rate": 4.821424974316621e-05, "loss": 2.5257, "mean_token_accuracy": 0.3931034505367279, "step": 47870 }, { "epoch": 0.0482202084724543, "grad_norm": 12.443899103466219, "learning_rate": 4.821928569989727e-05, "loss": 2.5855, "mean_token_accuracy": 0.3689655244350433, "step": 47875 }, { "epoch": 0.04822524452555847, "grad_norm": 13.616605615737052, "learning_rate": 4.822432165662833e-05, "loss": 2.6064, "mean_token_accuracy": 0.39310344457626345, "step": 47880 }, { "epoch": 0.048230280578662646, "grad_norm": 10.113312093342765, "learning_rate": 4.822935761335939e-05, "loss": 2.122, "mean_token_accuracy": 0.4448275864124298, "step": 47885 }, { "epoch": 0.04823531663176682, "grad_norm": 15.533841247387786, "learning_rate": 4.8234393570090444e-05, "loss": 2.4082, "mean_token_accuracy": 0.458620685338974, "step": 47890 }, { "epoch": 0.048240352684870993, "grad_norm": 16.27341223691513, "learning_rate": 4.82394295268215e-05, "loss": 3.2313, "mean_token_accuracy": 0.32758620381355286, "step": 47895 }, { "epoch": 0.04824538873797517, "grad_norm": 12.815066340522804, "learning_rate": 4.824446548355257e-05, "loss": 2.7286, "mean_token_accuracy": 0.4379310369491577, "step": 47900 }, { "epoch": 0.048250424791079334, "grad_norm": 13.684763976126163, "learning_rate": 4.824950144028363e-05, "loss": 2.5653, "mean_token_accuracy": 0.42952207922935487, "step": 47905 }, { "epoch": 0.04825546084418351, "grad_norm": 13.699913146049308, "learning_rate": 4.825453739701469e-05, "loss": 2.9401, "mean_token_accuracy": 0.36896551251411436, "step": 47910 }, { "epoch": 0.04826049689728768, "grad_norm": 11.042239964822294, "learning_rate": 4.825957335374575e-05, "loss": 2.6792, "mean_token_accuracy": 0.3689655244350433, "step": 47915 }, { "epoch": 0.048265532950391855, "grad_norm": 13.474063338102429, "learning_rate": 4.8264609310476806e-05, "loss": 2.7897, "mean_token_accuracy": 0.38620689809322356, "step": 47920 }, { "epoch": 0.04827056900349603, "grad_norm": 13.83060275475539, "learning_rate": 4.826964526720787e-05, "loss": 2.595, "mean_token_accuracy": 0.37931033968925476, "step": 47925 }, { "epoch": 0.0482756050566002, "grad_norm": 13.622960931163657, "learning_rate": 4.8274681223938925e-05, "loss": 2.6993, "mean_token_accuracy": 0.41034482717514037, "step": 47930 }, { "epoch": 0.04828064110970438, "grad_norm": 16.05431563451574, "learning_rate": 4.8279717180669984e-05, "loss": 2.9244, "mean_token_accuracy": 0.30689655244350433, "step": 47935 }, { "epoch": 0.048285677162808543, "grad_norm": 15.072709148803987, "learning_rate": 4.828475313740104e-05, "loss": 2.3852, "mean_token_accuracy": 0.4399878978729248, "step": 47940 }, { "epoch": 0.04829071321591272, "grad_norm": 12.231847001336279, "learning_rate": 4.82897890941321e-05, "loss": 2.5781, "mean_token_accuracy": 0.4047791838645935, "step": 47945 }, { "epoch": 0.04829574926901689, "grad_norm": 13.723842790085856, "learning_rate": 4.829482505086316e-05, "loss": 2.6655, "mean_token_accuracy": 0.3620689630508423, "step": 47950 }, { "epoch": 0.048300785322121065, "grad_norm": 13.123596574714506, "learning_rate": 4.829986100759423e-05, "loss": 2.7262, "mean_token_accuracy": 0.39655172228813174, "step": 47955 }, { "epoch": 0.04830582137522524, "grad_norm": 16.12078669152484, "learning_rate": 4.830489696432529e-05, "loss": 3.026, "mean_token_accuracy": 0.36896551549434664, "step": 47960 }, { "epoch": 0.04831085742832941, "grad_norm": 14.88394241707456, "learning_rate": 4.8309932921056346e-05, "loss": 2.635, "mean_token_accuracy": 0.4068965494632721, "step": 47965 }, { "epoch": 0.048315893481433586, "grad_norm": 11.852550459829525, "learning_rate": 4.8314968877787406e-05, "loss": 2.836, "mean_token_accuracy": 0.3655172437429428, "step": 47970 }, { "epoch": 0.04832092953453775, "grad_norm": 13.147051097520313, "learning_rate": 4.8320004834518465e-05, "loss": 2.908, "mean_token_accuracy": 0.36896551251411436, "step": 47975 }, { "epoch": 0.04832596558764193, "grad_norm": 10.290878026574449, "learning_rate": 4.8325040791249524e-05, "loss": 2.5569, "mean_token_accuracy": 0.3758620619773865, "step": 47980 }, { "epoch": 0.0483310016407461, "grad_norm": 14.911818746624565, "learning_rate": 4.833007674798058e-05, "loss": 2.621, "mean_token_accuracy": 0.3793103486299515, "step": 47985 }, { "epoch": 0.048336037693850274, "grad_norm": 15.442953974231326, "learning_rate": 4.833511270471164e-05, "loss": 2.8982, "mean_token_accuracy": 0.33793103098869326, "step": 47990 }, { "epoch": 0.04834107374695445, "grad_norm": 11.280672597002635, "learning_rate": 4.83401486614427e-05, "loss": 2.8518, "mean_token_accuracy": 0.39655172228813174, "step": 47995 }, { "epoch": 0.04834610980005862, "grad_norm": 10.7260455224953, "learning_rate": 4.834518461817376e-05, "loss": 2.3238, "mean_token_accuracy": 0.4344827592372894, "step": 48000 }, { "epoch": 0.048351145853162796, "grad_norm": 11.336672068868449, "learning_rate": 4.835022057490483e-05, "loss": 2.6321, "mean_token_accuracy": 0.3896551728248596, "step": 48005 }, { "epoch": 0.04835618190626696, "grad_norm": 11.291506943875316, "learning_rate": 4.8355256531635886e-05, "loss": 2.3768, "mean_token_accuracy": 0.42758620381355283, "step": 48010 }, { "epoch": 0.048361217959371136, "grad_norm": 14.296863471149353, "learning_rate": 4.8360292488366946e-05, "loss": 2.7143, "mean_token_accuracy": 0.35862068831920624, "step": 48015 }, { "epoch": 0.04836625401247531, "grad_norm": 17.845974770416266, "learning_rate": 4.8365328445098005e-05, "loss": 2.3487, "mean_token_accuracy": 0.4551724255084991, "step": 48020 }, { "epoch": 0.048371290065579484, "grad_norm": 17.99301831657516, "learning_rate": 4.837036440182906e-05, "loss": 2.6763, "mean_token_accuracy": 0.42758620977401735, "step": 48025 }, { "epoch": 0.04837632611868366, "grad_norm": 13.65630079561519, "learning_rate": 4.837540035856012e-05, "loss": 2.5703, "mean_token_accuracy": 0.4034482717514038, "step": 48030 }, { "epoch": 0.04838136217178783, "grad_norm": 14.22024069990981, "learning_rate": 4.838043631529118e-05, "loss": 2.4472, "mean_token_accuracy": 0.3862069010734558, "step": 48035 }, { "epoch": 0.048386398224892005, "grad_norm": 12.614602852936184, "learning_rate": 4.838547227202224e-05, "loss": 2.3851, "mean_token_accuracy": 0.3999999940395355, "step": 48040 }, { "epoch": 0.04839143427799617, "grad_norm": 12.225588195256037, "learning_rate": 4.83905082287533e-05, "loss": 2.7198, "mean_token_accuracy": 0.3517241418361664, "step": 48045 }, { "epoch": 0.048396470331100346, "grad_norm": 16.536903920919432, "learning_rate": 4.839554418548436e-05, "loss": 3.0174, "mean_token_accuracy": 0.36896551847457887, "step": 48050 }, { "epoch": 0.04840150638420452, "grad_norm": 11.703344554135915, "learning_rate": 4.840058014221542e-05, "loss": 2.2003, "mean_token_accuracy": 0.4482758641242981, "step": 48055 }, { "epoch": 0.04840654243730869, "grad_norm": 12.199465489470057, "learning_rate": 4.8405616098946486e-05, "loss": 2.508, "mean_token_accuracy": 0.43448275327682495, "step": 48060 }, { "epoch": 0.04841157849041287, "grad_norm": 10.736565903678072, "learning_rate": 4.841065205567754e-05, "loss": 2.7646, "mean_token_accuracy": 0.3793103456497192, "step": 48065 }, { "epoch": 0.04841661454351704, "grad_norm": 11.061564126828895, "learning_rate": 4.84156880124086e-05, "loss": 2.6704, "mean_token_accuracy": 0.34137930870056155, "step": 48070 }, { "epoch": 0.048421650596621214, "grad_norm": 12.965823345460366, "learning_rate": 4.842072396913966e-05, "loss": 2.7181, "mean_token_accuracy": 0.35862069129943847, "step": 48075 }, { "epoch": 0.04842668664972538, "grad_norm": 12.767373568784635, "learning_rate": 4.8425759925870716e-05, "loss": 2.9332, "mean_token_accuracy": 0.4, "step": 48080 }, { "epoch": 0.048431722702829555, "grad_norm": 11.136649343593858, "learning_rate": 4.843079588260178e-05, "loss": 2.3221, "mean_token_accuracy": 0.4620689630508423, "step": 48085 }, { "epoch": 0.04843675875593373, "grad_norm": 13.913160544286793, "learning_rate": 4.843583183933284e-05, "loss": 2.5998, "mean_token_accuracy": 0.3827586233615875, "step": 48090 }, { "epoch": 0.0484417948090379, "grad_norm": 10.664867515244978, "learning_rate": 4.84408677960639e-05, "loss": 2.5048, "mean_token_accuracy": 0.4034482777118683, "step": 48095 }, { "epoch": 0.048446830862142076, "grad_norm": 15.33218767243514, "learning_rate": 4.844590375279496e-05, "loss": 2.4069, "mean_token_accuracy": 0.41379310488700866, "step": 48100 }, { "epoch": 0.04845186691524625, "grad_norm": 11.590038592891378, "learning_rate": 4.845093970952602e-05, "loss": 2.4749, "mean_token_accuracy": 0.3827586233615875, "step": 48105 }, { "epoch": 0.048456902968350424, "grad_norm": 13.522503788548732, "learning_rate": 4.845597566625708e-05, "loss": 2.6779, "mean_token_accuracy": 0.3655172407627106, "step": 48110 }, { "epoch": 0.04846193902145459, "grad_norm": 13.765110744221158, "learning_rate": 4.846101162298814e-05, "loss": 2.4557, "mean_token_accuracy": 0.4068965494632721, "step": 48115 }, { "epoch": 0.048466975074558764, "grad_norm": 9.63216924131321, "learning_rate": 4.84660475797192e-05, "loss": 2.3762, "mean_token_accuracy": 0.44664247035980226, "step": 48120 }, { "epoch": 0.04847201112766294, "grad_norm": 16.91491101255305, "learning_rate": 4.8471083536450256e-05, "loss": 2.5557, "mean_token_accuracy": 0.4068965554237366, "step": 48125 }, { "epoch": 0.04847704718076711, "grad_norm": 12.654562604229833, "learning_rate": 4.8476119493181315e-05, "loss": 2.9432, "mean_token_accuracy": 0.32413792312145234, "step": 48130 }, { "epoch": 0.048482083233871286, "grad_norm": 12.003290721499402, "learning_rate": 4.8481155449912375e-05, "loss": 2.6067, "mean_token_accuracy": 0.37931033968925476, "step": 48135 }, { "epoch": 0.04848711928697546, "grad_norm": 11.59529740484712, "learning_rate": 4.848619140664344e-05, "loss": 2.2355, "mean_token_accuracy": 0.417241370677948, "step": 48140 }, { "epoch": 0.04849215534007963, "grad_norm": 11.644062267615753, "learning_rate": 4.84912273633745e-05, "loss": 2.3597, "mean_token_accuracy": 0.41034482717514037, "step": 48145 }, { "epoch": 0.0484971913931838, "grad_norm": 14.941949996041856, "learning_rate": 4.849626332010556e-05, "loss": 2.563, "mean_token_accuracy": 0.35862068831920624, "step": 48150 }, { "epoch": 0.048502227446287974, "grad_norm": 15.375850917715237, "learning_rate": 4.850129927683661e-05, "loss": 2.9527, "mean_token_accuracy": 0.3620689630508423, "step": 48155 }, { "epoch": 0.04850726349939215, "grad_norm": 13.328263973384196, "learning_rate": 4.850633523356767e-05, "loss": 2.5813, "mean_token_accuracy": 0.3827586233615875, "step": 48160 }, { "epoch": 0.04851229955249632, "grad_norm": 11.742490535354298, "learning_rate": 4.851137119029874e-05, "loss": 2.5646, "mean_token_accuracy": 0.4156079888343811, "step": 48165 }, { "epoch": 0.048517335605600495, "grad_norm": 13.438952936223133, "learning_rate": 4.8516407147029796e-05, "loss": 2.2942, "mean_token_accuracy": 0.4517241358757019, "step": 48170 }, { "epoch": 0.04852237165870467, "grad_norm": 13.674030613944092, "learning_rate": 4.8521443103760855e-05, "loss": 2.8094, "mean_token_accuracy": 0.4034482717514038, "step": 48175 }, { "epoch": 0.04852740771180884, "grad_norm": 12.531782944179996, "learning_rate": 4.8526479060491915e-05, "loss": 2.669, "mean_token_accuracy": 0.39310344457626345, "step": 48180 }, { "epoch": 0.04853244376491301, "grad_norm": 12.922740689455576, "learning_rate": 4.8531515017222974e-05, "loss": 2.6885, "mean_token_accuracy": 0.37931033968925476, "step": 48185 }, { "epoch": 0.04853747981801718, "grad_norm": 14.437195888089473, "learning_rate": 4.853655097395403e-05, "loss": 2.569, "mean_token_accuracy": 0.3931034505367279, "step": 48190 }, { "epoch": 0.04854251587112136, "grad_norm": 13.954715030601593, "learning_rate": 4.854158693068509e-05, "loss": 2.7136, "mean_token_accuracy": 0.3655172407627106, "step": 48195 }, { "epoch": 0.04854755192422553, "grad_norm": 19.736013762155707, "learning_rate": 4.854662288741615e-05, "loss": 2.3335, "mean_token_accuracy": 0.4310344815254211, "step": 48200 }, { "epoch": 0.048552587977329704, "grad_norm": 15.476636621925529, "learning_rate": 4.855165884414721e-05, "loss": 2.9275, "mean_token_accuracy": 0.3551724135875702, "step": 48205 }, { "epoch": 0.04855762403043388, "grad_norm": 15.07431611920237, "learning_rate": 4.855669480087827e-05, "loss": 2.4141, "mean_token_accuracy": 0.37586206793785093, "step": 48210 }, { "epoch": 0.04856266008353805, "grad_norm": 8.953688372231884, "learning_rate": 4.856173075760933e-05, "loss": 2.213, "mean_token_accuracy": 0.4379310369491577, "step": 48215 }, { "epoch": 0.04856769613664222, "grad_norm": 34.78160837929185, "learning_rate": 4.8566766714340395e-05, "loss": 2.7686, "mean_token_accuracy": 0.41379310488700866, "step": 48220 }, { "epoch": 0.04857273218974639, "grad_norm": 24.264707806974492, "learning_rate": 4.8571802671071455e-05, "loss": 2.7362, "mean_token_accuracy": 0.37241379022598264, "step": 48225 }, { "epoch": 0.048577768242850566, "grad_norm": 15.161720206262045, "learning_rate": 4.8576838627802514e-05, "loss": 2.8926, "mean_token_accuracy": 0.3620689630508423, "step": 48230 }, { "epoch": 0.04858280429595474, "grad_norm": 12.990903514245483, "learning_rate": 4.858187458453357e-05, "loss": 2.7298, "mean_token_accuracy": 0.38965516686439516, "step": 48235 }, { "epoch": 0.048587840349058914, "grad_norm": 13.319839303133348, "learning_rate": 4.858691054126463e-05, "loss": 2.7845, "mean_token_accuracy": 0.37241379618644715, "step": 48240 }, { "epoch": 0.04859287640216309, "grad_norm": 15.394204246819257, "learning_rate": 4.859194649799569e-05, "loss": 2.7024, "mean_token_accuracy": 0.3965517282485962, "step": 48245 }, { "epoch": 0.04859791245526726, "grad_norm": 12.367496431187494, "learning_rate": 4.859698245472675e-05, "loss": 2.9963, "mean_token_accuracy": 0.3344827502965927, "step": 48250 }, { "epoch": 0.04860294850837143, "grad_norm": 12.259542837307565, "learning_rate": 4.860201841145781e-05, "loss": 2.3547, "mean_token_accuracy": 0.41034482717514037, "step": 48255 }, { "epoch": 0.0486079845614756, "grad_norm": 12.827897221164488, "learning_rate": 4.860705436818887e-05, "loss": 2.7808, "mean_token_accuracy": 0.3310344874858856, "step": 48260 }, { "epoch": 0.048613020614579776, "grad_norm": 15.185870826588415, "learning_rate": 4.861209032491993e-05, "loss": 2.9822, "mean_token_accuracy": 0.36896551847457887, "step": 48265 }, { "epoch": 0.04861805666768395, "grad_norm": 12.194208729305847, "learning_rate": 4.8617126281650995e-05, "loss": 2.2269, "mean_token_accuracy": 0.43103448748588563, "step": 48270 }, { "epoch": 0.04862309272078812, "grad_norm": 12.668104981739225, "learning_rate": 4.8622162238382054e-05, "loss": 2.5206, "mean_token_accuracy": 0.3551724195480347, "step": 48275 }, { "epoch": 0.0486281287738923, "grad_norm": 13.000552727158809, "learning_rate": 4.862719819511311e-05, "loss": 2.3085, "mean_token_accuracy": 0.4413793087005615, "step": 48280 }, { "epoch": 0.04863316482699647, "grad_norm": 13.0346681751547, "learning_rate": 4.863223415184417e-05, "loss": 2.2602, "mean_token_accuracy": 0.46896551847457885, "step": 48285 }, { "epoch": 0.04863820088010064, "grad_norm": 12.780303088050125, "learning_rate": 4.8637270108575225e-05, "loss": 2.6014, "mean_token_accuracy": 0.34482758641242983, "step": 48290 }, { "epoch": 0.04864323693320481, "grad_norm": 10.94737219672258, "learning_rate": 4.8642306065306284e-05, "loss": 2.2147, "mean_token_accuracy": 0.41724138855934145, "step": 48295 }, { "epoch": 0.048648272986308985, "grad_norm": 12.130738409199719, "learning_rate": 4.864734202203735e-05, "loss": 2.327, "mean_token_accuracy": 0.43793103098869324, "step": 48300 }, { "epoch": 0.04865330903941316, "grad_norm": 14.122267368545838, "learning_rate": 4.865237797876841e-05, "loss": 2.8448, "mean_token_accuracy": 0.39999998807907106, "step": 48305 }, { "epoch": 0.04865834509251733, "grad_norm": 14.716641341102592, "learning_rate": 4.865741393549947e-05, "loss": 3.0089, "mean_token_accuracy": 0.334482753276825, "step": 48310 }, { "epoch": 0.048663381145621507, "grad_norm": 12.116616078984062, "learning_rate": 4.866244989223053e-05, "loss": 2.8515, "mean_token_accuracy": 0.3655172407627106, "step": 48315 }, { "epoch": 0.04866841719872568, "grad_norm": 13.313615856370046, "learning_rate": 4.866748584896159e-05, "loss": 2.9, "mean_token_accuracy": 0.3275861978530884, "step": 48320 }, { "epoch": 0.04867345325182985, "grad_norm": 11.663066221859198, "learning_rate": 4.867252180569265e-05, "loss": 2.4684, "mean_token_accuracy": 0.4034482717514038, "step": 48325 }, { "epoch": 0.04867848930493402, "grad_norm": 13.696278580835385, "learning_rate": 4.8677557762423706e-05, "loss": 2.6287, "mean_token_accuracy": 0.41724138259887694, "step": 48330 }, { "epoch": 0.048683525358038195, "grad_norm": 19.960609175796876, "learning_rate": 4.8682593719154765e-05, "loss": 2.7043, "mean_token_accuracy": 0.3965517282485962, "step": 48335 }, { "epoch": 0.04868856141114237, "grad_norm": 12.417478960739547, "learning_rate": 4.8687629675885824e-05, "loss": 2.5205, "mean_token_accuracy": 0.36896551549434664, "step": 48340 }, { "epoch": 0.04869359746424654, "grad_norm": 13.405255716067808, "learning_rate": 4.8692665632616883e-05, "loss": 3.0405, "mean_token_accuracy": 0.3862068891525269, "step": 48345 }, { "epoch": 0.048698633517350716, "grad_norm": 13.48379818560196, "learning_rate": 4.869770158934795e-05, "loss": 2.5258, "mean_token_accuracy": 0.4310344815254211, "step": 48350 }, { "epoch": 0.04870366957045489, "grad_norm": 12.895154840984414, "learning_rate": 4.870273754607901e-05, "loss": 2.5868, "mean_token_accuracy": 0.41034482717514037, "step": 48355 }, { "epoch": 0.048708705623559057, "grad_norm": 13.294338109326006, "learning_rate": 4.870777350281007e-05, "loss": 2.6717, "mean_token_accuracy": 0.3379310369491577, "step": 48360 }, { "epoch": 0.04871374167666323, "grad_norm": 13.491877913936998, "learning_rate": 4.871280945954113e-05, "loss": 2.3029, "mean_token_accuracy": 0.4482758641242981, "step": 48365 }, { "epoch": 0.048718777729767404, "grad_norm": 13.544388200593719, "learning_rate": 4.8717845416272186e-05, "loss": 2.7479, "mean_token_accuracy": 0.36896551251411436, "step": 48370 }, { "epoch": 0.04872381378287158, "grad_norm": 11.416438468398985, "learning_rate": 4.8722881373003246e-05, "loss": 2.7715, "mean_token_accuracy": 0.3793103456497192, "step": 48375 }, { "epoch": 0.04872884983597575, "grad_norm": 12.188407945788123, "learning_rate": 4.8727917329734305e-05, "loss": 2.5204, "mean_token_accuracy": 0.4084089457988739, "step": 48380 }, { "epoch": 0.048733885889079925, "grad_norm": 12.292729252302372, "learning_rate": 4.8732953286465364e-05, "loss": 2.2203, "mean_token_accuracy": 0.42832512259483335, "step": 48385 }, { "epoch": 0.0487389219421841, "grad_norm": 12.448125703528053, "learning_rate": 4.8737989243196424e-05, "loss": 2.8356, "mean_token_accuracy": 0.3793103456497192, "step": 48390 }, { "epoch": 0.048743957995288266, "grad_norm": 12.621255462003862, "learning_rate": 4.874302519992748e-05, "loss": 2.5078, "mean_token_accuracy": 0.3896551728248596, "step": 48395 }, { "epoch": 0.04874899404839244, "grad_norm": 12.392426273612912, "learning_rate": 4.874806115665854e-05, "loss": 2.5789, "mean_token_accuracy": 0.4223835408687592, "step": 48400 }, { "epoch": 0.048754030101496613, "grad_norm": 13.418593738013328, "learning_rate": 4.875309711338961e-05, "loss": 2.6193, "mean_token_accuracy": 0.3848154813051224, "step": 48405 }, { "epoch": 0.04875906615460079, "grad_norm": 11.680217457884503, "learning_rate": 4.875813307012067e-05, "loss": 2.4637, "mean_token_accuracy": 0.4137930989265442, "step": 48410 }, { "epoch": 0.04876410220770496, "grad_norm": 12.549532962302548, "learning_rate": 4.8763169026851727e-05, "loss": 2.9229, "mean_token_accuracy": 0.3275861978530884, "step": 48415 }, { "epoch": 0.048769138260809135, "grad_norm": 13.239880630759775, "learning_rate": 4.8768204983582786e-05, "loss": 2.6379, "mean_token_accuracy": 0.3931034505367279, "step": 48420 }, { "epoch": 0.04877417431391331, "grad_norm": 14.94931651380599, "learning_rate": 4.877324094031384e-05, "loss": 2.6614, "mean_token_accuracy": 0.39310344457626345, "step": 48425 }, { "epoch": 0.048779210367017475, "grad_norm": 10.35937841170216, "learning_rate": 4.8778276897044904e-05, "loss": 2.6427, "mean_token_accuracy": 0.37241379618644715, "step": 48430 }, { "epoch": 0.04878424642012165, "grad_norm": 15.778365637741251, "learning_rate": 4.8783312853775964e-05, "loss": 2.5496, "mean_token_accuracy": 0.4468239605426788, "step": 48435 }, { "epoch": 0.04878928247322582, "grad_norm": 13.594801836797828, "learning_rate": 4.878834881050702e-05, "loss": 2.8545, "mean_token_accuracy": 0.37586206793785093, "step": 48440 }, { "epoch": 0.04879431852633, "grad_norm": 14.953599458295173, "learning_rate": 4.879338476723808e-05, "loss": 2.6875, "mean_token_accuracy": 0.3965517282485962, "step": 48445 }, { "epoch": 0.04879935457943417, "grad_norm": 10.521815384316806, "learning_rate": 4.879842072396914e-05, "loss": 2.7746, "mean_token_accuracy": 0.3862069010734558, "step": 48450 }, { "epoch": 0.048804390632538344, "grad_norm": 13.824906564932823, "learning_rate": 4.88034566807002e-05, "loss": 2.8319, "mean_token_accuracy": 0.38275861740112305, "step": 48455 }, { "epoch": 0.04880942668564252, "grad_norm": 14.314326851472151, "learning_rate": 4.8808492637431267e-05, "loss": 2.6559, "mean_token_accuracy": 0.38965516686439516, "step": 48460 }, { "epoch": 0.048814462738746685, "grad_norm": 13.06217644717499, "learning_rate": 4.881352859416232e-05, "loss": 2.5418, "mean_token_accuracy": 0.3862069010734558, "step": 48465 }, { "epoch": 0.04881949879185086, "grad_norm": 18.250455079471973, "learning_rate": 4.881856455089338e-05, "loss": 3.0609, "mean_token_accuracy": 0.3655172407627106, "step": 48470 }, { "epoch": 0.04882453484495503, "grad_norm": 13.524749735949026, "learning_rate": 4.882360050762444e-05, "loss": 2.7834, "mean_token_accuracy": 0.34137930572032926, "step": 48475 }, { "epoch": 0.048829570898059206, "grad_norm": 12.571227692156853, "learning_rate": 4.88286364643555e-05, "loss": 2.8615, "mean_token_accuracy": 0.37241379022598264, "step": 48480 }, { "epoch": 0.04883460695116338, "grad_norm": 13.315856263625749, "learning_rate": 4.883367242108656e-05, "loss": 2.5918, "mean_token_accuracy": 0.4068965554237366, "step": 48485 }, { "epoch": 0.048839643004267554, "grad_norm": 15.008201898662728, "learning_rate": 4.883870837781762e-05, "loss": 2.4612, "mean_token_accuracy": 0.4000000059604645, "step": 48490 }, { "epoch": 0.04884467905737173, "grad_norm": 12.232209408897178, "learning_rate": 4.884374433454868e-05, "loss": 2.4763, "mean_token_accuracy": 0.4172413766384125, "step": 48495 }, { "epoch": 0.048849715110475894, "grad_norm": 14.255163908103704, "learning_rate": 4.884878029127974e-05, "loss": 2.2962, "mean_token_accuracy": 0.3793103456497192, "step": 48500 }, { "epoch": 0.04885475116358007, "grad_norm": 13.768624316857094, "learning_rate": 4.88538162480108e-05, "loss": 2.3636, "mean_token_accuracy": 0.42607381343841555, "step": 48505 }, { "epoch": 0.04885978721668424, "grad_norm": 11.985706766565356, "learning_rate": 4.885885220474186e-05, "loss": 2.4073, "mean_token_accuracy": 0.3741681814193726, "step": 48510 }, { "epoch": 0.048864823269788416, "grad_norm": 14.2329004019485, "learning_rate": 4.886388816147292e-05, "loss": 3.1628, "mean_token_accuracy": 0.32758620381355286, "step": 48515 }, { "epoch": 0.04886985932289259, "grad_norm": 11.688248127140819, "learning_rate": 4.886892411820398e-05, "loss": 2.4542, "mean_token_accuracy": 0.40000000298023225, "step": 48520 }, { "epoch": 0.04887489537599676, "grad_norm": 12.424367045858348, "learning_rate": 4.887396007493504e-05, "loss": 2.5759, "mean_token_accuracy": 0.37241379618644715, "step": 48525 }, { "epoch": 0.04887993142910094, "grad_norm": 13.673095229947196, "learning_rate": 4.8878996031666096e-05, "loss": 2.5992, "mean_token_accuracy": 0.33448275923728943, "step": 48530 }, { "epoch": 0.048884967482205104, "grad_norm": 10.333840333112141, "learning_rate": 4.8884031988397155e-05, "loss": 2.4575, "mean_token_accuracy": 0.41379310488700866, "step": 48535 }, { "epoch": 0.04889000353530928, "grad_norm": 13.465473085376816, "learning_rate": 4.888906794512822e-05, "loss": 2.4419, "mean_token_accuracy": 0.41034482717514037, "step": 48540 }, { "epoch": 0.04889503958841345, "grad_norm": 15.7055676822973, "learning_rate": 4.889410390185928e-05, "loss": 3.1634, "mean_token_accuracy": 0.31379310190677645, "step": 48545 }, { "epoch": 0.048900075641517625, "grad_norm": 12.217448120901466, "learning_rate": 4.889913985859034e-05, "loss": 2.6261, "mean_token_accuracy": 0.3931034505367279, "step": 48550 }, { "epoch": 0.0489051116946218, "grad_norm": 13.35142035782145, "learning_rate": 4.89041758153214e-05, "loss": 3.1392, "mean_token_accuracy": 0.34827586114406583, "step": 48555 }, { "epoch": 0.04891014774772597, "grad_norm": 10.733218452025508, "learning_rate": 4.890921177205245e-05, "loss": 2.4871, "mean_token_accuracy": 0.38620689511299133, "step": 48560 }, { "epoch": 0.048915183800830146, "grad_norm": 13.58158465293678, "learning_rate": 4.891424772878352e-05, "loss": 2.3137, "mean_token_accuracy": 0.41724138259887694, "step": 48565 }, { "epoch": 0.04892021985393431, "grad_norm": 10.826982571655027, "learning_rate": 4.891928368551458e-05, "loss": 2.3018, "mean_token_accuracy": 0.4379310369491577, "step": 48570 }, { "epoch": 0.04892525590703849, "grad_norm": 14.165362100668998, "learning_rate": 4.8924319642245636e-05, "loss": 2.6641, "mean_token_accuracy": 0.4034482717514038, "step": 48575 }, { "epoch": 0.04893029196014266, "grad_norm": 12.976475870644922, "learning_rate": 4.8929355598976695e-05, "loss": 3.0219, "mean_token_accuracy": 0.35287356078624726, "step": 48580 }, { "epoch": 0.048935328013246834, "grad_norm": 9.84799880495947, "learning_rate": 4.8934391555707755e-05, "loss": 2.2046, "mean_token_accuracy": 0.42413792610168455, "step": 48585 }, { "epoch": 0.04894036406635101, "grad_norm": 17.9686088498197, "learning_rate": 4.893942751243882e-05, "loss": 3.1188, "mean_token_accuracy": 0.35172412991523744, "step": 48590 }, { "epoch": 0.04894540011945518, "grad_norm": 11.147522915979303, "learning_rate": 4.894446346916988e-05, "loss": 2.7349, "mean_token_accuracy": 0.324137932062149, "step": 48595 }, { "epoch": 0.048950436172559356, "grad_norm": 14.18028834326604, "learning_rate": 4.894949942590093e-05, "loss": 2.6525, "mean_token_accuracy": 0.36551724672317504, "step": 48600 }, { "epoch": 0.04895547222566352, "grad_norm": 13.937273308945302, "learning_rate": 4.895453538263199e-05, "loss": 2.6352, "mean_token_accuracy": 0.40307881832122805, "step": 48605 }, { "epoch": 0.048960508278767696, "grad_norm": 12.602036334793599, "learning_rate": 4.895957133936305e-05, "loss": 2.6794, "mean_token_accuracy": 0.3551724135875702, "step": 48610 }, { "epoch": 0.04896554433187187, "grad_norm": 39.03827948099858, "learning_rate": 4.896460729609411e-05, "loss": 2.7775, "mean_token_accuracy": 0.3827586233615875, "step": 48615 }, { "epoch": 0.048970580384976044, "grad_norm": 11.693095912421027, "learning_rate": 4.8969643252825176e-05, "loss": 2.3885, "mean_token_accuracy": 0.43103448748588563, "step": 48620 }, { "epoch": 0.04897561643808022, "grad_norm": 16.908001976639614, "learning_rate": 4.8974679209556236e-05, "loss": 2.641, "mean_token_accuracy": 0.384694492816925, "step": 48625 }, { "epoch": 0.04898065249118439, "grad_norm": 12.129049762220168, "learning_rate": 4.8979715166287295e-05, "loss": 2.3119, "mean_token_accuracy": 0.4275861978530884, "step": 48630 }, { "epoch": 0.048985688544288565, "grad_norm": 15.713237207363157, "learning_rate": 4.8984751123018354e-05, "loss": 2.5231, "mean_token_accuracy": 0.3965517282485962, "step": 48635 }, { "epoch": 0.04899072459739273, "grad_norm": 15.441070874542985, "learning_rate": 4.898978707974941e-05, "loss": 2.4771, "mean_token_accuracy": 0.36896551847457887, "step": 48640 }, { "epoch": 0.048995760650496906, "grad_norm": 10.696272089903482, "learning_rate": 4.899482303648047e-05, "loss": 2.771, "mean_token_accuracy": 0.3965517163276672, "step": 48645 }, { "epoch": 0.04900079670360108, "grad_norm": 12.335197204442453, "learning_rate": 4.899985899321153e-05, "loss": 2.7099, "mean_token_accuracy": 0.41034482717514037, "step": 48650 }, { "epoch": 0.04900583275670525, "grad_norm": 11.171296736601489, "learning_rate": 4.900489494994259e-05, "loss": 2.2138, "mean_token_accuracy": 0.417241370677948, "step": 48655 }, { "epoch": 0.04901086880980943, "grad_norm": 11.07551264107439, "learning_rate": 4.900993090667365e-05, "loss": 2.6168, "mean_token_accuracy": 0.4603750824928284, "step": 48660 }, { "epoch": 0.0490159048629136, "grad_norm": 11.41291529272933, "learning_rate": 4.901496686340471e-05, "loss": 2.2826, "mean_token_accuracy": 0.42758620381355283, "step": 48665 }, { "epoch": 0.049020940916017774, "grad_norm": 12.35846193357575, "learning_rate": 4.9020002820135776e-05, "loss": 2.7803, "mean_token_accuracy": 0.35862069129943847, "step": 48670 }, { "epoch": 0.04902597696912194, "grad_norm": 13.452914487071011, "learning_rate": 4.9025038776866835e-05, "loss": 2.9799, "mean_token_accuracy": 0.39655172228813174, "step": 48675 }, { "epoch": 0.049031013022226115, "grad_norm": 10.667367877039903, "learning_rate": 4.9030074733597894e-05, "loss": 2.2757, "mean_token_accuracy": 0.44313369393348695, "step": 48680 }, { "epoch": 0.04903604907533029, "grad_norm": 14.034089187491649, "learning_rate": 4.903511069032895e-05, "loss": 2.4509, "mean_token_accuracy": 0.41724138259887694, "step": 48685 }, { "epoch": 0.04904108512843446, "grad_norm": 16.065601693166588, "learning_rate": 4.9040146647060006e-05, "loss": 2.4202, "mean_token_accuracy": 0.36896551847457887, "step": 48690 }, { "epoch": 0.049046121181538636, "grad_norm": 16.291617589586618, "learning_rate": 4.904518260379107e-05, "loss": 2.7196, "mean_token_accuracy": 0.3793103456497192, "step": 48695 }, { "epoch": 0.04905115723464281, "grad_norm": 12.597348660149816, "learning_rate": 4.905021856052213e-05, "loss": 2.7101, "mean_token_accuracy": 0.39310344457626345, "step": 48700 }, { "epoch": 0.049056193287746984, "grad_norm": 13.38408563963703, "learning_rate": 4.905525451725319e-05, "loss": 2.5155, "mean_token_accuracy": 0.43793103098869324, "step": 48705 }, { "epoch": 0.04906122934085115, "grad_norm": 14.05694404360121, "learning_rate": 4.906029047398425e-05, "loss": 2.4763, "mean_token_accuracy": 0.3965517282485962, "step": 48710 }, { "epoch": 0.049066265393955324, "grad_norm": 11.818800615651364, "learning_rate": 4.906532643071531e-05, "loss": 2.4328, "mean_token_accuracy": 0.41724138259887694, "step": 48715 }, { "epoch": 0.0490713014470595, "grad_norm": 16.28295995476592, "learning_rate": 4.907036238744637e-05, "loss": 2.9192, "mean_token_accuracy": 0.32758620381355286, "step": 48720 }, { "epoch": 0.04907633750016367, "grad_norm": 17.12050061095547, "learning_rate": 4.9075398344177434e-05, "loss": 2.6422, "mean_token_accuracy": 0.41724138855934145, "step": 48725 }, { "epoch": 0.049081373553267846, "grad_norm": 12.037829431397777, "learning_rate": 4.9080434300908487e-05, "loss": 2.761, "mean_token_accuracy": 0.3655172407627106, "step": 48730 }, { "epoch": 0.04908640960637202, "grad_norm": 11.299898431584603, "learning_rate": 4.9085470257639546e-05, "loss": 2.7494, "mean_token_accuracy": 0.37241379022598264, "step": 48735 }, { "epoch": 0.04909144565947619, "grad_norm": 14.026421664454766, "learning_rate": 4.9090506214370605e-05, "loss": 2.6895, "mean_token_accuracy": 0.39655172228813174, "step": 48740 }, { "epoch": 0.04909648171258036, "grad_norm": 11.161789309102504, "learning_rate": 4.9095542171101664e-05, "loss": 1.9768, "mean_token_accuracy": 0.47241379618644713, "step": 48745 }, { "epoch": 0.049101517765684534, "grad_norm": 15.256682681278365, "learning_rate": 4.910057812783273e-05, "loss": 2.6676, "mean_token_accuracy": 0.4448275864124298, "step": 48750 }, { "epoch": 0.04910655381878871, "grad_norm": 12.563981049323946, "learning_rate": 4.910561408456379e-05, "loss": 2.5272, "mean_token_accuracy": 0.42758620977401735, "step": 48755 }, { "epoch": 0.04911158987189288, "grad_norm": 13.200370300176669, "learning_rate": 4.911065004129485e-05, "loss": 2.8527, "mean_token_accuracy": 0.37241379022598264, "step": 48760 }, { "epoch": 0.049116625924997055, "grad_norm": 12.040463218394537, "learning_rate": 4.911568599802591e-05, "loss": 2.5456, "mean_token_accuracy": 0.4241379201412201, "step": 48765 }, { "epoch": 0.04912166197810123, "grad_norm": 11.900442597829505, "learning_rate": 4.912072195475697e-05, "loss": 2.562, "mean_token_accuracy": 0.3551724135875702, "step": 48770 }, { "epoch": 0.0491266980312054, "grad_norm": 10.561517018587763, "learning_rate": 4.912575791148803e-05, "loss": 2.6782, "mean_token_accuracy": 0.37241379618644715, "step": 48775 }, { "epoch": 0.04913173408430957, "grad_norm": 14.577239316877046, "learning_rate": 4.9130793868219086e-05, "loss": 2.4158, "mean_token_accuracy": 0.40508166551589964, "step": 48780 }, { "epoch": 0.04913677013741374, "grad_norm": 14.221353782661387, "learning_rate": 4.9135829824950145e-05, "loss": 3.3963, "mean_token_accuracy": 0.2931034505367279, "step": 48785 }, { "epoch": 0.04914180619051792, "grad_norm": 10.390088581283837, "learning_rate": 4.9140865781681204e-05, "loss": 2.7245, "mean_token_accuracy": 0.39655172228813174, "step": 48790 }, { "epoch": 0.04914684224362209, "grad_norm": 10.861266271960327, "learning_rate": 4.9145901738412264e-05, "loss": 2.6406, "mean_token_accuracy": 0.4172413766384125, "step": 48795 }, { "epoch": 0.049151878296726265, "grad_norm": 13.615060606335325, "learning_rate": 4.915093769514332e-05, "loss": 2.8556, "mean_token_accuracy": 0.38965516686439516, "step": 48800 }, { "epoch": 0.04915691434983044, "grad_norm": 14.268334857568226, "learning_rate": 4.915597365187439e-05, "loss": 2.7183, "mean_token_accuracy": 0.3724137932062149, "step": 48805 }, { "epoch": 0.04916195040293461, "grad_norm": 13.416458486468091, "learning_rate": 4.916100960860545e-05, "loss": 3.0008, "mean_token_accuracy": 0.34482758641242983, "step": 48810 }, { "epoch": 0.04916698645603878, "grad_norm": 13.38075914762203, "learning_rate": 4.916604556533651e-05, "loss": 2.9544, "mean_token_accuracy": 0.37241379022598264, "step": 48815 }, { "epoch": 0.04917202250914295, "grad_norm": 13.019920543432152, "learning_rate": 4.917108152206757e-05, "loss": 2.4255, "mean_token_accuracy": 0.42758620977401735, "step": 48820 }, { "epoch": 0.049177058562247127, "grad_norm": 14.57767112428131, "learning_rate": 4.917611747879862e-05, "loss": 2.4531, "mean_token_accuracy": 0.43793103098869324, "step": 48825 }, { "epoch": 0.0491820946153513, "grad_norm": 12.820668338971199, "learning_rate": 4.9181153435529685e-05, "loss": 2.55, "mean_token_accuracy": 0.4, "step": 48830 }, { "epoch": 0.049187130668455474, "grad_norm": 45.18701672184421, "learning_rate": 4.9186189392260744e-05, "loss": 2.4031, "mean_token_accuracy": 0.42068966031074523, "step": 48835 }, { "epoch": 0.04919216672155965, "grad_norm": 11.9466223325721, "learning_rate": 4.9191225348991804e-05, "loss": 2.3265, "mean_token_accuracy": 0.441379314661026, "step": 48840 }, { "epoch": 0.04919720277466382, "grad_norm": 9.560043074615674, "learning_rate": 4.919626130572286e-05, "loss": 2.6376, "mean_token_accuracy": 0.37241379022598264, "step": 48845 }, { "epoch": 0.04920223882776799, "grad_norm": 11.659676178487606, "learning_rate": 4.920129726245392e-05, "loss": 2.7371, "mean_token_accuracy": 0.42758620381355283, "step": 48850 }, { "epoch": 0.04920727488087216, "grad_norm": 40.276321061043596, "learning_rate": 4.920633321918499e-05, "loss": 2.6005, "mean_token_accuracy": 0.4253478586673737, "step": 48855 }, { "epoch": 0.049212310933976336, "grad_norm": 10.408309385179242, "learning_rate": 4.921136917591605e-05, "loss": 2.4425, "mean_token_accuracy": 0.3931034505367279, "step": 48860 }, { "epoch": 0.04921734698708051, "grad_norm": 14.321151651783058, "learning_rate": 4.92164051326471e-05, "loss": 2.3596, "mean_token_accuracy": 0.4482758641242981, "step": 48865 }, { "epoch": 0.04922238304018468, "grad_norm": 13.600447757095006, "learning_rate": 4.922144108937816e-05, "loss": 2.593, "mean_token_accuracy": 0.38275861740112305, "step": 48870 }, { "epoch": 0.04922741909328886, "grad_norm": 12.751175659898864, "learning_rate": 4.922647704610922e-05, "loss": 2.7389, "mean_token_accuracy": 0.3862068891525269, "step": 48875 }, { "epoch": 0.04923245514639303, "grad_norm": 12.11222444077847, "learning_rate": 4.923151300284028e-05, "loss": 2.3035, "mean_token_accuracy": 0.3965517282485962, "step": 48880 }, { "epoch": 0.0492374911994972, "grad_norm": 12.144342147668807, "learning_rate": 4.9236548959571344e-05, "loss": 3.0019, "mean_token_accuracy": 0.39655172228813174, "step": 48885 }, { "epoch": 0.04924252725260137, "grad_norm": 12.14491031470454, "learning_rate": 4.92415849163024e-05, "loss": 2.4509, "mean_token_accuracy": 0.3945553541183472, "step": 48890 }, { "epoch": 0.049247563305705545, "grad_norm": 11.795610286509646, "learning_rate": 4.924662087303346e-05, "loss": 2.584, "mean_token_accuracy": 0.3965517282485962, "step": 48895 }, { "epoch": 0.04925259935880972, "grad_norm": 16.301345551630295, "learning_rate": 4.925165682976452e-05, "loss": 2.7685, "mean_token_accuracy": 0.39310344159603117, "step": 48900 }, { "epoch": 0.04925763541191389, "grad_norm": 15.19144555577583, "learning_rate": 4.925669278649558e-05, "loss": 2.5623, "mean_token_accuracy": 0.42413793206214906, "step": 48905 }, { "epoch": 0.04926267146501807, "grad_norm": 21.835471087718506, "learning_rate": 4.926172874322664e-05, "loss": 2.8131, "mean_token_accuracy": 0.3965517163276672, "step": 48910 }, { "epoch": 0.04926770751812224, "grad_norm": 17.810675516982396, "learning_rate": 4.92667646999577e-05, "loss": 2.6096, "mean_token_accuracy": 0.37241379022598264, "step": 48915 }, { "epoch": 0.04927274357122641, "grad_norm": 12.504608025082877, "learning_rate": 4.927180065668876e-05, "loss": 2.2541, "mean_token_accuracy": 0.42068964838981626, "step": 48920 }, { "epoch": 0.04927777962433058, "grad_norm": 12.89568499758129, "learning_rate": 4.927683661341982e-05, "loss": 2.473, "mean_token_accuracy": 0.4379310250282288, "step": 48925 }, { "epoch": 0.049282815677434755, "grad_norm": 13.055089932535353, "learning_rate": 4.928187257015088e-05, "loss": 2.8171, "mean_token_accuracy": 0.41034482717514037, "step": 48930 }, { "epoch": 0.04928785173053893, "grad_norm": 13.623002976530392, "learning_rate": 4.928690852688194e-05, "loss": 3.1556, "mean_token_accuracy": 0.3413793116807938, "step": 48935 }, { "epoch": 0.0492928877836431, "grad_norm": 12.116065984149035, "learning_rate": 4.9291944483613e-05, "loss": 2.6422, "mean_token_accuracy": 0.3620689630508423, "step": 48940 }, { "epoch": 0.049297923836747276, "grad_norm": 12.169340069722987, "learning_rate": 4.929698044034406e-05, "loss": 2.8414, "mean_token_accuracy": 0.3862068921327591, "step": 48945 }, { "epoch": 0.04930295988985145, "grad_norm": 13.726568890272771, "learning_rate": 4.930201639707512e-05, "loss": 2.2898, "mean_token_accuracy": 0.4448275864124298, "step": 48950 }, { "epoch": 0.04930799594295562, "grad_norm": 14.65710590527926, "learning_rate": 4.930705235380618e-05, "loss": 2.5824, "mean_token_accuracy": 0.4068965554237366, "step": 48955 }, { "epoch": 0.04931303199605979, "grad_norm": 17.17793757785788, "learning_rate": 4.931208831053723e-05, "loss": 3.0452, "mean_token_accuracy": 0.37392619252204895, "step": 48960 }, { "epoch": 0.049318068049163964, "grad_norm": 10.512203447989698, "learning_rate": 4.93171242672683e-05, "loss": 2.4961, "mean_token_accuracy": 0.4103448152542114, "step": 48965 }, { "epoch": 0.04932310410226814, "grad_norm": 13.511176085946092, "learning_rate": 4.932216022399936e-05, "loss": 2.6672, "mean_token_accuracy": 0.4172413766384125, "step": 48970 }, { "epoch": 0.04932814015537231, "grad_norm": 11.07085745044113, "learning_rate": 4.932719618073042e-05, "loss": 2.8035, "mean_token_accuracy": 0.41379310488700866, "step": 48975 }, { "epoch": 0.049333176208476485, "grad_norm": 12.226242046400936, "learning_rate": 4.9332232137461476e-05, "loss": 2.4214, "mean_token_accuracy": 0.43647912740707395, "step": 48980 }, { "epoch": 0.04933821226158066, "grad_norm": 14.929548591129555, "learning_rate": 4.9337268094192536e-05, "loss": 2.9902, "mean_token_accuracy": 0.36551723778247835, "step": 48985 }, { "epoch": 0.049343248314684826, "grad_norm": 13.347567918108984, "learning_rate": 4.93423040509236e-05, "loss": 2.4535, "mean_token_accuracy": 0.4261342972517014, "step": 48990 }, { "epoch": 0.049348284367789, "grad_norm": 18.38620285207596, "learning_rate": 4.934734000765466e-05, "loss": 3.002, "mean_token_accuracy": 0.35862069129943847, "step": 48995 }, { "epoch": 0.049353320420893174, "grad_norm": 12.39865944592867, "learning_rate": 4.935237596438571e-05, "loss": 2.9165, "mean_token_accuracy": 0.33448275923728943, "step": 49000 }, { "epoch": 0.04935835647399735, "grad_norm": 11.939019831500048, "learning_rate": 4.935741192111677e-05, "loss": 2.6814, "mean_token_accuracy": 0.4034482777118683, "step": 49005 }, { "epoch": 0.04936339252710152, "grad_norm": 16.12697505582482, "learning_rate": 4.936244787784783e-05, "loss": 2.8692, "mean_token_accuracy": 0.35517241060733795, "step": 49010 }, { "epoch": 0.049368428580205695, "grad_norm": 12.689607770485951, "learning_rate": 4.93674838345789e-05, "loss": 2.5232, "mean_token_accuracy": 0.4000000059604645, "step": 49015 }, { "epoch": 0.04937346463330987, "grad_norm": 10.886131405544253, "learning_rate": 4.937251979130996e-05, "loss": 2.2615, "mean_token_accuracy": 0.46551724076271056, "step": 49020 }, { "epoch": 0.049378500686414036, "grad_norm": 13.417360271090889, "learning_rate": 4.9377555748041016e-05, "loss": 2.5904, "mean_token_accuracy": 0.42601330280303956, "step": 49025 }, { "epoch": 0.04938353673951821, "grad_norm": 16.915400960677378, "learning_rate": 4.9382591704772076e-05, "loss": 2.6125, "mean_token_accuracy": 0.43793103098869324, "step": 49030 }, { "epoch": 0.04938857279262238, "grad_norm": 12.704193536934127, "learning_rate": 4.9387627661503135e-05, "loss": 2.6771, "mean_token_accuracy": 0.41379310488700866, "step": 49035 }, { "epoch": 0.04939360884572656, "grad_norm": 11.972975676578281, "learning_rate": 4.9392663618234194e-05, "loss": 2.8462, "mean_token_accuracy": 0.3551724076271057, "step": 49040 }, { "epoch": 0.04939864489883073, "grad_norm": 9.949013891257787, "learning_rate": 4.9397699574965253e-05, "loss": 2.4206, "mean_token_accuracy": 0.42413793206214906, "step": 49045 }, { "epoch": 0.049403680951934904, "grad_norm": 11.071053548273435, "learning_rate": 4.940273553169631e-05, "loss": 2.7129, "mean_token_accuracy": 0.37241379618644715, "step": 49050 }, { "epoch": 0.04940871700503908, "grad_norm": 15.965791486524349, "learning_rate": 4.940777148842737e-05, "loss": 2.8587, "mean_token_accuracy": 0.3620689630508423, "step": 49055 }, { "epoch": 0.049413753058143245, "grad_norm": 13.68915150265036, "learning_rate": 4.941280744515843e-05, "loss": 2.3533, "mean_token_accuracy": 0.4241379380226135, "step": 49060 }, { "epoch": 0.04941878911124742, "grad_norm": 18.903890479113123, "learning_rate": 4.941784340188949e-05, "loss": 2.4972, "mean_token_accuracy": 0.3793103337287903, "step": 49065 }, { "epoch": 0.04942382516435159, "grad_norm": 10.80566940095437, "learning_rate": 4.9422879358620556e-05, "loss": 2.2117, "mean_token_accuracy": 0.4890566885471344, "step": 49070 }, { "epoch": 0.049428861217455766, "grad_norm": 11.45007364890809, "learning_rate": 4.9427915315351616e-05, "loss": 2.7231, "mean_token_accuracy": 0.3655172407627106, "step": 49075 }, { "epoch": 0.04943389727055994, "grad_norm": 12.875678811664686, "learning_rate": 4.9432951272082675e-05, "loss": 2.5675, "mean_token_accuracy": 0.41034482717514037, "step": 49080 }, { "epoch": 0.049438933323664114, "grad_norm": 12.504291587564936, "learning_rate": 4.9437987228813734e-05, "loss": 2.7649, "mean_token_accuracy": 0.3793103456497192, "step": 49085 }, { "epoch": 0.04944396937676829, "grad_norm": 17.23653916001431, "learning_rate": 4.9443023185544793e-05, "loss": 2.971, "mean_token_accuracy": 0.35862069129943847, "step": 49090 }, { "epoch": 0.049449005429872454, "grad_norm": 16.302425491255768, "learning_rate": 4.944805914227585e-05, "loss": 2.7943, "mean_token_accuracy": 0.41379310488700866, "step": 49095 }, { "epoch": 0.04945404148297663, "grad_norm": 13.830579276455648, "learning_rate": 4.945309509900691e-05, "loss": 2.7077, "mean_token_accuracy": 0.41034482717514037, "step": 49100 }, { "epoch": 0.0494590775360808, "grad_norm": 9.24250120260068, "learning_rate": 4.945813105573797e-05, "loss": 2.2826, "mean_token_accuracy": 0.4103448212146759, "step": 49105 }, { "epoch": 0.049464113589184976, "grad_norm": 13.30485365600019, "learning_rate": 4.946316701246903e-05, "loss": 2.5806, "mean_token_accuracy": 0.4, "step": 49110 }, { "epoch": 0.04946914964228915, "grad_norm": 13.226046901555446, "learning_rate": 4.946820296920009e-05, "loss": 2.619, "mean_token_accuracy": 0.37586206793785093, "step": 49115 }, { "epoch": 0.04947418569539332, "grad_norm": 10.39212014048255, "learning_rate": 4.9473238925931156e-05, "loss": 2.7365, "mean_token_accuracy": 0.39310344457626345, "step": 49120 }, { "epoch": 0.0494792217484975, "grad_norm": 13.625839027020147, "learning_rate": 4.9478274882662215e-05, "loss": 2.7067, "mean_token_accuracy": 0.3620689630508423, "step": 49125 }, { "epoch": 0.049484257801601664, "grad_norm": 12.841702873957686, "learning_rate": 4.9483310839393274e-05, "loss": 2.5441, "mean_token_accuracy": 0.4344827651977539, "step": 49130 }, { "epoch": 0.04948929385470584, "grad_norm": 12.334404174880499, "learning_rate": 4.948834679612433e-05, "loss": 2.1043, "mean_token_accuracy": 0.47241378426551817, "step": 49135 }, { "epoch": 0.04949432990781001, "grad_norm": 15.09710601585035, "learning_rate": 4.9493382752855386e-05, "loss": 2.9059, "mean_token_accuracy": 0.3379310369491577, "step": 49140 }, { "epoch": 0.049499365960914185, "grad_norm": 13.027265673330593, "learning_rate": 4.9498418709586445e-05, "loss": 2.2881, "mean_token_accuracy": 0.41379310488700866, "step": 49145 }, { "epoch": 0.04950440201401836, "grad_norm": 12.068235801694266, "learning_rate": 4.950345466631751e-05, "loss": 2.9198, "mean_token_accuracy": 0.3551724135875702, "step": 49150 }, { "epoch": 0.04950943806712253, "grad_norm": 10.92990075005881, "learning_rate": 4.950849062304857e-05, "loss": 2.4792, "mean_token_accuracy": 0.4413793087005615, "step": 49155 }, { "epoch": 0.049514474120226706, "grad_norm": 18.296693353236435, "learning_rate": 4.951352657977963e-05, "loss": 3.1022, "mean_token_accuracy": 0.3103448212146759, "step": 49160 }, { "epoch": 0.04951951017333087, "grad_norm": 12.49348015563861, "learning_rate": 4.951856253651069e-05, "loss": 3.0282, "mean_token_accuracy": 0.37241379022598264, "step": 49165 }, { "epoch": 0.04952454622643505, "grad_norm": 12.969289920558253, "learning_rate": 4.952359849324175e-05, "loss": 3.0352, "mean_token_accuracy": 0.3275862127542496, "step": 49170 }, { "epoch": 0.04952958227953922, "grad_norm": 13.008677609167234, "learning_rate": 4.952863444997281e-05, "loss": 3.4216, "mean_token_accuracy": 0.32413792610168457, "step": 49175 }, { "epoch": 0.049534618332643394, "grad_norm": 11.948299790792138, "learning_rate": 4.953367040670387e-05, "loss": 3.4276, "mean_token_accuracy": 0.30689655244350433, "step": 49180 }, { "epoch": 0.04953965438574757, "grad_norm": 13.232044398199198, "learning_rate": 4.9538706363434926e-05, "loss": 2.7483, "mean_token_accuracy": 0.38620689511299133, "step": 49185 }, { "epoch": 0.04954469043885174, "grad_norm": 12.072298961707434, "learning_rate": 4.9543742320165985e-05, "loss": 2.9151, "mean_token_accuracy": 0.3793103337287903, "step": 49190 }, { "epoch": 0.049549726491955916, "grad_norm": 13.992984048405518, "learning_rate": 4.9548778276897045e-05, "loss": 2.8701, "mean_token_accuracy": 0.32068965435028074, "step": 49195 }, { "epoch": 0.04955476254506008, "grad_norm": 16.055490129836002, "learning_rate": 4.955381423362811e-05, "loss": 2.4514, "mean_token_accuracy": 0.42758620977401735, "step": 49200 }, { "epoch": 0.049559798598164256, "grad_norm": 11.903501170180727, "learning_rate": 4.955885019035917e-05, "loss": 2.5416, "mean_token_accuracy": 0.4344827592372894, "step": 49205 }, { "epoch": 0.04956483465126843, "grad_norm": 17.413304193082453, "learning_rate": 4.956388614709023e-05, "loss": 2.8184, "mean_token_accuracy": 0.38620689511299133, "step": 49210 }, { "epoch": 0.049569870704372604, "grad_norm": 13.238645675444548, "learning_rate": 4.956892210382129e-05, "loss": 2.5133, "mean_token_accuracy": 0.43793103098869324, "step": 49215 }, { "epoch": 0.04957490675747678, "grad_norm": 12.327412690427577, "learning_rate": 4.957395806055235e-05, "loss": 2.742, "mean_token_accuracy": 0.38620689511299133, "step": 49220 }, { "epoch": 0.04957994281058095, "grad_norm": 14.263868970130424, "learning_rate": 4.95789940172834e-05, "loss": 2.6779, "mean_token_accuracy": 0.4068965494632721, "step": 49225 }, { "epoch": 0.049584978863685125, "grad_norm": 11.108085205111877, "learning_rate": 4.9584029974014466e-05, "loss": 2.6186, "mean_token_accuracy": 0.39655172228813174, "step": 49230 }, { "epoch": 0.04959001491678929, "grad_norm": 12.901059445419929, "learning_rate": 4.9589065930745525e-05, "loss": 2.6966, "mean_token_accuracy": 0.3689655244350433, "step": 49235 }, { "epoch": 0.049595050969893466, "grad_norm": 12.543300954528458, "learning_rate": 4.9594101887476585e-05, "loss": 2.3733, "mean_token_accuracy": 0.458620685338974, "step": 49240 }, { "epoch": 0.04960008702299764, "grad_norm": 17.400105066019382, "learning_rate": 4.9599137844207644e-05, "loss": 2.2198, "mean_token_accuracy": 0.45402299165725707, "step": 49245 }, { "epoch": 0.04960512307610181, "grad_norm": 13.30602468258391, "learning_rate": 4.96041738009387e-05, "loss": 2.5902, "mean_token_accuracy": 0.39310344457626345, "step": 49250 }, { "epoch": 0.04961015912920599, "grad_norm": 11.124328629447813, "learning_rate": 4.960920975766977e-05, "loss": 2.5845, "mean_token_accuracy": 0.3931034505367279, "step": 49255 }, { "epoch": 0.04961519518231016, "grad_norm": 11.775406834473008, "learning_rate": 4.961424571440083e-05, "loss": 2.2878, "mean_token_accuracy": 0.417241370677948, "step": 49260 }, { "epoch": 0.049620231235414335, "grad_norm": 14.113907397336439, "learning_rate": 4.961928167113188e-05, "loss": 2.7747, "mean_token_accuracy": 0.39310344159603117, "step": 49265 }, { "epoch": 0.0496252672885185, "grad_norm": 13.659875871175261, "learning_rate": 4.962431762786294e-05, "loss": 2.665, "mean_token_accuracy": 0.4000000059604645, "step": 49270 }, { "epoch": 0.049630303341622675, "grad_norm": 14.318838856310839, "learning_rate": 4.9629353584594e-05, "loss": 2.8541, "mean_token_accuracy": 0.4, "step": 49275 }, { "epoch": 0.04963533939472685, "grad_norm": 13.762589800802216, "learning_rate": 4.9634389541325065e-05, "loss": 3.2975, "mean_token_accuracy": 0.33103448450565337, "step": 49280 }, { "epoch": 0.04964037544783102, "grad_norm": 12.148281583557175, "learning_rate": 4.9639425498056125e-05, "loss": 2.8777, "mean_token_accuracy": 0.41379310488700866, "step": 49285 }, { "epoch": 0.049645411500935197, "grad_norm": 12.593167754663654, "learning_rate": 4.9644461454787184e-05, "loss": 2.7955, "mean_token_accuracy": 0.4124621868133545, "step": 49290 }, { "epoch": 0.04965044755403937, "grad_norm": 11.807473589452503, "learning_rate": 4.964949741151824e-05, "loss": 2.8509, "mean_token_accuracy": 0.42413793206214906, "step": 49295 }, { "epoch": 0.049655483607143544, "grad_norm": 14.780928364294947, "learning_rate": 4.96545333682493e-05, "loss": 2.5748, "mean_token_accuracy": 0.43103448748588563, "step": 49300 }, { "epoch": 0.04966051966024771, "grad_norm": 21.3860693383878, "learning_rate": 4.965956932498036e-05, "loss": 2.5254, "mean_token_accuracy": 0.3758620619773865, "step": 49305 }, { "epoch": 0.049665555713351885, "grad_norm": 15.224877263419117, "learning_rate": 4.966460528171142e-05, "loss": 2.3813, "mean_token_accuracy": 0.42758620381355283, "step": 49310 }, { "epoch": 0.04967059176645606, "grad_norm": 11.466907821737939, "learning_rate": 4.966964123844248e-05, "loss": 2.6847, "mean_token_accuracy": 0.3482758641242981, "step": 49315 }, { "epoch": 0.04967562781956023, "grad_norm": 12.635880277284619, "learning_rate": 4.967467719517354e-05, "loss": 2.3469, "mean_token_accuracy": 0.3862069010734558, "step": 49320 }, { "epoch": 0.049680663872664406, "grad_norm": 11.87581265201363, "learning_rate": 4.96797131519046e-05, "loss": 2.327, "mean_token_accuracy": 0.46896551847457885, "step": 49325 }, { "epoch": 0.04968569992576858, "grad_norm": 10.949030175235732, "learning_rate": 4.968474910863566e-05, "loss": 2.5311, "mean_token_accuracy": 0.37241379618644715, "step": 49330 }, { "epoch": 0.04969073597887275, "grad_norm": 14.735402843195, "learning_rate": 4.9689785065366724e-05, "loss": 2.7708, "mean_token_accuracy": 0.37241379022598264, "step": 49335 }, { "epoch": 0.04969577203197692, "grad_norm": 10.362049912778165, "learning_rate": 4.969482102209778e-05, "loss": 2.4387, "mean_token_accuracy": 0.4103448212146759, "step": 49340 }, { "epoch": 0.049700808085081094, "grad_norm": 11.6372504251952, "learning_rate": 4.969985697882884e-05, "loss": 2.3279, "mean_token_accuracy": 0.4344827651977539, "step": 49345 }, { "epoch": 0.04970584413818527, "grad_norm": 13.783295856114794, "learning_rate": 4.97048929355599e-05, "loss": 2.8837, "mean_token_accuracy": 0.37241379022598264, "step": 49350 }, { "epoch": 0.04971088019128944, "grad_norm": 11.834986404930202, "learning_rate": 4.970992889229096e-05, "loss": 2.7457, "mean_token_accuracy": 0.3517241388559341, "step": 49355 }, { "epoch": 0.049715916244393615, "grad_norm": 11.521477246740094, "learning_rate": 4.971496484902202e-05, "loss": 2.5048, "mean_token_accuracy": 0.4172413766384125, "step": 49360 }, { "epoch": 0.04972095229749779, "grad_norm": 11.858652590225981, "learning_rate": 4.972000080575308e-05, "loss": 2.4719, "mean_token_accuracy": 0.43448275327682495, "step": 49365 }, { "epoch": 0.04972598835060196, "grad_norm": 11.373794773558473, "learning_rate": 4.972503676248414e-05, "loss": 2.8187, "mean_token_accuracy": 0.3482758581638336, "step": 49370 }, { "epoch": 0.04973102440370613, "grad_norm": 16.116368217565114, "learning_rate": 4.97300727192152e-05, "loss": 2.3998, "mean_token_accuracy": 0.42413793206214906, "step": 49375 }, { "epoch": 0.0497360604568103, "grad_norm": 12.799520076892202, "learning_rate": 4.973510867594626e-05, "loss": 2.4465, "mean_token_accuracy": 0.4379310250282288, "step": 49380 }, { "epoch": 0.04974109650991448, "grad_norm": 13.742453803553794, "learning_rate": 4.9740144632677316e-05, "loss": 2.6858, "mean_token_accuracy": 0.38475499153137205, "step": 49385 }, { "epoch": 0.04974613256301865, "grad_norm": 9.861523476840427, "learning_rate": 4.974518058940838e-05, "loss": 2.4784, "mean_token_accuracy": 0.4275861978530884, "step": 49390 }, { "epoch": 0.049751168616122825, "grad_norm": 14.802353146211662, "learning_rate": 4.975021654613944e-05, "loss": 2.8429, "mean_token_accuracy": 0.3724137842655182, "step": 49395 }, { "epoch": 0.049756204669227, "grad_norm": 18.86918748812848, "learning_rate": 4.9755252502870494e-05, "loss": 2.8367, "mean_token_accuracy": 0.4223835527896881, "step": 49400 }, { "epoch": 0.04976124072233117, "grad_norm": 11.6322610418858, "learning_rate": 4.9760288459601553e-05, "loss": 2.7996, "mean_token_accuracy": 0.3620689570903778, "step": 49405 }, { "epoch": 0.04976627677543534, "grad_norm": 13.821393909998825, "learning_rate": 4.976532441633261e-05, "loss": 3.0473, "mean_token_accuracy": 0.38620689511299133, "step": 49410 }, { "epoch": 0.04977131282853951, "grad_norm": 13.345087258669924, "learning_rate": 4.977036037306368e-05, "loss": 2.4209, "mean_token_accuracy": 0.39310344457626345, "step": 49415 }, { "epoch": 0.04977634888164369, "grad_norm": 13.137243998874078, "learning_rate": 4.977539632979474e-05, "loss": 2.4648, "mean_token_accuracy": 0.41379310488700866, "step": 49420 }, { "epoch": 0.04978138493474786, "grad_norm": 11.613483538823493, "learning_rate": 4.97804322865258e-05, "loss": 2.621, "mean_token_accuracy": 0.43266788125038147, "step": 49425 }, { "epoch": 0.049786420987852034, "grad_norm": 12.605566542025525, "learning_rate": 4.9785468243256857e-05, "loss": 2.6217, "mean_token_accuracy": 0.3620689570903778, "step": 49430 }, { "epoch": 0.04979145704095621, "grad_norm": 13.598326714699105, "learning_rate": 4.9790504199987916e-05, "loss": 2.6605, "mean_token_accuracy": 0.41379311084747317, "step": 49435 }, { "epoch": 0.04979649309406038, "grad_norm": 14.550143317492957, "learning_rate": 4.9795540156718975e-05, "loss": 2.4002, "mean_token_accuracy": 0.39310344457626345, "step": 49440 }, { "epoch": 0.04980152914716455, "grad_norm": 14.702991348953809, "learning_rate": 4.9800576113450034e-05, "loss": 2.9054, "mean_token_accuracy": 0.36206896901130675, "step": 49445 }, { "epoch": 0.04980656520026872, "grad_norm": 11.302201937499778, "learning_rate": 4.9805612070181094e-05, "loss": 2.6867, "mean_token_accuracy": 0.38620689511299133, "step": 49450 }, { "epoch": 0.049811601253372896, "grad_norm": 13.56973108055161, "learning_rate": 4.981064802691215e-05, "loss": 2.5946, "mean_token_accuracy": 0.3517241388559341, "step": 49455 }, { "epoch": 0.04981663730647707, "grad_norm": 12.842971988297705, "learning_rate": 4.981568398364321e-05, "loss": 2.6623, "mean_token_accuracy": 0.3809437394142151, "step": 49460 }, { "epoch": 0.049821673359581244, "grad_norm": 11.563934427017875, "learning_rate": 4.982071994037427e-05, "loss": 2.8514, "mean_token_accuracy": 0.32413792610168457, "step": 49465 }, { "epoch": 0.04982670941268542, "grad_norm": 12.122643995871975, "learning_rate": 4.982575589710534e-05, "loss": 2.5237, "mean_token_accuracy": 0.3896551728248596, "step": 49470 }, { "epoch": 0.049831745465789584, "grad_norm": 10.304817959472109, "learning_rate": 4.9830791853836397e-05, "loss": 2.7031, "mean_token_accuracy": 0.37241379022598264, "step": 49475 }, { "epoch": 0.04983678151889376, "grad_norm": 11.443792638318499, "learning_rate": 4.9835827810567456e-05, "loss": 2.369, "mean_token_accuracy": 0.3827586233615875, "step": 49480 }, { "epoch": 0.04984181757199793, "grad_norm": 17.651962081814688, "learning_rate": 4.9840863767298515e-05, "loss": 2.8535, "mean_token_accuracy": 0.3655172407627106, "step": 49485 }, { "epoch": 0.049846853625102105, "grad_norm": 14.28581158762603, "learning_rate": 4.9845899724029574e-05, "loss": 2.6115, "mean_token_accuracy": 0.3517241358757019, "step": 49490 }, { "epoch": 0.04985188967820628, "grad_norm": 12.245707832101791, "learning_rate": 4.9850935680760634e-05, "loss": 2.9765, "mean_token_accuracy": 0.33793102502822875, "step": 49495 }, { "epoch": 0.04985692573131045, "grad_norm": 11.338788116320691, "learning_rate": 4.985597163749169e-05, "loss": 2.2332, "mean_token_accuracy": 0.4517241358757019, "step": 49500 }, { "epoch": 0.04986196178441463, "grad_norm": 15.91943713855223, "learning_rate": 4.986100759422275e-05, "loss": 2.5005, "mean_token_accuracy": 0.42758620977401735, "step": 49505 }, { "epoch": 0.049866997837518794, "grad_norm": 15.01142749013905, "learning_rate": 4.986604355095381e-05, "loss": 2.4042, "mean_token_accuracy": 0.42413792610168455, "step": 49510 }, { "epoch": 0.04987203389062297, "grad_norm": 23.31739947772715, "learning_rate": 4.987107950768487e-05, "loss": 2.7926, "mean_token_accuracy": 0.33793103098869326, "step": 49515 }, { "epoch": 0.04987706994372714, "grad_norm": 11.040523663202919, "learning_rate": 4.987611546441594e-05, "loss": 2.4018, "mean_token_accuracy": 0.4172413766384125, "step": 49520 }, { "epoch": 0.049882105996831315, "grad_norm": 13.294381592565193, "learning_rate": 4.9881151421146996e-05, "loss": 2.2171, "mean_token_accuracy": 0.441379314661026, "step": 49525 }, { "epoch": 0.04988714204993549, "grad_norm": 11.646579343031293, "learning_rate": 4.9886187377878055e-05, "loss": 2.6643, "mean_token_accuracy": 0.39655172228813174, "step": 49530 }, { "epoch": 0.04989217810303966, "grad_norm": 13.71315294642325, "learning_rate": 4.989122333460911e-05, "loss": 2.7966, "mean_token_accuracy": 0.3517241418361664, "step": 49535 }, { "epoch": 0.049897214156143836, "grad_norm": 17.97975577463106, "learning_rate": 4.989625929134017e-05, "loss": 2.7515, "mean_token_accuracy": 0.36896551251411436, "step": 49540 }, { "epoch": 0.049902250209248, "grad_norm": 12.268858907224732, "learning_rate": 4.990129524807123e-05, "loss": 2.6348, "mean_token_accuracy": 0.3931034505367279, "step": 49545 }, { "epoch": 0.04990728626235218, "grad_norm": 20.81019688630803, "learning_rate": 4.990633120480229e-05, "loss": 2.8056, "mean_token_accuracy": 0.4103448331356049, "step": 49550 }, { "epoch": 0.04991232231545635, "grad_norm": 13.646494384884486, "learning_rate": 4.991136716153335e-05, "loss": 2.5801, "mean_token_accuracy": 0.38275861740112305, "step": 49555 }, { "epoch": 0.049917358368560524, "grad_norm": 13.258184842553518, "learning_rate": 4.991640311826441e-05, "loss": 2.4923, "mean_token_accuracy": 0.37931033968925476, "step": 49560 }, { "epoch": 0.0499223944216647, "grad_norm": 13.376109319528164, "learning_rate": 4.992143907499547e-05, "loss": 2.8028, "mean_token_accuracy": 0.3448275923728943, "step": 49565 }, { "epoch": 0.04992743047476887, "grad_norm": 12.32846130563162, "learning_rate": 4.992647503172653e-05, "loss": 2.612, "mean_token_accuracy": 0.3482758641242981, "step": 49570 }, { "epoch": 0.049932466527873046, "grad_norm": 11.564618024882458, "learning_rate": 4.993151098845759e-05, "loss": 2.6198, "mean_token_accuracy": 0.4034482777118683, "step": 49575 }, { "epoch": 0.04993750258097721, "grad_norm": 14.824679698883445, "learning_rate": 4.993654694518865e-05, "loss": 2.7361, "mean_token_accuracy": 0.38275861740112305, "step": 49580 }, { "epoch": 0.049942538634081386, "grad_norm": 15.552365673154968, "learning_rate": 4.994158290191971e-05, "loss": 2.876, "mean_token_accuracy": 0.35862069129943847, "step": 49585 }, { "epoch": 0.04994757468718556, "grad_norm": 14.199919668919463, "learning_rate": 4.9946618858650766e-05, "loss": 2.3108, "mean_token_accuracy": 0.482758617401123, "step": 49590 }, { "epoch": 0.049952610740289734, "grad_norm": 14.47673515226275, "learning_rate": 4.9951654815381825e-05, "loss": 2.6364, "mean_token_accuracy": 0.3655172407627106, "step": 49595 }, { "epoch": 0.04995764679339391, "grad_norm": 14.824743793669533, "learning_rate": 4.995669077211289e-05, "loss": 2.5179, "mean_token_accuracy": 0.4117362380027771, "step": 49600 }, { "epoch": 0.04996268284649808, "grad_norm": 12.498952004436685, "learning_rate": 4.996172672884395e-05, "loss": 2.555, "mean_token_accuracy": 0.4159104585647583, "step": 49605 }, { "epoch": 0.049967718899602255, "grad_norm": 13.935762699559254, "learning_rate": 4.996676268557501e-05, "loss": 2.827, "mean_token_accuracy": 0.3517241358757019, "step": 49610 }, { "epoch": 0.04997275495270642, "grad_norm": 14.630953503098178, "learning_rate": 4.997179864230607e-05, "loss": 2.3519, "mean_token_accuracy": 0.36745311319828033, "step": 49615 }, { "epoch": 0.049977791005810596, "grad_norm": 12.230547852391641, "learning_rate": 4.997683459903713e-05, "loss": 2.7427, "mean_token_accuracy": 0.39509981870651245, "step": 49620 }, { "epoch": 0.04998282705891477, "grad_norm": 12.537907041137958, "learning_rate": 4.998187055576819e-05, "loss": 2.8497, "mean_token_accuracy": 0.36206896007061007, "step": 49625 }, { "epoch": 0.04998786311201894, "grad_norm": 11.284924131603896, "learning_rate": 4.998690651249925e-05, "loss": 2.4841, "mean_token_accuracy": 0.403448274731636, "step": 49630 }, { "epoch": 0.04999289916512312, "grad_norm": 11.76655415509917, "learning_rate": 4.9991942469230306e-05, "loss": 2.8672, "mean_token_accuracy": 0.35862069129943847, "step": 49635 }, { "epoch": 0.04999793521822729, "grad_norm": 15.605220600081669, "learning_rate": 4.9996978425961365e-05, "loss": 2.5876, "mean_token_accuracy": 0.3551724076271057, "step": 49640 }, { "epoch": 0.050002971271331464, "grad_norm": 12.006394240076212, "learning_rate": 4.999999999950077e-05, "loss": 2.9691, "mean_token_accuracy": 0.36551724672317504, "step": 49645 }, { "epoch": 0.05000800732443563, "grad_norm": 12.6271173068407, "learning_rate": 4.999999999388436e-05, "loss": 2.4717, "mean_token_accuracy": 0.4310344815254211, "step": 49650 }, { "epoch": 0.050013043377539805, "grad_norm": 12.234909042511857, "learning_rate": 4.999999998202748e-05, "loss": 2.824, "mean_token_accuracy": 0.37586206793785093, "step": 49655 }, { "epoch": 0.05001807943064398, "grad_norm": 44.949610624950466, "learning_rate": 4.999999996393015e-05, "loss": 3.1453, "mean_token_accuracy": 0.27241378724575044, "step": 49660 }, { "epoch": 0.05002311548374815, "grad_norm": 12.025230843353924, "learning_rate": 4.9999999939592364e-05, "loss": 2.5444, "mean_token_accuracy": 0.38275861740112305, "step": 49665 }, { "epoch": 0.050028151536852326, "grad_norm": 11.381206032498792, "learning_rate": 4.999999990901411e-05, "loss": 2.4051, "mean_token_accuracy": 0.3620689630508423, "step": 49670 }, { "epoch": 0.0500331875899565, "grad_norm": 11.788428976912376, "learning_rate": 4.999999987219541e-05, "loss": 2.1203, "mean_token_accuracy": 0.47931034564971925, "step": 49675 }, { "epoch": 0.050038223643060674, "grad_norm": 17.25882583624675, "learning_rate": 4.999999982913624e-05, "loss": 2.377, "mean_token_accuracy": 0.4379310429096222, "step": 49680 }, { "epoch": 0.05004325969616484, "grad_norm": 15.178981141132901, "learning_rate": 4.999999977983662e-05, "loss": 2.5629, "mean_token_accuracy": 0.4172413766384125, "step": 49685 }, { "epoch": 0.050048295749269014, "grad_norm": 15.925549144673484, "learning_rate": 4.9999999724296524e-05, "loss": 2.5396, "mean_token_accuracy": 0.39310344457626345, "step": 49690 }, { "epoch": 0.05005333180237319, "grad_norm": 14.720843721851335, "learning_rate": 4.999999966251598e-05, "loss": 3.1586, "mean_token_accuracy": 0.37586206793785093, "step": 49695 }, { "epoch": 0.05005836785547736, "grad_norm": 10.552678313176491, "learning_rate": 4.9999999594494986e-05, "loss": 2.5198, "mean_token_accuracy": 0.4172413766384125, "step": 49700 }, { "epoch": 0.050063403908581536, "grad_norm": 14.610190061508218, "learning_rate": 4.999999952023352e-05, "loss": 2.7993, "mean_token_accuracy": 0.3310344874858856, "step": 49705 }, { "epoch": 0.05006843996168571, "grad_norm": 13.328116361078306, "learning_rate": 4.99999994397316e-05, "loss": 2.9146, "mean_token_accuracy": 0.3482758581638336, "step": 49710 }, { "epoch": 0.05007347601478988, "grad_norm": 11.356790665066061, "learning_rate": 4.9999999352989224e-05, "loss": 2.5436, "mean_token_accuracy": 0.38620689511299133, "step": 49715 }, { "epoch": 0.05007851206789405, "grad_norm": 12.899454764875982, "learning_rate": 4.9999999260006385e-05, "loss": 2.738, "mean_token_accuracy": 0.42413793206214906, "step": 49720 }, { "epoch": 0.050083548120998224, "grad_norm": 13.51586536994713, "learning_rate": 4.999999916078308e-05, "loss": 2.4499, "mean_token_accuracy": 0.39437386989593504, "step": 49725 }, { "epoch": 0.0500885841741024, "grad_norm": 16.644151180282265, "learning_rate": 4.999999905531934e-05, "loss": 2.7232, "mean_token_accuracy": 0.38620689511299133, "step": 49730 }, { "epoch": 0.05009362022720657, "grad_norm": 13.058655976823317, "learning_rate": 4.9999998943615124e-05, "loss": 2.4483, "mean_token_accuracy": 0.4172413766384125, "step": 49735 }, { "epoch": 0.050098656280310745, "grad_norm": 11.179485730759678, "learning_rate": 4.999999882567045e-05, "loss": 3.2205, "mean_token_accuracy": 0.358620685338974, "step": 49740 }, { "epoch": 0.05010369233341492, "grad_norm": 14.60636381748364, "learning_rate": 4.9999998701485314e-05, "loss": 2.4974, "mean_token_accuracy": 0.3999999940395355, "step": 49745 }, { "epoch": 0.05010872838651909, "grad_norm": 17.494031287034645, "learning_rate": 4.999999857105973e-05, "loss": 3.1808, "mean_token_accuracy": 0.3551724165678024, "step": 49750 }, { "epoch": 0.05011376443962326, "grad_norm": 12.266893588534552, "learning_rate": 4.999999843439368e-05, "loss": 2.7388, "mean_token_accuracy": 0.4137930989265442, "step": 49755 }, { "epoch": 0.05011880049272743, "grad_norm": 9.465675904973079, "learning_rate": 4.9999998291487175e-05, "loss": 2.4497, "mean_token_accuracy": 0.4255293428897858, "step": 49760 }, { "epoch": 0.05012383654583161, "grad_norm": 13.092268195033029, "learning_rate": 4.999999814234021e-05, "loss": 2.2549, "mean_token_accuracy": 0.39310343861579894, "step": 49765 }, { "epoch": 0.05012887259893578, "grad_norm": 12.990682712269443, "learning_rate": 4.999999798695279e-05, "loss": 2.7591, "mean_token_accuracy": 0.37241379022598264, "step": 49770 }, { "epoch": 0.050133908652039955, "grad_norm": 16.466545408775996, "learning_rate": 4.999999782532491e-05, "loss": 2.3737, "mean_token_accuracy": 0.4255898416042328, "step": 49775 }, { "epoch": 0.05013894470514413, "grad_norm": 11.880720595914022, "learning_rate": 4.9999997657456566e-05, "loss": 2.4955, "mean_token_accuracy": 0.4344827592372894, "step": 49780 }, { "epoch": 0.0501439807582483, "grad_norm": 12.650095182831993, "learning_rate": 4.9999997483347765e-05, "loss": 2.4397, "mean_token_accuracy": 0.39818512201309203, "step": 49785 }, { "epoch": 0.05014901681135247, "grad_norm": 12.43159565953127, "learning_rate": 4.9999997302998506e-05, "loss": 2.9674, "mean_token_accuracy": 0.36896551251411436, "step": 49790 }, { "epoch": 0.05015405286445664, "grad_norm": 14.959005423990027, "learning_rate": 4.9999997116408796e-05, "loss": 3.1992, "mean_token_accuracy": 0.2999999910593033, "step": 49795 }, { "epoch": 0.050159088917560817, "grad_norm": 15.552031044538174, "learning_rate": 4.999999692357862e-05, "loss": 2.7995, "mean_token_accuracy": 0.38620689511299133, "step": 49800 }, { "epoch": 0.05016412497066499, "grad_norm": 12.972217035970225, "learning_rate": 4.9999996724507996e-05, "loss": 2.4475, "mean_token_accuracy": 0.3999999940395355, "step": 49805 }, { "epoch": 0.050169161023769164, "grad_norm": 10.596658939758084, "learning_rate": 4.999999651919691e-05, "loss": 2.6885, "mean_token_accuracy": 0.44827585220336913, "step": 49810 }, { "epoch": 0.05017419707687334, "grad_norm": 13.479090681305225, "learning_rate": 4.999999630764536e-05, "loss": 2.5114, "mean_token_accuracy": 0.3655172407627106, "step": 49815 }, { "epoch": 0.05017923312997751, "grad_norm": 13.780359024390073, "learning_rate": 4.9999996089853355e-05, "loss": 2.7229, "mean_token_accuracy": 0.3454930394887924, "step": 49820 }, { "epoch": 0.05018426918308168, "grad_norm": 15.225368901024439, "learning_rate": 4.999999586582089e-05, "loss": 2.7321, "mean_token_accuracy": 0.4068965554237366, "step": 49825 }, { "epoch": 0.05018930523618585, "grad_norm": 11.720377494039846, "learning_rate": 4.999999563554797e-05, "loss": 2.636, "mean_token_accuracy": 0.3517241418361664, "step": 49830 }, { "epoch": 0.050194341289290026, "grad_norm": 12.61486108463667, "learning_rate": 4.9999995399034594e-05, "loss": 2.7784, "mean_token_accuracy": 0.3551724135875702, "step": 49835 }, { "epoch": 0.0501993773423942, "grad_norm": 13.289997714746, "learning_rate": 4.999999515628075e-05, "loss": 2.4773, "mean_token_accuracy": 0.41034482717514037, "step": 49840 }, { "epoch": 0.05020441339549837, "grad_norm": 13.177140455998506, "learning_rate": 4.9999994907286466e-05, "loss": 2.6352, "mean_token_accuracy": 0.3986085891723633, "step": 49845 }, { "epoch": 0.05020944944860255, "grad_norm": 12.348074467666311, "learning_rate": 4.9999994652051715e-05, "loss": 2.426, "mean_token_accuracy": 0.39491832852363584, "step": 49850 }, { "epoch": 0.05021448550170672, "grad_norm": 16.999614863859883, "learning_rate": 4.999999439057651e-05, "loss": 2.6804, "mean_token_accuracy": 0.3827586233615875, "step": 49855 }, { "epoch": 0.05021952155481089, "grad_norm": 13.252310613732684, "learning_rate": 4.999999412286084e-05, "loss": 2.3524, "mean_token_accuracy": 0.4034482777118683, "step": 49860 }, { "epoch": 0.05022455760791506, "grad_norm": 10.72375928360509, "learning_rate": 4.999999384890472e-05, "loss": 2.306, "mean_token_accuracy": 0.45517240166664125, "step": 49865 }, { "epoch": 0.050229593661019235, "grad_norm": 13.797454384393609, "learning_rate": 4.9999993568708136e-05, "loss": 2.6761, "mean_token_accuracy": 0.4068965554237366, "step": 49870 }, { "epoch": 0.05023462971412341, "grad_norm": 11.391078182708428, "learning_rate": 4.9999993282271104e-05, "loss": 2.4039, "mean_token_accuracy": 0.4862069010734558, "step": 49875 }, { "epoch": 0.05023966576722758, "grad_norm": 13.511557320896227, "learning_rate": 4.999999298959361e-05, "loss": 2.5194, "mean_token_accuracy": 0.3862068891525269, "step": 49880 }, { "epoch": 0.05024470182033176, "grad_norm": 11.959401914858365, "learning_rate": 4.999999269067566e-05, "loss": 2.9104, "mean_token_accuracy": 0.41379310488700866, "step": 49885 }, { "epoch": 0.05024973787343593, "grad_norm": 10.454554000475806, "learning_rate": 4.999999238551725e-05, "loss": 2.8246, "mean_token_accuracy": 0.39491833448410035, "step": 49890 }, { "epoch": 0.0502547739265401, "grad_norm": 11.180505264939498, "learning_rate": 4.999999207411838e-05, "loss": 2.7407, "mean_token_accuracy": 0.42758620977401735, "step": 49895 }, { "epoch": 0.05025980997964427, "grad_norm": 16.598709812537702, "learning_rate": 4.999999175647906e-05, "loss": 3.0353, "mean_token_accuracy": 0.3137931048870087, "step": 49900 }, { "epoch": 0.050264846032748445, "grad_norm": 18.39314814150104, "learning_rate": 4.999999143259929e-05, "loss": 2.4632, "mean_token_accuracy": 0.40689654350280763, "step": 49905 }, { "epoch": 0.05026988208585262, "grad_norm": 23.375139055611236, "learning_rate": 4.999999110247905e-05, "loss": 2.7536, "mean_token_accuracy": 0.3896551728248596, "step": 49910 }, { "epoch": 0.05027491813895679, "grad_norm": 12.575790854780092, "learning_rate": 4.999999076611835e-05, "loss": 2.756, "mean_token_accuracy": 0.4034482717514038, "step": 49915 }, { "epoch": 0.050279954192060966, "grad_norm": 11.853179869949356, "learning_rate": 4.999999042351721e-05, "loss": 2.7287, "mean_token_accuracy": 0.36551724672317504, "step": 49920 }, { "epoch": 0.05028499024516514, "grad_norm": 11.314370035179659, "learning_rate": 4.99999900746756e-05, "loss": 2.238, "mean_token_accuracy": 0.42413792610168455, "step": 49925 }, { "epoch": 0.05029002629826931, "grad_norm": 11.80282057662572, "learning_rate": 4.9999989719593546e-05, "loss": 2.8643, "mean_token_accuracy": 0.34482758343219755, "step": 49930 }, { "epoch": 0.05029506235137348, "grad_norm": 12.012195216515446, "learning_rate": 4.999998935827103e-05, "loss": 2.7738, "mean_token_accuracy": 0.36896551251411436, "step": 49935 }, { "epoch": 0.050300098404477654, "grad_norm": 14.042802247376741, "learning_rate": 4.9999988990708054e-05, "loss": 2.174, "mean_token_accuracy": 0.44827587008476255, "step": 49940 }, { "epoch": 0.05030513445758183, "grad_norm": 11.633376202270114, "learning_rate": 4.999998861690462e-05, "loss": 2.3763, "mean_token_accuracy": 0.39655172228813174, "step": 49945 }, { "epoch": 0.050310170510686, "grad_norm": 13.548543139366798, "learning_rate": 4.999998823686074e-05, "loss": 2.7846, "mean_token_accuracy": 0.3000000029802322, "step": 49950 }, { "epoch": 0.050315206563790175, "grad_norm": 16.761012173907588, "learning_rate": 4.9999987850576405e-05, "loss": 2.9079, "mean_token_accuracy": 0.41034482717514037, "step": 49955 }, { "epoch": 0.05032024261689435, "grad_norm": 10.752007701541265, "learning_rate": 4.99999874580516e-05, "loss": 2.3022, "mean_token_accuracy": 0.4551724076271057, "step": 49960 }, { "epoch": 0.050325278669998516, "grad_norm": 13.272690428884117, "learning_rate": 4.999998705928635e-05, "loss": 3.0675, "mean_token_accuracy": 0.40344828367233276, "step": 49965 }, { "epoch": 0.05033031472310269, "grad_norm": 11.456320473853069, "learning_rate": 4.999998665428065e-05, "loss": 2.1809, "mean_token_accuracy": 0.39655172228813174, "step": 49970 }, { "epoch": 0.050335350776206864, "grad_norm": 13.473766954694915, "learning_rate": 4.9999986243034485e-05, "loss": 2.5452, "mean_token_accuracy": 0.37241379022598264, "step": 49975 }, { "epoch": 0.05034038682931104, "grad_norm": 26.80994790393054, "learning_rate": 4.999998582554787e-05, "loss": 2.9404, "mean_token_accuracy": 0.3931034505367279, "step": 49980 }, { "epoch": 0.05034542288241521, "grad_norm": 11.080244502563596, "learning_rate": 4.99999854018208e-05, "loss": 2.5327, "mean_token_accuracy": 0.42758620977401735, "step": 49985 }, { "epoch": 0.050350458935519385, "grad_norm": 17.367509809474647, "learning_rate": 4.999998497185327e-05, "loss": 2.673, "mean_token_accuracy": 0.38965516686439516, "step": 49990 }, { "epoch": 0.05035549498862356, "grad_norm": 11.036298754864657, "learning_rate": 4.99999845356453e-05, "loss": 2.3717, "mean_token_accuracy": 0.4712038695812225, "step": 49995 }, { "epoch": 0.050360531041727725, "grad_norm": 15.158522527044616, "learning_rate": 4.999998409319686e-05, "loss": 2.6372, "mean_token_accuracy": 0.3827586233615875, "step": 50000 }, { "epoch": 0.0503655670948319, "grad_norm": 15.593171681203087, "learning_rate": 4.999998364450796e-05, "loss": 2.6878, "mean_token_accuracy": 0.38620689511299133, "step": 50005 }, { "epoch": 0.05037060314793607, "grad_norm": 13.863212335459519, "learning_rate": 4.999998318957862e-05, "loss": 2.4158, "mean_token_accuracy": 0.42413792610168455, "step": 50010 }, { "epoch": 0.05037563920104025, "grad_norm": 13.006626728251153, "learning_rate": 4.999998272840882e-05, "loss": 2.5361, "mean_token_accuracy": 0.3793103337287903, "step": 50015 }, { "epoch": 0.05038067525414442, "grad_norm": 18.675955816483324, "learning_rate": 4.9999982260998566e-05, "loss": 2.6408, "mean_token_accuracy": 0.41724138259887694, "step": 50020 }, { "epoch": 0.050385711307248594, "grad_norm": 12.305510197988752, "learning_rate": 4.9999981787347865e-05, "loss": 2.3927, "mean_token_accuracy": 0.4000000059604645, "step": 50025 }, { "epoch": 0.05039074736035277, "grad_norm": 15.104214893830951, "learning_rate": 4.99999813074567e-05, "loss": 2.6876, "mean_token_accuracy": 0.36551723480224607, "step": 50030 }, { "epoch": 0.050395783413456935, "grad_norm": 12.659031341136748, "learning_rate": 4.9999980821325085e-05, "loss": 2.4855, "mean_token_accuracy": 0.40689654350280763, "step": 50035 }, { "epoch": 0.05040081946656111, "grad_norm": 13.468400778591251, "learning_rate": 4.9999980328953025e-05, "loss": 2.8903, "mean_token_accuracy": 0.3517241358757019, "step": 50040 }, { "epoch": 0.05040585551966528, "grad_norm": 10.051057832774758, "learning_rate": 4.99999798303405e-05, "loss": 3.0942, "mean_token_accuracy": 0.36206896901130675, "step": 50045 }, { "epoch": 0.050410891572769456, "grad_norm": 11.479981276974836, "learning_rate": 4.999997932548752e-05, "loss": 2.5297, "mean_token_accuracy": 0.3827586203813553, "step": 50050 }, { "epoch": 0.05041592762587363, "grad_norm": 12.604137009557448, "learning_rate": 4.999997881439409e-05, "loss": 2.7703, "mean_token_accuracy": 0.3482758581638336, "step": 50055 }, { "epoch": 0.050420963678977804, "grad_norm": 12.086906228102732, "learning_rate": 4.999997829706021e-05, "loss": 2.6068, "mean_token_accuracy": 0.40344828367233276, "step": 50060 }, { "epoch": 0.05042599973208198, "grad_norm": 10.817874276039062, "learning_rate": 4.9999977773485875e-05, "loss": 2.0977, "mean_token_accuracy": 0.4413793206214905, "step": 50065 }, { "epoch": 0.050431035785186144, "grad_norm": 11.903700462637147, "learning_rate": 4.999997724367109e-05, "loss": 2.5412, "mean_token_accuracy": 0.39655172228813174, "step": 50070 }, { "epoch": 0.05043607183829032, "grad_norm": 11.62487044538741, "learning_rate": 4.9999976707615854e-05, "loss": 2.2405, "mean_token_accuracy": 0.4344827592372894, "step": 50075 }, { "epoch": 0.05044110789139449, "grad_norm": 12.218379749564965, "learning_rate": 4.9999976165320154e-05, "loss": 2.7374, "mean_token_accuracy": 0.35862068831920624, "step": 50080 }, { "epoch": 0.050446143944498666, "grad_norm": 11.55715370364926, "learning_rate": 4.9999975616784016e-05, "loss": 2.9408, "mean_token_accuracy": 0.3827586233615875, "step": 50085 }, { "epoch": 0.05045117999760284, "grad_norm": 13.13891490985889, "learning_rate": 4.9999975062007413e-05, "loss": 2.6482, "mean_token_accuracy": 0.37241379618644715, "step": 50090 }, { "epoch": 0.05045621605070701, "grad_norm": 11.43510281822606, "learning_rate": 4.9999974500990374e-05, "loss": 2.4089, "mean_token_accuracy": 0.4344827651977539, "step": 50095 }, { "epoch": 0.05046125210381119, "grad_norm": 13.43929597670826, "learning_rate": 4.999997393373287e-05, "loss": 2.6215, "mean_token_accuracy": 0.32413792610168457, "step": 50100 }, { "epoch": 0.050466288156915354, "grad_norm": 14.588231280641814, "learning_rate": 4.9999973360234915e-05, "loss": 3.2441, "mean_token_accuracy": 0.40859044194221494, "step": 50105 }, { "epoch": 0.05047132421001953, "grad_norm": 10.787683774273884, "learning_rate": 4.999997278049651e-05, "loss": 2.6974, "mean_token_accuracy": 0.4120387136936188, "step": 50110 }, { "epoch": 0.0504763602631237, "grad_norm": 14.236627571128553, "learning_rate": 4.999997219451766e-05, "loss": 2.4728, "mean_token_accuracy": 0.41034482717514037, "step": 50115 }, { "epoch": 0.050481396316227875, "grad_norm": 13.962810688334864, "learning_rate": 4.999997160229835e-05, "loss": 2.3186, "mean_token_accuracy": 0.4137930989265442, "step": 50120 }, { "epoch": 0.05048643236933205, "grad_norm": 14.015178940218096, "learning_rate": 4.9999971003838594e-05, "loss": 2.7623, "mean_token_accuracy": 0.38620689511299133, "step": 50125 }, { "epoch": 0.05049146842243622, "grad_norm": 17.39032792569419, "learning_rate": 4.999997039913838e-05, "loss": 2.2941, "mean_token_accuracy": 0.47980296015739443, "step": 50130 }, { "epoch": 0.050496504475540396, "grad_norm": 12.643248063820355, "learning_rate": 4.999996978819772e-05, "loss": 2.4715, "mean_token_accuracy": 0.4241379380226135, "step": 50135 }, { "epoch": 0.05050154052864456, "grad_norm": 12.157281732548233, "learning_rate": 4.9999969171016614e-05, "loss": 2.609, "mean_token_accuracy": 0.4034482717514038, "step": 50140 }, { "epoch": 0.05050657658174874, "grad_norm": 14.951114291603682, "learning_rate": 4.999996854759505e-05, "loss": 2.7245, "mean_token_accuracy": 0.341379314661026, "step": 50145 }, { "epoch": 0.05051161263485291, "grad_norm": 39.30511979152037, "learning_rate": 4.999996791793304e-05, "loss": 3.2416, "mean_token_accuracy": 0.3793103456497192, "step": 50150 }, { "epoch": 0.050516648687957084, "grad_norm": 13.275875653103892, "learning_rate": 4.999996728203058e-05, "loss": 2.9485, "mean_token_accuracy": 0.37586206793785093, "step": 50155 }, { "epoch": 0.05052168474106126, "grad_norm": 11.42794887378641, "learning_rate": 4.999996663988767e-05, "loss": 2.569, "mean_token_accuracy": 0.4436176598072052, "step": 50160 }, { "epoch": 0.05052672079416543, "grad_norm": 12.386741195391972, "learning_rate": 4.999996599150431e-05, "loss": 2.4167, "mean_token_accuracy": 0.4084089457988739, "step": 50165 }, { "epoch": 0.050531756847269606, "grad_norm": 11.521676489888533, "learning_rate": 4.99999653368805e-05, "loss": 2.7182, "mean_token_accuracy": 0.3482758581638336, "step": 50170 }, { "epoch": 0.05053679290037377, "grad_norm": 11.16330970164469, "learning_rate": 4.999996467601624e-05, "loss": 2.5253, "mean_token_accuracy": 0.4103448212146759, "step": 50175 }, { "epoch": 0.050541828953477946, "grad_norm": 12.300427211862397, "learning_rate": 4.999996400891153e-05, "loss": 2.933, "mean_token_accuracy": 0.33103448152542114, "step": 50180 }, { "epoch": 0.05054686500658212, "grad_norm": 12.376512949866452, "learning_rate": 4.999996333556638e-05, "loss": 2.5401, "mean_token_accuracy": 0.37241379022598264, "step": 50185 }, { "epoch": 0.050551901059686294, "grad_norm": 12.380421031577722, "learning_rate": 4.9999962655980775e-05, "loss": 2.6882, "mean_token_accuracy": 0.3827586233615875, "step": 50190 }, { "epoch": 0.05055693711279047, "grad_norm": 16.5603308545267, "learning_rate": 4.999996197015472e-05, "loss": 2.6222, "mean_token_accuracy": 0.3724137872457504, "step": 50195 }, { "epoch": 0.05056197316589464, "grad_norm": 12.56249445255557, "learning_rate": 4.999996127808822e-05, "loss": 2.5685, "mean_token_accuracy": 0.4103448212146759, "step": 50200 }, { "epoch": 0.050567009218998815, "grad_norm": 13.676499462900354, "learning_rate": 4.9999960579781265e-05, "loss": 2.4247, "mean_token_accuracy": 0.3931034475564957, "step": 50205 }, { "epoch": 0.05057204527210298, "grad_norm": 14.396825411179483, "learning_rate": 4.999995987523386e-05, "loss": 2.3522, "mean_token_accuracy": 0.42413793206214906, "step": 50210 }, { "epoch": 0.050577081325207156, "grad_norm": 22.056202030539666, "learning_rate": 4.999995916444602e-05, "loss": 3.0558, "mean_token_accuracy": 0.3758620619773865, "step": 50215 }, { "epoch": 0.05058211737831133, "grad_norm": 15.905427465072279, "learning_rate": 4.9999958447417725e-05, "loss": 2.3306, "mean_token_accuracy": 0.4413793087005615, "step": 50220 }, { "epoch": 0.0505871534314155, "grad_norm": 13.437858679416255, "learning_rate": 4.999995772414898e-05, "loss": 2.372, "mean_token_accuracy": 0.4241379380226135, "step": 50225 }, { "epoch": 0.05059218948451968, "grad_norm": 12.349046366925462, "learning_rate": 4.9999956994639804e-05, "loss": 2.3315, "mean_token_accuracy": 0.4257108271121979, "step": 50230 }, { "epoch": 0.05059722553762385, "grad_norm": 11.127226646507648, "learning_rate": 4.9999956258890154e-05, "loss": 2.5167, "mean_token_accuracy": 0.4310344785451889, "step": 50235 }, { "epoch": 0.050602261590728025, "grad_norm": 12.877153680635537, "learning_rate": 4.999995551690008e-05, "loss": 2.6582, "mean_token_accuracy": 0.42758620977401735, "step": 50240 }, { "epoch": 0.05060729764383219, "grad_norm": 12.869601768946875, "learning_rate": 4.999995476866955e-05, "loss": 2.3543, "mean_token_accuracy": 0.4620689630508423, "step": 50245 }, { "epoch": 0.050612333696936365, "grad_norm": 13.482445336901161, "learning_rate": 4.999995401419857e-05, "loss": 2.4901, "mean_token_accuracy": 0.4, "step": 50250 }, { "epoch": 0.05061736975004054, "grad_norm": 15.084508148166748, "learning_rate": 4.999995325348715e-05, "loss": 2.5601, "mean_token_accuracy": 0.4034482777118683, "step": 50255 }, { "epoch": 0.05062240580314471, "grad_norm": 13.386760851950458, "learning_rate": 4.9999952486535275e-05, "loss": 2.7355, "mean_token_accuracy": 0.35517241060733795, "step": 50260 }, { "epoch": 0.050627441856248886, "grad_norm": 14.02379392633141, "learning_rate": 4.999995171334297e-05, "loss": 2.2317, "mean_token_accuracy": 0.42413793206214906, "step": 50265 }, { "epoch": 0.05063247790935306, "grad_norm": 13.244677349032951, "learning_rate": 4.999995093391021e-05, "loss": 2.9401, "mean_token_accuracy": 0.3551724195480347, "step": 50270 }, { "epoch": 0.050637513962457234, "grad_norm": 12.373428618036398, "learning_rate": 4.9999950148237006e-05, "loss": 2.9427, "mean_token_accuracy": 0.3620689630508423, "step": 50275 }, { "epoch": 0.0506425500155614, "grad_norm": 13.8730668054857, "learning_rate": 4.999994935632336e-05, "loss": 2.679, "mean_token_accuracy": 0.3724137842655182, "step": 50280 }, { "epoch": 0.050647586068665575, "grad_norm": 13.272347289290915, "learning_rate": 4.999994855816926e-05, "loss": 2.952, "mean_token_accuracy": 0.36551723480224607, "step": 50285 }, { "epoch": 0.05065262212176975, "grad_norm": 12.233073409564609, "learning_rate": 4.9999947753774726e-05, "loss": 3.2847, "mean_token_accuracy": 0.32413792312145234, "step": 50290 }, { "epoch": 0.05065765817487392, "grad_norm": 15.652301657370083, "learning_rate": 4.999994694313973e-05, "loss": 2.4582, "mean_token_accuracy": 0.4137930929660797, "step": 50295 }, { "epoch": 0.050662694227978096, "grad_norm": 12.639222699245709, "learning_rate": 4.999994612626431e-05, "loss": 2.4381, "mean_token_accuracy": 0.38965516090393065, "step": 50300 }, { "epoch": 0.05066773028108227, "grad_norm": 14.433210586636221, "learning_rate": 4.9999945303148435e-05, "loss": 2.8484, "mean_token_accuracy": 0.3448275774717331, "step": 50305 }, { "epoch": 0.05067276633418644, "grad_norm": 15.76482090872602, "learning_rate": 4.9999944473792124e-05, "loss": 2.3164, "mean_token_accuracy": 0.4034482717514038, "step": 50310 }, { "epoch": 0.05067780238729061, "grad_norm": 18.40720269668647, "learning_rate": 4.999994363819536e-05, "loss": 3.0817, "mean_token_accuracy": 0.33793103098869326, "step": 50315 }, { "epoch": 0.050682838440394784, "grad_norm": 10.704388766496008, "learning_rate": 4.999994279635816e-05, "loss": 2.097, "mean_token_accuracy": 0.45517241954803467, "step": 50320 }, { "epoch": 0.05068787449349896, "grad_norm": 15.269389253078632, "learning_rate": 4.999994194828052e-05, "loss": 2.6285, "mean_token_accuracy": 0.41724138259887694, "step": 50325 }, { "epoch": 0.05069291054660313, "grad_norm": 13.149014843343057, "learning_rate": 4.999994109396243e-05, "loss": 2.5431, "mean_token_accuracy": 0.3793103456497192, "step": 50330 }, { "epoch": 0.050697946599707305, "grad_norm": 18.00027476552083, "learning_rate": 4.9999940233403904e-05, "loss": 2.7242, "mean_token_accuracy": 0.39655172526836396, "step": 50335 }, { "epoch": 0.05070298265281148, "grad_norm": 14.417635393764959, "learning_rate": 4.999993936660493e-05, "loss": 2.2481, "mean_token_accuracy": 0.43793103098869324, "step": 50340 }, { "epoch": 0.05070801870591565, "grad_norm": 14.529208214345644, "learning_rate": 4.999993849356551e-05, "loss": 2.6387, "mean_token_accuracy": 0.39655172228813174, "step": 50345 }, { "epoch": 0.05071305475901982, "grad_norm": 12.498917304320713, "learning_rate": 4.999993761428565e-05, "loss": 2.8343, "mean_token_accuracy": 0.39655172228813174, "step": 50350 }, { "epoch": 0.05071809081212399, "grad_norm": 12.215905953034385, "learning_rate": 4.999993672876535e-05, "loss": 2.3883, "mean_token_accuracy": 0.4482758641242981, "step": 50355 }, { "epoch": 0.05072312686522817, "grad_norm": 13.195156917622159, "learning_rate": 4.999993583700461e-05, "loss": 2.824, "mean_token_accuracy": 0.39655172228813174, "step": 50360 }, { "epoch": 0.05072816291833234, "grad_norm": 14.403178828329962, "learning_rate": 4.9999934939003434e-05, "loss": 3.4793, "mean_token_accuracy": 0.29310344755649564, "step": 50365 }, { "epoch": 0.050733198971436515, "grad_norm": 12.027852798397008, "learning_rate": 4.999993403476181e-05, "loss": 2.9235, "mean_token_accuracy": 0.35862068831920624, "step": 50370 }, { "epoch": 0.05073823502454069, "grad_norm": 15.27328788978254, "learning_rate": 4.999993312427975e-05, "loss": 2.907, "mean_token_accuracy": 0.37586207389831544, "step": 50375 }, { "epoch": 0.05074327107764486, "grad_norm": 12.99541592749973, "learning_rate": 4.999993220755724e-05, "loss": 2.874, "mean_token_accuracy": 0.3448275804519653, "step": 50380 }, { "epoch": 0.05074830713074903, "grad_norm": 13.21460625924436, "learning_rate": 4.9999931284594295e-05, "loss": 2.8509, "mean_token_accuracy": 0.3655172437429428, "step": 50385 }, { "epoch": 0.0507533431838532, "grad_norm": 13.447178401623969, "learning_rate": 4.999993035539092e-05, "loss": 2.5361, "mean_token_accuracy": 0.3931034505367279, "step": 50390 }, { "epoch": 0.05075837923695738, "grad_norm": 11.676902120723726, "learning_rate": 4.999992941994709e-05, "loss": 2.4866, "mean_token_accuracy": 0.3827586233615875, "step": 50395 }, { "epoch": 0.05076341529006155, "grad_norm": 10.326934684992437, "learning_rate": 4.9999928478262834e-05, "loss": 2.3729, "mean_token_accuracy": 0.44482758045196535, "step": 50400 }, { "epoch": 0.050768451343165724, "grad_norm": 11.223476499367806, "learning_rate": 4.999992753033813e-05, "loss": 2.6259, "mean_token_accuracy": 0.42068964838981626, "step": 50405 }, { "epoch": 0.0507734873962699, "grad_norm": 11.533671842982669, "learning_rate": 4.9999926576172986e-05, "loss": 3.0445, "mean_token_accuracy": 0.38275861740112305, "step": 50410 }, { "epoch": 0.05077852344937407, "grad_norm": 11.465328071015533, "learning_rate": 4.999992561576741e-05, "loss": 2.469, "mean_token_accuracy": 0.40344828367233276, "step": 50415 }, { "epoch": 0.05078355950247824, "grad_norm": 11.463208455361164, "learning_rate": 4.9999924649121396e-05, "loss": 2.765, "mean_token_accuracy": 0.3655172407627106, "step": 50420 }, { "epoch": 0.05078859555558241, "grad_norm": 11.151187312782465, "learning_rate": 4.999992367623494e-05, "loss": 2.5221, "mean_token_accuracy": 0.35862068831920624, "step": 50425 }, { "epoch": 0.050793631608686586, "grad_norm": 13.622775208407601, "learning_rate": 4.9999922697108043e-05, "loss": 2.6598, "mean_token_accuracy": 0.37241379022598264, "step": 50430 }, { "epoch": 0.05079866766179076, "grad_norm": 13.678948168777696, "learning_rate": 4.999992171174072e-05, "loss": 2.5965, "mean_token_accuracy": 0.39310345649719236, "step": 50435 }, { "epoch": 0.050803703714894934, "grad_norm": 11.892291183988693, "learning_rate": 4.9999920720132955e-05, "loss": 2.4797, "mean_token_accuracy": 0.41034482717514037, "step": 50440 }, { "epoch": 0.05080873976799911, "grad_norm": 13.233194899553208, "learning_rate": 4.999991972228474e-05, "loss": 2.7149, "mean_token_accuracy": 0.358620685338974, "step": 50445 }, { "epoch": 0.05081377582110328, "grad_norm": 13.85509292141273, "learning_rate": 4.99999187181961e-05, "loss": 2.7106, "mean_token_accuracy": 0.3551724076271057, "step": 50450 }, { "epoch": 0.05081881187420745, "grad_norm": 10.301359830994032, "learning_rate": 4.999991770786702e-05, "loss": 2.4974, "mean_token_accuracy": 0.4241379201412201, "step": 50455 }, { "epoch": 0.05082384792731162, "grad_norm": 15.288932018542642, "learning_rate": 4.999991669129751e-05, "loss": 2.7898, "mean_token_accuracy": 0.3395644336938858, "step": 50460 }, { "epoch": 0.050828883980415795, "grad_norm": 10.501455040792836, "learning_rate": 4.999991566848756e-05, "loss": 2.0661, "mean_token_accuracy": 0.4724137902259827, "step": 50465 }, { "epoch": 0.05083392003351997, "grad_norm": 11.567431115765524, "learning_rate": 4.999991463943717e-05, "loss": 2.8463, "mean_token_accuracy": 0.3517241418361664, "step": 50470 }, { "epoch": 0.05083895608662414, "grad_norm": 18.940026618890897, "learning_rate": 4.9999913604146356e-05, "loss": 2.9963, "mean_token_accuracy": 0.3551724195480347, "step": 50475 }, { "epoch": 0.05084399213972832, "grad_norm": 13.871105979677806, "learning_rate": 4.99999125626151e-05, "loss": 2.7956, "mean_token_accuracy": 0.37586206793785093, "step": 50480 }, { "epoch": 0.05084902819283249, "grad_norm": 11.969604195578384, "learning_rate": 4.999991151484341e-05, "loss": 2.2208, "mean_token_accuracy": 0.4413793087005615, "step": 50485 }, { "epoch": 0.05085406424593666, "grad_norm": 13.7483171347264, "learning_rate": 4.9999910460831286e-05, "loss": 2.7553, "mean_token_accuracy": 0.38620689809322356, "step": 50490 }, { "epoch": 0.05085910029904083, "grad_norm": 13.374470871774966, "learning_rate": 4.999990940057873e-05, "loss": 2.8883, "mean_token_accuracy": 0.38620689511299133, "step": 50495 }, { "epoch": 0.050864136352145005, "grad_norm": 12.051370925671462, "learning_rate": 4.999990833408573e-05, "loss": 2.9269, "mean_token_accuracy": 0.33448275923728943, "step": 50500 }, { "epoch": 0.05086917240524918, "grad_norm": 11.177393694866062, "learning_rate": 4.9999907261352315e-05, "loss": 2.4536, "mean_token_accuracy": 0.44482758045196535, "step": 50505 }, { "epoch": 0.05087420845835335, "grad_norm": 11.83979212110621, "learning_rate": 4.999990618237845e-05, "loss": 2.7094, "mean_token_accuracy": 0.35517241060733795, "step": 50510 }, { "epoch": 0.050879244511457526, "grad_norm": 13.536797330310398, "learning_rate": 4.9999905097164155e-05, "loss": 2.5111, "mean_token_accuracy": 0.3482758581638336, "step": 50515 }, { "epoch": 0.0508842805645617, "grad_norm": 13.854247271581649, "learning_rate": 4.9999904005709435e-05, "loss": 2.3745, "mean_token_accuracy": 0.459482753276825, "step": 50520 }, { "epoch": 0.05088931661766587, "grad_norm": 14.442259604575176, "learning_rate": 4.999990290801427e-05, "loss": 2.9725, "mean_token_accuracy": 0.3840895354747772, "step": 50525 }, { "epoch": 0.05089435267077004, "grad_norm": 14.164961502975885, "learning_rate": 4.9999901804078685e-05, "loss": 2.4713, "mean_token_accuracy": 0.43103448748588563, "step": 50530 }, { "epoch": 0.050899388723874214, "grad_norm": 10.612880208792598, "learning_rate": 4.999990069390267e-05, "loss": 3.0189, "mean_token_accuracy": 0.31034482419490816, "step": 50535 }, { "epoch": 0.05090442477697839, "grad_norm": 13.397037540691723, "learning_rate": 4.999989957748622e-05, "loss": 2.4633, "mean_token_accuracy": 0.43793103098869324, "step": 50540 }, { "epoch": 0.05090946083008256, "grad_norm": 11.751872463325505, "learning_rate": 4.9999898454829334e-05, "loss": 2.5125, "mean_token_accuracy": 0.3862068891525269, "step": 50545 }, { "epoch": 0.050914496883186736, "grad_norm": 12.742807249912271, "learning_rate": 4.999989732593202e-05, "loss": 2.5619, "mean_token_accuracy": 0.36896551847457887, "step": 50550 }, { "epoch": 0.05091953293629091, "grad_norm": 14.85416411347173, "learning_rate": 4.999989619079427e-05, "loss": 2.6726, "mean_token_accuracy": 0.3862069010734558, "step": 50555 }, { "epoch": 0.050924568989395076, "grad_norm": 11.729884494350376, "learning_rate": 4.99998950494161e-05, "loss": 2.746, "mean_token_accuracy": 0.36896551251411436, "step": 50560 }, { "epoch": 0.05092960504249925, "grad_norm": 12.456041971939259, "learning_rate": 4.999989390179749e-05, "loss": 2.6011, "mean_token_accuracy": 0.3931034505367279, "step": 50565 }, { "epoch": 0.050934641095603424, "grad_norm": 13.049428129856947, "learning_rate": 4.999989274793846e-05, "loss": 2.4744, "mean_token_accuracy": 0.4068965554237366, "step": 50570 }, { "epoch": 0.0509396771487076, "grad_norm": 15.172187273750252, "learning_rate": 4.999989158783899e-05, "loss": 2.4209, "mean_token_accuracy": 0.42068966031074523, "step": 50575 }, { "epoch": 0.05094471320181177, "grad_norm": 15.284546724590435, "learning_rate": 4.99998904214991e-05, "loss": 2.7756, "mean_token_accuracy": 0.38620689511299133, "step": 50580 }, { "epoch": 0.050949749254915945, "grad_norm": 11.171683100945389, "learning_rate": 4.9999889248918784e-05, "loss": 2.973, "mean_token_accuracy": 0.3241379350423813, "step": 50585 }, { "epoch": 0.05095478530802012, "grad_norm": 10.318888011024718, "learning_rate": 4.999988807009803e-05, "loss": 2.0035, "mean_token_accuracy": 0.5034482836723327, "step": 50590 }, { "epoch": 0.050959821361124286, "grad_norm": 16.495888368016665, "learning_rate": 4.999988688503685e-05, "loss": 2.938, "mean_token_accuracy": 0.36551724672317504, "step": 50595 }, { "epoch": 0.05096485741422846, "grad_norm": 12.056136201116475, "learning_rate": 4.999988569373526e-05, "loss": 2.6403, "mean_token_accuracy": 0.36896551847457887, "step": 50600 }, { "epoch": 0.05096989346733263, "grad_norm": 11.482882692953941, "learning_rate": 4.999988449619322e-05, "loss": 2.5845, "mean_token_accuracy": 0.41034482717514037, "step": 50605 }, { "epoch": 0.05097492952043681, "grad_norm": 12.372321106372752, "learning_rate": 4.999988329241077e-05, "loss": 2.4334, "mean_token_accuracy": 0.44694494605064394, "step": 50610 }, { "epoch": 0.05097996557354098, "grad_norm": 12.378444336336448, "learning_rate": 4.999988208238788e-05, "loss": 2.8065, "mean_token_accuracy": 0.3724137842655182, "step": 50615 }, { "epoch": 0.050985001626645154, "grad_norm": 12.907399789494304, "learning_rate": 4.999988086612456e-05, "loss": 3.0349, "mean_token_accuracy": 0.35862069129943847, "step": 50620 }, { "epoch": 0.05099003767974933, "grad_norm": 12.695328688508344, "learning_rate": 4.9999879643620836e-05, "loss": 2.9212, "mean_token_accuracy": 0.3655172407627106, "step": 50625 }, { "epoch": 0.050995073732853495, "grad_norm": 11.378454729698843, "learning_rate": 4.999987841487667e-05, "loss": 2.4957, "mean_token_accuracy": 0.4156079888343811, "step": 50630 }, { "epoch": 0.05100010978595767, "grad_norm": 14.206251095541488, "learning_rate": 4.9999877179892084e-05, "loss": 2.9049, "mean_token_accuracy": 0.39310344457626345, "step": 50635 }, { "epoch": 0.05100514583906184, "grad_norm": 16.434382829001443, "learning_rate": 4.9999875938667066e-05, "loss": 2.7233, "mean_token_accuracy": 0.3793103456497192, "step": 50640 }, { "epoch": 0.051010181892166016, "grad_norm": 12.849534715047369, "learning_rate": 4.999987469120164e-05, "loss": 2.9175, "mean_token_accuracy": 0.3827586233615875, "step": 50645 }, { "epoch": 0.05101521794527019, "grad_norm": 16.786675505839124, "learning_rate": 4.999987343749578e-05, "loss": 2.4359, "mean_token_accuracy": 0.39310344159603117, "step": 50650 }, { "epoch": 0.051020253998374364, "grad_norm": 12.34115087509322, "learning_rate": 4.999987217754949e-05, "loss": 2.7907, "mean_token_accuracy": 0.37586206793785093, "step": 50655 }, { "epoch": 0.05102529005147854, "grad_norm": 36.61510853950055, "learning_rate": 4.9999870911362775e-05, "loss": 3.0481, "mean_token_accuracy": 0.3620689630508423, "step": 50660 }, { "epoch": 0.051030326104582704, "grad_norm": 14.500048960266122, "learning_rate": 4.9999869638935645e-05, "loss": 2.6012, "mean_token_accuracy": 0.4053236484527588, "step": 50665 }, { "epoch": 0.05103536215768688, "grad_norm": 15.641473581959717, "learning_rate": 4.999986836026809e-05, "loss": 2.8094, "mean_token_accuracy": 0.4, "step": 50670 }, { "epoch": 0.05104039821079105, "grad_norm": 13.372895943097312, "learning_rate": 4.999986707536012e-05, "loss": 2.4642, "mean_token_accuracy": 0.4000000059604645, "step": 50675 }, { "epoch": 0.051045434263895226, "grad_norm": 11.158541350361103, "learning_rate": 4.999986578421172e-05, "loss": 2.4038, "mean_token_accuracy": 0.44827585816383364, "step": 50680 }, { "epoch": 0.0510504703169994, "grad_norm": 10.452277524536793, "learning_rate": 4.999986448682289e-05, "loss": 2.286, "mean_token_accuracy": 0.4103448331356049, "step": 50685 }, { "epoch": 0.05105550637010357, "grad_norm": 12.707008852328286, "learning_rate": 4.999986318319365e-05, "loss": 2.8223, "mean_token_accuracy": 0.3931034505367279, "step": 50690 }, { "epoch": 0.05106054242320775, "grad_norm": 12.874717018340522, "learning_rate": 4.999986187332399e-05, "loss": 2.4357, "mean_token_accuracy": 0.44827587008476255, "step": 50695 }, { "epoch": 0.051065578476311914, "grad_norm": 11.240808453968851, "learning_rate": 4.999986055721391e-05, "loss": 2.4639, "mean_token_accuracy": 0.45172412395477296, "step": 50700 }, { "epoch": 0.05107061452941609, "grad_norm": 14.117069554688578, "learning_rate": 4.999985923486341e-05, "loss": 2.6966, "mean_token_accuracy": 0.43448275327682495, "step": 50705 }, { "epoch": 0.05107565058252026, "grad_norm": 10.643264149205516, "learning_rate": 4.999985790627248e-05, "loss": 2.3975, "mean_token_accuracy": 0.4396249294281006, "step": 50710 }, { "epoch": 0.051080686635624435, "grad_norm": 12.059514891084545, "learning_rate": 4.999985657144114e-05, "loss": 2.204, "mean_token_accuracy": 0.47241379618644713, "step": 50715 }, { "epoch": 0.05108572268872861, "grad_norm": 13.301422299121736, "learning_rate": 4.999985523036937e-05, "loss": 2.8314, "mean_token_accuracy": 0.38275861740112305, "step": 50720 }, { "epoch": 0.05109075874183278, "grad_norm": 13.176480343357916, "learning_rate": 4.999985388305719e-05, "loss": 2.3718, "mean_token_accuracy": 0.4172413766384125, "step": 50725 }, { "epoch": 0.051095794794936956, "grad_norm": 13.078364829447723, "learning_rate": 4.999985252950459e-05, "loss": 2.8303, "mean_token_accuracy": 0.3793103456497192, "step": 50730 }, { "epoch": 0.05110083084804112, "grad_norm": 11.523671369849534, "learning_rate": 4.999985116971158e-05, "loss": 2.6696, "mean_token_accuracy": 0.4344827592372894, "step": 50735 }, { "epoch": 0.0511058669011453, "grad_norm": 14.418817469698231, "learning_rate": 4.999984980367814e-05, "loss": 2.3964, "mean_token_accuracy": 0.44482758045196535, "step": 50740 }, { "epoch": 0.05111090295424947, "grad_norm": 11.923611585157808, "learning_rate": 4.999984843140429e-05, "loss": 2.4344, "mean_token_accuracy": 0.41379310488700866, "step": 50745 }, { "epoch": 0.051115939007353645, "grad_norm": 11.478710784659865, "learning_rate": 4.9999847052890015e-05, "loss": 2.2794, "mean_token_accuracy": 0.45517241954803467, "step": 50750 }, { "epoch": 0.05112097506045782, "grad_norm": 12.27726143967275, "learning_rate": 4.999984566813533e-05, "loss": 2.8494, "mean_token_accuracy": 0.4032667875289917, "step": 50755 }, { "epoch": 0.05112601111356199, "grad_norm": 26.514712610701736, "learning_rate": 4.9999844277140224e-05, "loss": 2.976, "mean_token_accuracy": 0.3379310339689255, "step": 50760 }, { "epoch": 0.051131047166666166, "grad_norm": 16.6683723281907, "learning_rate": 4.999984287990471e-05, "loss": 2.6059, "mean_token_accuracy": 0.38275861740112305, "step": 50765 }, { "epoch": 0.05113608321977033, "grad_norm": 13.29468259950492, "learning_rate": 4.999984147642878e-05, "loss": 2.8289, "mean_token_accuracy": 0.33793103098869326, "step": 50770 }, { "epoch": 0.051141119272874506, "grad_norm": 10.503863866473427, "learning_rate": 4.999984006671243e-05, "loss": 2.4664, "mean_token_accuracy": 0.3896551728248596, "step": 50775 }, { "epoch": 0.05114615532597868, "grad_norm": 12.258165165755115, "learning_rate": 4.9999838650755667e-05, "loss": 3.0564, "mean_token_accuracy": 0.36551723480224607, "step": 50780 }, { "epoch": 0.051151191379082854, "grad_norm": 11.738062119497405, "learning_rate": 4.999983722855849e-05, "loss": 2.423, "mean_token_accuracy": 0.41379310488700866, "step": 50785 }, { "epoch": 0.05115622743218703, "grad_norm": 12.837625698816778, "learning_rate": 4.99998358001209e-05, "loss": 2.3323, "mean_token_accuracy": 0.42068964838981626, "step": 50790 }, { "epoch": 0.0511612634852912, "grad_norm": 9.94996131355436, "learning_rate": 4.9999834365442894e-05, "loss": 2.5248, "mean_token_accuracy": 0.4517241299152374, "step": 50795 }, { "epoch": 0.051166299538395375, "grad_norm": 12.888870671234065, "learning_rate": 4.999983292452448e-05, "loss": 2.9839, "mean_token_accuracy": 0.32413792610168457, "step": 50800 }, { "epoch": 0.05117133559149954, "grad_norm": 14.406153247881461, "learning_rate": 4.9999831477365645e-05, "loss": 3.2232, "mean_token_accuracy": 0.31724138259887696, "step": 50805 }, { "epoch": 0.051176371644603716, "grad_norm": 14.378786650015185, "learning_rate": 4.99998300239664e-05, "loss": 2.75, "mean_token_accuracy": 0.3655172407627106, "step": 50810 }, { "epoch": 0.05118140769770789, "grad_norm": 19.22735985157965, "learning_rate": 4.999982856432675e-05, "loss": 2.4734, "mean_token_accuracy": 0.41034482717514037, "step": 50815 }, { "epoch": 0.05118644375081206, "grad_norm": 15.522569753235098, "learning_rate": 4.999982709844669e-05, "loss": 2.9803, "mean_token_accuracy": 0.3448275804519653, "step": 50820 }, { "epoch": 0.05119147980391624, "grad_norm": 14.79317805750623, "learning_rate": 4.999982562632621e-05, "loss": 2.6382, "mean_token_accuracy": 0.38965516686439516, "step": 50825 }, { "epoch": 0.05119651585702041, "grad_norm": 12.34551948392326, "learning_rate": 4.999982414796532e-05, "loss": 2.3967, "mean_token_accuracy": 0.41379310488700866, "step": 50830 }, { "epoch": 0.051201551910124585, "grad_norm": 12.961008118984303, "learning_rate": 4.9999822663364026e-05, "loss": 2.5331, "mean_token_accuracy": 0.4068965494632721, "step": 50835 }, { "epoch": 0.05120658796322875, "grad_norm": 13.437188411337072, "learning_rate": 4.9999821172522315e-05, "loss": 2.4934, "mean_token_accuracy": 0.36896551251411436, "step": 50840 }, { "epoch": 0.051211624016332925, "grad_norm": 12.443311128190949, "learning_rate": 4.99998196754402e-05, "loss": 2.5719, "mean_token_accuracy": 0.3551724135875702, "step": 50845 }, { "epoch": 0.0512166600694371, "grad_norm": 14.096214012417848, "learning_rate": 4.9999818172117675e-05, "loss": 2.419, "mean_token_accuracy": 0.41379310488700866, "step": 50850 }, { "epoch": 0.05122169612254127, "grad_norm": 10.609964847332812, "learning_rate": 4.999981666255475e-05, "loss": 2.4425, "mean_token_accuracy": 0.38965516686439516, "step": 50855 }, { "epoch": 0.05122673217564545, "grad_norm": 12.23619203623958, "learning_rate": 4.99998151467514e-05, "loss": 2.2994, "mean_token_accuracy": 0.4275861978530884, "step": 50860 }, { "epoch": 0.05123176822874962, "grad_norm": 12.02649319209877, "learning_rate": 4.999981362470765e-05, "loss": 2.4735, "mean_token_accuracy": 0.43103447556495667, "step": 50865 }, { "epoch": 0.051236804281853794, "grad_norm": 9.680217071372095, "learning_rate": 4.9999812096423495e-05, "loss": 2.2326, "mean_token_accuracy": 0.4517241418361664, "step": 50870 }, { "epoch": 0.05124184033495796, "grad_norm": 15.594835691087551, "learning_rate": 4.999981056189893e-05, "loss": 3.6037, "mean_token_accuracy": 0.29310344755649564, "step": 50875 }, { "epoch": 0.051246876388062135, "grad_norm": 15.3248595892866, "learning_rate": 4.999980902113396e-05, "loss": 2.5861, "mean_token_accuracy": 0.42758620977401735, "step": 50880 }, { "epoch": 0.05125191244116631, "grad_norm": 12.458959127836875, "learning_rate": 4.9999807474128584e-05, "loss": 2.4134, "mean_token_accuracy": 0.39655172228813174, "step": 50885 }, { "epoch": 0.05125694849427048, "grad_norm": 12.11492084612793, "learning_rate": 4.9999805920882805e-05, "loss": 2.6406, "mean_token_accuracy": 0.4, "step": 50890 }, { "epoch": 0.051261984547374656, "grad_norm": 13.71665520899789, "learning_rate": 4.9999804361396615e-05, "loss": 2.7467, "mean_token_accuracy": 0.33103448152542114, "step": 50895 }, { "epoch": 0.05126702060047883, "grad_norm": 13.49433978997008, "learning_rate": 4.999980279567003e-05, "loss": 3.1413, "mean_token_accuracy": 0.33448275923728943, "step": 50900 }, { "epoch": 0.051272056653583004, "grad_norm": 14.392407630712961, "learning_rate": 4.999980122370303e-05, "loss": 2.4145, "mean_token_accuracy": 0.4068965494632721, "step": 50905 }, { "epoch": 0.05127709270668717, "grad_norm": 13.091900175287327, "learning_rate": 4.999979964549564e-05, "loss": 2.5835, "mean_token_accuracy": 0.37931033968925476, "step": 50910 }, { "epoch": 0.051282128759791344, "grad_norm": 14.003193125547206, "learning_rate": 4.999979806104783e-05, "loss": 2.9264, "mean_token_accuracy": 0.32413792312145234, "step": 50915 }, { "epoch": 0.05128716481289552, "grad_norm": 11.263117746217645, "learning_rate": 4.999979647035963e-05, "loss": 2.656, "mean_token_accuracy": 0.38965516686439516, "step": 50920 }, { "epoch": 0.05129220086599969, "grad_norm": 13.250377362367928, "learning_rate": 4.999979487343102e-05, "loss": 2.2998, "mean_token_accuracy": 0.43647912740707395, "step": 50925 }, { "epoch": 0.051297236919103865, "grad_norm": 14.187996994022388, "learning_rate": 4.9999793270262006e-05, "loss": 2.4715, "mean_token_accuracy": 0.3965517282485962, "step": 50930 }, { "epoch": 0.05130227297220804, "grad_norm": 10.238075499048039, "learning_rate": 4.9999791660852595e-05, "loss": 2.4173, "mean_token_accuracy": 0.47241379618644713, "step": 50935 }, { "epoch": 0.05130730902531221, "grad_norm": 11.385759304110067, "learning_rate": 4.999979004520278e-05, "loss": 2.7381, "mean_token_accuracy": 0.3551724076271057, "step": 50940 }, { "epoch": 0.05131234507841638, "grad_norm": 11.001773678247298, "learning_rate": 4.999978842331257e-05, "loss": 2.6548, "mean_token_accuracy": 0.33103448152542114, "step": 50945 }, { "epoch": 0.051317381131520554, "grad_norm": 18.430164746876034, "learning_rate": 4.9999786795181965e-05, "loss": 2.719, "mean_token_accuracy": 0.39310344457626345, "step": 50950 }, { "epoch": 0.05132241718462473, "grad_norm": 13.003007931588273, "learning_rate": 4.999978516081094e-05, "loss": 2.297, "mean_token_accuracy": 0.4620689630508423, "step": 50955 }, { "epoch": 0.0513274532377289, "grad_norm": 16.122748725297082, "learning_rate": 4.9999783520199534e-05, "loss": 2.8609, "mean_token_accuracy": 0.3843920171260834, "step": 50960 }, { "epoch": 0.051332489290833075, "grad_norm": 12.55460038783875, "learning_rate": 4.999978187334772e-05, "loss": 2.6097, "mean_token_accuracy": 0.4000000059604645, "step": 50965 }, { "epoch": 0.05133752534393725, "grad_norm": 12.255336830730991, "learning_rate": 4.999978022025551e-05, "loss": 2.1952, "mean_token_accuracy": 0.4497882604598999, "step": 50970 }, { "epoch": 0.05134256139704142, "grad_norm": 15.490637324884583, "learning_rate": 4.99997785609229e-05, "loss": 2.6487, "mean_token_accuracy": 0.41034482717514037, "step": 50975 }, { "epoch": 0.05134759745014559, "grad_norm": 10.169066129495206, "learning_rate": 4.99997768953499e-05, "loss": 2.8483, "mean_token_accuracy": 0.41724138259887694, "step": 50980 }, { "epoch": 0.05135263350324976, "grad_norm": 15.084322176414775, "learning_rate": 4.9999775223536494e-05, "loss": 2.635, "mean_token_accuracy": 0.4, "step": 50985 }, { "epoch": 0.05135766955635394, "grad_norm": 12.371185090920786, "learning_rate": 4.9999773545482694e-05, "loss": 2.5849, "mean_token_accuracy": 0.3793103456497192, "step": 50990 }, { "epoch": 0.05136270560945811, "grad_norm": 10.646839802882313, "learning_rate": 4.99997718611885e-05, "loss": 2.4614, "mean_token_accuracy": 0.4172413766384125, "step": 50995 }, { "epoch": 0.051367741662562284, "grad_norm": 13.205666166697197, "learning_rate": 4.999977017065391e-05, "loss": 2.5323, "mean_token_accuracy": 0.3896551728248596, "step": 51000 }, { "epoch": 0.05137277771566646, "grad_norm": 12.226788417692278, "learning_rate": 4.999976847387892e-05, "loss": 2.8101, "mean_token_accuracy": 0.37241379618644715, "step": 51005 }, { "epoch": 0.05137781376877063, "grad_norm": 12.031641548340382, "learning_rate": 4.999976677086354e-05, "loss": 2.4819, "mean_token_accuracy": 0.3793103337287903, "step": 51010 }, { "epoch": 0.0513828498218748, "grad_norm": 11.968784098747346, "learning_rate": 4.999976506160777e-05, "loss": 2.8444, "mean_token_accuracy": 0.41034482717514037, "step": 51015 }, { "epoch": 0.05138788587497897, "grad_norm": 11.478885496503556, "learning_rate": 4.9999763346111595e-05, "loss": 2.8047, "mean_token_accuracy": 0.3965517282485962, "step": 51020 }, { "epoch": 0.051392921928083146, "grad_norm": 12.300036949229812, "learning_rate": 4.999976162437504e-05, "loss": 2.6939, "mean_token_accuracy": 0.39310343861579894, "step": 51025 }, { "epoch": 0.05139795798118732, "grad_norm": 11.8263698470749, "learning_rate": 4.999975989639808e-05, "loss": 2.4497, "mean_token_accuracy": 0.42068964838981626, "step": 51030 }, { "epoch": 0.051402994034291494, "grad_norm": 11.737202767093606, "learning_rate": 4.9999758162180735e-05, "loss": 2.6731, "mean_token_accuracy": 0.3896551728248596, "step": 51035 }, { "epoch": 0.05140803008739567, "grad_norm": 11.514363948016042, "learning_rate": 4.9999756421722995e-05, "loss": 2.6212, "mean_token_accuracy": 0.39310344457626345, "step": 51040 }, { "epoch": 0.05141306614049984, "grad_norm": 10.675617574535094, "learning_rate": 4.9999754675024865e-05, "loss": 2.8054, "mean_token_accuracy": 0.35172412991523744, "step": 51045 }, { "epoch": 0.05141810219360401, "grad_norm": 12.256579072647709, "learning_rate": 4.9999752922086345e-05, "loss": 2.5923, "mean_token_accuracy": 0.3862068891525269, "step": 51050 }, { "epoch": 0.05142313824670818, "grad_norm": 12.154667936496908, "learning_rate": 4.999975116290743e-05, "loss": 2.8157, "mean_token_accuracy": 0.4137930989265442, "step": 51055 }, { "epoch": 0.051428174299812356, "grad_norm": 16.363384659472228, "learning_rate": 4.999974939748813e-05, "loss": 2.7208, "mean_token_accuracy": 0.43189655542373656, "step": 51060 }, { "epoch": 0.05143321035291653, "grad_norm": 12.830348100363567, "learning_rate": 4.999974762582843e-05, "loss": 2.8218, "mean_token_accuracy": 0.3862068891525269, "step": 51065 }, { "epoch": 0.0514382464060207, "grad_norm": 11.622833138463445, "learning_rate": 4.9999745847928354e-05, "loss": 2.7577, "mean_token_accuracy": 0.3448275804519653, "step": 51070 }, { "epoch": 0.05144328245912488, "grad_norm": 10.258774237612803, "learning_rate": 4.999974406378789e-05, "loss": 2.2819, "mean_token_accuracy": 0.47428916692733764, "step": 51075 }, { "epoch": 0.05144831851222905, "grad_norm": 12.441505544879986, "learning_rate": 4.999974227340703e-05, "loss": 2.6171, "mean_token_accuracy": 0.39655172228813174, "step": 51080 }, { "epoch": 0.05145335456533322, "grad_norm": 14.365569243100772, "learning_rate": 4.999974047678579e-05, "loss": 2.7089, "mean_token_accuracy": 0.3758620619773865, "step": 51085 }, { "epoch": 0.05145839061843739, "grad_norm": 13.577520838396728, "learning_rate": 4.999973867392415e-05, "loss": 2.9569, "mean_token_accuracy": 0.31724138259887696, "step": 51090 }, { "epoch": 0.051463426671541565, "grad_norm": 17.16753810806592, "learning_rate": 4.999973686482213e-05, "loss": 2.6726, "mean_token_accuracy": 0.37241379022598264, "step": 51095 }, { "epoch": 0.05146846272464574, "grad_norm": 11.49485354134092, "learning_rate": 4.999973504947974e-05, "loss": 2.826, "mean_token_accuracy": 0.3344827562570572, "step": 51100 }, { "epoch": 0.05147349877774991, "grad_norm": 11.053012541885813, "learning_rate": 4.9999733227896944e-05, "loss": 2.2644, "mean_token_accuracy": 0.4344827592372894, "step": 51105 }, { "epoch": 0.051478534830854086, "grad_norm": 11.770033855592528, "learning_rate": 4.999973140007377e-05, "loss": 2.4222, "mean_token_accuracy": 0.4503327250480652, "step": 51110 }, { "epoch": 0.05148357088395826, "grad_norm": 12.51707852906171, "learning_rate": 4.999972956601022e-05, "loss": 2.6553, "mean_token_accuracy": 0.37586206793785093, "step": 51115 }, { "epoch": 0.05148860693706243, "grad_norm": 13.271440631770309, "learning_rate": 4.999972772570627e-05, "loss": 2.6644, "mean_token_accuracy": 0.4103448212146759, "step": 51120 }, { "epoch": 0.0514936429901666, "grad_norm": 11.531979296531613, "learning_rate": 4.999972587916195e-05, "loss": 3.3401, "mean_token_accuracy": 0.31724137961864474, "step": 51125 }, { "epoch": 0.051498679043270774, "grad_norm": 11.839866141934124, "learning_rate": 4.999972402637724e-05, "loss": 2.6027, "mean_token_accuracy": 0.3724137842655182, "step": 51130 }, { "epoch": 0.05150371509637495, "grad_norm": 10.175524918131666, "learning_rate": 4.9999722167352143e-05, "loss": 2.4688, "mean_token_accuracy": 0.3949183315038681, "step": 51135 }, { "epoch": 0.05150875114947912, "grad_norm": 12.9938993814973, "learning_rate": 4.9999720302086674e-05, "loss": 2.4997, "mean_token_accuracy": 0.37586206793785093, "step": 51140 }, { "epoch": 0.051513787202583296, "grad_norm": 14.039919128716551, "learning_rate": 4.999971843058082e-05, "loss": 2.5303, "mean_token_accuracy": 0.41929823756217954, "step": 51145 }, { "epoch": 0.05151882325568747, "grad_norm": 10.985297110573095, "learning_rate": 4.9999716552834585e-05, "loss": 2.7327, "mean_token_accuracy": 0.4, "step": 51150 }, { "epoch": 0.051523859308791636, "grad_norm": 15.03183286569151, "learning_rate": 4.9999714668847966e-05, "loss": 2.5817, "mean_token_accuracy": 0.43103448748588563, "step": 51155 }, { "epoch": 0.05152889536189581, "grad_norm": 10.679756864212496, "learning_rate": 4.999971277862097e-05, "loss": 2.7841, "mean_token_accuracy": 0.4103448212146759, "step": 51160 }, { "epoch": 0.051533931414999984, "grad_norm": 12.764146387174659, "learning_rate": 4.99997108821536e-05, "loss": 2.4377, "mean_token_accuracy": 0.38620689511299133, "step": 51165 }, { "epoch": 0.05153896746810416, "grad_norm": 13.489231022979125, "learning_rate": 4.9999708979445844e-05, "loss": 2.3699, "mean_token_accuracy": 0.40689656138420105, "step": 51170 }, { "epoch": 0.05154400352120833, "grad_norm": 12.3125489594521, "learning_rate": 4.999970707049772e-05, "loss": 2.5374, "mean_token_accuracy": 0.42758620977401735, "step": 51175 }, { "epoch": 0.051549039574312505, "grad_norm": 12.669920988744435, "learning_rate": 4.999970515530921e-05, "loss": 2.1664, "mean_token_accuracy": 0.4862069010734558, "step": 51180 }, { "epoch": 0.05155407562741668, "grad_norm": 12.05119087802671, "learning_rate": 4.999970323388032e-05, "loss": 2.6343, "mean_token_accuracy": 0.36896551847457887, "step": 51185 }, { "epoch": 0.051559111680520846, "grad_norm": 32.00112245250176, "learning_rate": 4.999970130621106e-05, "loss": 3.1162, "mean_token_accuracy": 0.3655172407627106, "step": 51190 }, { "epoch": 0.05156414773362502, "grad_norm": 11.417235798421409, "learning_rate": 4.999969937230142e-05, "loss": 2.3447, "mean_token_accuracy": 0.44827585816383364, "step": 51195 }, { "epoch": 0.05156918378672919, "grad_norm": 13.061022705416535, "learning_rate": 4.9999697432151405e-05, "loss": 2.7388, "mean_token_accuracy": 0.3448275804519653, "step": 51200 }, { "epoch": 0.05157421983983337, "grad_norm": 10.423564866242934, "learning_rate": 4.9999695485761016e-05, "loss": 2.3348, "mean_token_accuracy": 0.4811857223510742, "step": 51205 }, { "epoch": 0.05157925589293754, "grad_norm": 12.289162809274536, "learning_rate": 4.999969353313025e-05, "loss": 2.7429, "mean_token_accuracy": 0.36896551847457887, "step": 51210 }, { "epoch": 0.051584291946041715, "grad_norm": 14.513978196107391, "learning_rate": 4.9999691574259116e-05, "loss": 2.2719, "mean_token_accuracy": 0.4310344815254211, "step": 51215 }, { "epoch": 0.05158932799914589, "grad_norm": 12.759548930126149, "learning_rate": 4.9999689609147605e-05, "loss": 3.0401, "mean_token_accuracy": 0.34482758641242983, "step": 51220 }, { "epoch": 0.051594364052250055, "grad_norm": 14.07693033166377, "learning_rate": 4.9999687637795724e-05, "loss": 2.635, "mean_token_accuracy": 0.37241379618644715, "step": 51225 }, { "epoch": 0.05159940010535423, "grad_norm": 12.769116993783562, "learning_rate": 4.999968566020346e-05, "loss": 2.4998, "mean_token_accuracy": 0.4068965494632721, "step": 51230 }, { "epoch": 0.0516044361584584, "grad_norm": 13.519669918323107, "learning_rate": 4.999968367637083e-05, "loss": 2.5875, "mean_token_accuracy": 0.4103448212146759, "step": 51235 }, { "epoch": 0.051609472211562576, "grad_norm": 13.091166493068675, "learning_rate": 4.999968168629783e-05, "loss": 3.0719, "mean_token_accuracy": 0.37931033968925476, "step": 51240 }, { "epoch": 0.05161450826466675, "grad_norm": 13.276585034756858, "learning_rate": 4.9999679689984464e-05, "loss": 2.7862, "mean_token_accuracy": 0.3620689630508423, "step": 51245 }, { "epoch": 0.051619544317770924, "grad_norm": 13.147118886267071, "learning_rate": 4.999967768743072e-05, "loss": 2.6313, "mean_token_accuracy": 0.37586206793785093, "step": 51250 }, { "epoch": 0.0516245803708751, "grad_norm": 11.7782902842485, "learning_rate": 4.999967567863661e-05, "loss": 2.563, "mean_token_accuracy": 0.3862068891525269, "step": 51255 }, { "epoch": 0.051629616423979265, "grad_norm": 13.650756641481388, "learning_rate": 4.999967366360213e-05, "loss": 2.2632, "mean_token_accuracy": 0.44827587008476255, "step": 51260 }, { "epoch": 0.05163465247708344, "grad_norm": 9.390638167383264, "learning_rate": 4.999967164232729e-05, "loss": 2.4564, "mean_token_accuracy": 0.41034482717514037, "step": 51265 }, { "epoch": 0.05163968853018761, "grad_norm": 11.207402260636966, "learning_rate": 4.999966961481207e-05, "loss": 2.3716, "mean_token_accuracy": 0.39655172228813174, "step": 51270 }, { "epoch": 0.051644724583291786, "grad_norm": 11.66384245821523, "learning_rate": 4.999966758105649e-05, "loss": 3.0666, "mean_token_accuracy": 0.34482758641242983, "step": 51275 }, { "epoch": 0.05164976063639596, "grad_norm": 16.26844994417877, "learning_rate": 4.9999665541060536e-05, "loss": 2.5265, "mean_token_accuracy": 0.3999999940395355, "step": 51280 }, { "epoch": 0.05165479668950013, "grad_norm": 9.416030849993486, "learning_rate": 4.999966349482423e-05, "loss": 2.1766, "mean_token_accuracy": 0.4379310369491577, "step": 51285 }, { "epoch": 0.05165983274260431, "grad_norm": 11.228840046175568, "learning_rate": 4.999966144234754e-05, "loss": 2.7072, "mean_token_accuracy": 0.35862069129943847, "step": 51290 }, { "epoch": 0.051664868795708474, "grad_norm": 17.258905051698523, "learning_rate": 4.99996593836305e-05, "loss": 2.8208, "mean_token_accuracy": 0.4119782209396362, "step": 51295 }, { "epoch": 0.05166990484881265, "grad_norm": 13.075396370550544, "learning_rate": 4.999965731867309e-05, "loss": 2.7675, "mean_token_accuracy": 0.4068965494632721, "step": 51300 }, { "epoch": 0.05167494090191682, "grad_norm": 13.227620589192181, "learning_rate": 4.999965524747531e-05, "loss": 2.5024, "mean_token_accuracy": 0.38106473684310915, "step": 51305 }, { "epoch": 0.051679976955020995, "grad_norm": 13.439786276457877, "learning_rate": 4.9999653170037175e-05, "loss": 2.6135, "mean_token_accuracy": 0.4137930989265442, "step": 51310 }, { "epoch": 0.05168501300812517, "grad_norm": 10.306234272154814, "learning_rate": 4.9999651086358676e-05, "loss": 2.5221, "mean_token_accuracy": 0.4310949832201004, "step": 51315 }, { "epoch": 0.05169004906122934, "grad_norm": 13.783639158989327, "learning_rate": 4.9999648996439814e-05, "loss": 2.668, "mean_token_accuracy": 0.35862069129943847, "step": 51320 }, { "epoch": 0.05169508511433352, "grad_norm": 21.178810627013096, "learning_rate": 4.999964690028059e-05, "loss": 2.7821, "mean_token_accuracy": 0.38275861740112305, "step": 51325 }, { "epoch": 0.05170012116743768, "grad_norm": 11.872769669137597, "learning_rate": 4.999964479788101e-05, "loss": 2.5241, "mean_token_accuracy": 0.4137930989265442, "step": 51330 }, { "epoch": 0.05170515722054186, "grad_norm": 16.811573310792177, "learning_rate": 4.999964268924106e-05, "loss": 2.6429, "mean_token_accuracy": 0.3620689630508423, "step": 51335 }, { "epoch": 0.05171019327364603, "grad_norm": 14.060322519721295, "learning_rate": 4.9999640574360754e-05, "loss": 2.2734, "mean_token_accuracy": 0.4379310369491577, "step": 51340 }, { "epoch": 0.051715229326750205, "grad_norm": 14.041968949191542, "learning_rate": 4.999963845324009e-05, "loss": 2.7971, "mean_token_accuracy": 0.37241379618644715, "step": 51345 }, { "epoch": 0.05172026537985438, "grad_norm": 26.808375329128097, "learning_rate": 4.999963632587906e-05, "loss": 3.2629, "mean_token_accuracy": 0.33793102502822875, "step": 51350 }, { "epoch": 0.05172530143295855, "grad_norm": 12.249344033156914, "learning_rate": 4.9999634192277686e-05, "loss": 2.3921, "mean_token_accuracy": 0.4413793087005615, "step": 51355 }, { "epoch": 0.051730337486062726, "grad_norm": 12.729540291138045, "learning_rate": 4.999963205243594e-05, "loss": 2.7996, "mean_token_accuracy": 0.39655172228813174, "step": 51360 }, { "epoch": 0.05173537353916689, "grad_norm": 13.20535539595901, "learning_rate": 4.999962990635385e-05, "loss": 2.9736, "mean_token_accuracy": 0.3896551728248596, "step": 51365 }, { "epoch": 0.05174040959227107, "grad_norm": 11.84301437140591, "learning_rate": 4.9999627754031395e-05, "loss": 2.3046, "mean_token_accuracy": 0.4413793057203293, "step": 51370 }, { "epoch": 0.05174544564537524, "grad_norm": 11.289490458225915, "learning_rate": 4.99996255954686e-05, "loss": 2.5873, "mean_token_accuracy": 0.38965516686439516, "step": 51375 }, { "epoch": 0.051750481698479414, "grad_norm": 15.591185201798947, "learning_rate": 4.999962343066543e-05, "loss": 2.7343, "mean_token_accuracy": 0.3551724135875702, "step": 51380 }, { "epoch": 0.05175551775158359, "grad_norm": 12.352233481478585, "learning_rate": 4.999962125962191e-05, "loss": 2.5227, "mean_token_accuracy": 0.3793103456497192, "step": 51385 }, { "epoch": 0.05176055380468776, "grad_norm": 14.817630170556203, "learning_rate": 4.999961908233804e-05, "loss": 3.08, "mean_token_accuracy": 0.3551724135875702, "step": 51390 }, { "epoch": 0.051765589857791935, "grad_norm": 11.995264548169345, "learning_rate": 4.999961689881382e-05, "loss": 2.673, "mean_token_accuracy": 0.39655172228813174, "step": 51395 }, { "epoch": 0.0517706259108961, "grad_norm": 11.389041458113113, "learning_rate": 4.9999614709049234e-05, "loss": 2.2819, "mean_token_accuracy": 0.4310344815254211, "step": 51400 }, { "epoch": 0.051775661964000276, "grad_norm": 12.232049000050218, "learning_rate": 4.9999612513044306e-05, "loss": 2.8424, "mean_token_accuracy": 0.3655172437429428, "step": 51405 }, { "epoch": 0.05178069801710445, "grad_norm": 11.672051781631591, "learning_rate": 4.999961031079903e-05, "loss": 2.5749, "mean_token_accuracy": 0.4, "step": 51410 }, { "epoch": 0.051785734070208624, "grad_norm": 12.653184436831527, "learning_rate": 4.99996081023134e-05, "loss": 2.6616, "mean_token_accuracy": 0.40344828367233276, "step": 51415 }, { "epoch": 0.0517907701233128, "grad_norm": 11.76446431990547, "learning_rate": 4.999960588758742e-05, "loss": 2.8634, "mean_token_accuracy": 0.404718691110611, "step": 51420 }, { "epoch": 0.05179580617641697, "grad_norm": 18.54822727521575, "learning_rate": 4.9999603666621085e-05, "loss": 2.7432, "mean_token_accuracy": 0.4310344815254211, "step": 51425 }, { "epoch": 0.051800842229521145, "grad_norm": 15.21484669007347, "learning_rate": 4.9999601439414405e-05, "loss": 2.6596, "mean_token_accuracy": 0.37241379022598264, "step": 51430 }, { "epoch": 0.05180587828262531, "grad_norm": 11.519423853241912, "learning_rate": 4.9999599205967376e-05, "loss": 2.4685, "mean_token_accuracy": 0.37241379618644715, "step": 51435 }, { "epoch": 0.051810914335729485, "grad_norm": 14.557774915715163, "learning_rate": 4.999959696628e-05, "loss": 2.6852, "mean_token_accuracy": 0.39655173420906065, "step": 51440 }, { "epoch": 0.05181595038883366, "grad_norm": 13.602200023616282, "learning_rate": 4.999959472035228e-05, "loss": 2.6285, "mean_token_accuracy": 0.39655172228813174, "step": 51445 }, { "epoch": 0.05182098644193783, "grad_norm": 18.327072905315713, "learning_rate": 4.99995924681842e-05, "loss": 2.4651, "mean_token_accuracy": 0.4034482717514038, "step": 51450 }, { "epoch": 0.05182602249504201, "grad_norm": 15.257442315203056, "learning_rate": 4.999959020977579e-05, "loss": 2.5188, "mean_token_accuracy": 0.43569267392158506, "step": 51455 }, { "epoch": 0.05183105854814618, "grad_norm": 11.769360807657616, "learning_rate": 4.999958794512703e-05, "loss": 2.4121, "mean_token_accuracy": 0.42413793206214906, "step": 51460 }, { "epoch": 0.051836094601250354, "grad_norm": 13.543668896104613, "learning_rate": 4.999958567423793e-05, "loss": 2.6288, "mean_token_accuracy": 0.3931034505367279, "step": 51465 }, { "epoch": 0.05184113065435452, "grad_norm": 13.846616772056356, "learning_rate": 4.999958339710847e-05, "loss": 2.5003, "mean_token_accuracy": 0.3931034356355667, "step": 51470 }, { "epoch": 0.051846166707458695, "grad_norm": 13.021003741549926, "learning_rate": 4.999958111373867e-05, "loss": 2.6672, "mean_token_accuracy": 0.3931034505367279, "step": 51475 }, { "epoch": 0.05185120276056287, "grad_norm": 24.264462849697768, "learning_rate": 4.999957882412855e-05, "loss": 2.6374, "mean_token_accuracy": 0.4186932861804962, "step": 51480 }, { "epoch": 0.05185623881366704, "grad_norm": 11.040854275060669, "learning_rate": 4.999957652827807e-05, "loss": 2.3008, "mean_token_accuracy": 0.44827587008476255, "step": 51485 }, { "epoch": 0.051861274866771216, "grad_norm": 13.086499436466484, "learning_rate": 4.9999574226187244e-05, "loss": 2.3235, "mean_token_accuracy": 0.41034482717514037, "step": 51490 }, { "epoch": 0.05186631091987539, "grad_norm": 15.227021717539586, "learning_rate": 4.999957191785608e-05, "loss": 2.8223, "mean_token_accuracy": 0.38396853804588316, "step": 51495 }, { "epoch": 0.051871346972979564, "grad_norm": 13.044571434700508, "learning_rate": 4.999956960328458e-05, "loss": 2.9737, "mean_token_accuracy": 0.3517241358757019, "step": 51500 }, { "epoch": 0.05187638302608373, "grad_norm": 13.56020772141275, "learning_rate": 4.999956728247274e-05, "loss": 2.955, "mean_token_accuracy": 0.3344827562570572, "step": 51505 }, { "epoch": 0.051881419079187904, "grad_norm": 13.283307949099592, "learning_rate": 4.999956495542057e-05, "loss": 2.6386, "mean_token_accuracy": 0.38275861740112305, "step": 51510 }, { "epoch": 0.05188645513229208, "grad_norm": 15.351473583826742, "learning_rate": 4.9999562622128045e-05, "loss": 2.4528, "mean_token_accuracy": 0.3965517163276672, "step": 51515 }, { "epoch": 0.05189149118539625, "grad_norm": 14.749473682088102, "learning_rate": 4.999956028259519e-05, "loss": 2.6267, "mean_token_accuracy": 0.3448275804519653, "step": 51520 }, { "epoch": 0.051896527238500426, "grad_norm": 11.639286347159493, "learning_rate": 4.9999557936822e-05, "loss": 2.3788, "mean_token_accuracy": 0.4137930989265442, "step": 51525 }, { "epoch": 0.0519015632916046, "grad_norm": 14.391390418917288, "learning_rate": 4.9999555584808465e-05, "loss": 2.8489, "mean_token_accuracy": 0.33793103098869326, "step": 51530 }, { "epoch": 0.05190659934470877, "grad_norm": 13.133728509129215, "learning_rate": 4.99995532265546e-05, "loss": 2.5761, "mean_token_accuracy": 0.37586206793785093, "step": 51535 }, { "epoch": 0.05191163539781294, "grad_norm": 12.494381510844713, "learning_rate": 4.99995508620604e-05, "loss": 2.8219, "mean_token_accuracy": 0.3758620619773865, "step": 51540 }, { "epoch": 0.051916671450917114, "grad_norm": 11.07871394599296, "learning_rate": 4.999954849132588e-05, "loss": 2.907, "mean_token_accuracy": 0.36896551251411436, "step": 51545 }, { "epoch": 0.05192170750402129, "grad_norm": 13.829573751328976, "learning_rate": 4.999954611435101e-05, "loss": 2.7079, "mean_token_accuracy": 0.358620685338974, "step": 51550 }, { "epoch": 0.05192674355712546, "grad_norm": 11.982410234925275, "learning_rate": 4.99995437311358e-05, "loss": 2.5301, "mean_token_accuracy": 0.39655172228813174, "step": 51555 }, { "epoch": 0.051931779610229635, "grad_norm": 12.280582909639634, "learning_rate": 4.999954134168027e-05, "loss": 2.8146, "mean_token_accuracy": 0.39310344457626345, "step": 51560 }, { "epoch": 0.05193681566333381, "grad_norm": 12.607713751972994, "learning_rate": 4.999953894598441e-05, "loss": 2.7687, "mean_token_accuracy": 0.4310344815254211, "step": 51565 }, { "epoch": 0.05194185171643798, "grad_norm": 17.117914948019205, "learning_rate": 4.999953654404821e-05, "loss": 2.938, "mean_token_accuracy": 0.3586206942796707, "step": 51570 }, { "epoch": 0.05194688776954215, "grad_norm": 13.74302062263069, "learning_rate": 4.999953413587169e-05, "loss": 2.9738, "mean_token_accuracy": 0.35862069129943847, "step": 51575 }, { "epoch": 0.05195192382264632, "grad_norm": 12.981803458447178, "learning_rate": 4.999953172145484e-05, "loss": 2.9753, "mean_token_accuracy": 0.324137932062149, "step": 51580 }, { "epoch": 0.0519569598757505, "grad_norm": 15.36083719700262, "learning_rate": 4.999952930079765e-05, "loss": 2.8738, "mean_token_accuracy": 0.3620689630508423, "step": 51585 }, { "epoch": 0.05196199592885467, "grad_norm": 9.99752250040171, "learning_rate": 4.9999526873900133e-05, "loss": 2.3806, "mean_token_accuracy": 0.4103448212146759, "step": 51590 }, { "epoch": 0.051967031981958844, "grad_norm": 19.43790074288775, "learning_rate": 4.999952444076229e-05, "loss": 2.9239, "mean_token_accuracy": 0.3482758581638336, "step": 51595 }, { "epoch": 0.05197206803506302, "grad_norm": 13.717880053116458, "learning_rate": 4.999952200138413e-05, "loss": 2.5889, "mean_token_accuracy": 0.3896551787853241, "step": 51600 }, { "epoch": 0.05197710408816719, "grad_norm": 11.498437928134406, "learning_rate": 4.9999519555765625e-05, "loss": 2.7037, "mean_token_accuracy": 0.3551724165678024, "step": 51605 }, { "epoch": 0.05198214014127136, "grad_norm": 12.341570437492297, "learning_rate": 4.9999517103906813e-05, "loss": 2.8332, "mean_token_accuracy": 0.3620689630508423, "step": 51610 }, { "epoch": 0.05198717619437553, "grad_norm": 14.564671008418772, "learning_rate": 4.9999514645807666e-05, "loss": 2.2651, "mean_token_accuracy": 0.47241379618644713, "step": 51615 }, { "epoch": 0.051992212247479706, "grad_norm": 12.519847970455341, "learning_rate": 4.99995121814682e-05, "loss": 3.4641, "mean_token_accuracy": 0.2758620709180832, "step": 51620 }, { "epoch": 0.05199724830058388, "grad_norm": 11.639487803280984, "learning_rate": 4.9999509710888406e-05, "loss": 2.4807, "mean_token_accuracy": 0.4310344815254211, "step": 51625 }, { "epoch": 0.052002284353688054, "grad_norm": 9.916910592851048, "learning_rate": 4.9999507234068285e-05, "loss": 2.4368, "mean_token_accuracy": 0.44827585816383364, "step": 51630 }, { "epoch": 0.05200732040679223, "grad_norm": 12.906453941158953, "learning_rate": 4.999950475100785e-05, "loss": 3.1005, "mean_token_accuracy": 0.3310344755649567, "step": 51635 }, { "epoch": 0.0520123564598964, "grad_norm": 24.560825195198234, "learning_rate": 4.999950226170709e-05, "loss": 3.3394, "mean_token_accuracy": 0.33448275923728943, "step": 51640 }, { "epoch": 0.05201739251300057, "grad_norm": 10.597612371892525, "learning_rate": 4.9999499766166006e-05, "loss": 2.4425, "mean_token_accuracy": 0.4206896543502808, "step": 51645 }, { "epoch": 0.05202242856610474, "grad_norm": 12.16400483823541, "learning_rate": 4.9999497264384604e-05, "loss": 2.5568, "mean_token_accuracy": 0.3758620619773865, "step": 51650 }, { "epoch": 0.052027464619208916, "grad_norm": 12.369190939519763, "learning_rate": 4.9999494756362886e-05, "loss": 2.7597, "mean_token_accuracy": 0.4172413766384125, "step": 51655 }, { "epoch": 0.05203250067231309, "grad_norm": 13.127422412411152, "learning_rate": 4.999949224210085e-05, "loss": 2.1539, "mean_token_accuracy": 0.4551724076271057, "step": 51660 }, { "epoch": 0.05203753672541726, "grad_norm": 14.268059670233528, "learning_rate": 4.999948972159849e-05, "loss": 2.9776, "mean_token_accuracy": 0.3827586233615875, "step": 51665 }, { "epoch": 0.05204257277852144, "grad_norm": 11.595923127274558, "learning_rate": 4.9999487194855815e-05, "loss": 2.7785, "mean_token_accuracy": 0.403448274731636, "step": 51670 }, { "epoch": 0.05204760883162561, "grad_norm": 12.720134450636516, "learning_rate": 4.999948466187282e-05, "loss": 2.6524, "mean_token_accuracy": 0.36206896901130675, "step": 51675 }, { "epoch": 0.05205264488472978, "grad_norm": 15.636618475390302, "learning_rate": 4.999948212264952e-05, "loss": 2.7185, "mean_token_accuracy": 0.33103448152542114, "step": 51680 }, { "epoch": 0.05205768093783395, "grad_norm": 14.81298550824267, "learning_rate": 4.999947957718589e-05, "loss": 2.7656, "mean_token_accuracy": 0.3655172407627106, "step": 51685 }, { "epoch": 0.052062716990938125, "grad_norm": 10.339322387030133, "learning_rate": 4.999947702548196e-05, "loss": 2.6231, "mean_token_accuracy": 0.39655172228813174, "step": 51690 }, { "epoch": 0.0520677530440423, "grad_norm": 13.347273905295713, "learning_rate": 4.999947446753771e-05, "loss": 2.6441, "mean_token_accuracy": 0.41911675930023196, "step": 51695 }, { "epoch": 0.05207278909714647, "grad_norm": 10.594017643763067, "learning_rate": 4.999947190335314e-05, "loss": 2.4394, "mean_token_accuracy": 0.43793103098869324, "step": 51700 }, { "epoch": 0.052077825150250646, "grad_norm": 12.803844328009665, "learning_rate": 4.999946933292827e-05, "loss": 2.7878, "mean_token_accuracy": 0.38620689511299133, "step": 51705 }, { "epoch": 0.05208286120335482, "grad_norm": 14.976573455990588, "learning_rate": 4.9999466756263075e-05, "loss": 2.3552, "mean_token_accuracy": 0.37241379022598264, "step": 51710 }, { "epoch": 0.05208789725645899, "grad_norm": 12.657586640069539, "learning_rate": 4.9999464173357574e-05, "loss": 2.416, "mean_token_accuracy": 0.43448275327682495, "step": 51715 }, { "epoch": 0.05209293330956316, "grad_norm": 12.644352917246177, "learning_rate": 4.999946158421177e-05, "loss": 2.4875, "mean_token_accuracy": 0.417241370677948, "step": 51720 }, { "epoch": 0.052097969362667335, "grad_norm": 12.32544767388427, "learning_rate": 4.9999458988825646e-05, "loss": 2.7227, "mean_token_accuracy": 0.38620689511299133, "step": 51725 }, { "epoch": 0.05210300541577151, "grad_norm": 12.918179747887688, "learning_rate": 4.999945638719922e-05, "loss": 2.9189, "mean_token_accuracy": 0.32413792610168457, "step": 51730 }, { "epoch": 0.05210804146887568, "grad_norm": 92.34655459463046, "learning_rate": 4.999945377933248e-05, "loss": 2.6129, "mean_token_accuracy": 0.4310344696044922, "step": 51735 }, { "epoch": 0.052113077521979856, "grad_norm": 12.836650729429833, "learning_rate": 4.999945116522544e-05, "loss": 2.2936, "mean_token_accuracy": 0.4862069010734558, "step": 51740 }, { "epoch": 0.05211811357508403, "grad_norm": 11.377132865226105, "learning_rate": 4.9999448544878086e-05, "loss": 2.6369, "mean_token_accuracy": 0.38620689511299133, "step": 51745 }, { "epoch": 0.052123149628188196, "grad_norm": 10.256913076416197, "learning_rate": 4.999944591829043e-05, "loss": 2.3834, "mean_token_accuracy": 0.4344827592372894, "step": 51750 }, { "epoch": 0.05212818568129237, "grad_norm": 11.238740482668224, "learning_rate": 4.999944328546247e-05, "loss": 2.3435, "mean_token_accuracy": 0.45172414779663084, "step": 51755 }, { "epoch": 0.052133221734396544, "grad_norm": 14.771451343348712, "learning_rate": 4.9999440646394204e-05, "loss": 2.466, "mean_token_accuracy": 0.4137930989265442, "step": 51760 }, { "epoch": 0.05213825778750072, "grad_norm": 17.362302224155048, "learning_rate": 4.999943800108564e-05, "loss": 2.2653, "mean_token_accuracy": 0.4801724135875702, "step": 51765 }, { "epoch": 0.05214329384060489, "grad_norm": 14.571726134069939, "learning_rate": 4.999943534953676e-05, "loss": 2.3732, "mean_token_accuracy": 0.4448275864124298, "step": 51770 }, { "epoch": 0.052148329893709065, "grad_norm": 13.157735294888793, "learning_rate": 4.999943269174759e-05, "loss": 2.5321, "mean_token_accuracy": 0.4241379380226135, "step": 51775 }, { "epoch": 0.05215336594681324, "grad_norm": 15.09499073769964, "learning_rate": 4.9999430027718106e-05, "loss": 3.0651, "mean_token_accuracy": 0.3000000029802322, "step": 51780 }, { "epoch": 0.052158401999917406, "grad_norm": 12.929928086677865, "learning_rate": 4.999942735744833e-05, "loss": 2.6868, "mean_token_accuracy": 0.37586206793785093, "step": 51785 }, { "epoch": 0.05216343805302158, "grad_norm": 12.062415224600912, "learning_rate": 4.999942468093825e-05, "loss": 3.1127, "mean_token_accuracy": 0.35517241060733795, "step": 51790 }, { "epoch": 0.05216847410612575, "grad_norm": 12.788421092382398, "learning_rate": 4.999942199818787e-05, "loss": 2.6278, "mean_token_accuracy": 0.42256503701210024, "step": 51795 }, { "epoch": 0.05217351015922993, "grad_norm": 24.8238997749523, "learning_rate": 4.99994193091972e-05, "loss": 2.7558, "mean_token_accuracy": 0.4068965494632721, "step": 51800 }, { "epoch": 0.0521785462123341, "grad_norm": 12.04873849761242, "learning_rate": 4.999941661396622e-05, "loss": 2.7465, "mean_token_accuracy": 0.39655172228813174, "step": 51805 }, { "epoch": 0.052183582265438275, "grad_norm": 15.141144305014784, "learning_rate": 4.999941391249495e-05, "loss": 3.0766, "mean_token_accuracy": 0.36551724672317504, "step": 51810 }, { "epoch": 0.05218861831854245, "grad_norm": 12.924091047932116, "learning_rate": 4.9999411204783386e-05, "loss": 2.5162, "mean_token_accuracy": 0.43103448748588563, "step": 51815 }, { "epoch": 0.052193654371646615, "grad_norm": 11.939436121383284, "learning_rate": 4.999940849083152e-05, "loss": 2.6405, "mean_token_accuracy": 0.3551724076271057, "step": 51820 }, { "epoch": 0.05219869042475079, "grad_norm": 11.766971299794431, "learning_rate": 4.9999405770639366e-05, "loss": 2.4433, "mean_token_accuracy": 0.4206896543502808, "step": 51825 }, { "epoch": 0.05220372647785496, "grad_norm": 13.464807863130709, "learning_rate": 4.9999403044206916e-05, "loss": 2.6405, "mean_token_accuracy": 0.36896551847457887, "step": 51830 }, { "epoch": 0.05220876253095914, "grad_norm": 10.643781827120558, "learning_rate": 4.9999400311534165e-05, "loss": 2.5752, "mean_token_accuracy": 0.44137930274009707, "step": 51835 }, { "epoch": 0.05221379858406331, "grad_norm": 14.622420326068994, "learning_rate": 4.999939757262113e-05, "loss": 2.5617, "mean_token_accuracy": 0.41034482717514037, "step": 51840 }, { "epoch": 0.052218834637167484, "grad_norm": 12.281396744478528, "learning_rate": 4.999939482746779e-05, "loss": 2.7527, "mean_token_accuracy": 0.38620689511299133, "step": 51845 }, { "epoch": 0.05222387069027166, "grad_norm": 13.165128704313105, "learning_rate": 4.9999392076074176e-05, "loss": 2.2695, "mean_token_accuracy": 0.46394434571266174, "step": 51850 }, { "epoch": 0.052228906743375825, "grad_norm": 11.12759346888424, "learning_rate": 4.999938931844026e-05, "loss": 2.5365, "mean_token_accuracy": 0.38965516686439516, "step": 51855 }, { "epoch": 0.05223394279648, "grad_norm": 11.397626474171725, "learning_rate": 4.9999386554566067e-05, "loss": 2.9969, "mean_token_accuracy": 0.3344827562570572, "step": 51860 }, { "epoch": 0.05223897884958417, "grad_norm": 12.487777989322241, "learning_rate": 4.999938378445157e-05, "loss": 2.8486, "mean_token_accuracy": 0.3862069010734558, "step": 51865 }, { "epoch": 0.052244014902688346, "grad_norm": 9.433906355649128, "learning_rate": 4.99993810080968e-05, "loss": 2.3709, "mean_token_accuracy": 0.3896551728248596, "step": 51870 }, { "epoch": 0.05224905095579252, "grad_norm": 11.904287031743506, "learning_rate": 4.9999378225501734e-05, "loss": 2.4249, "mean_token_accuracy": 0.4241379380226135, "step": 51875 }, { "epoch": 0.052254087008896694, "grad_norm": 12.040427949653834, "learning_rate": 4.999937543666638e-05, "loss": 2.3865, "mean_token_accuracy": 0.47241379618644713, "step": 51880 }, { "epoch": 0.05225912306200087, "grad_norm": 13.920480453844364, "learning_rate": 4.999937264159074e-05, "loss": 2.4145, "mean_token_accuracy": 0.3999999940395355, "step": 51885 }, { "epoch": 0.052264159115105034, "grad_norm": 14.586932860755967, "learning_rate": 4.999936984027483e-05, "loss": 2.462, "mean_token_accuracy": 0.3793103456497192, "step": 51890 }, { "epoch": 0.05226919516820921, "grad_norm": 18.092399449270154, "learning_rate": 4.999936703271862e-05, "loss": 2.507, "mean_token_accuracy": 0.40302479863166807, "step": 51895 }, { "epoch": 0.05227423122131338, "grad_norm": 14.177754307940212, "learning_rate": 4.9999364218922126e-05, "loss": 3.1606, "mean_token_accuracy": 0.35172413289546967, "step": 51900 }, { "epoch": 0.052279267274417555, "grad_norm": 10.697912852304334, "learning_rate": 4.999936139888535e-05, "loss": 2.2602, "mean_token_accuracy": 0.4482758641242981, "step": 51905 }, { "epoch": 0.05228430332752173, "grad_norm": 11.28488865369071, "learning_rate": 4.99993585726083e-05, "loss": 2.5272, "mean_token_accuracy": 0.3655172407627106, "step": 51910 }, { "epoch": 0.0522893393806259, "grad_norm": 12.66233430467587, "learning_rate": 4.9999355740090956e-05, "loss": 3.0583, "mean_token_accuracy": 0.358620685338974, "step": 51915 }, { "epoch": 0.05229437543373008, "grad_norm": 10.620397930173764, "learning_rate": 4.9999352901333346e-05, "loss": 2.8388, "mean_token_accuracy": 0.3655172407627106, "step": 51920 }, { "epoch": 0.052299411486834244, "grad_norm": 13.238533620290577, "learning_rate": 4.9999350056335454e-05, "loss": 2.7829, "mean_token_accuracy": 0.38275861740112305, "step": 51925 }, { "epoch": 0.05230444753993842, "grad_norm": 12.28203650063212, "learning_rate": 4.999934720509728e-05, "loss": 2.4589, "mean_token_accuracy": 0.3931034505367279, "step": 51930 }, { "epoch": 0.05230948359304259, "grad_norm": 20.535431642558393, "learning_rate": 4.9999344347618826e-05, "loss": 2.7109, "mean_token_accuracy": 0.41240169405937194, "step": 51935 }, { "epoch": 0.052314519646146765, "grad_norm": 11.985089735323992, "learning_rate": 4.99993414839001e-05, "loss": 2.4698, "mean_token_accuracy": 0.36206896901130675, "step": 51940 }, { "epoch": 0.05231955569925094, "grad_norm": 13.02402898449528, "learning_rate": 4.999933861394109e-05, "loss": 2.7767, "mean_token_accuracy": 0.37931033968925476, "step": 51945 }, { "epoch": 0.05232459175235511, "grad_norm": 11.319896378824003, "learning_rate": 4.9999335737741814e-05, "loss": 2.5596, "mean_token_accuracy": 0.4034482717514038, "step": 51950 }, { "epoch": 0.052329627805459286, "grad_norm": 12.545421124510765, "learning_rate": 4.9999332855302254e-05, "loss": 2.1279, "mean_token_accuracy": 0.47586206793785096, "step": 51955 }, { "epoch": 0.05233466385856345, "grad_norm": 14.406110329255547, "learning_rate": 4.999932996662243e-05, "loss": 3.1266, "mean_token_accuracy": 0.3275862067937851, "step": 51960 }, { "epoch": 0.05233969991166763, "grad_norm": 17.209945065764238, "learning_rate": 4.999932707170233e-05, "loss": 3.1088, "mean_token_accuracy": 0.341379314661026, "step": 51965 }, { "epoch": 0.0523447359647718, "grad_norm": 12.8563753325561, "learning_rate": 4.999932417054196e-05, "loss": 2.3161, "mean_token_accuracy": 0.4620689630508423, "step": 51970 }, { "epoch": 0.052349772017875974, "grad_norm": 9.630827999758104, "learning_rate": 4.999932126314131e-05, "loss": 2.6604, "mean_token_accuracy": 0.37931033968925476, "step": 51975 }, { "epoch": 0.05235480807098015, "grad_norm": 18.231017588421746, "learning_rate": 4.9999318349500394e-05, "loss": 3.3551, "mean_token_accuracy": 0.34137930870056155, "step": 51980 }, { "epoch": 0.05235984412408432, "grad_norm": 11.046257661122137, "learning_rate": 4.9999315429619206e-05, "loss": 2.8109, "mean_token_accuracy": 0.37586205899715425, "step": 51985 }, { "epoch": 0.052364880177188496, "grad_norm": 11.303123043893011, "learning_rate": 4.999931250349775e-05, "loss": 2.6773, "mean_token_accuracy": 0.38275861740112305, "step": 51990 }, { "epoch": 0.05236991623029266, "grad_norm": 13.253435412221885, "learning_rate": 4.999930957113602e-05, "loss": 2.7006, "mean_token_accuracy": 0.3965517282485962, "step": 51995 }, { "epoch": 0.052374952283396836, "grad_norm": 11.63368960927172, "learning_rate": 4.9999306632534034e-05, "loss": 2.4454, "mean_token_accuracy": 0.41724138259887694, "step": 52000 }, { "epoch": 0.05237998833650101, "grad_norm": 12.17144724304306, "learning_rate": 4.999930368769178e-05, "loss": 2.1679, "mean_token_accuracy": 0.48275861144065857, "step": 52005 }, { "epoch": 0.052385024389605184, "grad_norm": 13.787507713672492, "learning_rate": 4.999930073660925e-05, "loss": 2.521, "mean_token_accuracy": 0.42758620977401735, "step": 52010 }, { "epoch": 0.05239006044270936, "grad_norm": 13.54498544070483, "learning_rate": 4.9999297779286465e-05, "loss": 2.3914, "mean_token_accuracy": 0.3931034505367279, "step": 52015 }, { "epoch": 0.05239509649581353, "grad_norm": 13.138131937217258, "learning_rate": 4.999929481572341e-05, "loss": 2.5963, "mean_token_accuracy": 0.38487597107887267, "step": 52020 }, { "epoch": 0.052400132548917705, "grad_norm": 12.979288464583135, "learning_rate": 4.99992918459201e-05, "loss": 2.7456, "mean_token_accuracy": 0.3655172407627106, "step": 52025 }, { "epoch": 0.05240516860202187, "grad_norm": 14.21718228221012, "learning_rate": 4.999928886987651e-05, "loss": 2.6792, "mean_token_accuracy": 0.4206896543502808, "step": 52030 }, { "epoch": 0.052410204655126046, "grad_norm": 12.966998853878286, "learning_rate": 4.999928588759267e-05, "loss": 2.8929, "mean_token_accuracy": 0.3793103456497192, "step": 52035 }, { "epoch": 0.05241524070823022, "grad_norm": 14.839121343102141, "learning_rate": 4.999928289906858e-05, "loss": 2.7162, "mean_token_accuracy": 0.38275861740112305, "step": 52040 }, { "epoch": 0.05242027676133439, "grad_norm": 11.869848406801163, "learning_rate": 4.999927990430421e-05, "loss": 2.6238, "mean_token_accuracy": 0.3862069010734558, "step": 52045 }, { "epoch": 0.05242531281443857, "grad_norm": 11.81959211718063, "learning_rate": 4.99992769032996e-05, "loss": 2.6896, "mean_token_accuracy": 0.3827586233615875, "step": 52050 }, { "epoch": 0.05243034886754274, "grad_norm": 10.656825799457476, "learning_rate": 4.999927389605471e-05, "loss": 2.3044, "mean_token_accuracy": 0.41222020983695984, "step": 52055 }, { "epoch": 0.052435384920646914, "grad_norm": 10.987871883063795, "learning_rate": 4.999927088256958e-05, "loss": 2.8076, "mean_token_accuracy": 0.34827586114406583, "step": 52060 }, { "epoch": 0.05244042097375108, "grad_norm": 10.797993233098772, "learning_rate": 4.9999267862844176e-05, "loss": 2.6516, "mean_token_accuracy": 0.3724137842655182, "step": 52065 }, { "epoch": 0.052445457026855255, "grad_norm": 12.458782090006233, "learning_rate": 4.9999264836878536e-05, "loss": 2.9152, "mean_token_accuracy": 0.3551724135875702, "step": 52070 }, { "epoch": 0.05245049307995943, "grad_norm": 13.23608192693537, "learning_rate": 4.9999261804672634e-05, "loss": 2.4127, "mean_token_accuracy": 0.4068965554237366, "step": 52075 }, { "epoch": 0.0524555291330636, "grad_norm": 11.488794148054623, "learning_rate": 4.999925876622647e-05, "loss": 2.1559, "mean_token_accuracy": 0.4156079888343811, "step": 52080 }, { "epoch": 0.052460565186167776, "grad_norm": 12.090644229971591, "learning_rate": 4.999925572154006e-05, "loss": 2.5503, "mean_token_accuracy": 0.358620685338974, "step": 52085 }, { "epoch": 0.05246560123927195, "grad_norm": 10.342209494298395, "learning_rate": 4.99992526706134e-05, "loss": 2.2552, "mean_token_accuracy": 0.4689655065536499, "step": 52090 }, { "epoch": 0.052470637292376124, "grad_norm": 14.598708871373963, "learning_rate": 4.999924961344648e-05, "loss": 2.7511, "mean_token_accuracy": 0.3999999940395355, "step": 52095 }, { "epoch": 0.05247567334548029, "grad_norm": 14.616971671790006, "learning_rate": 4.9999246550039316e-05, "loss": 2.2227, "mean_token_accuracy": 0.4551724076271057, "step": 52100 }, { "epoch": 0.052480709398584464, "grad_norm": 12.528962196313383, "learning_rate": 4.99992434803919e-05, "loss": 2.862, "mean_token_accuracy": 0.3827586233615875, "step": 52105 }, { "epoch": 0.05248574545168864, "grad_norm": 12.023485513785992, "learning_rate": 4.9999240404504226e-05, "loss": 2.8143, "mean_token_accuracy": 0.3965517282485962, "step": 52110 }, { "epoch": 0.05249078150479281, "grad_norm": 11.761039655052471, "learning_rate": 4.999923732237632e-05, "loss": 2.3418, "mean_token_accuracy": 0.4034482717514038, "step": 52115 }, { "epoch": 0.052495817557896986, "grad_norm": 13.230800703921457, "learning_rate": 4.999923423400816e-05, "loss": 2.5962, "mean_token_accuracy": 0.37931033968925476, "step": 52120 }, { "epoch": 0.05250085361100116, "grad_norm": 14.240433525950861, "learning_rate": 4.999923113939975e-05, "loss": 2.4569, "mean_token_accuracy": 0.41034482717514037, "step": 52125 }, { "epoch": 0.05250588966410533, "grad_norm": 12.457293250188362, "learning_rate": 4.9999228038551096e-05, "loss": 2.1803, "mean_token_accuracy": 0.458620685338974, "step": 52130 }, { "epoch": 0.0525109257172095, "grad_norm": 12.27904909587513, "learning_rate": 4.99992249314622e-05, "loss": 2.7667, "mean_token_accuracy": 0.37931033968925476, "step": 52135 }, { "epoch": 0.052515961770313674, "grad_norm": 11.14996876610598, "learning_rate": 4.9999221818133055e-05, "loss": 2.6647, "mean_token_accuracy": 0.4068965494632721, "step": 52140 }, { "epoch": 0.05252099782341785, "grad_norm": 9.900206047952885, "learning_rate": 4.9999218698563674e-05, "loss": 2.7649, "mean_token_accuracy": 0.3827586233615875, "step": 52145 }, { "epoch": 0.05252603387652202, "grad_norm": 12.207255185333542, "learning_rate": 4.9999215572754045e-05, "loss": 2.646, "mean_token_accuracy": 0.34137930870056155, "step": 52150 }, { "epoch": 0.052531069929626195, "grad_norm": 10.577116839930989, "learning_rate": 4.9999212440704175e-05, "loss": 2.7892, "mean_token_accuracy": 0.3965517282485962, "step": 52155 }, { "epoch": 0.05253610598273037, "grad_norm": 20.21637411673557, "learning_rate": 4.999920930241407e-05, "loss": 2.7213, "mean_token_accuracy": 0.3620689630508423, "step": 52160 }, { "epoch": 0.05254114203583454, "grad_norm": 13.903731067244967, "learning_rate": 4.999920615788371e-05, "loss": 2.7897, "mean_token_accuracy": 0.34482758939266206, "step": 52165 }, { "epoch": 0.05254617808893871, "grad_norm": 18.24469380220512, "learning_rate": 4.999920300711313e-05, "loss": 2.8853, "mean_token_accuracy": 0.4034482717514038, "step": 52170 }, { "epoch": 0.05255121414204288, "grad_norm": 13.533902094835367, "learning_rate": 4.99991998501023e-05, "loss": 2.6875, "mean_token_accuracy": 0.43103447556495667, "step": 52175 }, { "epoch": 0.05255625019514706, "grad_norm": 14.24378219346739, "learning_rate": 4.999919668685124e-05, "loss": 2.6151, "mean_token_accuracy": 0.4137930989265442, "step": 52180 }, { "epoch": 0.05256128624825123, "grad_norm": 10.292270013990894, "learning_rate": 4.9999193517359947e-05, "loss": 2.5914, "mean_token_accuracy": 0.40344828367233276, "step": 52185 }, { "epoch": 0.052566322301355405, "grad_norm": 13.872307093643263, "learning_rate": 4.9999190341628405e-05, "loss": 2.8447, "mean_token_accuracy": 0.4, "step": 52190 }, { "epoch": 0.05257135835445958, "grad_norm": 13.849164232194378, "learning_rate": 4.999918715965664e-05, "loss": 2.4755, "mean_token_accuracy": 0.40689654350280763, "step": 52195 }, { "epoch": 0.05257639440756375, "grad_norm": 11.87651107097205, "learning_rate": 4.999918397144463e-05, "loss": 2.1827, "mean_token_accuracy": 0.4689655125141144, "step": 52200 }, { "epoch": 0.05258143046066792, "grad_norm": 14.367810572842119, "learning_rate": 4.99991807769924e-05, "loss": 2.9821, "mean_token_accuracy": 0.37241379618644715, "step": 52205 }, { "epoch": 0.05258646651377209, "grad_norm": 11.381407012232026, "learning_rate": 4.999917757629993e-05, "loss": 2.5382, "mean_token_accuracy": 0.401875376701355, "step": 52210 }, { "epoch": 0.052591502566876266, "grad_norm": 16.406232845676236, "learning_rate": 4.999917436936723e-05, "loss": 2.4639, "mean_token_accuracy": 0.41379311084747317, "step": 52215 }, { "epoch": 0.05259653861998044, "grad_norm": 15.499543231027019, "learning_rate": 4.9999171156194307e-05, "loss": 2.505, "mean_token_accuracy": 0.40689654350280763, "step": 52220 }, { "epoch": 0.052601574673084614, "grad_norm": 14.858624427498272, "learning_rate": 4.999916793678115e-05, "loss": 2.5045, "mean_token_accuracy": 0.4206896543502808, "step": 52225 }, { "epoch": 0.05260661072618879, "grad_norm": 11.233828437139257, "learning_rate": 4.999916471112777e-05, "loss": 2.5937, "mean_token_accuracy": 0.4034482777118683, "step": 52230 }, { "epoch": 0.05261164677929296, "grad_norm": 11.208375804292299, "learning_rate": 4.999916147923415e-05, "loss": 2.3938, "mean_token_accuracy": 0.43103448748588563, "step": 52235 }, { "epoch": 0.05261668283239713, "grad_norm": 11.254985535819657, "learning_rate": 4.999915824110032e-05, "loss": 2.583, "mean_token_accuracy": 0.4931034445762634, "step": 52240 }, { "epoch": 0.0526217188855013, "grad_norm": 12.674404404396473, "learning_rate": 4.9999154996726256e-05, "loss": 2.453, "mean_token_accuracy": 0.4586206912994385, "step": 52245 }, { "epoch": 0.052626754938605476, "grad_norm": 12.48225736606735, "learning_rate": 4.999915174611197e-05, "loss": 2.9606, "mean_token_accuracy": 0.36551723182201384, "step": 52250 }, { "epoch": 0.05263179099170965, "grad_norm": 15.392282192757758, "learning_rate": 4.999914848925746e-05, "loss": 3.0142, "mean_token_accuracy": 0.37586206793785093, "step": 52255 }, { "epoch": 0.05263682704481382, "grad_norm": 11.248839819767962, "learning_rate": 4.999914522616272e-05, "loss": 2.3361, "mean_token_accuracy": 0.4399878978729248, "step": 52260 }, { "epoch": 0.052641863097918, "grad_norm": 11.771158604649797, "learning_rate": 4.999914195682776e-05, "loss": 2.4946, "mean_token_accuracy": 0.4034482777118683, "step": 52265 }, { "epoch": 0.05264689915102217, "grad_norm": 11.195915050907423, "learning_rate": 4.999913868125259e-05, "loss": 2.8939, "mean_token_accuracy": 0.34482758641242983, "step": 52270 }, { "epoch": 0.05265193520412634, "grad_norm": 12.575116565538005, "learning_rate": 4.9999135399437187e-05, "loss": 2.4399, "mean_token_accuracy": 0.4502117335796356, "step": 52275 }, { "epoch": 0.05265697125723051, "grad_norm": 20.08568004960425, "learning_rate": 4.999913211138157e-05, "loss": 3.2782, "mean_token_accuracy": 0.3429522067308426, "step": 52280 }, { "epoch": 0.052662007310334685, "grad_norm": 12.85778333940458, "learning_rate": 4.9999128817085735e-05, "loss": 2.6607, "mean_token_accuracy": 0.3896551787853241, "step": 52285 }, { "epoch": 0.05266704336343886, "grad_norm": 11.118907725801146, "learning_rate": 4.999912551654968e-05, "loss": 2.7966, "mean_token_accuracy": 0.3896551728248596, "step": 52290 }, { "epoch": 0.05267207941654303, "grad_norm": 9.51625197672453, "learning_rate": 4.999912220977342e-05, "loss": 2.5506, "mean_token_accuracy": 0.3896551728248596, "step": 52295 }, { "epoch": 0.05267711546964721, "grad_norm": 12.048989906041559, "learning_rate": 4.9999118896756935e-05, "loss": 2.7023, "mean_token_accuracy": 0.3965517282485962, "step": 52300 }, { "epoch": 0.05268215152275138, "grad_norm": 14.695106589562384, "learning_rate": 4.999911557750024e-05, "loss": 2.4345, "mean_token_accuracy": 0.4034482777118683, "step": 52305 }, { "epoch": 0.05268718757585555, "grad_norm": 11.999980746598482, "learning_rate": 4.999911225200331e-05, "loss": 2.7438, "mean_token_accuracy": 0.341379314661026, "step": 52310 }, { "epoch": 0.05269222362895972, "grad_norm": 14.935420635563466, "learning_rate": 4.99991089202662e-05, "loss": 2.3546, "mean_token_accuracy": 0.39310344457626345, "step": 52315 }, { "epoch": 0.052697259682063895, "grad_norm": 13.72130692864623, "learning_rate": 4.9999105582288857e-05, "loss": 2.6643, "mean_token_accuracy": 0.36896551251411436, "step": 52320 }, { "epoch": 0.05270229573516807, "grad_norm": 15.733599661062378, "learning_rate": 4.9999102238071315e-05, "loss": 2.6731, "mean_token_accuracy": 0.3896551728248596, "step": 52325 }, { "epoch": 0.05270733178827224, "grad_norm": 21.428603513808433, "learning_rate": 4.999909888761355e-05, "loss": 2.8353, "mean_token_accuracy": 0.37356321811676024, "step": 52330 }, { "epoch": 0.052712367841376416, "grad_norm": 12.056837297684389, "learning_rate": 4.999909553091559e-05, "loss": 2.3452, "mean_token_accuracy": 0.4379310369491577, "step": 52335 }, { "epoch": 0.05271740389448059, "grad_norm": 12.71091112723237, "learning_rate": 4.999909216797742e-05, "loss": 2.4256, "mean_token_accuracy": 0.4137930989265442, "step": 52340 }, { "epoch": 0.05272243994758476, "grad_norm": 15.914802195092928, "learning_rate": 4.999908879879903e-05, "loss": 3.1311, "mean_token_accuracy": 0.3310344874858856, "step": 52345 }, { "epoch": 0.05272747600068893, "grad_norm": 13.982267070269382, "learning_rate": 4.999908542338044e-05, "loss": 3.2666, "mean_token_accuracy": 0.33793103992938994, "step": 52350 }, { "epoch": 0.052732512053793104, "grad_norm": 12.10752493266362, "learning_rate": 4.999908204172165e-05, "loss": 2.8074, "mean_token_accuracy": 0.32758620381355286, "step": 52355 }, { "epoch": 0.05273754810689728, "grad_norm": 14.449191870762254, "learning_rate": 4.999907865382265e-05, "loss": 2.7442, "mean_token_accuracy": 0.3551724046468735, "step": 52360 }, { "epoch": 0.05274258416000145, "grad_norm": 11.512158697732623, "learning_rate": 4.9999075259683455e-05, "loss": 2.6798, "mean_token_accuracy": 0.37586206793785093, "step": 52365 }, { "epoch": 0.052747620213105625, "grad_norm": 10.156931007852432, "learning_rate": 4.999907185930405e-05, "loss": 2.0482, "mean_token_accuracy": 0.4896551609039307, "step": 52370 }, { "epoch": 0.0527526562662098, "grad_norm": 11.729109946496521, "learning_rate": 4.9999068452684446e-05, "loss": 2.418, "mean_token_accuracy": 0.42758620977401735, "step": 52375 }, { "epoch": 0.052757692319313966, "grad_norm": 17.216350448924175, "learning_rate": 4.999906503982464e-05, "loss": 3.0039, "mean_token_accuracy": 0.38620689511299133, "step": 52380 }, { "epoch": 0.05276272837241814, "grad_norm": 13.894546698102543, "learning_rate": 4.9999061620724636e-05, "loss": 2.3688, "mean_token_accuracy": 0.3879007875919342, "step": 52385 }, { "epoch": 0.052767764425522314, "grad_norm": 12.907526872862055, "learning_rate": 4.999905819538443e-05, "loss": 2.1838, "mean_token_accuracy": 0.4655172348022461, "step": 52390 }, { "epoch": 0.05277280047862649, "grad_norm": 12.008549283232462, "learning_rate": 4.9999054763804026e-05, "loss": 3.0417, "mean_token_accuracy": 0.33448275923728943, "step": 52395 }, { "epoch": 0.05277783653173066, "grad_norm": 10.949887694973663, "learning_rate": 4.9999051325983435e-05, "loss": 2.5315, "mean_token_accuracy": 0.4034482717514038, "step": 52400 }, { "epoch": 0.052782872584834835, "grad_norm": 12.196457541091707, "learning_rate": 4.9999047881922636e-05, "loss": 2.6658, "mean_token_accuracy": 0.4068965554237366, "step": 52405 }, { "epoch": 0.05278790863793901, "grad_norm": 11.026794824583384, "learning_rate": 4.999904443162165e-05, "loss": 2.0451, "mean_token_accuracy": 0.47586206793785096, "step": 52410 }, { "epoch": 0.052792944691043175, "grad_norm": 13.76854502876863, "learning_rate": 4.9999040975080466e-05, "loss": 2.5316, "mean_token_accuracy": 0.3620689630508423, "step": 52415 }, { "epoch": 0.05279798074414735, "grad_norm": 18.612129031037366, "learning_rate": 4.999903751229909e-05, "loss": 3.131, "mean_token_accuracy": 0.337931028008461, "step": 52420 }, { "epoch": 0.05280301679725152, "grad_norm": 13.071645329114645, "learning_rate": 4.999903404327752e-05, "loss": 2.3479, "mean_token_accuracy": 0.4586206912994385, "step": 52425 }, { "epoch": 0.0528080528503557, "grad_norm": 12.641279256627136, "learning_rate": 4.9999030568015763e-05, "loss": 2.0556, "mean_token_accuracy": 0.4931034445762634, "step": 52430 }, { "epoch": 0.05281308890345987, "grad_norm": 16.645951299439346, "learning_rate": 4.999902708651381e-05, "loss": 2.8782, "mean_token_accuracy": 0.3827586114406586, "step": 52435 }, { "epoch": 0.052818124956564044, "grad_norm": 14.949139066603236, "learning_rate": 4.999902359877167e-05, "loss": 2.5619, "mean_token_accuracy": 0.38771929740905764, "step": 52440 }, { "epoch": 0.05282316100966822, "grad_norm": 12.577712155506225, "learning_rate": 4.9999020104789355e-05, "loss": 2.3735, "mean_token_accuracy": 0.3931034505367279, "step": 52445 }, { "epoch": 0.052828197062772385, "grad_norm": 10.61630628844499, "learning_rate": 4.999901660456684e-05, "loss": 2.4444, "mean_token_accuracy": 0.4190562665462494, "step": 52450 }, { "epoch": 0.05283323311587656, "grad_norm": 15.0253532284693, "learning_rate": 4.999901309810414e-05, "loss": 3.1995, "mean_token_accuracy": 0.3241379290819168, "step": 52455 }, { "epoch": 0.05283826916898073, "grad_norm": 11.930183669267848, "learning_rate": 4.9999009585401246e-05, "loss": 2.7477, "mean_token_accuracy": 0.34137930870056155, "step": 52460 }, { "epoch": 0.052843305222084906, "grad_norm": 9.665504944820308, "learning_rate": 4.999900606645818e-05, "loss": 2.4115, "mean_token_accuracy": 0.4103448331356049, "step": 52465 }, { "epoch": 0.05284834127518908, "grad_norm": 11.245917775491176, "learning_rate": 4.999900254127493e-05, "loss": 2.6707, "mean_token_accuracy": 0.3586206793785095, "step": 52470 }, { "epoch": 0.052853377328293254, "grad_norm": 12.67937417176609, "learning_rate": 4.99989990098515e-05, "loss": 2.9381, "mean_token_accuracy": 0.3655172407627106, "step": 52475 }, { "epoch": 0.05285841338139743, "grad_norm": 11.88575920668747, "learning_rate": 4.999899547218788e-05, "loss": 2.6887, "mean_token_accuracy": 0.3517241418361664, "step": 52480 }, { "epoch": 0.052863449434501594, "grad_norm": 14.430140644439849, "learning_rate": 4.9998991928284084e-05, "loss": 2.6276, "mean_token_accuracy": 0.38275861740112305, "step": 52485 }, { "epoch": 0.05286848548760577, "grad_norm": 13.935806624669114, "learning_rate": 4.999898837814011e-05, "loss": 3.0041, "mean_token_accuracy": 0.3344827562570572, "step": 52490 }, { "epoch": 0.05287352154070994, "grad_norm": 10.722903919471433, "learning_rate": 4.9998984821755956e-05, "loss": 2.811, "mean_token_accuracy": 0.3878402948379517, "step": 52495 }, { "epoch": 0.052878557593814116, "grad_norm": 11.963375711383776, "learning_rate": 4.999898125913162e-05, "loss": 2.4768, "mean_token_accuracy": 0.42068966031074523, "step": 52500 }, { "epoch": 0.05288359364691829, "grad_norm": 11.471988950483025, "learning_rate": 4.9998977690267116e-05, "loss": 2.6276, "mean_token_accuracy": 0.37241379022598264, "step": 52505 }, { "epoch": 0.05288862970002246, "grad_norm": 12.533240115342801, "learning_rate": 4.999897411516243e-05, "loss": 2.303, "mean_token_accuracy": 0.4530550420284271, "step": 52510 }, { "epoch": 0.05289366575312664, "grad_norm": 10.379311569672394, "learning_rate": 4.9998970533817576e-05, "loss": 2.4079, "mean_token_accuracy": 0.3896551728248596, "step": 52515 }, { "epoch": 0.052898701806230804, "grad_norm": 13.575178704284905, "learning_rate": 4.999896694623255e-05, "loss": 2.8839, "mean_token_accuracy": 0.3275861978530884, "step": 52520 }, { "epoch": 0.05290373785933498, "grad_norm": 10.542823767956984, "learning_rate": 4.9998963352407345e-05, "loss": 2.7316, "mean_token_accuracy": 0.4068965584039688, "step": 52525 }, { "epoch": 0.05290877391243915, "grad_norm": 11.468709330186027, "learning_rate": 4.999895975234197e-05, "loss": 2.5425, "mean_token_accuracy": 0.3827586263418198, "step": 52530 }, { "epoch": 0.052913809965543325, "grad_norm": 10.71005920483588, "learning_rate": 4.999895614603642e-05, "loss": 2.5727, "mean_token_accuracy": 0.4413793206214905, "step": 52535 }, { "epoch": 0.0529188460186475, "grad_norm": 14.776546143806273, "learning_rate": 4.999895253349071e-05, "loss": 2.3197, "mean_token_accuracy": 0.42413793206214906, "step": 52540 }, { "epoch": 0.05292388207175167, "grad_norm": 14.562609889895791, "learning_rate": 4.9998948914704826e-05, "loss": 3.0693, "mean_token_accuracy": 0.3482758641242981, "step": 52545 }, { "epoch": 0.052928918124855846, "grad_norm": 12.498456911660455, "learning_rate": 4.999894528967877e-05, "loss": 2.8945, "mean_token_accuracy": 0.34137930870056155, "step": 52550 }, { "epoch": 0.05293395417796001, "grad_norm": 11.635851674457173, "learning_rate": 4.999894165841256e-05, "loss": 2.886, "mean_token_accuracy": 0.4068965494632721, "step": 52555 }, { "epoch": 0.05293899023106419, "grad_norm": 13.204996711937499, "learning_rate": 4.999893802090618e-05, "loss": 2.4327, "mean_token_accuracy": 0.40689654350280763, "step": 52560 }, { "epoch": 0.05294402628416836, "grad_norm": 10.321895599351686, "learning_rate": 4.999893437715963e-05, "loss": 2.4239, "mean_token_accuracy": 0.3827586233615875, "step": 52565 }, { "epoch": 0.052949062337272534, "grad_norm": 10.560755174824738, "learning_rate": 4.999893072717292e-05, "loss": 2.4879, "mean_token_accuracy": 0.3896551728248596, "step": 52570 }, { "epoch": 0.05295409839037671, "grad_norm": 17.248603148432775, "learning_rate": 4.999892707094605e-05, "loss": 2.1451, "mean_token_accuracy": 0.43242589235305784, "step": 52575 }, { "epoch": 0.05295913444348088, "grad_norm": 12.085934325050895, "learning_rate": 4.999892340847901e-05, "loss": 2.2653, "mean_token_accuracy": 0.4517241418361664, "step": 52580 }, { "epoch": 0.052964170496585056, "grad_norm": 12.128206931118553, "learning_rate": 4.999891973977182e-05, "loss": 2.3303, "mean_token_accuracy": 0.4310344815254211, "step": 52585 }, { "epoch": 0.05296920654968922, "grad_norm": 12.038340814259648, "learning_rate": 4.9998916064824465e-05, "loss": 2.4507, "mean_token_accuracy": 0.4275861978530884, "step": 52590 }, { "epoch": 0.052974242602793396, "grad_norm": 11.18072493193772, "learning_rate": 4.9998912383636956e-05, "loss": 2.5699, "mean_token_accuracy": 0.4393224477767944, "step": 52595 }, { "epoch": 0.05297927865589757, "grad_norm": 11.264856928966092, "learning_rate": 4.999890869620929e-05, "loss": 2.758, "mean_token_accuracy": 0.3655172407627106, "step": 52600 }, { "epoch": 0.052984314709001744, "grad_norm": 12.662740725118304, "learning_rate": 4.9998905002541465e-05, "loss": 2.3017, "mean_token_accuracy": 0.42413793206214906, "step": 52605 }, { "epoch": 0.05298935076210592, "grad_norm": 11.568444933710694, "learning_rate": 4.9998901302633484e-05, "loss": 2.5195, "mean_token_accuracy": 0.39655171930789945, "step": 52610 }, { "epoch": 0.05299438681521009, "grad_norm": 13.604484609869235, "learning_rate": 4.999889759648535e-05, "loss": 2.275, "mean_token_accuracy": 0.4413793087005615, "step": 52615 }, { "epoch": 0.052999422868314265, "grad_norm": 16.32571150369406, "learning_rate": 4.999889388409706e-05, "loss": 3.0466, "mean_token_accuracy": 0.34827586114406583, "step": 52620 }, { "epoch": 0.05300445892141843, "grad_norm": 14.32251835227207, "learning_rate": 4.999889016546862e-05, "loss": 2.6345, "mean_token_accuracy": 0.401875376701355, "step": 52625 }, { "epoch": 0.053009494974522606, "grad_norm": 11.619585681906743, "learning_rate": 4.999888644060003e-05, "loss": 2.5406, "mean_token_accuracy": 0.42413793206214906, "step": 52630 }, { "epoch": 0.05301453102762678, "grad_norm": 10.999043949392975, "learning_rate": 4.999888270949129e-05, "loss": 2.6576, "mean_token_accuracy": 0.4206896543502808, "step": 52635 }, { "epoch": 0.05301956708073095, "grad_norm": 10.797483300813841, "learning_rate": 4.99988789721424e-05, "loss": 2.3416, "mean_token_accuracy": 0.44827585220336913, "step": 52640 }, { "epoch": 0.05302460313383513, "grad_norm": 10.190071336878232, "learning_rate": 4.999887522855336e-05, "loss": 2.3408, "mean_token_accuracy": 0.4379310369491577, "step": 52645 }, { "epoch": 0.0530296391869393, "grad_norm": 13.195678833858894, "learning_rate": 4.999887147872417e-05, "loss": 2.5173, "mean_token_accuracy": 0.3827586114406586, "step": 52650 }, { "epoch": 0.053034675240043475, "grad_norm": 10.293024908823016, "learning_rate": 4.999886772265484e-05, "loss": 2.3027, "mean_token_accuracy": 0.47241379618644713, "step": 52655 }, { "epoch": 0.05303971129314764, "grad_norm": 11.182903712942725, "learning_rate": 4.999886396034537e-05, "loss": 2.5046, "mean_token_accuracy": 0.3827586233615875, "step": 52660 }, { "epoch": 0.053044747346251815, "grad_norm": 11.992129139191514, "learning_rate": 4.999886019179575e-05, "loss": 2.9038, "mean_token_accuracy": 0.3758620709180832, "step": 52665 }, { "epoch": 0.05304978339935599, "grad_norm": 19.11379512584089, "learning_rate": 4.999885641700598e-05, "loss": 2.8824, "mean_token_accuracy": 0.37931033968925476, "step": 52670 }, { "epoch": 0.05305481945246016, "grad_norm": 10.984800346228653, "learning_rate": 4.999885263597608e-05, "loss": 2.2453, "mean_token_accuracy": 0.4379310369491577, "step": 52675 }, { "epoch": 0.053059855505564336, "grad_norm": 14.184798519481314, "learning_rate": 4.999884884870604e-05, "loss": 3.1606, "mean_token_accuracy": 0.3241379290819168, "step": 52680 }, { "epoch": 0.05306489155866851, "grad_norm": 10.616186951384595, "learning_rate": 4.999884505519585e-05, "loss": 2.6455, "mean_token_accuracy": 0.37241379022598264, "step": 52685 }, { "epoch": 0.053069927611772684, "grad_norm": 13.663478059557402, "learning_rate": 4.9998841255445525e-05, "loss": 2.6163, "mean_token_accuracy": 0.4103448212146759, "step": 52690 }, { "epoch": 0.05307496366487685, "grad_norm": 13.970196535234571, "learning_rate": 4.999883744945506e-05, "loss": 2.3919, "mean_token_accuracy": 0.44506956934928893, "step": 52695 }, { "epoch": 0.053079999717981025, "grad_norm": 12.480725699350362, "learning_rate": 4.999883363722447e-05, "loss": 2.775, "mean_token_accuracy": 0.40689654350280763, "step": 52700 }, { "epoch": 0.0530850357710852, "grad_norm": 11.175515791859308, "learning_rate": 4.9998829818753735e-05, "loss": 2.4434, "mean_token_accuracy": 0.4517241358757019, "step": 52705 }, { "epoch": 0.05309007182418937, "grad_norm": 11.595580399340005, "learning_rate": 4.9998825994042876e-05, "loss": 2.5742, "mean_token_accuracy": 0.3931034505367279, "step": 52710 }, { "epoch": 0.053095107877293546, "grad_norm": 12.343190414901912, "learning_rate": 4.999882216309187e-05, "loss": 2.7913, "mean_token_accuracy": 0.3620689630508423, "step": 52715 }, { "epoch": 0.05310014393039772, "grad_norm": 12.346017048299837, "learning_rate": 4.999881832590074e-05, "loss": 2.5219, "mean_token_accuracy": 0.38620689511299133, "step": 52720 }, { "epoch": 0.05310517998350189, "grad_norm": 11.662371357810025, "learning_rate": 4.999881448246947e-05, "loss": 2.2389, "mean_token_accuracy": 0.41034482717514037, "step": 52725 }, { "epoch": 0.05311021603660606, "grad_norm": 19.659439428407314, "learning_rate": 4.9998810632798075e-05, "loss": 2.8296, "mean_token_accuracy": 0.3551724135875702, "step": 52730 }, { "epoch": 0.053115252089710234, "grad_norm": 13.09192498933278, "learning_rate": 4.999880677688656e-05, "loss": 2.0579, "mean_token_accuracy": 0.47931034564971925, "step": 52735 }, { "epoch": 0.05312028814281441, "grad_norm": 11.797746097397859, "learning_rate": 4.999880291473491e-05, "loss": 2.5272, "mean_token_accuracy": 0.3862069010734558, "step": 52740 }, { "epoch": 0.05312532419591858, "grad_norm": 10.6882470671556, "learning_rate": 4.999879904634313e-05, "loss": 2.2672, "mean_token_accuracy": 0.4034482717514038, "step": 52745 }, { "epoch": 0.053130360249022755, "grad_norm": 13.282527204142411, "learning_rate": 4.999879517171122e-05, "loss": 2.2753, "mean_token_accuracy": 0.45517241954803467, "step": 52750 }, { "epoch": 0.05313539630212693, "grad_norm": 13.267004112529877, "learning_rate": 4.999879129083919e-05, "loss": 2.3213, "mean_token_accuracy": 0.40871143341064453, "step": 52755 }, { "epoch": 0.0531404323552311, "grad_norm": 13.23764106515179, "learning_rate": 4.999878740372704e-05, "loss": 2.4392, "mean_token_accuracy": 0.4137930989265442, "step": 52760 }, { "epoch": 0.05314546840833527, "grad_norm": 11.87947414365668, "learning_rate": 4.999878351037476e-05, "loss": 2.3639, "mean_token_accuracy": 0.36896551847457887, "step": 52765 }, { "epoch": 0.05315050446143944, "grad_norm": 11.079826528737392, "learning_rate": 4.999877961078237e-05, "loss": 2.4969, "mean_token_accuracy": 0.41379310488700866, "step": 52770 }, { "epoch": 0.05315554051454362, "grad_norm": 10.443981284297202, "learning_rate": 4.999877570494985e-05, "loss": 2.3254, "mean_token_accuracy": 0.43278887271881106, "step": 52775 }, { "epoch": 0.05316057656764779, "grad_norm": 10.519975025616317, "learning_rate": 4.999877179287722e-05, "loss": 2.7338, "mean_token_accuracy": 0.39704433679580686, "step": 52780 }, { "epoch": 0.053165612620751965, "grad_norm": 14.733345302411507, "learning_rate": 4.999876787456446e-05, "loss": 2.2582, "mean_token_accuracy": 0.4068965554237366, "step": 52785 }, { "epoch": 0.05317064867385614, "grad_norm": 16.245384216423915, "learning_rate": 4.999876395001159e-05, "loss": 2.4911, "mean_token_accuracy": 0.42413793206214906, "step": 52790 }, { "epoch": 0.05317568472696031, "grad_norm": 13.503076131143054, "learning_rate": 4.99987600192186e-05, "loss": 2.9788, "mean_token_accuracy": 0.43448275327682495, "step": 52795 }, { "epoch": 0.05318072078006448, "grad_norm": 12.252898555965514, "learning_rate": 4.99987560821855e-05, "loss": 2.5199, "mean_token_accuracy": 0.38275861740112305, "step": 52800 }, { "epoch": 0.05318575683316865, "grad_norm": 15.842404251043618, "learning_rate": 4.999875213891228e-05, "loss": 2.4493, "mean_token_accuracy": 0.4068965554237366, "step": 52805 }, { "epoch": 0.05319079288627283, "grad_norm": 12.198436866938454, "learning_rate": 4.999874818939895e-05, "loss": 2.723, "mean_token_accuracy": 0.341379314661026, "step": 52810 }, { "epoch": 0.053195828939377, "grad_norm": 12.705963902335885, "learning_rate": 4.9998744233645514e-05, "loss": 2.801, "mean_token_accuracy": 0.3999999940395355, "step": 52815 }, { "epoch": 0.053200864992481174, "grad_norm": 12.579729153343981, "learning_rate": 4.999874027165196e-05, "loss": 2.937, "mean_token_accuracy": 0.38275861740112305, "step": 52820 }, { "epoch": 0.05320590104558535, "grad_norm": 14.678412721580386, "learning_rate": 4.99987363034183e-05, "loss": 2.522, "mean_token_accuracy": 0.4330308556556702, "step": 52825 }, { "epoch": 0.05321093709868952, "grad_norm": 12.258802189162726, "learning_rate": 4.999873232894453e-05, "loss": 2.424, "mean_token_accuracy": 0.4068965554237366, "step": 52830 }, { "epoch": 0.05321597315179369, "grad_norm": 12.142042640151569, "learning_rate": 4.999872834823064e-05, "loss": 2.7172, "mean_token_accuracy": 0.3758620619773865, "step": 52835 }, { "epoch": 0.05322100920489786, "grad_norm": 13.442158739379614, "learning_rate": 4.999872436127667e-05, "loss": 2.7406, "mean_token_accuracy": 0.3827586114406586, "step": 52840 }, { "epoch": 0.053226045258002036, "grad_norm": 12.054948859060215, "learning_rate": 4.999872036808257e-05, "loss": 2.7899, "mean_token_accuracy": 0.36206896901130675, "step": 52845 }, { "epoch": 0.05323108131110621, "grad_norm": 12.795347486395773, "learning_rate": 4.999871636864838e-05, "loss": 2.7907, "mean_token_accuracy": 0.38965516686439516, "step": 52850 }, { "epoch": 0.053236117364210384, "grad_norm": 12.006954291849244, "learning_rate": 4.999871236297409e-05, "loss": 3.0269, "mean_token_accuracy": 0.36896551251411436, "step": 52855 }, { "epoch": 0.05324115341731456, "grad_norm": 12.154761433250204, "learning_rate": 4.9998708351059684e-05, "loss": 2.9575, "mean_token_accuracy": 0.358620685338974, "step": 52860 }, { "epoch": 0.05324618947041873, "grad_norm": 12.09703492091388, "learning_rate": 4.999870433290519e-05, "loss": 2.2414, "mean_token_accuracy": 0.4637023627758026, "step": 52865 }, { "epoch": 0.0532512255235229, "grad_norm": 11.377316730118268, "learning_rate": 4.9998700308510595e-05, "loss": 2.6494, "mean_token_accuracy": 0.39655172228813174, "step": 52870 }, { "epoch": 0.05325626157662707, "grad_norm": 10.769509740795984, "learning_rate": 4.9998696277875896e-05, "loss": 2.4547, "mean_token_accuracy": 0.39310344457626345, "step": 52875 }, { "epoch": 0.053261297629731245, "grad_norm": 12.323636501363245, "learning_rate": 4.99986922410011e-05, "loss": 2.408, "mean_token_accuracy": 0.43103448748588563, "step": 52880 }, { "epoch": 0.05326633368283542, "grad_norm": 14.89145612944545, "learning_rate": 4.999868819788621e-05, "loss": 2.4483, "mean_token_accuracy": 0.4186932921409607, "step": 52885 }, { "epoch": 0.05327136973593959, "grad_norm": 12.980147072098786, "learning_rate": 4.999868414853123e-05, "loss": 2.6049, "mean_token_accuracy": 0.4034482717514038, "step": 52890 }, { "epoch": 0.05327640578904377, "grad_norm": 13.143195734858024, "learning_rate": 4.999868009293615e-05, "loss": 2.4916, "mean_token_accuracy": 0.4431941986083984, "step": 52895 }, { "epoch": 0.05328144184214794, "grad_norm": 11.853857309029957, "learning_rate": 4.999867603110098e-05, "loss": 2.6125, "mean_token_accuracy": 0.3999999940395355, "step": 52900 }, { "epoch": 0.05328647789525211, "grad_norm": 14.364572928581355, "learning_rate": 4.9998671963025714e-05, "loss": 2.7712, "mean_token_accuracy": 0.3517241358757019, "step": 52905 }, { "epoch": 0.05329151394835628, "grad_norm": 10.665113525426142, "learning_rate": 4.999866788871035e-05, "loss": 2.1914, "mean_token_accuracy": 0.42413793206214906, "step": 52910 }, { "epoch": 0.053296550001460455, "grad_norm": 12.847233348713168, "learning_rate": 4.999866380815491e-05, "loss": 2.5993, "mean_token_accuracy": 0.4000000059604645, "step": 52915 }, { "epoch": 0.05330158605456463, "grad_norm": 11.72936127154475, "learning_rate": 4.999865972135938e-05, "loss": 2.278, "mean_token_accuracy": 0.4551724135875702, "step": 52920 }, { "epoch": 0.0533066221076688, "grad_norm": 11.407950980351982, "learning_rate": 4.999865562832376e-05, "loss": 2.569, "mean_token_accuracy": 0.39655172526836396, "step": 52925 }, { "epoch": 0.053311658160772976, "grad_norm": 11.468870440087771, "learning_rate": 4.9998651529048063e-05, "loss": 2.5952, "mean_token_accuracy": 0.39310344457626345, "step": 52930 }, { "epoch": 0.05331669421387715, "grad_norm": 14.596768094651525, "learning_rate": 4.999864742353227e-05, "loss": 2.395, "mean_token_accuracy": 0.44640047550201417, "step": 52935 }, { "epoch": 0.05332173026698132, "grad_norm": 13.683764367482794, "learning_rate": 4.999864331177639e-05, "loss": 2.7397, "mean_token_accuracy": 0.38965517580509185, "step": 52940 }, { "epoch": 0.05332676632008549, "grad_norm": 28.264960451564423, "learning_rate": 4.9998639193780436e-05, "loss": 2.3059, "mean_token_accuracy": 0.4551724135875702, "step": 52945 }, { "epoch": 0.053331802373189664, "grad_norm": 11.721084785281288, "learning_rate": 4.99986350695444e-05, "loss": 2.5699, "mean_token_accuracy": 0.41724138557910917, "step": 52950 }, { "epoch": 0.05333683842629384, "grad_norm": 20.13673538691939, "learning_rate": 4.999863093906828e-05, "loss": 2.4718, "mean_token_accuracy": 0.4068965494632721, "step": 52955 }, { "epoch": 0.05334187447939801, "grad_norm": 13.0804956244583, "learning_rate": 4.9998626802352085e-05, "loss": 2.5879, "mean_token_accuracy": 0.4034482777118683, "step": 52960 }, { "epoch": 0.053346910532502186, "grad_norm": 12.420669520664438, "learning_rate": 4.99986226593958e-05, "loss": 2.7264, "mean_token_accuracy": 0.37241379022598264, "step": 52965 }, { "epoch": 0.05335194658560636, "grad_norm": 12.68211208107712, "learning_rate": 4.9998618510199456e-05, "loss": 2.6559, "mean_token_accuracy": 0.3620689630508423, "step": 52970 }, { "epoch": 0.053356982638710526, "grad_norm": 11.851400456756226, "learning_rate": 4.9998614354763024e-05, "loss": 2.281, "mean_token_accuracy": 0.5103448331356049, "step": 52975 }, { "epoch": 0.0533620186918147, "grad_norm": 8.346634887058913, "learning_rate": 4.999861019308652e-05, "loss": 2.3549, "mean_token_accuracy": 0.4366606056690216, "step": 52980 }, { "epoch": 0.053367054744918874, "grad_norm": 11.624734140259012, "learning_rate": 4.9998606025169946e-05, "loss": 2.914, "mean_token_accuracy": 0.35366001129150393, "step": 52985 }, { "epoch": 0.05337209079802305, "grad_norm": 11.739164407751714, "learning_rate": 4.999860185101329e-05, "loss": 2.6577, "mean_token_accuracy": 0.38620689511299133, "step": 52990 }, { "epoch": 0.05337712685112722, "grad_norm": 12.384144804613255, "learning_rate": 4.9998597670616575e-05, "loss": 2.4268, "mean_token_accuracy": 0.42758620381355283, "step": 52995 }, { "epoch": 0.053382162904231395, "grad_norm": 13.889224771537172, "learning_rate": 4.9998593483979786e-05, "loss": 2.7332, "mean_token_accuracy": 0.36896551251411436, "step": 53000 }, { "epoch": 0.05338719895733557, "grad_norm": 13.496604042900646, "learning_rate": 4.9998589291102926e-05, "loss": 2.4705, "mean_token_accuracy": 0.42068964838981626, "step": 53005 }, { "epoch": 0.053392235010439736, "grad_norm": 35.4073262906832, "learning_rate": 4.9998585091986e-05, "loss": 2.8587, "mean_token_accuracy": 0.40496068000793456, "step": 53010 }, { "epoch": 0.05339727106354391, "grad_norm": 14.719910046514489, "learning_rate": 4.999858088662901e-05, "loss": 3.1237, "mean_token_accuracy": 0.33103448152542114, "step": 53015 }, { "epoch": 0.05340230711664808, "grad_norm": 12.816248183109348, "learning_rate": 4.999857667503195e-05, "loss": 2.3176, "mean_token_accuracy": 0.4709618866443634, "step": 53020 }, { "epoch": 0.05340734316975226, "grad_norm": 13.036208878562023, "learning_rate": 4.9998572457194825e-05, "loss": 2.7488, "mean_token_accuracy": 0.4103448212146759, "step": 53025 }, { "epoch": 0.05341237922285643, "grad_norm": 12.113656357668281, "learning_rate": 4.9998568233117635e-05, "loss": 2.845, "mean_token_accuracy": 0.4034482717514038, "step": 53030 }, { "epoch": 0.053417415275960604, "grad_norm": 10.352034456624176, "learning_rate": 4.9998564002800393e-05, "loss": 2.6079, "mean_token_accuracy": 0.3517241358757019, "step": 53035 }, { "epoch": 0.05342245132906478, "grad_norm": 13.115334892020975, "learning_rate": 4.999855976624309e-05, "loss": 2.2802, "mean_token_accuracy": 0.42413792610168455, "step": 53040 }, { "epoch": 0.053427487382168945, "grad_norm": 11.193886649903126, "learning_rate": 4.999855552344572e-05, "loss": 2.4384, "mean_token_accuracy": 0.4344827651977539, "step": 53045 }, { "epoch": 0.05343252343527312, "grad_norm": 13.379268528543985, "learning_rate": 4.999855127440829e-05, "loss": 2.885, "mean_token_accuracy": 0.37241379022598264, "step": 53050 }, { "epoch": 0.05343755948837729, "grad_norm": 13.16404786891074, "learning_rate": 4.999854701913081e-05, "loss": 2.7824, "mean_token_accuracy": 0.4000000059604645, "step": 53055 }, { "epoch": 0.053442595541481466, "grad_norm": 13.50443575715912, "learning_rate": 4.999854275761327e-05, "loss": 2.8787, "mean_token_accuracy": 0.32413792610168457, "step": 53060 }, { "epoch": 0.05344763159458564, "grad_norm": 10.553667173306518, "learning_rate": 4.999853848985567e-05, "loss": 2.3649, "mean_token_accuracy": 0.4261947929859161, "step": 53065 }, { "epoch": 0.053452667647689814, "grad_norm": 9.85019786647655, "learning_rate": 4.999853421585803e-05, "loss": 2.8954, "mean_token_accuracy": 0.3310344755649567, "step": 53070 }, { "epoch": 0.05345770370079399, "grad_norm": 13.154996558988962, "learning_rate": 4.9998529935620334e-05, "loss": 2.7391, "mean_token_accuracy": 0.3482758581638336, "step": 53075 }, { "epoch": 0.053462739753898154, "grad_norm": 10.067660974458168, "learning_rate": 4.999852564914258e-05, "loss": 2.4131, "mean_token_accuracy": 0.37241379618644715, "step": 53080 }, { "epoch": 0.05346777580700233, "grad_norm": 12.099311175556132, "learning_rate": 4.999852135642478e-05, "loss": 2.6574, "mean_token_accuracy": 0.4068965494632721, "step": 53085 }, { "epoch": 0.0534728118601065, "grad_norm": 13.572079937624716, "learning_rate": 4.999851705746693e-05, "loss": 3.1079, "mean_token_accuracy": 0.3999999940395355, "step": 53090 }, { "epoch": 0.053477847913210676, "grad_norm": 10.590901101458197, "learning_rate": 4.9998512752269034e-05, "loss": 2.3247, "mean_token_accuracy": 0.4551724135875702, "step": 53095 }, { "epoch": 0.05348288396631485, "grad_norm": 17.19284234830098, "learning_rate": 4.999850844083109e-05, "loss": 2.7113, "mean_token_accuracy": 0.3862068891525269, "step": 53100 }, { "epoch": 0.05348792001941902, "grad_norm": 18.657905664139136, "learning_rate": 4.99985041231531e-05, "loss": 2.2172, "mean_token_accuracy": 0.4517241358757019, "step": 53105 }, { "epoch": 0.0534929560725232, "grad_norm": 13.933958320769486, "learning_rate": 4.9998499799235064e-05, "loss": 2.4311, "mean_token_accuracy": 0.4000000059604645, "step": 53110 }, { "epoch": 0.053497992125627364, "grad_norm": 13.80928778502695, "learning_rate": 4.9998495469076985e-05, "loss": 2.6719, "mean_token_accuracy": 0.4162734389305115, "step": 53115 }, { "epoch": 0.05350302817873154, "grad_norm": 12.713315118406596, "learning_rate": 4.9998491132678876e-05, "loss": 2.4797, "mean_token_accuracy": 0.4150635123252869, "step": 53120 }, { "epoch": 0.05350806423183571, "grad_norm": 14.563944890132191, "learning_rate": 4.9998486790040716e-05, "loss": 1.9744, "mean_token_accuracy": 0.43448275327682495, "step": 53125 }, { "epoch": 0.053513100284939885, "grad_norm": 13.578007760895447, "learning_rate": 4.999848244116252e-05, "loss": 2.7582, "mean_token_accuracy": 0.36551723480224607, "step": 53130 }, { "epoch": 0.05351813633804406, "grad_norm": 14.180388287283959, "learning_rate": 4.999847808604428e-05, "loss": 2.8588, "mean_token_accuracy": 0.42758620381355283, "step": 53135 }, { "epoch": 0.05352317239114823, "grad_norm": 16.675086573544075, "learning_rate": 4.999847372468601e-05, "loss": 2.6974, "mean_token_accuracy": 0.4137930989265442, "step": 53140 }, { "epoch": 0.0535282084442524, "grad_norm": 12.87754162991758, "learning_rate": 4.99984693570877e-05, "loss": 2.7523, "mean_token_accuracy": 0.32758620381355286, "step": 53145 }, { "epoch": 0.05353324449735657, "grad_norm": 12.1817153582392, "learning_rate": 4.999846498324936e-05, "loss": 2.7332, "mean_token_accuracy": 0.4310344815254211, "step": 53150 }, { "epoch": 0.05353828055046075, "grad_norm": 12.117218534786808, "learning_rate": 4.999846060317098e-05, "loss": 2.4481, "mean_token_accuracy": 0.4482758641242981, "step": 53155 }, { "epoch": 0.05354331660356492, "grad_norm": 13.28273842369201, "learning_rate": 4.9998456216852576e-05, "loss": 2.6973, "mean_token_accuracy": 0.3793103456497192, "step": 53160 }, { "epoch": 0.053548352656669095, "grad_norm": 11.52469420988639, "learning_rate": 4.999845182429414e-05, "loss": 2.5191, "mean_token_accuracy": 0.4379310429096222, "step": 53165 }, { "epoch": 0.05355338870977327, "grad_norm": 13.910088770943712, "learning_rate": 4.999844742549567e-05, "loss": 2.8339, "mean_token_accuracy": 0.3620689570903778, "step": 53170 }, { "epoch": 0.05355842476287744, "grad_norm": 11.01092415936324, "learning_rate": 4.999844302045718e-05, "loss": 2.6481, "mean_token_accuracy": 0.4190562665462494, "step": 53175 }, { "epoch": 0.05356346081598161, "grad_norm": 17.993466375526516, "learning_rate": 4.999843860917865e-05, "loss": 2.5487, "mean_token_accuracy": 0.4206896543502808, "step": 53180 }, { "epoch": 0.05356849686908578, "grad_norm": 9.38016912278869, "learning_rate": 4.99984341916601e-05, "loss": 2.368, "mean_token_accuracy": 0.4344827592372894, "step": 53185 }, { "epoch": 0.053573532922189956, "grad_norm": 21.656518136485545, "learning_rate": 4.999842976790153e-05, "loss": 2.7492, "mean_token_accuracy": 0.3655172407627106, "step": 53190 }, { "epoch": 0.05357856897529413, "grad_norm": 15.43732532456617, "learning_rate": 4.9998425337902924e-05, "loss": 2.351, "mean_token_accuracy": 0.4172413766384125, "step": 53195 }, { "epoch": 0.053583605028398304, "grad_norm": 14.825546692394683, "learning_rate": 4.999842090166431e-05, "loss": 2.4702, "mean_token_accuracy": 0.39310344457626345, "step": 53200 }, { "epoch": 0.05358864108150248, "grad_norm": 11.970549585530728, "learning_rate": 4.999841645918567e-05, "loss": 2.2551, "mean_token_accuracy": 0.43793103098869324, "step": 53205 }, { "epoch": 0.05359367713460665, "grad_norm": 11.785975047199853, "learning_rate": 4.999841201046701e-05, "loss": 2.3459, "mean_token_accuracy": 0.4517241358757019, "step": 53210 }, { "epoch": 0.05359871318771082, "grad_norm": 12.363590388557657, "learning_rate": 4.999840755550833e-05, "loss": 2.5401, "mean_token_accuracy": 0.42020567655563357, "step": 53215 }, { "epoch": 0.05360374924081499, "grad_norm": 13.899363203093284, "learning_rate": 4.9998403094309645e-05, "loss": 2.8381, "mean_token_accuracy": 0.43793103098869324, "step": 53220 }, { "epoch": 0.053608785293919166, "grad_norm": 9.840183718434782, "learning_rate": 4.9998398626870924e-05, "loss": 2.8497, "mean_token_accuracy": 0.34482758939266206, "step": 53225 }, { "epoch": 0.05361382134702334, "grad_norm": 9.89270097489401, "learning_rate": 4.99983941531922e-05, "loss": 2.5249, "mean_token_accuracy": 0.4, "step": 53230 }, { "epoch": 0.05361885740012751, "grad_norm": 13.084982610131592, "learning_rate": 4.999838967327346e-05, "loss": 2.928, "mean_token_accuracy": 0.38965516686439516, "step": 53235 }, { "epoch": 0.05362389345323169, "grad_norm": 12.705022880064721, "learning_rate": 4.9998385187114706e-05, "loss": 3.036, "mean_token_accuracy": 0.4068965494632721, "step": 53240 }, { "epoch": 0.05362892950633586, "grad_norm": 12.623017729649321, "learning_rate": 4.999838069471594e-05, "loss": 2.5145, "mean_token_accuracy": 0.43103448748588563, "step": 53245 }, { "epoch": 0.05363396555944003, "grad_norm": 11.525889629523258, "learning_rate": 4.999837619607717e-05, "loss": 2.5071, "mean_token_accuracy": 0.42970356345176697, "step": 53250 }, { "epoch": 0.0536390016125442, "grad_norm": 10.796276764118186, "learning_rate": 4.999837169119839e-05, "loss": 2.0471, "mean_token_accuracy": 0.47785844206809996, "step": 53255 }, { "epoch": 0.053644037665648375, "grad_norm": 10.964210108293592, "learning_rate": 4.99983671800796e-05, "loss": 2.3864, "mean_token_accuracy": 0.41379310488700866, "step": 53260 }, { "epoch": 0.05364907371875255, "grad_norm": 12.254445670335736, "learning_rate": 4.99983626627208e-05, "loss": 2.3101, "mean_token_accuracy": 0.4659407138824463, "step": 53265 }, { "epoch": 0.05365410977185672, "grad_norm": 13.475788553071837, "learning_rate": 4.9998358139122e-05, "loss": 2.6494, "mean_token_accuracy": 0.4157289803028107, "step": 53270 }, { "epoch": 0.0536591458249609, "grad_norm": 13.377262777791412, "learning_rate": 4.9998353609283196e-05, "loss": 2.8284, "mean_token_accuracy": 0.38620689511299133, "step": 53275 }, { "epoch": 0.05366418187806507, "grad_norm": 12.806046963963635, "learning_rate": 4.9998349073204395e-05, "loss": 2.9124, "mean_token_accuracy": 0.36339987218379977, "step": 53280 }, { "epoch": 0.05366921793116924, "grad_norm": 10.830858858988913, "learning_rate": 4.999834453088559e-05, "loss": 2.6208, "mean_token_accuracy": 0.3848759770393372, "step": 53285 }, { "epoch": 0.05367425398427341, "grad_norm": 12.410675058286438, "learning_rate": 4.999833998232678e-05, "loss": 2.7488, "mean_token_accuracy": 0.36206896901130675, "step": 53290 }, { "epoch": 0.053679290037377585, "grad_norm": 10.310156045379767, "learning_rate": 4.999833542752797e-05, "loss": 2.4567, "mean_token_accuracy": 0.4068965494632721, "step": 53295 }, { "epoch": 0.05368432609048176, "grad_norm": 18.25908334129646, "learning_rate": 4.999833086648917e-05, "loss": 3.0179, "mean_token_accuracy": 0.3793103456497192, "step": 53300 }, { "epoch": 0.05368936214358593, "grad_norm": 12.350485789078926, "learning_rate": 4.999832629921037e-05, "loss": 2.6429, "mean_token_accuracy": 0.42758620381355283, "step": 53305 }, { "epoch": 0.053694398196690106, "grad_norm": 12.9789388557274, "learning_rate": 4.999832172569158e-05, "loss": 2.4805, "mean_token_accuracy": 0.41724138259887694, "step": 53310 }, { "epoch": 0.05369943424979428, "grad_norm": 13.202807711300615, "learning_rate": 4.999831714593279e-05, "loss": 2.4868, "mean_token_accuracy": 0.40344826877117157, "step": 53315 }, { "epoch": 0.05370447030289845, "grad_norm": 11.860060841812048, "learning_rate": 4.9998312559934015e-05, "loss": 2.7827, "mean_token_accuracy": 0.33103448450565337, "step": 53320 }, { "epoch": 0.05370950635600262, "grad_norm": 15.048911938393122, "learning_rate": 4.999830796769525e-05, "loss": 2.4595, "mean_token_accuracy": 0.42413792610168455, "step": 53325 }, { "epoch": 0.053714542409106794, "grad_norm": 11.783162414885554, "learning_rate": 4.9998303369216485e-05, "loss": 2.7389, "mean_token_accuracy": 0.3551724076271057, "step": 53330 }, { "epoch": 0.05371957846221097, "grad_norm": 10.613639827264706, "learning_rate": 4.9998298764497745e-05, "loss": 2.432, "mean_token_accuracy": 0.42413793206214906, "step": 53335 }, { "epoch": 0.05372461451531514, "grad_norm": 14.11402585171257, "learning_rate": 4.9998294153539e-05, "loss": 2.6934, "mean_token_accuracy": 0.37586206793785093, "step": 53340 }, { "epoch": 0.053729650568419315, "grad_norm": 10.322053677971374, "learning_rate": 4.999828953634028e-05, "loss": 2.1674, "mean_token_accuracy": 0.5034482836723327, "step": 53345 }, { "epoch": 0.05373468662152349, "grad_norm": 12.974255550693718, "learning_rate": 4.999828491290158e-05, "loss": 2.4477, "mean_token_accuracy": 0.42758620977401735, "step": 53350 }, { "epoch": 0.053739722674627656, "grad_norm": 12.831179856913947, "learning_rate": 4.999828028322289e-05, "loss": 2.5955, "mean_token_accuracy": 0.4, "step": 53355 }, { "epoch": 0.05374475872773183, "grad_norm": 12.139266041067364, "learning_rate": 4.999827564730422e-05, "loss": 2.1915, "mean_token_accuracy": 0.47931033968925474, "step": 53360 }, { "epoch": 0.053749794780836004, "grad_norm": 15.756750168391394, "learning_rate": 4.999827100514556e-05, "loss": 2.9547, "mean_token_accuracy": 0.3517241358757019, "step": 53365 }, { "epoch": 0.05375483083394018, "grad_norm": 13.123215494186805, "learning_rate": 4.999826635674694e-05, "loss": 2.8284, "mean_token_accuracy": 0.38620689511299133, "step": 53370 }, { "epoch": 0.05375986688704435, "grad_norm": 13.300777459230037, "learning_rate": 4.9998261702108326e-05, "loss": 2.8751, "mean_token_accuracy": 0.33793103098869326, "step": 53375 }, { "epoch": 0.053764902940148525, "grad_norm": 10.943921861277873, "learning_rate": 4.999825704122974e-05, "loss": 2.8933, "mean_token_accuracy": 0.33206291794776915, "step": 53380 }, { "epoch": 0.0537699389932527, "grad_norm": 13.350706019397954, "learning_rate": 4.999825237411118e-05, "loss": 2.9381, "mean_token_accuracy": 0.3482758641242981, "step": 53385 }, { "epoch": 0.053774975046356865, "grad_norm": 12.780375947019301, "learning_rate": 4.999824770075265e-05, "loss": 2.8705, "mean_token_accuracy": 0.3827586203813553, "step": 53390 }, { "epoch": 0.05378001109946104, "grad_norm": 16.802684385443204, "learning_rate": 4.999824302115414e-05, "loss": 2.7176, "mean_token_accuracy": 0.36551723480224607, "step": 53395 }, { "epoch": 0.05378504715256521, "grad_norm": 13.92322779619066, "learning_rate": 4.9998238335315654e-05, "loss": 2.5725, "mean_token_accuracy": 0.4034482777118683, "step": 53400 }, { "epoch": 0.05379008320566939, "grad_norm": 15.014922166388505, "learning_rate": 4.999823364323721e-05, "loss": 2.8755, "mean_token_accuracy": 0.33793103098869326, "step": 53405 }, { "epoch": 0.05379511925877356, "grad_norm": 14.2573605523194, "learning_rate": 4.999822894491879e-05, "loss": 2.3816, "mean_token_accuracy": 0.35862069129943847, "step": 53410 }, { "epoch": 0.053800155311877734, "grad_norm": 12.532273714394055, "learning_rate": 4.9998224240360396e-05, "loss": 2.6323, "mean_token_accuracy": 0.33103448152542114, "step": 53415 }, { "epoch": 0.05380519136498191, "grad_norm": 13.988515040582149, "learning_rate": 4.999821952956205e-05, "loss": 2.7388, "mean_token_accuracy": 0.3805807590484619, "step": 53420 }, { "epoch": 0.053810227418086075, "grad_norm": 13.682906345152427, "learning_rate": 4.999821481252373e-05, "loss": 2.4726, "mean_token_accuracy": 0.42758620977401735, "step": 53425 }, { "epoch": 0.05381526347119025, "grad_norm": 14.301343233340672, "learning_rate": 4.999821008924545e-05, "loss": 3.2137, "mean_token_accuracy": 0.3188747763633728, "step": 53430 }, { "epoch": 0.05382029952429442, "grad_norm": 18.289178843537798, "learning_rate": 4.99982053597272e-05, "loss": 2.8926, "mean_token_accuracy": 0.3482758641242981, "step": 53435 }, { "epoch": 0.053825335577398596, "grad_norm": 11.132204877060822, "learning_rate": 4.9998200623969e-05, "loss": 2.2486, "mean_token_accuracy": 0.4551724135875702, "step": 53440 }, { "epoch": 0.05383037163050277, "grad_norm": 11.701772598762922, "learning_rate": 4.999819588197083e-05, "loss": 2.4288, "mean_token_accuracy": 0.38275861740112305, "step": 53445 }, { "epoch": 0.053835407683606944, "grad_norm": 11.391821246140948, "learning_rate": 4.99981911337327e-05, "loss": 2.6397, "mean_token_accuracy": 0.4080459773540497, "step": 53450 }, { "epoch": 0.05384044373671112, "grad_norm": 11.69322178028498, "learning_rate": 4.9998186379254625e-05, "loss": 2.5769, "mean_token_accuracy": 0.42413792610168455, "step": 53455 }, { "epoch": 0.053845479789815284, "grad_norm": 14.991644009548025, "learning_rate": 4.999818161853659e-05, "loss": 2.9003, "mean_token_accuracy": 0.3655172407627106, "step": 53460 }, { "epoch": 0.05385051584291946, "grad_norm": 13.222980218432527, "learning_rate": 4.999817685157859e-05, "loss": 2.7091, "mean_token_accuracy": 0.3931034505367279, "step": 53465 }, { "epoch": 0.05385555189602363, "grad_norm": 13.96982971994947, "learning_rate": 4.999817207838065e-05, "loss": 2.4253, "mean_token_accuracy": 0.4068965494632721, "step": 53470 }, { "epoch": 0.053860587949127806, "grad_norm": 18.978969604747643, "learning_rate": 4.999816729894275e-05, "loss": 2.4091, "mean_token_accuracy": 0.37586206793785093, "step": 53475 }, { "epoch": 0.05386562400223198, "grad_norm": 17.76736945611813, "learning_rate": 4.999816251326491e-05, "loss": 2.3444, "mean_token_accuracy": 0.44289171099662783, "step": 53480 }, { "epoch": 0.05387066005533615, "grad_norm": 20.433569378202648, "learning_rate": 4.99981577213471e-05, "loss": 2.9264, "mean_token_accuracy": 0.358620685338974, "step": 53485 }, { "epoch": 0.05387569610844033, "grad_norm": 21.83505482978997, "learning_rate": 4.999815292318936e-05, "loss": 2.5636, "mean_token_accuracy": 0.4034482777118683, "step": 53490 }, { "epoch": 0.053880732161544494, "grad_norm": 13.616043057728692, "learning_rate": 4.999814811879167e-05, "loss": 2.7593, "mean_token_accuracy": 0.3931034505367279, "step": 53495 }, { "epoch": 0.05388576821464867, "grad_norm": 12.561869388210585, "learning_rate": 4.999814330815403e-05, "loss": 2.4966, "mean_token_accuracy": 0.4413793087005615, "step": 53500 }, { "epoch": 0.05389080426775284, "grad_norm": 10.028205253702849, "learning_rate": 4.9998138491276455e-05, "loss": 2.4312, "mean_token_accuracy": 0.4122807025909424, "step": 53505 }, { "epoch": 0.053895840320857015, "grad_norm": 13.929207773274339, "learning_rate": 4.999813366815893e-05, "loss": 2.5925, "mean_token_accuracy": 0.38965516686439516, "step": 53510 }, { "epoch": 0.05390087637396119, "grad_norm": 12.343186518783545, "learning_rate": 4.999812883880146e-05, "loss": 2.4939, "mean_token_accuracy": 0.417241370677948, "step": 53515 }, { "epoch": 0.05390591242706536, "grad_norm": 13.028492009216768, "learning_rate": 4.999812400320406e-05, "loss": 3.0868, "mean_token_accuracy": 0.31379310190677645, "step": 53520 }, { "epoch": 0.053910948480169536, "grad_norm": 13.338966609837291, "learning_rate": 4.999811916136671e-05, "loss": 2.2548, "mean_token_accuracy": 0.42758620977401735, "step": 53525 }, { "epoch": 0.0539159845332737, "grad_norm": 13.113855228020165, "learning_rate": 4.999811431328944e-05, "loss": 3.0098, "mean_token_accuracy": 0.358620685338974, "step": 53530 }, { "epoch": 0.05392102058637788, "grad_norm": 14.29777802306572, "learning_rate": 4.999810945897221e-05, "loss": 2.2996, "mean_token_accuracy": 0.458620685338974, "step": 53535 }, { "epoch": 0.05392605663948205, "grad_norm": 21.073877416446035, "learning_rate": 4.9998104598415065e-05, "loss": 2.714, "mean_token_accuracy": 0.3965517163276672, "step": 53540 }, { "epoch": 0.053931092692586224, "grad_norm": 10.880882504412943, "learning_rate": 4.9998099731617975e-05, "loss": 3.044, "mean_token_accuracy": 0.4068965494632721, "step": 53545 }, { "epoch": 0.0539361287456904, "grad_norm": 10.185084705512226, "learning_rate": 4.9998094858580956e-05, "loss": 2.5614, "mean_token_accuracy": 0.43103448748588563, "step": 53550 }, { "epoch": 0.05394116479879457, "grad_norm": 12.025156945931665, "learning_rate": 4.999808997930402e-05, "loss": 2.6493, "mean_token_accuracy": 0.4000000059604645, "step": 53555 }, { "epoch": 0.053946200851898746, "grad_norm": 10.184677405639917, "learning_rate": 4.999808509378713e-05, "loss": 2.4094, "mean_token_accuracy": 0.43793103098869324, "step": 53560 }, { "epoch": 0.05395123690500291, "grad_norm": 11.609287808930308, "learning_rate": 4.999808020203033e-05, "loss": 2.6424, "mean_token_accuracy": 0.3827586233615875, "step": 53565 }, { "epoch": 0.053956272958107086, "grad_norm": 16.695190736609316, "learning_rate": 4.99980753040336e-05, "loss": 2.6145, "mean_token_accuracy": 0.36206896901130675, "step": 53570 }, { "epoch": 0.05396130901121126, "grad_norm": 15.110250873955165, "learning_rate": 4.999807039979695e-05, "loss": 3.053, "mean_token_accuracy": 0.3620689630508423, "step": 53575 }, { "epoch": 0.053966345064315434, "grad_norm": 14.193496698538517, "learning_rate": 4.9998065489320365e-05, "loss": 2.363, "mean_token_accuracy": 0.39655172228813174, "step": 53580 }, { "epoch": 0.05397138111741961, "grad_norm": 11.129491206058479, "learning_rate": 4.999806057260387e-05, "loss": 2.3486, "mean_token_accuracy": 0.4482758641242981, "step": 53585 }, { "epoch": 0.05397641717052378, "grad_norm": 11.943943954695788, "learning_rate": 4.9998055649647445e-05, "loss": 2.5825, "mean_token_accuracy": 0.3862069010734558, "step": 53590 }, { "epoch": 0.053981453223627955, "grad_norm": 12.238307474477383, "learning_rate": 4.99980507204511e-05, "loss": 2.2918, "mean_token_accuracy": 0.458620685338974, "step": 53595 }, { "epoch": 0.05398648927673212, "grad_norm": 12.154455113336633, "learning_rate": 4.999804578501484e-05, "loss": 2.9044, "mean_token_accuracy": 0.37586206793785093, "step": 53600 }, { "epoch": 0.053991525329836296, "grad_norm": 12.80897333719012, "learning_rate": 4.9998040843338665e-05, "loss": 2.4185, "mean_token_accuracy": 0.41034482717514037, "step": 53605 }, { "epoch": 0.05399656138294047, "grad_norm": 12.236871033425325, "learning_rate": 4.999803589542257e-05, "loss": 2.4977, "mean_token_accuracy": 0.4310344815254211, "step": 53610 }, { "epoch": 0.05400159743604464, "grad_norm": 12.470547130273014, "learning_rate": 4.999803094126656e-05, "loss": 2.8487, "mean_token_accuracy": 0.37241379618644715, "step": 53615 }, { "epoch": 0.05400663348914882, "grad_norm": 12.997722904682266, "learning_rate": 4.9998025980870644e-05, "loss": 2.5829, "mean_token_accuracy": 0.39655172228813174, "step": 53620 }, { "epoch": 0.05401166954225299, "grad_norm": 11.080540512830911, "learning_rate": 4.999802101423481e-05, "loss": 2.1365, "mean_token_accuracy": 0.4344827592372894, "step": 53625 }, { "epoch": 0.054016705595357165, "grad_norm": 11.36495288910569, "learning_rate": 4.999801604135907e-05, "loss": 2.4176, "mean_token_accuracy": 0.4137930989265442, "step": 53630 }, { "epoch": 0.05402174164846133, "grad_norm": 13.563556905218828, "learning_rate": 4.999801106224342e-05, "loss": 2.3597, "mean_token_accuracy": 0.4532970428466797, "step": 53635 }, { "epoch": 0.054026777701565505, "grad_norm": 14.694521090131635, "learning_rate": 4.999800607688786e-05, "loss": 3.0304, "mean_token_accuracy": 0.36370235979557036, "step": 53640 }, { "epoch": 0.05403181375466968, "grad_norm": 12.406892401175796, "learning_rate": 4.99980010852924e-05, "loss": 2.6515, "mean_token_accuracy": 0.38275861740112305, "step": 53645 }, { "epoch": 0.05403684980777385, "grad_norm": 12.304064391575528, "learning_rate": 4.9997996087457026e-05, "loss": 2.9965, "mean_token_accuracy": 0.3068965539336205, "step": 53650 }, { "epoch": 0.054041885860878026, "grad_norm": 10.745659492088434, "learning_rate": 4.9997991083381754e-05, "loss": 2.5702, "mean_token_accuracy": 0.3999999940395355, "step": 53655 }, { "epoch": 0.0540469219139822, "grad_norm": 13.345864484364192, "learning_rate": 4.999798607306658e-05, "loss": 2.7008, "mean_token_accuracy": 0.4034482777118683, "step": 53660 }, { "epoch": 0.054051957967086374, "grad_norm": 13.730207462407472, "learning_rate": 4.999798105651151e-05, "loss": 2.3414, "mean_token_accuracy": 0.4413793087005615, "step": 53665 }, { "epoch": 0.05405699402019054, "grad_norm": 12.363216610526056, "learning_rate": 4.999797603371654e-05, "loss": 2.7455, "mean_token_accuracy": 0.3810042321681976, "step": 53670 }, { "epoch": 0.054062030073294715, "grad_norm": 11.444101484058908, "learning_rate": 4.9997971004681665e-05, "loss": 2.4455, "mean_token_accuracy": 0.4137930989265442, "step": 53675 }, { "epoch": 0.05406706612639889, "grad_norm": 10.474391557150467, "learning_rate": 4.9997965969406896e-05, "loss": 2.3744, "mean_token_accuracy": 0.42413792610168455, "step": 53680 }, { "epoch": 0.05407210217950306, "grad_norm": 14.077640673046057, "learning_rate": 4.999796092789224e-05, "loss": 3.4186, "mean_token_accuracy": 0.3241379350423813, "step": 53685 }, { "epoch": 0.054077138232607236, "grad_norm": 17.160786387542075, "learning_rate": 4.999795588013769e-05, "loss": 2.5364, "mean_token_accuracy": 0.41034482717514037, "step": 53690 }, { "epoch": 0.05408217428571141, "grad_norm": 10.766147743582179, "learning_rate": 4.999795082614324e-05, "loss": 2.6038, "mean_token_accuracy": 0.3896551698446274, "step": 53695 }, { "epoch": 0.05408721033881558, "grad_norm": 14.677747092803214, "learning_rate": 4.9997945765908896e-05, "loss": 2.5502, "mean_token_accuracy": 0.42068964838981626, "step": 53700 }, { "epoch": 0.05409224639191975, "grad_norm": 12.769271928328553, "learning_rate": 4.999794069943467e-05, "loss": 2.7477, "mean_token_accuracy": 0.3793103456497192, "step": 53705 }, { "epoch": 0.054097282445023924, "grad_norm": 12.400346931510436, "learning_rate": 4.999793562672056e-05, "loss": 2.6056, "mean_token_accuracy": 0.3793103456497192, "step": 53710 }, { "epoch": 0.0541023184981281, "grad_norm": 11.287944249081166, "learning_rate": 4.999793054776656e-05, "loss": 2.9649, "mean_token_accuracy": 0.324137932062149, "step": 53715 }, { "epoch": 0.05410735455123227, "grad_norm": 10.969953116591077, "learning_rate": 4.9997925462572674e-05, "loss": 3.4667, "mean_token_accuracy": 0.34482758343219755, "step": 53720 }, { "epoch": 0.054112390604336445, "grad_norm": 13.054006170367336, "learning_rate": 4.999792037113891e-05, "loss": 3.0844, "mean_token_accuracy": 0.3551724135875702, "step": 53725 }, { "epoch": 0.05411742665744062, "grad_norm": 16.037814288865174, "learning_rate": 4.999791527346525e-05, "loss": 2.4932, "mean_token_accuracy": 0.3931034505367279, "step": 53730 }, { "epoch": 0.05412246271054479, "grad_norm": 17.213810627283163, "learning_rate": 4.999791016955172e-05, "loss": 2.4509, "mean_token_accuracy": 0.3862069010734558, "step": 53735 }, { "epoch": 0.05412749876364896, "grad_norm": 11.716465574290336, "learning_rate": 4.999790505939831e-05, "loss": 2.2793, "mean_token_accuracy": 0.45517241954803467, "step": 53740 }, { "epoch": 0.05413253481675313, "grad_norm": 12.003322282470714, "learning_rate": 4.999789994300502e-05, "loss": 2.5329, "mean_token_accuracy": 0.41034482717514037, "step": 53745 }, { "epoch": 0.05413757086985731, "grad_norm": 11.703077591036108, "learning_rate": 4.9997894820371855e-05, "loss": 2.6602, "mean_token_accuracy": 0.4053841531276703, "step": 53750 }, { "epoch": 0.05414260692296148, "grad_norm": 12.497987994517377, "learning_rate": 4.9997889691498814e-05, "loss": 2.8614, "mean_token_accuracy": 0.36896551251411436, "step": 53755 }, { "epoch": 0.054147642976065655, "grad_norm": 14.65336341702027, "learning_rate": 4.99978845563859e-05, "loss": 2.9145, "mean_token_accuracy": 0.34137930572032926, "step": 53760 }, { "epoch": 0.05415267902916983, "grad_norm": 10.946588020110655, "learning_rate": 4.9997879415033114e-05, "loss": 2.2789, "mean_token_accuracy": 0.4504537105560303, "step": 53765 }, { "epoch": 0.054157715082274, "grad_norm": 10.62388792977892, "learning_rate": 4.9997874267440455e-05, "loss": 2.6099, "mean_token_accuracy": 0.34137930870056155, "step": 53770 }, { "epoch": 0.05416275113537817, "grad_norm": 13.3845489452758, "learning_rate": 4.9997869113607935e-05, "loss": 2.6704, "mean_token_accuracy": 0.36551723480224607, "step": 53775 }, { "epoch": 0.05416778718848234, "grad_norm": 12.321608355058734, "learning_rate": 4.999786395353553e-05, "loss": 2.3503, "mean_token_accuracy": 0.3827586233615875, "step": 53780 }, { "epoch": 0.05417282324158652, "grad_norm": 15.758723657304026, "learning_rate": 4.9997858787223276e-05, "loss": 2.5535, "mean_token_accuracy": 0.36551723778247835, "step": 53785 }, { "epoch": 0.05417785929469069, "grad_norm": 12.708130712218528, "learning_rate": 4.999785361467115e-05, "loss": 2.842, "mean_token_accuracy": 0.41203871965408323, "step": 53790 }, { "epoch": 0.054182895347794864, "grad_norm": 11.452532087921881, "learning_rate": 4.999784843587916e-05, "loss": 1.9419, "mean_token_accuracy": 0.5185960710048676, "step": 53795 }, { "epoch": 0.05418793140089904, "grad_norm": 13.008851026016064, "learning_rate": 4.999784325084731e-05, "loss": 2.6844, "mean_token_accuracy": 0.39310344457626345, "step": 53800 }, { "epoch": 0.05419296745400321, "grad_norm": 11.156132799423109, "learning_rate": 4.9997838059575596e-05, "loss": 2.3681, "mean_token_accuracy": 0.447126442193985, "step": 53805 }, { "epoch": 0.05419800350710738, "grad_norm": 14.593307415040004, "learning_rate": 4.999783286206402e-05, "loss": 2.9648, "mean_token_accuracy": 0.37586206793785093, "step": 53810 }, { "epoch": 0.05420303956021155, "grad_norm": 10.3345314067087, "learning_rate": 4.999782765831259e-05, "loss": 2.7223, "mean_token_accuracy": 0.3896551698446274, "step": 53815 }, { "epoch": 0.054208075613315726, "grad_norm": 12.376332067036385, "learning_rate": 4.999782244832131e-05, "loss": 2.5899, "mean_token_accuracy": 0.38620689511299133, "step": 53820 }, { "epoch": 0.0542131116664199, "grad_norm": 10.452128327487172, "learning_rate": 4.999781723209016e-05, "loss": 2.2664, "mean_token_accuracy": 0.4034482717514038, "step": 53825 }, { "epoch": 0.054218147719524074, "grad_norm": 18.391573364387483, "learning_rate": 4.9997812009619174e-05, "loss": 3.0707, "mean_token_accuracy": 0.358620685338974, "step": 53830 }, { "epoch": 0.05422318377262825, "grad_norm": 14.890771930466618, "learning_rate": 4.9997806780908325e-05, "loss": 2.9583, "mean_token_accuracy": 0.33448276221752166, "step": 53835 }, { "epoch": 0.05422821982573242, "grad_norm": 11.494590459401621, "learning_rate": 4.999780154595763e-05, "loss": 2.688, "mean_token_accuracy": 0.4, "step": 53840 }, { "epoch": 0.05423325587883659, "grad_norm": 13.91047496171786, "learning_rate": 4.999779630476708e-05, "loss": 2.6772, "mean_token_accuracy": 0.4379310369491577, "step": 53845 }, { "epoch": 0.05423829193194076, "grad_norm": 14.142156072605141, "learning_rate": 4.999779105733669e-05, "loss": 2.8989, "mean_token_accuracy": 0.3620689630508423, "step": 53850 }, { "epoch": 0.054243327985044935, "grad_norm": 12.225168186552901, "learning_rate": 4.999778580366644e-05, "loss": 2.4245, "mean_token_accuracy": 0.40689654350280763, "step": 53855 }, { "epoch": 0.05424836403814911, "grad_norm": 14.784843511490099, "learning_rate": 4.999778054375636e-05, "loss": 2.4102, "mean_token_accuracy": 0.4310344815254211, "step": 53860 }, { "epoch": 0.05425340009125328, "grad_norm": 13.193032349691608, "learning_rate": 4.999777527760643e-05, "loss": 2.7318, "mean_token_accuracy": 0.37586207389831544, "step": 53865 }, { "epoch": 0.05425843614435746, "grad_norm": 10.308914678114846, "learning_rate": 4.999777000521666e-05, "loss": 2.536, "mean_token_accuracy": 0.4206896543502808, "step": 53870 }, { "epoch": 0.05426347219746163, "grad_norm": 17.11432377423662, "learning_rate": 4.999776472658705e-05, "loss": 2.3365, "mean_token_accuracy": 0.44137930274009707, "step": 53875 }, { "epoch": 0.0542685082505658, "grad_norm": 9.408458829399539, "learning_rate": 4.99977594417176e-05, "loss": 2.6308, "mean_token_accuracy": 0.3902709364891052, "step": 53880 }, { "epoch": 0.05427354430366997, "grad_norm": 19.909679161899852, "learning_rate": 4.999775415060832e-05, "loss": 3.726, "mean_token_accuracy": 0.29655172526836393, "step": 53885 }, { "epoch": 0.054278580356774145, "grad_norm": 10.492820950117368, "learning_rate": 4.9997748853259184e-05, "loss": 2.4399, "mean_token_accuracy": 0.4000000059604645, "step": 53890 }, { "epoch": 0.05428361640987832, "grad_norm": 10.887721613014627, "learning_rate": 4.999774354967023e-05, "loss": 2.0946, "mean_token_accuracy": 0.4620689690113068, "step": 53895 }, { "epoch": 0.05428865246298249, "grad_norm": 9.215919668736971, "learning_rate": 4.999773823984144e-05, "loss": 2.3564, "mean_token_accuracy": 0.4413793087005615, "step": 53900 }, { "epoch": 0.054293688516086666, "grad_norm": 16.040806172194344, "learning_rate": 4.9997732923772816e-05, "loss": 3.0961, "mean_token_accuracy": 0.379310342669487, "step": 53905 }, { "epoch": 0.05429872456919084, "grad_norm": 10.386997877987103, "learning_rate": 4.999772760146436e-05, "loss": 2.4712, "mean_token_accuracy": 0.37241379618644715, "step": 53910 }, { "epoch": 0.05430376062229501, "grad_norm": 10.771292438339708, "learning_rate": 4.999772227291608e-05, "loss": 2.2725, "mean_token_accuracy": 0.4482758641242981, "step": 53915 }, { "epoch": 0.05430879667539918, "grad_norm": 11.587152762044953, "learning_rate": 4.999771693812797e-05, "loss": 2.5614, "mean_token_accuracy": 0.3413792967796326, "step": 53920 }, { "epoch": 0.054313832728503354, "grad_norm": 10.417644101112003, "learning_rate": 4.999771159710004e-05, "loss": 2.5876, "mean_token_accuracy": 0.39818512201309203, "step": 53925 }, { "epoch": 0.05431886878160753, "grad_norm": 13.604045167660464, "learning_rate": 4.999770624983228e-05, "loss": 2.5467, "mean_token_accuracy": 0.4068965494632721, "step": 53930 }, { "epoch": 0.0543239048347117, "grad_norm": 11.767215749978025, "learning_rate": 4.999770089632469e-05, "loss": 2.3774, "mean_token_accuracy": 0.3965517163276672, "step": 53935 }, { "epoch": 0.054328940887815876, "grad_norm": 10.601196216208526, "learning_rate": 4.999769553657729e-05, "loss": 2.4349, "mean_token_accuracy": 0.36551723480224607, "step": 53940 }, { "epoch": 0.05433397694092005, "grad_norm": 15.275302812283423, "learning_rate": 4.9997690170590074e-05, "loss": 3.6193, "mean_token_accuracy": 0.27931034564971924, "step": 53945 }, { "epoch": 0.054339012994024216, "grad_norm": 13.18380812645147, "learning_rate": 4.9997684798363024e-05, "loss": 2.8234, "mean_token_accuracy": 0.39655172228813174, "step": 53950 }, { "epoch": 0.05434404904712839, "grad_norm": 11.211606771693553, "learning_rate": 4.999767941989617e-05, "loss": 2.5606, "mean_token_accuracy": 0.3620689630508423, "step": 53955 }, { "epoch": 0.054349085100232564, "grad_norm": 13.057766226252904, "learning_rate": 4.9997674035189496e-05, "loss": 2.8529, "mean_token_accuracy": 0.3551724076271057, "step": 53960 }, { "epoch": 0.05435412115333674, "grad_norm": 11.985242858576337, "learning_rate": 4.9997668644243004e-05, "loss": 2.8048, "mean_token_accuracy": 0.33103448152542114, "step": 53965 }, { "epoch": 0.05435915720644091, "grad_norm": 12.872207299026323, "learning_rate": 4.999766324705671e-05, "loss": 2.6505, "mean_token_accuracy": 0.39655172228813174, "step": 53970 }, { "epoch": 0.054364193259545085, "grad_norm": 12.934939352560217, "learning_rate": 4.99976578436306e-05, "loss": 2.6791, "mean_token_accuracy": 0.3862068891525269, "step": 53975 }, { "epoch": 0.05436922931264926, "grad_norm": 13.198426602591098, "learning_rate": 4.999765243396468e-05, "loss": 2.1877, "mean_token_accuracy": 0.4620689630508423, "step": 53980 }, { "epoch": 0.054374265365753426, "grad_norm": 13.486393585761201, "learning_rate": 4.999764701805895e-05, "loss": 2.9929, "mean_token_accuracy": 0.324137932062149, "step": 53985 }, { "epoch": 0.0543793014188576, "grad_norm": 14.76517937377239, "learning_rate": 4.999764159591342e-05, "loss": 3.0706, "mean_token_accuracy": 0.36896551251411436, "step": 53990 }, { "epoch": 0.05438433747196177, "grad_norm": 9.428751123244538, "learning_rate": 4.9997636167528076e-05, "loss": 2.6947, "mean_token_accuracy": 0.41034482717514037, "step": 53995 }, { "epoch": 0.05438937352506595, "grad_norm": 12.285506809661081, "learning_rate": 4.999763073290294e-05, "loss": 2.5627, "mean_token_accuracy": 0.3931034505367279, "step": 54000 }, { "epoch": 0.05439440957817012, "grad_norm": 10.736808833239031, "learning_rate": 4.9997625292038e-05, "loss": 2.2772, "mean_token_accuracy": 0.39310344457626345, "step": 54005 }, { "epoch": 0.054399445631274294, "grad_norm": 13.27385560071303, "learning_rate": 4.999761984493325e-05, "loss": 2.3316, "mean_token_accuracy": 0.42413793206214906, "step": 54010 }, { "epoch": 0.05440448168437847, "grad_norm": 14.079620012529094, "learning_rate": 4.999761439158871e-05, "loss": 2.2749, "mean_token_accuracy": 0.4413793087005615, "step": 54015 }, { "epoch": 0.054409517737482635, "grad_norm": 15.80187386483194, "learning_rate": 4.999760893200437e-05, "loss": 2.748, "mean_token_accuracy": 0.362068971991539, "step": 54020 }, { "epoch": 0.05441455379058681, "grad_norm": 13.297376669183109, "learning_rate": 4.9997603466180234e-05, "loss": 3.0282, "mean_token_accuracy": 0.38275861740112305, "step": 54025 }, { "epoch": 0.05441958984369098, "grad_norm": 14.885002948421775, "learning_rate": 4.9997597994116304e-05, "loss": 2.5697, "mean_token_accuracy": 0.3655172437429428, "step": 54030 }, { "epoch": 0.054424625896795156, "grad_norm": 14.516186343224915, "learning_rate": 4.9997592515812574e-05, "loss": 2.4575, "mean_token_accuracy": 0.37586207389831544, "step": 54035 }, { "epoch": 0.05442966194989933, "grad_norm": 12.207578521600366, "learning_rate": 4.999758703126906e-05, "loss": 2.544, "mean_token_accuracy": 0.37931033968925476, "step": 54040 }, { "epoch": 0.054434698003003504, "grad_norm": 12.319141839508259, "learning_rate": 4.999758154048576e-05, "loss": 2.3456, "mean_token_accuracy": 0.43103447556495667, "step": 54045 }, { "epoch": 0.05443973405610768, "grad_norm": 17.911899431205587, "learning_rate": 4.9997576043462675e-05, "loss": 2.6841, "mean_token_accuracy": 0.403448274731636, "step": 54050 }, { "epoch": 0.054444770109211844, "grad_norm": 16.376945136733447, "learning_rate": 4.999757054019979e-05, "loss": 2.9254, "mean_token_accuracy": 0.3551724135875702, "step": 54055 }, { "epoch": 0.05444980616231602, "grad_norm": 11.122444459641732, "learning_rate": 4.999756503069713e-05, "loss": 2.3591, "mean_token_accuracy": 0.4398064136505127, "step": 54060 }, { "epoch": 0.05445484221542019, "grad_norm": 10.914612837861824, "learning_rate": 4.999755951495468e-05, "loss": 2.4446, "mean_token_accuracy": 0.43793103098869324, "step": 54065 }, { "epoch": 0.054459878268524366, "grad_norm": 16.312173981981157, "learning_rate": 4.999755399297245e-05, "loss": 2.6329, "mean_token_accuracy": 0.4068965554237366, "step": 54070 }, { "epoch": 0.05446491432162854, "grad_norm": 10.776177989232732, "learning_rate": 4.999754846475044e-05, "loss": 2.0429, "mean_token_accuracy": 0.46739262342453003, "step": 54075 }, { "epoch": 0.05446995037473271, "grad_norm": 10.042907189674924, "learning_rate": 4.999754293028865e-05, "loss": 2.4901, "mean_token_accuracy": 0.4344827592372894, "step": 54080 }, { "epoch": 0.05447498642783689, "grad_norm": 13.37657953275907, "learning_rate": 4.999753738958708e-05, "loss": 2.5044, "mean_token_accuracy": 0.43793103098869324, "step": 54085 }, { "epoch": 0.054480022480941054, "grad_norm": 11.733893965659687, "learning_rate": 4.9997531842645745e-05, "loss": 2.5528, "mean_token_accuracy": 0.38808228075504303, "step": 54090 }, { "epoch": 0.05448505853404523, "grad_norm": 10.895627029035035, "learning_rate": 4.999752628946463e-05, "loss": 2.5496, "mean_token_accuracy": 0.39655172228813174, "step": 54095 }, { "epoch": 0.0544900945871494, "grad_norm": 12.456552738200319, "learning_rate": 4.9997520730043736e-05, "loss": 3.1742, "mean_token_accuracy": 0.36896551251411436, "step": 54100 }, { "epoch": 0.054495130640253575, "grad_norm": 11.188090946774155, "learning_rate": 4.999751516438307e-05, "loss": 2.6878, "mean_token_accuracy": 0.37931033968925476, "step": 54105 }, { "epoch": 0.05450016669335775, "grad_norm": 11.231086997805155, "learning_rate": 4.999750959248264e-05, "loss": 2.6557, "mean_token_accuracy": 0.3896551728248596, "step": 54110 }, { "epoch": 0.05450520274646192, "grad_norm": 15.856141130023543, "learning_rate": 4.9997504014342444e-05, "loss": 2.4951, "mean_token_accuracy": 0.37586206793785093, "step": 54115 }, { "epoch": 0.054510238799566096, "grad_norm": 12.261658078039867, "learning_rate": 4.999749842996249e-05, "loss": 2.5281, "mean_token_accuracy": 0.3793103456497192, "step": 54120 }, { "epoch": 0.05451527485267026, "grad_norm": 21.43156017662129, "learning_rate": 4.9997492839342744e-05, "loss": 2.9841, "mean_token_accuracy": 0.3827586114406586, "step": 54125 }, { "epoch": 0.05452031090577444, "grad_norm": 12.239773363389329, "learning_rate": 4.999748724248326e-05, "loss": 2.6943, "mean_token_accuracy": 0.3551724135875702, "step": 54130 }, { "epoch": 0.05452534695887861, "grad_norm": 14.604508286717447, "learning_rate": 4.9997481639384e-05, "loss": 2.4466, "mean_token_accuracy": 0.3862069010734558, "step": 54135 }, { "epoch": 0.054530383011982785, "grad_norm": 10.991348896845013, "learning_rate": 4.999747603004499e-05, "loss": 2.7659, "mean_token_accuracy": 0.3931034505367279, "step": 54140 }, { "epoch": 0.05453541906508696, "grad_norm": 9.415641213471579, "learning_rate": 4.999747041446621e-05, "loss": 2.4321, "mean_token_accuracy": 0.44664246439933775, "step": 54145 }, { "epoch": 0.05454045511819113, "grad_norm": 8.548283421150996, "learning_rate": 4.999746479264768e-05, "loss": 2.4075, "mean_token_accuracy": 0.47428917288780215, "step": 54150 }, { "epoch": 0.054545491171295306, "grad_norm": 11.059116112859952, "learning_rate": 4.999745916458939e-05, "loss": 2.8004, "mean_token_accuracy": 0.39655172228813174, "step": 54155 }, { "epoch": 0.05455052722439947, "grad_norm": 12.248777925373007, "learning_rate": 4.999745353029134e-05, "loss": 2.5656, "mean_token_accuracy": 0.4103448331356049, "step": 54160 }, { "epoch": 0.054555563277503646, "grad_norm": 11.075679094109898, "learning_rate": 4.999744788975355e-05, "loss": 2.6393, "mean_token_accuracy": 0.3793103456497192, "step": 54165 }, { "epoch": 0.05456059933060782, "grad_norm": 16.17636583863506, "learning_rate": 4.9997442242976e-05, "loss": 2.7734, "mean_token_accuracy": 0.4034482777118683, "step": 54170 }, { "epoch": 0.054565635383711994, "grad_norm": 12.715526057272394, "learning_rate": 4.999743658995871e-05, "loss": 2.5157, "mean_token_accuracy": 0.4497277677059174, "step": 54175 }, { "epoch": 0.05457067143681617, "grad_norm": 13.87766693780733, "learning_rate": 4.999743093070167e-05, "loss": 2.7745, "mean_token_accuracy": 0.3827586233615875, "step": 54180 }, { "epoch": 0.05457570748992034, "grad_norm": 17.41464747954218, "learning_rate": 4.9997425265204876e-05, "loss": 2.795, "mean_token_accuracy": 0.38717483878135683, "step": 54185 }, { "epoch": 0.054580743543024515, "grad_norm": 11.779048531960992, "learning_rate": 4.999741959346834e-05, "loss": 2.5468, "mean_token_accuracy": 0.4034482717514038, "step": 54190 }, { "epoch": 0.05458577959612868, "grad_norm": 13.465500772077153, "learning_rate": 4.999741391549206e-05, "loss": 2.7821, "mean_token_accuracy": 0.38620689511299133, "step": 54195 }, { "epoch": 0.054590815649232856, "grad_norm": 13.776070959656394, "learning_rate": 4.9997408231276036e-05, "loss": 2.793, "mean_token_accuracy": 0.4103448212146759, "step": 54200 }, { "epoch": 0.05459585170233703, "grad_norm": 10.374760641388725, "learning_rate": 4.999740254082028e-05, "loss": 2.6205, "mean_token_accuracy": 0.4, "step": 54205 }, { "epoch": 0.0546008877554412, "grad_norm": 10.685216006201408, "learning_rate": 4.9997396844124784e-05, "loss": 3.007, "mean_token_accuracy": 0.37241379022598264, "step": 54210 }, { "epoch": 0.05460592380854538, "grad_norm": 14.334845886791479, "learning_rate": 4.999739114118954e-05, "loss": 2.425, "mean_token_accuracy": 0.3931034505367279, "step": 54215 }, { "epoch": 0.05461095986164955, "grad_norm": 11.875115785044175, "learning_rate": 4.9997385432014575e-05, "loss": 2.7835, "mean_token_accuracy": 0.3896551728248596, "step": 54220 }, { "epoch": 0.054615995914753725, "grad_norm": 12.754674301187459, "learning_rate": 4.999737971659987e-05, "loss": 2.5647, "mean_token_accuracy": 0.41034482717514037, "step": 54225 }, { "epoch": 0.05462103196785789, "grad_norm": 15.23874272656464, "learning_rate": 4.999737399494543e-05, "loss": 2.7344, "mean_token_accuracy": 0.38965517580509185, "step": 54230 }, { "epoch": 0.054626068020962065, "grad_norm": 36.901815142983025, "learning_rate": 4.999736826705126e-05, "loss": 2.0797, "mean_token_accuracy": 0.4344827592372894, "step": 54235 }, { "epoch": 0.05463110407406624, "grad_norm": 13.430485074266526, "learning_rate": 4.999736253291737e-05, "loss": 2.5209, "mean_token_accuracy": 0.4, "step": 54240 }, { "epoch": 0.05463614012717041, "grad_norm": 17.49833231313112, "learning_rate": 4.999735679254374e-05, "loss": 2.6312, "mean_token_accuracy": 0.3931034505367279, "step": 54245 }, { "epoch": 0.05464117618027459, "grad_norm": 13.262399028389577, "learning_rate": 4.999735104593039e-05, "loss": 2.6093, "mean_token_accuracy": 0.3551724076271057, "step": 54250 }, { "epoch": 0.05464621223337876, "grad_norm": 20.89465837522596, "learning_rate": 4.9997345293077316e-05, "loss": 2.4356, "mean_token_accuracy": 0.3946158468723297, "step": 54255 }, { "epoch": 0.054651248286482934, "grad_norm": 10.647909151783944, "learning_rate": 4.999733953398452e-05, "loss": 2.4035, "mean_token_accuracy": 0.38620689511299133, "step": 54260 }, { "epoch": 0.0546562843395871, "grad_norm": 12.837324155958994, "learning_rate": 4.999733376865201e-05, "loss": 2.6298, "mean_token_accuracy": 0.3931034475564957, "step": 54265 }, { "epoch": 0.054661320392691275, "grad_norm": 13.924427303998854, "learning_rate": 4.999732799707977e-05, "loss": 2.4358, "mean_token_accuracy": 0.3620689660310745, "step": 54270 }, { "epoch": 0.05466635644579545, "grad_norm": 11.49985932214395, "learning_rate": 4.999732221926781e-05, "loss": 2.495, "mean_token_accuracy": 0.42274651527404783, "step": 54275 }, { "epoch": 0.05467139249889962, "grad_norm": 13.135449276344067, "learning_rate": 4.999731643521614e-05, "loss": 2.5321, "mean_token_accuracy": 0.3724137842655182, "step": 54280 }, { "epoch": 0.054676428552003796, "grad_norm": 11.62245100960335, "learning_rate": 4.999731064492476e-05, "loss": 2.2225, "mean_token_accuracy": 0.4620689630508423, "step": 54285 }, { "epoch": 0.05468146460510797, "grad_norm": 14.027978017442027, "learning_rate": 4.9997304848393656e-05, "loss": 2.6396, "mean_token_accuracy": 0.41379310488700866, "step": 54290 }, { "epoch": 0.054686500658212144, "grad_norm": 27.80286178382515, "learning_rate": 4.999729904562285e-05, "loss": 2.7357, "mean_token_accuracy": 0.37931033968925476, "step": 54295 }, { "epoch": 0.05469153671131631, "grad_norm": 11.32676543320186, "learning_rate": 4.999729323661232e-05, "loss": 2.1157, "mean_token_accuracy": 0.4689655125141144, "step": 54300 }, { "epoch": 0.054696572764420484, "grad_norm": 12.728543843048245, "learning_rate": 4.999728742136209e-05, "loss": 2.599, "mean_token_accuracy": 0.3620689630508423, "step": 54305 }, { "epoch": 0.05470160881752466, "grad_norm": 14.131418345484875, "learning_rate": 4.999728159987217e-05, "loss": 2.5245, "mean_token_accuracy": 0.3965517282485962, "step": 54310 }, { "epoch": 0.05470664487062883, "grad_norm": 13.708912455739048, "learning_rate": 4.9997275772142524e-05, "loss": 2.4708, "mean_token_accuracy": 0.38620689511299133, "step": 54315 }, { "epoch": 0.054711680923733005, "grad_norm": 13.183366545925505, "learning_rate": 4.9997269938173176e-05, "loss": 2.6331, "mean_token_accuracy": 0.33793102502822875, "step": 54320 }, { "epoch": 0.05471671697683718, "grad_norm": 12.059306886787619, "learning_rate": 4.999726409796413e-05, "loss": 2.5516, "mean_token_accuracy": 0.39310344457626345, "step": 54325 }, { "epoch": 0.05472175302994135, "grad_norm": 11.62830121580452, "learning_rate": 4.999725825151538e-05, "loss": 2.769, "mean_token_accuracy": 0.41379310488700866, "step": 54330 }, { "epoch": 0.05472678908304552, "grad_norm": 18.20200037305479, "learning_rate": 4.9997252398826946e-05, "loss": 2.7932, "mean_token_accuracy": 0.3827586203813553, "step": 54335 }, { "epoch": 0.054731825136149694, "grad_norm": 12.865533348690079, "learning_rate": 4.99972465398988e-05, "loss": 2.7858, "mean_token_accuracy": 0.37241379618644715, "step": 54340 }, { "epoch": 0.05473686118925387, "grad_norm": 13.1580197447787, "learning_rate": 4.999724067473097e-05, "loss": 2.5233, "mean_token_accuracy": 0.37931033968925476, "step": 54345 }, { "epoch": 0.05474189724235804, "grad_norm": 14.16512295381298, "learning_rate": 4.999723480332344e-05, "loss": 2.5528, "mean_token_accuracy": 0.4241379380226135, "step": 54350 }, { "epoch": 0.054746933295462215, "grad_norm": 12.154094385131897, "learning_rate": 4.999722892567622e-05, "loss": 2.4514, "mean_token_accuracy": 0.4482758641242981, "step": 54355 }, { "epoch": 0.05475196934856639, "grad_norm": 13.671438246452615, "learning_rate": 4.99972230417893e-05, "loss": 2.2815, "mean_token_accuracy": 0.4310344815254211, "step": 54360 }, { "epoch": 0.05475700540167056, "grad_norm": 12.743191833251798, "learning_rate": 4.999721715166271e-05, "loss": 2.7251, "mean_token_accuracy": 0.3896551728248596, "step": 54365 }, { "epoch": 0.05476204145477473, "grad_norm": 10.254736494907352, "learning_rate": 4.9997211255296414e-05, "loss": 2.7059, "mean_token_accuracy": 0.41379310488700866, "step": 54370 }, { "epoch": 0.0547670775078789, "grad_norm": 11.314195982705172, "learning_rate": 4.999720535269045e-05, "loss": 2.5785, "mean_token_accuracy": 0.37241379022598264, "step": 54375 }, { "epoch": 0.05477211356098308, "grad_norm": 10.25062799256725, "learning_rate": 4.9997199443844783e-05, "loss": 2.5568, "mean_token_accuracy": 0.4379310369491577, "step": 54380 }, { "epoch": 0.05477714961408725, "grad_norm": 11.277378520683582, "learning_rate": 4.999719352875945e-05, "loss": 2.4438, "mean_token_accuracy": 0.43103447556495667, "step": 54385 }, { "epoch": 0.054782185667191424, "grad_norm": 9.629413550103756, "learning_rate": 4.9997187607434426e-05, "loss": 2.1146, "mean_token_accuracy": 0.4605565667152405, "step": 54390 }, { "epoch": 0.0547872217202956, "grad_norm": 10.977584866380166, "learning_rate": 4.9997181679869726e-05, "loss": 2.3172, "mean_token_accuracy": 0.43103447556495667, "step": 54395 }, { "epoch": 0.05479225777339977, "grad_norm": 10.66862273991511, "learning_rate": 4.9997175746065354e-05, "loss": 2.2606, "mean_token_accuracy": 0.44482758045196535, "step": 54400 }, { "epoch": 0.05479729382650394, "grad_norm": 16.692960050717513, "learning_rate": 4.99971698060213e-05, "loss": 2.6912, "mean_token_accuracy": 0.3931034505367279, "step": 54405 }, { "epoch": 0.05480232987960811, "grad_norm": 11.945205236638019, "learning_rate": 4.999716385973757e-05, "loss": 2.6448, "mean_token_accuracy": 0.4344827592372894, "step": 54410 }, { "epoch": 0.054807365932712286, "grad_norm": 10.624755482170864, "learning_rate": 4.9997157907214176e-05, "loss": 2.6628, "mean_token_accuracy": 0.41034482717514037, "step": 54415 }, { "epoch": 0.05481240198581646, "grad_norm": 13.99169543678255, "learning_rate": 4.9997151948451105e-05, "loss": 2.3363, "mean_token_accuracy": 0.458620685338974, "step": 54420 }, { "epoch": 0.054817438038920634, "grad_norm": 10.321447297620917, "learning_rate": 4.9997145983448375e-05, "loss": 2.7296, "mean_token_accuracy": 0.3793103456497192, "step": 54425 }, { "epoch": 0.05482247409202481, "grad_norm": 12.560805386784308, "learning_rate": 4.999714001220596e-05, "loss": 2.2502, "mean_token_accuracy": 0.4689655125141144, "step": 54430 }, { "epoch": 0.05482751014512898, "grad_norm": 11.622107835143565, "learning_rate": 4.9997134034723895e-05, "loss": 2.1959, "mean_token_accuracy": 0.46551724076271056, "step": 54435 }, { "epoch": 0.05483254619823315, "grad_norm": 11.088411785650322, "learning_rate": 4.999712805100216e-05, "loss": 2.7162, "mean_token_accuracy": 0.37241379022598264, "step": 54440 }, { "epoch": 0.05483758225133732, "grad_norm": 10.397474582694896, "learning_rate": 4.999712206104076e-05, "loss": 2.6024, "mean_token_accuracy": 0.3379310369491577, "step": 54445 }, { "epoch": 0.054842618304441496, "grad_norm": 11.75477049590005, "learning_rate": 4.99971160648397e-05, "loss": 2.8731, "mean_token_accuracy": 0.38790078163146974, "step": 54450 }, { "epoch": 0.05484765435754567, "grad_norm": 10.903901256255777, "learning_rate": 4.999711006239899e-05, "loss": 2.2529, "mean_token_accuracy": 0.46551724076271056, "step": 54455 }, { "epoch": 0.05485269041064984, "grad_norm": 9.920129346600094, "learning_rate": 4.999710405371861e-05, "loss": 2.262, "mean_token_accuracy": 0.4172413766384125, "step": 54460 }, { "epoch": 0.05485772646375402, "grad_norm": 10.223240500440891, "learning_rate": 4.9997098038798585e-05, "loss": 2.5247, "mean_token_accuracy": 0.38620689511299133, "step": 54465 }, { "epoch": 0.05486276251685819, "grad_norm": 10.59049060922838, "learning_rate": 4.99970920176389e-05, "loss": 2.1664, "mean_token_accuracy": 0.4448275864124298, "step": 54470 }, { "epoch": 0.05486779856996236, "grad_norm": 11.845171339417996, "learning_rate": 4.999708599023956e-05, "loss": 3.2442, "mean_token_accuracy": 0.35862069129943847, "step": 54475 }, { "epoch": 0.05487283462306653, "grad_norm": 11.74041524184736, "learning_rate": 4.9997079956600574e-05, "loss": 2.3933, "mean_token_accuracy": 0.4103448212146759, "step": 54480 }, { "epoch": 0.054877870676170705, "grad_norm": 10.621372110218356, "learning_rate": 4.9997073916721935e-05, "loss": 2.3266, "mean_token_accuracy": 0.4517241299152374, "step": 54485 }, { "epoch": 0.05488290672927488, "grad_norm": 10.814890275601295, "learning_rate": 4.999706787060365e-05, "loss": 2.2058, "mean_token_accuracy": 0.47586207985877993, "step": 54490 }, { "epoch": 0.05488794278237905, "grad_norm": 11.583624713048449, "learning_rate": 4.999706181824573e-05, "loss": 2.0823, "mean_token_accuracy": 0.456745320558548, "step": 54495 }, { "epoch": 0.054892978835483226, "grad_norm": 11.461569199699918, "learning_rate": 4.9997055759648156e-05, "loss": 2.6524, "mean_token_accuracy": 0.3841500341892242, "step": 54500 }, { "epoch": 0.0548980148885874, "grad_norm": 12.291886836126451, "learning_rate": 4.999704969481094e-05, "loss": 2.423, "mean_token_accuracy": 0.43103447556495667, "step": 54505 }, { "epoch": 0.05490305094169157, "grad_norm": 11.139050865953255, "learning_rate": 4.999704362373408e-05, "loss": 2.4622, "mean_token_accuracy": 0.4724137902259827, "step": 54510 }, { "epoch": 0.05490808699479574, "grad_norm": 11.020074725867492, "learning_rate": 4.9997037546417584e-05, "loss": 2.504, "mean_token_accuracy": 0.41379310488700866, "step": 54515 }, { "epoch": 0.054913123047899914, "grad_norm": 11.170791713838243, "learning_rate": 4.9997031462861446e-05, "loss": 2.373, "mean_token_accuracy": 0.43793103098869324, "step": 54520 }, { "epoch": 0.05491815910100409, "grad_norm": 13.68047590302003, "learning_rate": 4.9997025373065685e-05, "loss": 2.471, "mean_token_accuracy": 0.4310344815254211, "step": 54525 }, { "epoch": 0.05492319515410826, "grad_norm": 13.040297655525265, "learning_rate": 4.999701927703028e-05, "loss": 2.6178, "mean_token_accuracy": 0.39310344457626345, "step": 54530 }, { "epoch": 0.054928231207212436, "grad_norm": 10.90090243179482, "learning_rate": 4.999701317475525e-05, "loss": 2.6843, "mean_token_accuracy": 0.4172413766384125, "step": 54535 }, { "epoch": 0.05493326726031661, "grad_norm": 13.825858356836811, "learning_rate": 4.999700706624058e-05, "loss": 2.5872, "mean_token_accuracy": 0.3862068891525269, "step": 54540 }, { "epoch": 0.054938303313420776, "grad_norm": 14.077127569330168, "learning_rate": 4.999700095148629e-05, "loss": 2.4339, "mean_token_accuracy": 0.45862069725990295, "step": 54545 }, { "epoch": 0.05494333936652495, "grad_norm": 10.849000917150184, "learning_rate": 4.9996994830492365e-05, "loss": 2.3763, "mean_token_accuracy": 0.3931034505367279, "step": 54550 }, { "epoch": 0.054948375419629124, "grad_norm": 13.451222479195048, "learning_rate": 4.999698870325882e-05, "loss": 2.6759, "mean_token_accuracy": 0.3620689630508423, "step": 54555 }, { "epoch": 0.0549534114727333, "grad_norm": 9.459283196923552, "learning_rate": 4.9996982569785647e-05, "loss": 2.4764, "mean_token_accuracy": 0.4206896543502808, "step": 54560 }, { "epoch": 0.05495844752583747, "grad_norm": 16.03525928253831, "learning_rate": 4.999697643007286e-05, "loss": 2.8863, "mean_token_accuracy": 0.39655172228813174, "step": 54565 }, { "epoch": 0.054963483578941645, "grad_norm": 13.0083486471545, "learning_rate": 4.9996970284120445e-05, "loss": 2.797, "mean_token_accuracy": 0.341379314661026, "step": 54570 }, { "epoch": 0.05496851963204582, "grad_norm": 10.228530865355939, "learning_rate": 4.9996964131928415e-05, "loss": 2.3807, "mean_token_accuracy": 0.41440887451171876, "step": 54575 }, { "epoch": 0.054973555685149986, "grad_norm": 15.406325236978802, "learning_rate": 4.999695797349677e-05, "loss": 2.4131, "mean_token_accuracy": 0.40689654350280763, "step": 54580 }, { "epoch": 0.05497859173825416, "grad_norm": 11.910937457865513, "learning_rate": 4.99969518088255e-05, "loss": 2.3949, "mean_token_accuracy": 0.46551724672317507, "step": 54585 }, { "epoch": 0.05498362779135833, "grad_norm": 11.703762405300274, "learning_rate": 4.9996945637914626e-05, "loss": 2.8361, "mean_token_accuracy": 0.3620689630508423, "step": 54590 }, { "epoch": 0.05498866384446251, "grad_norm": 12.581152816015233, "learning_rate": 4.999693946076413e-05, "loss": 2.7149, "mean_token_accuracy": 0.38275861740112305, "step": 54595 }, { "epoch": 0.05499369989756668, "grad_norm": 11.198934160804313, "learning_rate": 4.9996933277374036e-05, "loss": 2.682, "mean_token_accuracy": 0.4034482777118683, "step": 54600 }, { "epoch": 0.054998735950670855, "grad_norm": 10.951667333775461, "learning_rate": 4.999692708774433e-05, "loss": 2.5106, "mean_token_accuracy": 0.42068966031074523, "step": 54605 }, { "epoch": 0.05500377200377503, "grad_norm": 12.258298067842022, "learning_rate": 4.999692089187502e-05, "loss": 2.5145, "mean_token_accuracy": 0.4344827651977539, "step": 54610 }, { "epoch": 0.055008808056879195, "grad_norm": 12.57908508116266, "learning_rate": 4.99969146897661e-05, "loss": 2.6359, "mean_token_accuracy": 0.39655172228813174, "step": 54615 }, { "epoch": 0.05501384410998337, "grad_norm": 13.454749275789666, "learning_rate": 4.9996908481417583e-05, "loss": 2.2876, "mean_token_accuracy": 0.4517241358757019, "step": 54620 }, { "epoch": 0.05501888016308754, "grad_norm": 13.591296765081454, "learning_rate": 4.9996902266829454e-05, "loss": 2.6108, "mean_token_accuracy": 0.3999999940395355, "step": 54625 }, { "epoch": 0.055023916216191716, "grad_norm": 12.201824360800561, "learning_rate": 4.9996896046001734e-05, "loss": 2.5995, "mean_token_accuracy": 0.4137930989265442, "step": 54630 }, { "epoch": 0.05502895226929589, "grad_norm": 12.196068308050652, "learning_rate": 4.999688981893441e-05, "loss": 2.9516, "mean_token_accuracy": 0.38777979016304015, "step": 54635 }, { "epoch": 0.055033988322400064, "grad_norm": 9.339111483778993, "learning_rate": 4.999688358562749e-05, "loss": 2.761, "mean_token_accuracy": 0.4012704133987427, "step": 54640 }, { "epoch": 0.05503902437550424, "grad_norm": 10.293807014862155, "learning_rate": 4.9996877346080985e-05, "loss": 2.5875, "mean_token_accuracy": 0.3999999940395355, "step": 54645 }, { "epoch": 0.055044060428608405, "grad_norm": 18.971478335498414, "learning_rate": 4.999687110029487e-05, "loss": 2.67, "mean_token_accuracy": 0.38275861740112305, "step": 54650 }, { "epoch": 0.05504909648171258, "grad_norm": 8.637741971439697, "learning_rate": 4.999686484826918e-05, "loss": 2.4124, "mean_token_accuracy": 0.44273399114608764, "step": 54655 }, { "epoch": 0.05505413253481675, "grad_norm": 11.226614721447172, "learning_rate": 4.999685859000389e-05, "loss": 2.1836, "mean_token_accuracy": 0.4413793087005615, "step": 54660 }, { "epoch": 0.055059168587920926, "grad_norm": 12.386958927601722, "learning_rate": 4.9996852325499025e-05, "loss": 2.5318, "mean_token_accuracy": 0.37241379618644715, "step": 54665 }, { "epoch": 0.0550642046410251, "grad_norm": 9.939035573701915, "learning_rate": 4.999684605475456e-05, "loss": 2.6086, "mean_token_accuracy": 0.4, "step": 54670 }, { "epoch": 0.05506924069412927, "grad_norm": 11.172244204017614, "learning_rate": 4.999683977777052e-05, "loss": 2.6617, "mean_token_accuracy": 0.41379310488700866, "step": 54675 }, { "epoch": 0.05507427674723345, "grad_norm": 10.333740799497116, "learning_rate": 4.9996833494546894e-05, "loss": 2.3853, "mean_token_accuracy": 0.42934059500694277, "step": 54680 }, { "epoch": 0.055079312800337614, "grad_norm": 12.316261347915422, "learning_rate": 4.999682720508369e-05, "loss": 2.5662, "mean_token_accuracy": 0.37241379022598264, "step": 54685 }, { "epoch": 0.05508434885344179, "grad_norm": 16.82090551280434, "learning_rate": 4.9996820909380905e-05, "loss": 2.8953, "mean_token_accuracy": 0.3068965494632721, "step": 54690 }, { "epoch": 0.05508938490654596, "grad_norm": 16.188878317780716, "learning_rate": 4.999681460743854e-05, "loss": 2.7584, "mean_token_accuracy": 0.36551723480224607, "step": 54695 }, { "epoch": 0.055094420959650135, "grad_norm": 15.858139182180416, "learning_rate": 4.99968082992566e-05, "loss": 2.5018, "mean_token_accuracy": 0.41034482717514037, "step": 54700 }, { "epoch": 0.05509945701275431, "grad_norm": 11.342753520822262, "learning_rate": 4.999680198483509e-05, "loss": 2.7252, "mean_token_accuracy": 0.3517241358757019, "step": 54705 }, { "epoch": 0.05510449306585848, "grad_norm": 11.24062554770422, "learning_rate": 4.9996795664174006e-05, "loss": 3.0165, "mean_token_accuracy": 0.3931034505367279, "step": 54710 }, { "epoch": 0.05510952911896266, "grad_norm": 12.371929176551218, "learning_rate": 4.999678933727335e-05, "loss": 2.7066, "mean_token_accuracy": 0.42413793206214906, "step": 54715 }, { "epoch": 0.05511456517206682, "grad_norm": 11.723007719062226, "learning_rate": 4.999678300413313e-05, "loss": 2.3224, "mean_token_accuracy": 0.46206897497177124, "step": 54720 }, { "epoch": 0.055119601225171, "grad_norm": 10.453787645205768, "learning_rate": 4.999677666475335e-05, "loss": 2.2804, "mean_token_accuracy": 0.42413792610168455, "step": 54725 }, { "epoch": 0.05512463727827517, "grad_norm": 13.661984922244748, "learning_rate": 4.999677031913398e-05, "loss": 2.7923, "mean_token_accuracy": 0.39310344457626345, "step": 54730 }, { "epoch": 0.055129673331379345, "grad_norm": 9.843513305710964, "learning_rate": 4.9996763967275076e-05, "loss": 2.668, "mean_token_accuracy": 0.4413793087005615, "step": 54735 }, { "epoch": 0.05513470938448352, "grad_norm": 9.839692307253982, "learning_rate": 4.999675760917659e-05, "loss": 2.3811, "mean_token_accuracy": 0.4034482777118683, "step": 54740 }, { "epoch": 0.05513974543758769, "grad_norm": 13.187591163422763, "learning_rate": 4.9996751244838554e-05, "loss": 2.4789, "mean_token_accuracy": 0.4172413766384125, "step": 54745 }, { "epoch": 0.055144781490691866, "grad_norm": 16.90574741904902, "learning_rate": 4.999674487426096e-05, "loss": 2.9113, "mean_token_accuracy": 0.3655172407627106, "step": 54750 }, { "epoch": 0.05514981754379603, "grad_norm": 13.186336481179007, "learning_rate": 4.999673849744381e-05, "loss": 2.5756, "mean_token_accuracy": 0.37241379618644715, "step": 54755 }, { "epoch": 0.05515485359690021, "grad_norm": 14.696653946080515, "learning_rate": 4.99967321143871e-05, "loss": 2.5788, "mean_token_accuracy": 0.41379310488700866, "step": 54760 }, { "epoch": 0.05515988965000438, "grad_norm": 10.572708750101505, "learning_rate": 4.999672572509084e-05, "loss": 2.4267, "mean_token_accuracy": 0.44331517815589905, "step": 54765 }, { "epoch": 0.055164925703108554, "grad_norm": 13.739411337779165, "learning_rate": 4.999671932955503e-05, "loss": 2.4821, "mean_token_accuracy": 0.4137930989265442, "step": 54770 }, { "epoch": 0.05516996175621273, "grad_norm": 24.99825366576518, "learning_rate": 4.999671292777968e-05, "loss": 2.5, "mean_token_accuracy": 0.4068965494632721, "step": 54775 }, { "epoch": 0.0551749978093169, "grad_norm": 16.297228687295462, "learning_rate": 4.999670651976477e-05, "loss": 2.8089, "mean_token_accuracy": 0.4, "step": 54780 }, { "epoch": 0.055180033862421075, "grad_norm": 14.188861622743655, "learning_rate": 4.9996700105510315e-05, "loss": 2.4683, "mean_token_accuracy": 0.41379310488700866, "step": 54785 }, { "epoch": 0.05518506991552524, "grad_norm": 13.417367038458002, "learning_rate": 4.999669368501633e-05, "loss": 2.3871, "mean_token_accuracy": 0.4379310369491577, "step": 54790 }, { "epoch": 0.055190105968629416, "grad_norm": 11.90027213645835, "learning_rate": 4.999668725828278e-05, "loss": 2.4416, "mean_token_accuracy": 0.4137930929660797, "step": 54795 }, { "epoch": 0.05519514202173359, "grad_norm": 11.830590540658173, "learning_rate": 4.9996680825309704e-05, "loss": 2.5619, "mean_token_accuracy": 0.38965516686439516, "step": 54800 }, { "epoch": 0.055200178074837764, "grad_norm": 13.40633330215915, "learning_rate": 4.999667438609709e-05, "loss": 2.6303, "mean_token_accuracy": 0.42758620977401735, "step": 54805 }, { "epoch": 0.05520521412794194, "grad_norm": 11.422063169877035, "learning_rate": 4.999666794064494e-05, "loss": 2.6097, "mean_token_accuracy": 0.3793103516101837, "step": 54810 }, { "epoch": 0.05521025018104611, "grad_norm": 10.719332551447245, "learning_rate": 4.999666148895325e-05, "loss": 2.1441, "mean_token_accuracy": 0.44827585816383364, "step": 54815 }, { "epoch": 0.055215286234150285, "grad_norm": 14.500468249219592, "learning_rate": 4.9996655031022025e-05, "loss": 2.6859, "mean_token_accuracy": 0.37241379022598264, "step": 54820 }, { "epoch": 0.05522032228725445, "grad_norm": 12.90705665187925, "learning_rate": 4.999664856685128e-05, "loss": 2.7797, "mean_token_accuracy": 0.4, "step": 54825 }, { "epoch": 0.055225358340358625, "grad_norm": 11.555240877864215, "learning_rate": 4.9996642096441006e-05, "loss": 2.4373, "mean_token_accuracy": 0.4379310369491577, "step": 54830 }, { "epoch": 0.0552303943934628, "grad_norm": 11.067382229684112, "learning_rate": 4.9996635619791196e-05, "loss": 2.1078, "mean_token_accuracy": 0.4931034505367279, "step": 54835 }, { "epoch": 0.05523543044656697, "grad_norm": 9.955400835859779, "learning_rate": 4.9996629136901864e-05, "loss": 2.2166, "mean_token_accuracy": 0.4517241299152374, "step": 54840 }, { "epoch": 0.05524046649967115, "grad_norm": 13.587307787873684, "learning_rate": 4.9996622647772995e-05, "loss": 2.34, "mean_token_accuracy": 0.38275861740112305, "step": 54845 }, { "epoch": 0.05524550255277532, "grad_norm": 13.831716374193146, "learning_rate": 4.999661615240462e-05, "loss": 2.8683, "mean_token_accuracy": 0.33448276221752166, "step": 54850 }, { "epoch": 0.055250538605879494, "grad_norm": 18.409390174585916, "learning_rate": 4.9996609650796724e-05, "loss": 2.8376, "mean_token_accuracy": 0.3669691503047943, "step": 54855 }, { "epoch": 0.05525557465898366, "grad_norm": 14.096979531387243, "learning_rate": 4.99966031429493e-05, "loss": 2.6939, "mean_token_accuracy": 0.3448275804519653, "step": 54860 }, { "epoch": 0.055260610712087835, "grad_norm": 12.132522164587572, "learning_rate": 4.9996596628862363e-05, "loss": 2.4797, "mean_token_accuracy": 0.4434361755847931, "step": 54865 }, { "epoch": 0.05526564676519201, "grad_norm": 13.113101118002294, "learning_rate": 4.999659010853591e-05, "loss": 2.3233, "mean_token_accuracy": 0.4793103575706482, "step": 54870 }, { "epoch": 0.05527068281829618, "grad_norm": 12.160092444708887, "learning_rate": 4.9996583581969946e-05, "loss": 2.68, "mean_token_accuracy": 0.36896551251411436, "step": 54875 }, { "epoch": 0.055275718871400356, "grad_norm": 11.922595660041557, "learning_rate": 4.9996577049164474e-05, "loss": 2.5988, "mean_token_accuracy": 0.39310345649719236, "step": 54880 }, { "epoch": 0.05528075492450453, "grad_norm": 12.816866636640722, "learning_rate": 4.999657051011949e-05, "loss": 2.8605, "mean_token_accuracy": 0.36896551847457887, "step": 54885 }, { "epoch": 0.055285790977608704, "grad_norm": 13.323420661861707, "learning_rate": 4.9996563964835e-05, "loss": 2.4045, "mean_token_accuracy": 0.41034482419490814, "step": 54890 }, { "epoch": 0.05529082703071287, "grad_norm": 13.41144168505522, "learning_rate": 4.9996557413311e-05, "loss": 2.39, "mean_token_accuracy": 0.41034482717514037, "step": 54895 }, { "epoch": 0.055295863083817044, "grad_norm": 13.579021989642717, "learning_rate": 4.999655085554749e-05, "loss": 2.6607, "mean_token_accuracy": 0.3931034505367279, "step": 54900 }, { "epoch": 0.05530089913692122, "grad_norm": 11.506504825942018, "learning_rate": 4.999654429154449e-05, "loss": 2.7671, "mean_token_accuracy": 0.38620690405368807, "step": 54905 }, { "epoch": 0.05530593519002539, "grad_norm": 12.614480926633107, "learning_rate": 4.999653772130198e-05, "loss": 2.8117, "mean_token_accuracy": 0.3482758641242981, "step": 54910 }, { "epoch": 0.055310971243129566, "grad_norm": 18.655108904377578, "learning_rate": 4.9996531144819974e-05, "loss": 2.6272, "mean_token_accuracy": 0.4310344815254211, "step": 54915 }, { "epoch": 0.05531600729623374, "grad_norm": 16.31915096539104, "learning_rate": 4.999652456209847e-05, "loss": 2.2107, "mean_token_accuracy": 0.47241380214691164, "step": 54920 }, { "epoch": 0.05532104334933791, "grad_norm": 12.164337612431556, "learning_rate": 4.999651797313748e-05, "loss": 2.4467, "mean_token_accuracy": 0.41034482717514037, "step": 54925 }, { "epoch": 0.05532607940244208, "grad_norm": 11.09874962605666, "learning_rate": 4.999651137793699e-05, "loss": 2.5066, "mean_token_accuracy": 0.4137930989265442, "step": 54930 }, { "epoch": 0.055331115455546254, "grad_norm": 11.873227816740608, "learning_rate": 4.9996504776497e-05, "loss": 2.4948, "mean_token_accuracy": 0.4206896543502808, "step": 54935 }, { "epoch": 0.05533615150865043, "grad_norm": 9.231988415391495, "learning_rate": 4.999649816881754e-05, "loss": 2.5976, "mean_token_accuracy": 0.3862068861722946, "step": 54940 }, { "epoch": 0.0553411875617546, "grad_norm": 10.243498909869446, "learning_rate": 4.9996491554898585e-05, "loss": 2.3952, "mean_token_accuracy": 0.4486453115940094, "step": 54945 }, { "epoch": 0.055346223614858775, "grad_norm": 13.308800715201759, "learning_rate": 4.999648493474014e-05, "loss": 2.4796, "mean_token_accuracy": 0.4, "step": 54950 }, { "epoch": 0.05535125966796295, "grad_norm": 12.46458750927132, "learning_rate": 4.999647830834221e-05, "loss": 2.7676, "mean_token_accuracy": 0.4206896543502808, "step": 54955 }, { "epoch": 0.05535629572106712, "grad_norm": 12.439676291250857, "learning_rate": 4.9996471675704795e-05, "loss": 2.6972, "mean_token_accuracy": 0.43793103098869324, "step": 54960 }, { "epoch": 0.05536133177417129, "grad_norm": 15.166152034644938, "learning_rate": 4.9996465036827914e-05, "loss": 2.8801, "mean_token_accuracy": 0.341379314661026, "step": 54965 }, { "epoch": 0.05536636782727546, "grad_norm": 12.126232198079368, "learning_rate": 4.999645839171154e-05, "loss": 2.5715, "mean_token_accuracy": 0.37241379022598264, "step": 54970 }, { "epoch": 0.05537140388037964, "grad_norm": 20.05079463876575, "learning_rate": 4.9996451740355694e-05, "loss": 2.8157, "mean_token_accuracy": 0.40689654350280763, "step": 54975 }, { "epoch": 0.05537643993348381, "grad_norm": 15.326401851155078, "learning_rate": 4.9996445082760375e-05, "loss": 2.4846, "mean_token_accuracy": 0.391288560628891, "step": 54980 }, { "epoch": 0.055381475986587984, "grad_norm": 15.116986372941707, "learning_rate": 4.999643841892558e-05, "loss": 2.6962, "mean_token_accuracy": 0.4325468838214874, "step": 54985 }, { "epoch": 0.05538651203969216, "grad_norm": 11.004673847189965, "learning_rate": 4.999643174885132e-05, "loss": 2.357, "mean_token_accuracy": 0.4275862008333206, "step": 54990 }, { "epoch": 0.05539154809279633, "grad_norm": 13.684405634514107, "learning_rate": 4.999642507253759e-05, "loss": 2.4922, "mean_token_accuracy": 0.41034482717514037, "step": 54995 }, { "epoch": 0.0553965841459005, "grad_norm": 13.130869878476563, "learning_rate": 4.999641838998438e-05, "loss": 2.3238, "mean_token_accuracy": 0.458620685338974, "step": 55000 }, { "epoch": 0.05540162019900467, "grad_norm": 14.063261380293753, "learning_rate": 4.9996411701191714e-05, "loss": 2.467, "mean_token_accuracy": 0.3965517163276672, "step": 55005 }, { "epoch": 0.055406656252108846, "grad_norm": 12.096044865196449, "learning_rate": 4.9996405006159585e-05, "loss": 2.7491, "mean_token_accuracy": 0.38620689511299133, "step": 55010 }, { "epoch": 0.05541169230521302, "grad_norm": 14.141718681045582, "learning_rate": 4.999639830488799e-05, "loss": 2.956, "mean_token_accuracy": 0.38275861740112305, "step": 55015 }, { "epoch": 0.055416728358317194, "grad_norm": 11.175247956516836, "learning_rate": 4.999639159737694e-05, "loss": 2.4365, "mean_token_accuracy": 0.38620689511299133, "step": 55020 }, { "epoch": 0.05542176441142137, "grad_norm": 14.315991564166548, "learning_rate": 4.999638488362642e-05, "loss": 2.9045, "mean_token_accuracy": 0.3862069010734558, "step": 55025 }, { "epoch": 0.05542680046452554, "grad_norm": 9.565224112753468, "learning_rate": 4.999637816363646e-05, "loss": 2.6469, "mean_token_accuracy": 0.4186932861804962, "step": 55030 }, { "epoch": 0.05543183651762971, "grad_norm": 11.566151668107477, "learning_rate": 4.999637143740704e-05, "loss": 3.2032, "mean_token_accuracy": 0.33793103992938994, "step": 55035 }, { "epoch": 0.05543687257073388, "grad_norm": 10.812055382251499, "learning_rate": 4.9996364704938166e-05, "loss": 2.6318, "mean_token_accuracy": 0.4068965494632721, "step": 55040 }, { "epoch": 0.055441908623838056, "grad_norm": 12.330865894265942, "learning_rate": 4.9996357966229846e-05, "loss": 2.4412, "mean_token_accuracy": 0.4172413766384125, "step": 55045 }, { "epoch": 0.05544694467694223, "grad_norm": 10.842066863927242, "learning_rate": 4.9996351221282066e-05, "loss": 2.5264, "mean_token_accuracy": 0.39310344457626345, "step": 55050 }, { "epoch": 0.0554519807300464, "grad_norm": 9.823117900966967, "learning_rate": 4.9996344470094844e-05, "loss": 2.4135, "mean_token_accuracy": 0.43793103098869324, "step": 55055 }, { "epoch": 0.05545701678315058, "grad_norm": 13.950908591630437, "learning_rate": 4.999633771266818e-05, "loss": 2.5919, "mean_token_accuracy": 0.4172413766384125, "step": 55060 }, { "epoch": 0.05546205283625475, "grad_norm": 12.823019800805229, "learning_rate": 4.999633094900208e-05, "loss": 2.5533, "mean_token_accuracy": 0.42274652123451234, "step": 55065 }, { "epoch": 0.05546708888935892, "grad_norm": 10.38702277510885, "learning_rate": 4.9996324179096524e-05, "loss": 2.381, "mean_token_accuracy": 0.3931034505367279, "step": 55070 }, { "epoch": 0.05547212494246309, "grad_norm": 10.821313330688637, "learning_rate": 4.999631740295154e-05, "loss": 2.4651, "mean_token_accuracy": 0.39310344457626345, "step": 55075 }, { "epoch": 0.055477160995567265, "grad_norm": 11.114194031609195, "learning_rate": 4.999631062056712e-05, "loss": 2.7361, "mean_token_accuracy": 0.37586206793785093, "step": 55080 }, { "epoch": 0.05548219704867144, "grad_norm": 10.485797390707267, "learning_rate": 4.9996303831943254e-05, "loss": 2.7935, "mean_token_accuracy": 0.358620685338974, "step": 55085 }, { "epoch": 0.05548723310177561, "grad_norm": 12.536233359062974, "learning_rate": 4.999629703707995e-05, "loss": 2.4863, "mean_token_accuracy": 0.4137930989265442, "step": 55090 }, { "epoch": 0.055492269154879786, "grad_norm": 12.769839693722233, "learning_rate": 4.999629023597723e-05, "loss": 2.4737, "mean_token_accuracy": 0.42068966031074523, "step": 55095 }, { "epoch": 0.05549730520798396, "grad_norm": 9.258883644066165, "learning_rate": 4.999628342863507e-05, "loss": 2.6758, "mean_token_accuracy": 0.36757410764694215, "step": 55100 }, { "epoch": 0.05550234126108813, "grad_norm": 12.27060918185047, "learning_rate": 4.9996276615053475e-05, "loss": 2.6892, "mean_token_accuracy": 0.4068965494632721, "step": 55105 }, { "epoch": 0.0555073773141923, "grad_norm": 14.654778470302091, "learning_rate": 4.999626979523247e-05, "loss": 2.8632, "mean_token_accuracy": 0.39310344457626345, "step": 55110 }, { "epoch": 0.055512413367296475, "grad_norm": 11.163513667086654, "learning_rate": 4.999626296917204e-05, "loss": 2.4608, "mean_token_accuracy": 0.4137930989265442, "step": 55115 }, { "epoch": 0.05551744942040065, "grad_norm": 14.131974380245504, "learning_rate": 4.999625613687217e-05, "loss": 2.8842, "mean_token_accuracy": 0.379310342669487, "step": 55120 }, { "epoch": 0.05552248547350482, "grad_norm": 12.94364310046765, "learning_rate": 4.999624929833289e-05, "loss": 3.0354, "mean_token_accuracy": 0.3517241388559341, "step": 55125 }, { "epoch": 0.055527521526608996, "grad_norm": 11.806762270520412, "learning_rate": 4.9996242453554196e-05, "loss": 2.4638, "mean_token_accuracy": 0.417241370677948, "step": 55130 }, { "epoch": 0.05553255757971317, "grad_norm": 16.698416340214298, "learning_rate": 4.999623560253608e-05, "loss": 2.7585, "mean_token_accuracy": 0.3827586233615875, "step": 55135 }, { "epoch": 0.055537593632817336, "grad_norm": 10.985591156033887, "learning_rate": 4.999622874527855e-05, "loss": 2.5439, "mean_token_accuracy": 0.4620689630508423, "step": 55140 }, { "epoch": 0.05554262968592151, "grad_norm": 9.43855972912326, "learning_rate": 4.9996221881781605e-05, "loss": 2.3659, "mean_token_accuracy": 0.44652147889137267, "step": 55145 }, { "epoch": 0.055547665739025684, "grad_norm": 16.989438754881242, "learning_rate": 4.999621501204526e-05, "loss": 2.5003, "mean_token_accuracy": 0.44827585816383364, "step": 55150 }, { "epoch": 0.05555270179212986, "grad_norm": 14.04569726459809, "learning_rate": 4.999620813606948e-05, "loss": 2.23, "mean_token_accuracy": 0.4413793087005615, "step": 55155 }, { "epoch": 0.05555773784523403, "grad_norm": 13.265368926279924, "learning_rate": 4.999620125385432e-05, "loss": 3.3609, "mean_token_accuracy": 0.3551724076271057, "step": 55160 }, { "epoch": 0.055562773898338205, "grad_norm": 14.550531278267712, "learning_rate": 4.999619436539974e-05, "loss": 2.8696, "mean_token_accuracy": 0.3931034505367279, "step": 55165 }, { "epoch": 0.05556780995144238, "grad_norm": 11.762144796235553, "learning_rate": 4.999618747070576e-05, "loss": 2.6729, "mean_token_accuracy": 0.36896551251411436, "step": 55170 }, { "epoch": 0.055572846004546546, "grad_norm": 11.219606238524557, "learning_rate": 4.9996180569772385e-05, "loss": 2.5713, "mean_token_accuracy": 0.3703569293022156, "step": 55175 }, { "epoch": 0.05557788205765072, "grad_norm": 14.27059387550376, "learning_rate": 4.99961736625996e-05, "loss": 2.7238, "mean_token_accuracy": 0.37586206793785093, "step": 55180 }, { "epoch": 0.05558291811075489, "grad_norm": 11.34166498933833, "learning_rate": 4.999616674918742e-05, "loss": 2.5613, "mean_token_accuracy": 0.34827586114406583, "step": 55185 }, { "epoch": 0.05558795416385907, "grad_norm": 12.222003998124842, "learning_rate": 4.999615982953586e-05, "loss": 2.3422, "mean_token_accuracy": 0.42758620381355283, "step": 55190 }, { "epoch": 0.05559299021696324, "grad_norm": 12.704621209609298, "learning_rate": 4.9996152903644885e-05, "loss": 2.7622, "mean_token_accuracy": 0.3620689630508423, "step": 55195 }, { "epoch": 0.055598026270067415, "grad_norm": 10.818785098566572, "learning_rate": 4.9996145971514526e-05, "loss": 2.4925, "mean_token_accuracy": 0.4103448152542114, "step": 55200 }, { "epoch": 0.05560306232317159, "grad_norm": 14.540562821421046, "learning_rate": 4.999613903314477e-05, "loss": 2.8531, "mean_token_accuracy": 0.3068965464830399, "step": 55205 }, { "epoch": 0.055608098376275755, "grad_norm": 9.694080516003845, "learning_rate": 4.999613208853563e-05, "loss": 2.7534, "mean_token_accuracy": 0.40689654350280763, "step": 55210 }, { "epoch": 0.05561313442937993, "grad_norm": 11.99486416886706, "learning_rate": 4.999612513768711e-05, "loss": 2.2911, "mean_token_accuracy": 0.482758617401123, "step": 55215 }, { "epoch": 0.0556181704824841, "grad_norm": 12.69469958096461, "learning_rate": 4.9996118180599205e-05, "loss": 2.9784, "mean_token_accuracy": 0.29655172526836393, "step": 55220 }, { "epoch": 0.05562320653558828, "grad_norm": 13.064659655759353, "learning_rate": 4.999611121727191e-05, "loss": 2.7467, "mean_token_accuracy": 0.37241379022598264, "step": 55225 }, { "epoch": 0.05562824258869245, "grad_norm": 11.095888516987154, "learning_rate": 4.999610424770524e-05, "loss": 2.3222, "mean_token_accuracy": 0.4172413766384125, "step": 55230 }, { "epoch": 0.055633278641796624, "grad_norm": 11.891335015197479, "learning_rate": 4.999609727189919e-05, "loss": 2.4741, "mean_token_accuracy": 0.3862068921327591, "step": 55235 }, { "epoch": 0.0556383146949008, "grad_norm": 12.038023638940437, "learning_rate": 4.9996090289853766e-05, "loss": 2.6813, "mean_token_accuracy": 0.41034482717514037, "step": 55240 }, { "epoch": 0.055643350748004965, "grad_norm": 10.037299761142773, "learning_rate": 4.999608330156897e-05, "loss": 2.3029, "mean_token_accuracy": 0.4655172348022461, "step": 55245 }, { "epoch": 0.05564838680110914, "grad_norm": 11.698946265773344, "learning_rate": 4.999607630704479e-05, "loss": 2.4444, "mean_token_accuracy": 0.4034482717514038, "step": 55250 }, { "epoch": 0.05565342285421331, "grad_norm": 11.989167923008816, "learning_rate": 4.999606930628125e-05, "loss": 2.5002, "mean_token_accuracy": 0.3551724165678024, "step": 55255 }, { "epoch": 0.055658458907317486, "grad_norm": 11.92598926128034, "learning_rate": 4.999606229927834e-05, "loss": 2.6421, "mean_token_accuracy": 0.37931033968925476, "step": 55260 }, { "epoch": 0.05566349496042166, "grad_norm": 9.733177344807933, "learning_rate": 4.999605528603606e-05, "loss": 2.9483, "mean_token_accuracy": 0.3551724135875702, "step": 55265 }, { "epoch": 0.055668531013525833, "grad_norm": 12.984678228421926, "learning_rate": 4.9996048266554416e-05, "loss": 2.5744, "mean_token_accuracy": 0.4034482777118683, "step": 55270 }, { "epoch": 0.05567356706663001, "grad_norm": 11.436758068912521, "learning_rate": 4.999604124083341e-05, "loss": 3.0646, "mean_token_accuracy": 0.3724137842655182, "step": 55275 }, { "epoch": 0.055678603119734174, "grad_norm": 11.825761933554366, "learning_rate": 4.999603420887304e-05, "loss": 2.6151, "mean_token_accuracy": 0.4206896543502808, "step": 55280 }, { "epoch": 0.05568363917283835, "grad_norm": 12.182385662508128, "learning_rate": 4.999602717067332e-05, "loss": 2.3252, "mean_token_accuracy": 0.46551724076271056, "step": 55285 }, { "epoch": 0.05568867522594252, "grad_norm": 13.670035836345328, "learning_rate": 4.999602012623423e-05, "loss": 2.7682, "mean_token_accuracy": 0.3793103456497192, "step": 55290 }, { "epoch": 0.055693711279046695, "grad_norm": 11.172175710103827, "learning_rate": 4.999601307555579e-05, "loss": 2.2023, "mean_token_accuracy": 0.44827585816383364, "step": 55295 }, { "epoch": 0.05569874733215087, "grad_norm": 22.686569157079127, "learning_rate": 4.9996006018637995e-05, "loss": 2.7921, "mean_token_accuracy": 0.3413793116807938, "step": 55300 }, { "epoch": 0.05570378338525504, "grad_norm": 12.437272474566399, "learning_rate": 4.9995998955480854e-05, "loss": 2.5276, "mean_token_accuracy": 0.37586207389831544, "step": 55305 }, { "epoch": 0.05570881943835922, "grad_norm": 12.45842858314631, "learning_rate": 4.999599188608437e-05, "loss": 2.529, "mean_token_accuracy": 0.41379310488700866, "step": 55310 }, { "epoch": 0.055713855491463384, "grad_norm": 11.484855808485914, "learning_rate": 4.999598481044853e-05, "loss": 2.8285, "mean_token_accuracy": 0.38275861740112305, "step": 55315 }, { "epoch": 0.05571889154456756, "grad_norm": 10.70018745369348, "learning_rate": 4.999597772857334e-05, "loss": 2.78, "mean_token_accuracy": 0.3379310339689255, "step": 55320 }, { "epoch": 0.05572392759767173, "grad_norm": 10.836576268570814, "learning_rate": 4.999597064045881e-05, "loss": 2.383, "mean_token_accuracy": 0.4310344815254211, "step": 55325 }, { "epoch": 0.055728963650775905, "grad_norm": 8.854809769308766, "learning_rate": 4.999596354610495e-05, "loss": 2.8599, "mean_token_accuracy": 0.3793103456497192, "step": 55330 }, { "epoch": 0.05573399970388008, "grad_norm": 11.066354031969151, "learning_rate": 4.999595644551173e-05, "loss": 2.5095, "mean_token_accuracy": 0.41379310488700866, "step": 55335 }, { "epoch": 0.05573903575698425, "grad_norm": 12.040554830990093, "learning_rate": 4.9995949338679186e-05, "loss": 2.742, "mean_token_accuracy": 0.40344828367233276, "step": 55340 }, { "epoch": 0.055744071810088426, "grad_norm": 19.297636080693888, "learning_rate": 4.99959422256073e-05, "loss": 3.1752, "mean_token_accuracy": 0.31379309594631194, "step": 55345 }, { "epoch": 0.05574910786319259, "grad_norm": 11.888817621477607, "learning_rate": 4.9995935106296085e-05, "loss": 2.5193, "mean_token_accuracy": 0.39310344457626345, "step": 55350 }, { "epoch": 0.05575414391629677, "grad_norm": 10.485442336677337, "learning_rate": 4.9995927980745535e-05, "loss": 2.427, "mean_token_accuracy": 0.43448275327682495, "step": 55355 }, { "epoch": 0.05575917996940094, "grad_norm": 11.66947506324508, "learning_rate": 4.9995920848955666e-05, "loss": 2.5891, "mean_token_accuracy": 0.38620689511299133, "step": 55360 }, { "epoch": 0.055764216022505114, "grad_norm": 9.83141530421554, "learning_rate": 4.9995913710926464e-05, "loss": 2.2626, "mean_token_accuracy": 0.4448275864124298, "step": 55365 }, { "epoch": 0.05576925207560929, "grad_norm": 11.424608987189329, "learning_rate": 4.999590656665793e-05, "loss": 2.7924, "mean_token_accuracy": 0.35862069129943847, "step": 55370 }, { "epoch": 0.05577428812871346, "grad_norm": 10.748982709331269, "learning_rate": 4.9995899416150076e-05, "loss": 2.8064, "mean_token_accuracy": 0.3620689630508423, "step": 55375 }, { "epoch": 0.055779324181817636, "grad_norm": 12.826696067805928, "learning_rate": 4.99958922594029e-05, "loss": 2.7681, "mean_token_accuracy": 0.3674531102180481, "step": 55380 }, { "epoch": 0.0557843602349218, "grad_norm": 15.455034190302108, "learning_rate": 4.9995885096416404e-05, "loss": 2.5121, "mean_token_accuracy": 0.43448275327682495, "step": 55385 }, { "epoch": 0.055789396288025976, "grad_norm": 13.671120657076818, "learning_rate": 4.999587792719059e-05, "loss": 2.7879, "mean_token_accuracy": 0.358620685338974, "step": 55390 }, { "epoch": 0.05579443234113015, "grad_norm": 12.232140129938331, "learning_rate": 4.999587075172546e-05, "loss": 2.5779, "mean_token_accuracy": 0.4068965554237366, "step": 55395 }, { "epoch": 0.055799468394234324, "grad_norm": 11.820937504017776, "learning_rate": 4.9995863570021026e-05, "loss": 2.306, "mean_token_accuracy": 0.44827585220336913, "step": 55400 }, { "epoch": 0.0558045044473385, "grad_norm": 13.893171730636613, "learning_rate": 4.999585638207727e-05, "loss": 2.3531, "mean_token_accuracy": 0.43103448748588563, "step": 55405 }, { "epoch": 0.05580954050044267, "grad_norm": 24.812036864556504, "learning_rate": 4.99958491878942e-05, "loss": 2.469, "mean_token_accuracy": 0.4881773412227631, "step": 55410 }, { "epoch": 0.055814576553546845, "grad_norm": 13.471408796672502, "learning_rate": 4.9995841987471835e-05, "loss": 2.6876, "mean_token_accuracy": 0.4103448331356049, "step": 55415 }, { "epoch": 0.05581961260665101, "grad_norm": 15.37994208949692, "learning_rate": 4.9995834780810156e-05, "loss": 2.2825, "mean_token_accuracy": 0.46551724076271056, "step": 55420 }, { "epoch": 0.055824648659755186, "grad_norm": 14.850270434701018, "learning_rate": 4.9995827567909185e-05, "loss": 2.3724, "mean_token_accuracy": 0.41379310488700866, "step": 55425 }, { "epoch": 0.05582968471285936, "grad_norm": 10.873612191458651, "learning_rate": 4.9995820348768894e-05, "loss": 2.5251, "mean_token_accuracy": 0.41379311084747317, "step": 55430 }, { "epoch": 0.05583472076596353, "grad_norm": 10.343179244525905, "learning_rate": 4.999581312338931e-05, "loss": 2.365, "mean_token_accuracy": 0.44827587008476255, "step": 55435 }, { "epoch": 0.05583975681906771, "grad_norm": 14.56217552975939, "learning_rate": 4.9995805891770436e-05, "loss": 2.7866, "mean_token_accuracy": 0.3517241358757019, "step": 55440 }, { "epoch": 0.05584479287217188, "grad_norm": 12.006735489290199, "learning_rate": 4.999579865391226e-05, "loss": 2.4464, "mean_token_accuracy": 0.4109605967998505, "step": 55445 }, { "epoch": 0.055849828925276054, "grad_norm": 11.601561176441633, "learning_rate": 4.999579140981479e-05, "loss": 2.7549, "mean_token_accuracy": 0.3793103456497192, "step": 55450 }, { "epoch": 0.05585486497838022, "grad_norm": 13.315224014545223, "learning_rate": 4.999578415947803e-05, "loss": 2.5823, "mean_token_accuracy": 0.37586206793785093, "step": 55455 }, { "epoch": 0.055859901031484395, "grad_norm": 14.807845251258229, "learning_rate": 4.999577690290198e-05, "loss": 2.7736, "mean_token_accuracy": 0.4000000059604645, "step": 55460 }, { "epoch": 0.05586493708458857, "grad_norm": 10.425430771972495, "learning_rate": 4.9995769640086645e-05, "loss": 2.5801, "mean_token_accuracy": 0.37931033968925476, "step": 55465 }, { "epoch": 0.05586997313769274, "grad_norm": 12.085998532704712, "learning_rate": 4.9995762371032025e-05, "loss": 2.6439, "mean_token_accuracy": 0.3793103456497192, "step": 55470 }, { "epoch": 0.055875009190796916, "grad_norm": 10.901559273113412, "learning_rate": 4.999575509573812e-05, "loss": 2.3454, "mean_token_accuracy": 0.44827585816383364, "step": 55475 }, { "epoch": 0.05588004524390109, "grad_norm": 13.592038832710685, "learning_rate": 4.999574781420493e-05, "loss": 3.045, "mean_token_accuracy": 0.32068965435028074, "step": 55480 }, { "epoch": 0.055885081297005264, "grad_norm": 13.372528576934323, "learning_rate": 4.9995740526432456e-05, "loss": 2.6479, "mean_token_accuracy": 0.3931034505367279, "step": 55485 }, { "epoch": 0.05589011735010943, "grad_norm": 19.076839293726408, "learning_rate": 4.999573323242071e-05, "loss": 3.2535, "mean_token_accuracy": 0.320689657330513, "step": 55490 }, { "epoch": 0.055895153403213604, "grad_norm": 17.44462576839078, "learning_rate": 4.999572593216969e-05, "loss": 2.7354, "mean_token_accuracy": 0.3950393259525299, "step": 55495 }, { "epoch": 0.05590018945631778, "grad_norm": 12.35275274178227, "learning_rate": 4.9995718625679395e-05, "loss": 2.8131, "mean_token_accuracy": 0.33793103098869326, "step": 55500 }, { "epoch": 0.05590522550942195, "grad_norm": 11.678609786300013, "learning_rate": 4.999571131294983e-05, "loss": 2.5529, "mean_token_accuracy": 0.4172413766384125, "step": 55505 }, { "epoch": 0.055910261562526126, "grad_norm": 9.885349932085091, "learning_rate": 4.9995703993980995e-05, "loss": 2.4263, "mean_token_accuracy": 0.4291590929031372, "step": 55510 }, { "epoch": 0.0559152976156303, "grad_norm": 10.978215761057383, "learning_rate": 4.999569666877289e-05, "loss": 2.3606, "mean_token_accuracy": 0.3999999940395355, "step": 55515 }, { "epoch": 0.05592033366873447, "grad_norm": 11.804899225600598, "learning_rate": 4.999568933732552e-05, "loss": 2.4634, "mean_token_accuracy": 0.38620689511299133, "step": 55520 }, { "epoch": 0.05592536972183864, "grad_norm": 14.5273973854347, "learning_rate": 4.99956819996389e-05, "loss": 2.8908, "mean_token_accuracy": 0.3827586114406586, "step": 55525 }, { "epoch": 0.055930405774942814, "grad_norm": 11.646108210631468, "learning_rate": 4.9995674655712994e-05, "loss": 2.7042, "mean_token_accuracy": 0.3482758581638336, "step": 55530 }, { "epoch": 0.05593544182804699, "grad_norm": 12.103754202466247, "learning_rate": 4.9995667305547847e-05, "loss": 2.411, "mean_token_accuracy": 0.4034482717514038, "step": 55535 }, { "epoch": 0.05594047788115116, "grad_norm": 14.91880903910126, "learning_rate": 4.9995659949143434e-05, "loss": 2.7799, "mean_token_accuracy": 0.36654567122459414, "step": 55540 }, { "epoch": 0.055945513934255335, "grad_norm": 11.824224370379714, "learning_rate": 4.999565258649977e-05, "loss": 2.8428, "mean_token_accuracy": 0.39655172228813174, "step": 55545 }, { "epoch": 0.05595054998735951, "grad_norm": 13.08274366988657, "learning_rate": 4.9995645217616855e-05, "loss": 2.6681, "mean_token_accuracy": 0.37241379618644715, "step": 55550 }, { "epoch": 0.05595558604046368, "grad_norm": 12.739105684928443, "learning_rate": 4.999563784249468e-05, "loss": 2.6601, "mean_token_accuracy": 0.3655172288417816, "step": 55555 }, { "epoch": 0.05596062209356785, "grad_norm": 14.869876125362172, "learning_rate": 4.999563046113327e-05, "loss": 2.5195, "mean_token_accuracy": 0.3931034505367279, "step": 55560 }, { "epoch": 0.05596565814667202, "grad_norm": 13.541932171509355, "learning_rate": 4.99956230735326e-05, "loss": 2.4727, "mean_token_accuracy": 0.4034482777118683, "step": 55565 }, { "epoch": 0.0559706941997762, "grad_norm": 13.002342327516184, "learning_rate": 4.9995615679692685e-05, "loss": 2.6708, "mean_token_accuracy": 0.38620689511299133, "step": 55570 }, { "epoch": 0.05597573025288037, "grad_norm": 12.362158743427823, "learning_rate": 4.999560827961354e-05, "loss": 2.6806, "mean_token_accuracy": 0.3931034505367279, "step": 55575 }, { "epoch": 0.055980766305984545, "grad_norm": 11.83658952349421, "learning_rate": 4.999560087329514e-05, "loss": 2.4679, "mean_token_accuracy": 0.3931034505367279, "step": 55580 }, { "epoch": 0.05598580235908872, "grad_norm": 11.100836656050399, "learning_rate": 4.9995593460737514e-05, "loss": 2.148, "mean_token_accuracy": 0.4620689630508423, "step": 55585 }, { "epoch": 0.05599083841219289, "grad_norm": 12.676637616371321, "learning_rate": 4.999558604194064e-05, "loss": 2.6513, "mean_token_accuracy": 0.4, "step": 55590 }, { "epoch": 0.05599587446529706, "grad_norm": 10.279832895428571, "learning_rate": 4.999557861690454e-05, "loss": 2.3522, "mean_token_accuracy": 0.4517241358757019, "step": 55595 }, { "epoch": 0.05600091051840123, "grad_norm": 13.650754241162055, "learning_rate": 4.9995571185629195e-05, "loss": 2.4038, "mean_token_accuracy": 0.4310344815254211, "step": 55600 }, { "epoch": 0.056005946571505406, "grad_norm": 11.493670205352528, "learning_rate": 4.9995563748114635e-05, "loss": 2.3417, "mean_token_accuracy": 0.4413793206214905, "step": 55605 }, { "epoch": 0.05601098262460958, "grad_norm": 11.946136066864376, "learning_rate": 4.999555630436084e-05, "loss": 2.7034, "mean_token_accuracy": 0.37586206793785093, "step": 55610 }, { "epoch": 0.056016018677713754, "grad_norm": 13.4038665965711, "learning_rate": 4.9995548854367816e-05, "loss": 2.7833, "mean_token_accuracy": 0.37241379022598264, "step": 55615 }, { "epoch": 0.05602105473081793, "grad_norm": 11.994142765887478, "learning_rate": 4.999554139813556e-05, "loss": 2.4096, "mean_token_accuracy": 0.4310344934463501, "step": 55620 }, { "epoch": 0.0560260907839221, "grad_norm": 10.846849599478308, "learning_rate": 4.99955339356641e-05, "loss": 2.4347, "mean_token_accuracy": 0.4517241418361664, "step": 55625 }, { "epoch": 0.05603112683702627, "grad_norm": 15.755515820038195, "learning_rate": 4.9995526466953404e-05, "loss": 2.804, "mean_token_accuracy": 0.39310344457626345, "step": 55630 }, { "epoch": 0.05603616289013044, "grad_norm": 9.815562962525917, "learning_rate": 4.9995518992003505e-05, "loss": 2.5899, "mean_token_accuracy": 0.39310344457626345, "step": 55635 }, { "epoch": 0.056041198943234616, "grad_norm": 12.549017794520173, "learning_rate": 4.9995511510814375e-05, "loss": 3.0064, "mean_token_accuracy": 0.3172413736581802, "step": 55640 }, { "epoch": 0.05604623499633879, "grad_norm": 12.804856187185829, "learning_rate": 4.9995504023386034e-05, "loss": 2.658, "mean_token_accuracy": 0.3758620619773865, "step": 55645 }, { "epoch": 0.05605127104944296, "grad_norm": 12.347900923551641, "learning_rate": 4.999549652971849e-05, "loss": 2.6637, "mean_token_accuracy": 0.3551724076271057, "step": 55650 }, { "epoch": 0.05605630710254714, "grad_norm": 11.287367752838922, "learning_rate": 4.999548902981173e-05, "loss": 2.3648, "mean_token_accuracy": 0.4413793087005615, "step": 55655 }, { "epoch": 0.05606134315565131, "grad_norm": 12.158360580826953, "learning_rate": 4.999548152366576e-05, "loss": 2.6061, "mean_token_accuracy": 0.3344827502965927, "step": 55660 }, { "epoch": 0.05606637920875548, "grad_norm": 11.714547896287067, "learning_rate": 4.999547401128058e-05, "loss": 2.5602, "mean_token_accuracy": 0.32413793802261354, "step": 55665 }, { "epoch": 0.05607141526185965, "grad_norm": 10.959726244231172, "learning_rate": 4.9995466492656215e-05, "loss": 2.4428, "mean_token_accuracy": 0.42068966031074523, "step": 55670 }, { "epoch": 0.056076451314963825, "grad_norm": 12.691551244843508, "learning_rate": 4.999545896779263e-05, "loss": 2.4154, "mean_token_accuracy": 0.4188747763633728, "step": 55675 }, { "epoch": 0.056081487368068, "grad_norm": 12.216984822867625, "learning_rate": 4.9995451436689854e-05, "loss": 2.6198, "mean_token_accuracy": 0.4050211668014526, "step": 55680 }, { "epoch": 0.05608652342117217, "grad_norm": 12.83083705583494, "learning_rate": 4.9995443899347874e-05, "loss": 2.807, "mean_token_accuracy": 0.4068965494632721, "step": 55685 }, { "epoch": 0.05609155947427635, "grad_norm": 12.43070173344862, "learning_rate": 4.999543635576671e-05, "loss": 2.3477, "mean_token_accuracy": 0.4689655125141144, "step": 55690 }, { "epoch": 0.05609659552738052, "grad_norm": 11.726576799844116, "learning_rate": 4.999542880594634e-05, "loss": 2.8829, "mean_token_accuracy": 0.3655172407627106, "step": 55695 }, { "epoch": 0.05610163158048469, "grad_norm": 10.989027421236075, "learning_rate": 4.999542124988679e-05, "loss": 2.4286, "mean_token_accuracy": 0.4344827651977539, "step": 55700 }, { "epoch": 0.05610666763358886, "grad_norm": 14.314008801388935, "learning_rate": 4.999541368758804e-05, "loss": 2.7678, "mean_token_accuracy": 0.4, "step": 55705 }, { "epoch": 0.056111703686693035, "grad_norm": 11.22885216669747, "learning_rate": 4.999540611905012e-05, "loss": 2.7311, "mean_token_accuracy": 0.3827586114406586, "step": 55710 }, { "epoch": 0.05611673973979721, "grad_norm": 13.384945511617811, "learning_rate": 4.9995398544273e-05, "loss": 2.6346, "mean_token_accuracy": 0.37586206793785093, "step": 55715 }, { "epoch": 0.05612177579290138, "grad_norm": 9.91603656885435, "learning_rate": 4.999539096325671e-05, "loss": 2.7614, "mean_token_accuracy": 0.3448275804519653, "step": 55720 }, { "epoch": 0.056126811846005556, "grad_norm": 12.446619852554413, "learning_rate": 4.9995383376001225e-05, "loss": 2.4104, "mean_token_accuracy": 0.43309134244918823, "step": 55725 }, { "epoch": 0.05613184789910973, "grad_norm": 12.557396447300956, "learning_rate": 4.9995375782506564e-05, "loss": 2.5875, "mean_token_accuracy": 0.39655172228813174, "step": 55730 }, { "epoch": 0.0561368839522139, "grad_norm": 17.408677750925104, "learning_rate": 4.999536818277273e-05, "loss": 2.6708, "mean_token_accuracy": 0.41379310488700866, "step": 55735 }, { "epoch": 0.05614192000531807, "grad_norm": 11.915190773259242, "learning_rate": 4.9995360576799726e-05, "loss": 2.6907, "mean_token_accuracy": 0.334482753276825, "step": 55740 }, { "epoch": 0.056146956058422244, "grad_norm": 24.665368382376034, "learning_rate": 4.999535296458755e-05, "loss": 2.9262, "mean_token_accuracy": 0.34137930870056155, "step": 55745 }, { "epoch": 0.05615199211152642, "grad_norm": 13.810800373524092, "learning_rate": 4.9995345346136194e-05, "loss": 2.6177, "mean_token_accuracy": 0.35862069129943847, "step": 55750 }, { "epoch": 0.05615702816463059, "grad_norm": 12.4799682799755, "learning_rate": 4.9995337721445676e-05, "loss": 2.7356, "mean_token_accuracy": 0.3724137932062149, "step": 55755 }, { "epoch": 0.056162064217734765, "grad_norm": 10.216149877141273, "learning_rate": 4.999533009051599e-05, "loss": 2.3663, "mean_token_accuracy": 0.47586206793785096, "step": 55760 }, { "epoch": 0.05616710027083894, "grad_norm": 13.21183106887741, "learning_rate": 4.9995322453347143e-05, "loss": 2.6759, "mean_token_accuracy": 0.3671506345272064, "step": 55765 }, { "epoch": 0.056172136323943106, "grad_norm": 10.91491129085444, "learning_rate": 4.9995314809939135e-05, "loss": 2.663, "mean_token_accuracy": 0.4121597170829773, "step": 55770 }, { "epoch": 0.05617717237704728, "grad_norm": 11.026360965791449, "learning_rate": 4.9995307160291964e-05, "loss": 2.2988, "mean_token_accuracy": 0.3965517282485962, "step": 55775 }, { "epoch": 0.056182208430151453, "grad_norm": 12.154726158672773, "learning_rate": 4.999529950440564e-05, "loss": 2.1225, "mean_token_accuracy": 0.4517241358757019, "step": 55780 }, { "epoch": 0.05618724448325563, "grad_norm": 13.978067003386796, "learning_rate": 4.9995291842280165e-05, "loss": 2.7723, "mean_token_accuracy": 0.36896551847457887, "step": 55785 }, { "epoch": 0.0561922805363598, "grad_norm": 19.492661877672266, "learning_rate": 4.9995284173915524e-05, "loss": 2.563, "mean_token_accuracy": 0.3862069010734558, "step": 55790 }, { "epoch": 0.056197316589463975, "grad_norm": 10.692583344408439, "learning_rate": 4.999527649931174e-05, "loss": 2.3476, "mean_token_accuracy": 0.44827585816383364, "step": 55795 }, { "epoch": 0.05620235264256815, "grad_norm": 12.032537297566773, "learning_rate": 4.9995268818468806e-05, "loss": 2.299, "mean_token_accuracy": 0.44700543880462645, "step": 55800 }, { "epoch": 0.056207388695672315, "grad_norm": 13.987750025456895, "learning_rate": 4.999526113138672e-05, "loss": 2.4934, "mean_token_accuracy": 0.4448275864124298, "step": 55805 }, { "epoch": 0.05621242474877649, "grad_norm": 14.669170392521945, "learning_rate": 4.9995253438065495e-05, "loss": 2.6139, "mean_token_accuracy": 0.4310344815254211, "step": 55810 }, { "epoch": 0.05621746080188066, "grad_norm": 10.902668319945485, "learning_rate": 4.999524573850513e-05, "loss": 2.4678, "mean_token_accuracy": 0.37241379618644715, "step": 55815 }, { "epoch": 0.05622249685498484, "grad_norm": 12.862289039856709, "learning_rate": 4.999523803270562e-05, "loss": 2.7704, "mean_token_accuracy": 0.35862069129943847, "step": 55820 }, { "epoch": 0.05622753290808901, "grad_norm": 14.175467101771451, "learning_rate": 4.9995230320666976e-05, "loss": 2.7167, "mean_token_accuracy": 0.36206896901130675, "step": 55825 }, { "epoch": 0.056232568961193184, "grad_norm": 9.621561650234693, "learning_rate": 4.999522260238919e-05, "loss": 2.6136, "mean_token_accuracy": 0.4310344815254211, "step": 55830 }, { "epoch": 0.05623760501429736, "grad_norm": 9.619174855323118, "learning_rate": 4.9995214877872275e-05, "loss": 2.3365, "mean_token_accuracy": 0.4482758641242981, "step": 55835 }, { "epoch": 0.056242641067401525, "grad_norm": 10.634638051473122, "learning_rate": 4.999520714711623e-05, "loss": 2.3196, "mean_token_accuracy": 0.42758620381355283, "step": 55840 }, { "epoch": 0.0562476771205057, "grad_norm": 12.627631835259768, "learning_rate": 4.999519941012104e-05, "loss": 2.7594, "mean_token_accuracy": 0.3620689570903778, "step": 55845 }, { "epoch": 0.05625271317360987, "grad_norm": 12.526010201682999, "learning_rate": 4.999519166688674e-05, "loss": 2.7274, "mean_token_accuracy": 0.417241370677948, "step": 55850 }, { "epoch": 0.056257749226714046, "grad_norm": 13.640183196419043, "learning_rate": 4.999518391741331e-05, "loss": 2.7894, "mean_token_accuracy": 0.33448274731636046, "step": 55855 }, { "epoch": 0.05626278527981822, "grad_norm": 11.45198418309512, "learning_rate": 4.999517616170076e-05, "loss": 2.6135, "mean_token_accuracy": 0.3712643623352051, "step": 55860 }, { "epoch": 0.056267821332922394, "grad_norm": 11.337182460217367, "learning_rate": 4.9995168399749086e-05, "loss": 2.6033, "mean_token_accuracy": 0.3758620619773865, "step": 55865 }, { "epoch": 0.05627285738602657, "grad_norm": 12.815870179073693, "learning_rate": 4.999516063155829e-05, "loss": 2.5371, "mean_token_accuracy": 0.4068965494632721, "step": 55870 }, { "epoch": 0.056277893439130734, "grad_norm": 11.100899962844503, "learning_rate": 4.999515285712838e-05, "loss": 2.3923, "mean_token_accuracy": 0.4310344815254211, "step": 55875 }, { "epoch": 0.05628292949223491, "grad_norm": 9.908457361270326, "learning_rate": 4.9995145076459355e-05, "loss": 2.6029, "mean_token_accuracy": 0.41034482717514037, "step": 55880 }, { "epoch": 0.05628796554533908, "grad_norm": 10.497554776812228, "learning_rate": 4.999513728955122e-05, "loss": 2.293, "mean_token_accuracy": 0.4724137902259827, "step": 55885 }, { "epoch": 0.056293001598443256, "grad_norm": 14.141165602947416, "learning_rate": 4.999512949640397e-05, "loss": 2.8359, "mean_token_accuracy": 0.3379310369491577, "step": 55890 }, { "epoch": 0.05629803765154743, "grad_norm": 11.288246736623863, "learning_rate": 4.999512169701761e-05, "loss": 2.5181, "mean_token_accuracy": 0.458620685338974, "step": 55895 }, { "epoch": 0.0563030737046516, "grad_norm": 12.87596935670035, "learning_rate": 4.9995113891392145e-05, "loss": 2.7432, "mean_token_accuracy": 0.3827586233615875, "step": 55900 }, { "epoch": 0.05630810975775578, "grad_norm": 12.996531430316704, "learning_rate": 4.999510607952758e-05, "loss": 2.344, "mean_token_accuracy": 0.4983666181564331, "step": 55905 }, { "epoch": 0.056313145810859944, "grad_norm": 11.67859835841478, "learning_rate": 4.999509826142392e-05, "loss": 2.8238, "mean_token_accuracy": 0.3862068891525269, "step": 55910 }, { "epoch": 0.05631818186396412, "grad_norm": 12.2505877663274, "learning_rate": 4.9995090437081155e-05, "loss": 2.1617, "mean_token_accuracy": 0.46206897497177124, "step": 55915 }, { "epoch": 0.05632321791706829, "grad_norm": 12.447113254753495, "learning_rate": 4.999508260649929e-05, "loss": 2.8807, "mean_token_accuracy": 0.417241370677948, "step": 55920 }, { "epoch": 0.056328253970172465, "grad_norm": 15.289555280321078, "learning_rate": 4.999507476967833e-05, "loss": 2.9021, "mean_token_accuracy": 0.35862069129943847, "step": 55925 }, { "epoch": 0.05633329002327664, "grad_norm": 15.956216883924265, "learning_rate": 4.999506692661828e-05, "loss": 2.9148, "mean_token_accuracy": 0.34137930870056155, "step": 55930 }, { "epoch": 0.05633832607638081, "grad_norm": 12.094896997133805, "learning_rate": 4.999505907731913e-05, "loss": 2.1952, "mean_token_accuracy": 0.4206896543502808, "step": 55935 }, { "epoch": 0.056343362129484986, "grad_norm": 12.041920918747772, "learning_rate": 4.9995051221780904e-05, "loss": 2.5919, "mean_token_accuracy": 0.42758620977401735, "step": 55940 }, { "epoch": 0.05634839818258915, "grad_norm": 12.712888331040201, "learning_rate": 4.999504336000359e-05, "loss": 2.5698, "mean_token_accuracy": 0.38965516686439516, "step": 55945 }, { "epoch": 0.05635343423569333, "grad_norm": 11.715254665003515, "learning_rate": 4.999503549198718e-05, "loss": 2.7534, "mean_token_accuracy": 0.4, "step": 55950 }, { "epoch": 0.0563584702887975, "grad_norm": 13.25115930468187, "learning_rate": 4.99950276177317e-05, "loss": 2.6856, "mean_token_accuracy": 0.39086509346961973, "step": 55955 }, { "epoch": 0.056363506341901674, "grad_norm": 10.62711436502527, "learning_rate": 4.999501973723714e-05, "loss": 2.4892, "mean_token_accuracy": 0.4103448331356049, "step": 55960 }, { "epoch": 0.05636854239500585, "grad_norm": 14.387932801703723, "learning_rate": 4.99950118505035e-05, "loss": 3.1966, "mean_token_accuracy": 0.3517241418361664, "step": 55965 }, { "epoch": 0.05637357844811002, "grad_norm": 13.092175663752013, "learning_rate": 4.999500395753078e-05, "loss": 2.8125, "mean_token_accuracy": 0.38965516686439516, "step": 55970 }, { "epoch": 0.056378614501214196, "grad_norm": 9.395236459054871, "learning_rate": 4.9994996058318985e-05, "loss": 2.5765, "mean_token_accuracy": 0.3551724135875702, "step": 55975 }, { "epoch": 0.05638365055431836, "grad_norm": 11.487358980309494, "learning_rate": 4.999498815286812e-05, "loss": 2.6209, "mean_token_accuracy": 0.4068965494632721, "step": 55980 }, { "epoch": 0.056388686607422536, "grad_norm": 9.770517729132662, "learning_rate": 4.99949802411782e-05, "loss": 2.2947, "mean_token_accuracy": 0.4551724135875702, "step": 55985 }, { "epoch": 0.05639372266052671, "grad_norm": 23.677678380445162, "learning_rate": 4.99949723232492e-05, "loss": 3.2096, "mean_token_accuracy": 0.3103448301553726, "step": 55990 }, { "epoch": 0.056398758713630884, "grad_norm": 13.871452817833232, "learning_rate": 4.999496439908113e-05, "loss": 2.5721, "mean_token_accuracy": 0.3655172407627106, "step": 55995 }, { "epoch": 0.05640379476673506, "grad_norm": 13.299049886697127, "learning_rate": 4.9994956468674014e-05, "loss": 2.7384, "mean_token_accuracy": 0.42758620977401735, "step": 56000 }, { "epoch": 0.05640883081983923, "grad_norm": 20.86346437692601, "learning_rate": 4.9994948532027834e-05, "loss": 2.5434, "mean_token_accuracy": 0.4689655125141144, "step": 56005 }, { "epoch": 0.056413866872943405, "grad_norm": 11.747560814955518, "learning_rate": 4.999494058914259e-05, "loss": 2.2009, "mean_token_accuracy": 0.4758620738983154, "step": 56010 }, { "epoch": 0.05641890292604757, "grad_norm": 12.800483172331791, "learning_rate": 4.99949326400183e-05, "loss": 2.5922, "mean_token_accuracy": 0.39655172228813174, "step": 56015 }, { "epoch": 0.056423938979151746, "grad_norm": 10.591866492106657, "learning_rate": 4.999492468465494e-05, "loss": 2.4855, "mean_token_accuracy": 0.4448275864124298, "step": 56020 }, { "epoch": 0.05642897503225592, "grad_norm": 12.301025568847635, "learning_rate": 4.999491672305254e-05, "loss": 2.1691, "mean_token_accuracy": 0.44688445925712583, "step": 56025 }, { "epoch": 0.05643401108536009, "grad_norm": 12.424858740053356, "learning_rate": 4.999490875521109e-05, "loss": 2.4156, "mean_token_accuracy": 0.4068965494632721, "step": 56030 }, { "epoch": 0.05643904713846427, "grad_norm": 14.318980443474068, "learning_rate": 4.999490078113059e-05, "loss": 2.3559, "mean_token_accuracy": 0.417241370677948, "step": 56035 }, { "epoch": 0.05644408319156844, "grad_norm": 12.00690280503789, "learning_rate": 4.999489280081105e-05, "loss": 2.6854, "mean_token_accuracy": 0.4172413796186447, "step": 56040 }, { "epoch": 0.056449119244672614, "grad_norm": 9.933033563182349, "learning_rate": 4.999488481425247e-05, "loss": 2.5825, "mean_token_accuracy": 0.3709618806838989, "step": 56045 }, { "epoch": 0.05645415529777678, "grad_norm": 10.14170066469442, "learning_rate": 4.999487682145484e-05, "loss": 2.6453, "mean_token_accuracy": 0.37586206793785093, "step": 56050 }, { "epoch": 0.056459191350880955, "grad_norm": 11.864981174959619, "learning_rate": 4.999486882241818e-05, "loss": 3.2991, "mean_token_accuracy": 0.3275862067937851, "step": 56055 }, { "epoch": 0.05646422740398513, "grad_norm": 12.922577336153592, "learning_rate": 4.999486081714248e-05, "loss": 2.4108, "mean_token_accuracy": 0.4137930989265442, "step": 56060 }, { "epoch": 0.0564692634570893, "grad_norm": 16.35320997636631, "learning_rate": 4.9994852805627754e-05, "loss": 2.6591, "mean_token_accuracy": 0.37241379022598264, "step": 56065 }, { "epoch": 0.056474299510193476, "grad_norm": 24.246943848686776, "learning_rate": 4.999484478787399e-05, "loss": 2.8389, "mean_token_accuracy": 0.4263762950897217, "step": 56070 }, { "epoch": 0.05647933556329765, "grad_norm": 13.000042837197599, "learning_rate": 4.999483676388119e-05, "loss": 2.6574, "mean_token_accuracy": 0.4, "step": 56075 }, { "epoch": 0.056484371616401824, "grad_norm": 10.291728969346774, "learning_rate": 4.9994828733649376e-05, "loss": 2.2894, "mean_token_accuracy": 0.44857833385467527, "step": 56080 }, { "epoch": 0.05648940766950599, "grad_norm": 13.35767519366003, "learning_rate": 4.999482069717853e-05, "loss": 2.8827, "mean_token_accuracy": 0.37241379022598264, "step": 56085 }, { "epoch": 0.056494443722610165, "grad_norm": 17.04829436534816, "learning_rate": 4.999481265446866e-05, "loss": 2.7501, "mean_token_accuracy": 0.3724137991666794, "step": 56090 }, { "epoch": 0.05649947977571434, "grad_norm": 11.224689600497436, "learning_rate": 4.999480460551977e-05, "loss": 2.7012, "mean_token_accuracy": 0.3999999940395355, "step": 56095 }, { "epoch": 0.05650451582881851, "grad_norm": 9.264897784219167, "learning_rate": 4.999479655033187e-05, "loss": 2.2701, "mean_token_accuracy": 0.47447065711021424, "step": 56100 }, { "epoch": 0.056509551881922686, "grad_norm": 11.588187551590568, "learning_rate": 4.9994788488904956e-05, "loss": 2.9947, "mean_token_accuracy": 0.4, "step": 56105 }, { "epoch": 0.05651458793502686, "grad_norm": 11.780256155652385, "learning_rate": 4.9994780421239015e-05, "loss": 2.6263, "mean_token_accuracy": 0.38620689511299133, "step": 56110 }, { "epoch": 0.05651962398813103, "grad_norm": 19.174406558996136, "learning_rate": 4.9994772347334074e-05, "loss": 2.8523, "mean_token_accuracy": 0.4000000059604645, "step": 56115 }, { "epoch": 0.0565246600412352, "grad_norm": 14.08217172564536, "learning_rate": 4.999476426719012e-05, "loss": 2.4826, "mean_token_accuracy": 0.4068965494632721, "step": 56120 }, { "epoch": 0.056529696094339374, "grad_norm": 10.772878961266995, "learning_rate": 4.9994756180807156e-05, "loss": 2.35, "mean_token_accuracy": 0.4586206912994385, "step": 56125 }, { "epoch": 0.05653473214744355, "grad_norm": 10.95309091243865, "learning_rate": 4.9994748088185185e-05, "loss": 2.649, "mean_token_accuracy": 0.4103448212146759, "step": 56130 }, { "epoch": 0.05653976820054772, "grad_norm": 12.073484655424734, "learning_rate": 4.999473998932422e-05, "loss": 2.2722, "mean_token_accuracy": 0.44137930274009707, "step": 56135 }, { "epoch": 0.056544804253651895, "grad_norm": 9.428735317417326, "learning_rate": 4.999473188422425e-05, "loss": 2.1036, "mean_token_accuracy": 0.4551724135875702, "step": 56140 }, { "epoch": 0.05654984030675607, "grad_norm": 10.173790946437139, "learning_rate": 4.999472377288529e-05, "loss": 2.3319, "mean_token_accuracy": 0.37586206793785093, "step": 56145 }, { "epoch": 0.05655487635986024, "grad_norm": 11.114154994222837, "learning_rate": 4.999471565530732e-05, "loss": 2.2554, "mean_token_accuracy": 0.3896551787853241, "step": 56150 }, { "epoch": 0.05655991241296441, "grad_norm": 9.898848299447293, "learning_rate": 4.999470753149037e-05, "loss": 2.2788, "mean_token_accuracy": 0.4724137902259827, "step": 56155 }, { "epoch": 0.05656494846606858, "grad_norm": 12.956859726813956, "learning_rate": 4.999469940143442e-05, "loss": 2.7046, "mean_token_accuracy": 0.36896551251411436, "step": 56160 }, { "epoch": 0.05656998451917276, "grad_norm": 13.070449112114877, "learning_rate": 4.9994691265139486e-05, "loss": 2.4649, "mean_token_accuracy": 0.3896551728248596, "step": 56165 }, { "epoch": 0.05657502057227693, "grad_norm": 12.290126336581281, "learning_rate": 4.999468312260557e-05, "loss": 2.3197, "mean_token_accuracy": 0.4431336998939514, "step": 56170 }, { "epoch": 0.056580056625381105, "grad_norm": 14.128821139115507, "learning_rate": 4.999467497383266e-05, "loss": 2.755, "mean_token_accuracy": 0.3827586203813553, "step": 56175 }, { "epoch": 0.05658509267848528, "grad_norm": 12.364446326420033, "learning_rate": 4.999466681882077e-05, "loss": 2.5985, "mean_token_accuracy": 0.3931034505367279, "step": 56180 }, { "epoch": 0.05659012873158945, "grad_norm": 13.388260171930906, "learning_rate": 4.999465865756991e-05, "loss": 2.6423, "mean_token_accuracy": 0.32068965435028074, "step": 56185 }, { "epoch": 0.05659516478469362, "grad_norm": 13.957538619221943, "learning_rate": 4.9994650490080064e-05, "loss": 2.2126, "mean_token_accuracy": 0.4413793087005615, "step": 56190 }, { "epoch": 0.05660020083779779, "grad_norm": 16.75906020462544, "learning_rate": 4.999464231635124e-05, "loss": 2.8412, "mean_token_accuracy": 0.3620689660310745, "step": 56195 }, { "epoch": 0.056605236890901967, "grad_norm": 11.382926498965539, "learning_rate": 4.999463413638344e-05, "loss": 2.178, "mean_token_accuracy": 0.43103448748588563, "step": 56200 }, { "epoch": 0.05661027294400614, "grad_norm": 12.923868180315626, "learning_rate": 4.9994625950176684e-05, "loss": 2.5344, "mean_token_accuracy": 0.39655172228813174, "step": 56205 }, { "epoch": 0.056615308997110314, "grad_norm": 13.55973469079723, "learning_rate": 4.999461775773095e-05, "loss": 2.6146, "mean_token_accuracy": 0.3827586233615875, "step": 56210 }, { "epoch": 0.05662034505021449, "grad_norm": 9.751300016926047, "learning_rate": 4.9994609559046253e-05, "loss": 2.7455, "mean_token_accuracy": 0.41724138259887694, "step": 56215 }, { "epoch": 0.05662538110331866, "grad_norm": 13.519018713060435, "learning_rate": 4.999460135412259e-05, "loss": 2.2581, "mean_token_accuracy": 0.43103448748588563, "step": 56220 }, { "epoch": 0.05663041715642283, "grad_norm": 10.709237063735674, "learning_rate": 4.999459314295997e-05, "loss": 2.2562, "mean_token_accuracy": 0.4535390317440033, "step": 56225 }, { "epoch": 0.056635453209527, "grad_norm": 11.982745584180012, "learning_rate": 4.999458492555838e-05, "loss": 2.8691, "mean_token_accuracy": 0.32413792610168457, "step": 56230 }, { "epoch": 0.056640489262631176, "grad_norm": 14.51272503985993, "learning_rate": 4.9994576701917845e-05, "loss": 2.9809, "mean_token_accuracy": 0.3620689630508423, "step": 56235 }, { "epoch": 0.05664552531573535, "grad_norm": 11.658987217873193, "learning_rate": 4.999456847203835e-05, "loss": 2.5789, "mean_token_accuracy": 0.43103447556495667, "step": 56240 }, { "epoch": 0.056650561368839523, "grad_norm": 21.208352180773883, "learning_rate": 4.99945602359199e-05, "loss": 2.6997, "mean_token_accuracy": 0.3879612863063812, "step": 56245 }, { "epoch": 0.0566555974219437, "grad_norm": 16.53399268085399, "learning_rate": 4.99945519935625e-05, "loss": 3.6324, "mean_token_accuracy": 0.33793103098869326, "step": 56250 }, { "epoch": 0.05666063347504787, "grad_norm": 9.458524176170812, "learning_rate": 4.9994543744966156e-05, "loss": 2.7289, "mean_token_accuracy": 0.4103448331356049, "step": 56255 }, { "epoch": 0.05666566952815204, "grad_norm": 14.302232017194198, "learning_rate": 4.999453549013087e-05, "loss": 2.611, "mean_token_accuracy": 0.3620689630508423, "step": 56260 }, { "epoch": 0.05667070558125621, "grad_norm": 12.619289203556953, "learning_rate": 4.999452722905664e-05, "loss": 2.322, "mean_token_accuracy": 0.4137930989265442, "step": 56265 }, { "epoch": 0.056675741634360385, "grad_norm": 10.150124337348615, "learning_rate": 4.999451896174346e-05, "loss": 2.4294, "mean_token_accuracy": 0.3965517163276672, "step": 56270 }, { "epoch": 0.05668077768746456, "grad_norm": 10.242251614143063, "learning_rate": 4.999451068819134e-05, "loss": 2.3582, "mean_token_accuracy": 0.42758620381355283, "step": 56275 }, { "epoch": 0.05668581374056873, "grad_norm": 10.246147487295602, "learning_rate": 4.999450240840029e-05, "loss": 2.1904, "mean_token_accuracy": 0.4724137902259827, "step": 56280 }, { "epoch": 0.05669084979367291, "grad_norm": 11.057121714349869, "learning_rate": 4.999449412237031e-05, "loss": 2.1769, "mean_token_accuracy": 0.47241378426551817, "step": 56285 }, { "epoch": 0.05669588584677708, "grad_norm": 11.667221193909262, "learning_rate": 4.999448583010139e-05, "loss": 2.1731, "mean_token_accuracy": 0.4534785270690918, "step": 56290 }, { "epoch": 0.05670092189988125, "grad_norm": 12.065859193423591, "learning_rate": 4.999447753159355e-05, "loss": 2.4814, "mean_token_accuracy": 0.4068965554237366, "step": 56295 }, { "epoch": 0.05670595795298542, "grad_norm": 14.484821953608458, "learning_rate": 4.9994469226846766e-05, "loss": 2.5945, "mean_token_accuracy": 0.44482757449150084, "step": 56300 }, { "epoch": 0.056710994006089595, "grad_norm": 13.859050104033333, "learning_rate": 4.999446091586107e-05, "loss": 2.5457, "mean_token_accuracy": 0.3813067197799683, "step": 56305 }, { "epoch": 0.05671603005919377, "grad_norm": 11.60379609079237, "learning_rate": 4.9994452598636455e-05, "loss": 2.3094, "mean_token_accuracy": 0.4413793087005615, "step": 56310 }, { "epoch": 0.05672106611229794, "grad_norm": 10.271892577097054, "learning_rate": 4.9994444275172914e-05, "loss": 2.7395, "mean_token_accuracy": 0.3655172288417816, "step": 56315 }, { "epoch": 0.056726102165402116, "grad_norm": 10.677611282381557, "learning_rate": 4.9994435945470454e-05, "loss": 2.5213, "mean_token_accuracy": 0.4395320177078247, "step": 56320 }, { "epoch": 0.05673113821850629, "grad_norm": 13.53152976591521, "learning_rate": 4.9994427609529074e-05, "loss": 2.8761, "mean_token_accuracy": 0.3448275804519653, "step": 56325 }, { "epoch": 0.05673617427161046, "grad_norm": 11.493406745716078, "learning_rate": 4.999441926734879e-05, "loss": 2.9171, "mean_token_accuracy": 0.3965517282485962, "step": 56330 }, { "epoch": 0.05674121032471463, "grad_norm": 11.001316752283271, "learning_rate": 4.999441091892958e-05, "loss": 2.4803, "mean_token_accuracy": 0.38275861740112305, "step": 56335 }, { "epoch": 0.056746246377818804, "grad_norm": 14.269474406388586, "learning_rate": 4.999440256427148e-05, "loss": 3.3146, "mean_token_accuracy": 0.35862068831920624, "step": 56340 }, { "epoch": 0.05675128243092298, "grad_norm": 11.118253508436803, "learning_rate": 4.999439420337447e-05, "loss": 2.8236, "mean_token_accuracy": 0.35862069129943847, "step": 56345 }, { "epoch": 0.05675631848402715, "grad_norm": 14.291688247099108, "learning_rate": 4.9994385836238545e-05, "loss": 2.7143, "mean_token_accuracy": 0.3758620709180832, "step": 56350 }, { "epoch": 0.056761354537131326, "grad_norm": 11.205151955235964, "learning_rate": 4.999437746286372e-05, "loss": 2.5443, "mean_token_accuracy": 0.4103448212146759, "step": 56355 }, { "epoch": 0.0567663905902355, "grad_norm": 12.473834344319762, "learning_rate": 4.999436908325e-05, "loss": 2.5924, "mean_token_accuracy": 0.42413792610168455, "step": 56360 }, { "epoch": 0.056771426643339666, "grad_norm": 11.336414579736598, "learning_rate": 4.999436069739739e-05, "loss": 2.0133, "mean_token_accuracy": 0.49872957468032836, "step": 56365 }, { "epoch": 0.05677646269644384, "grad_norm": 20.3590680282211, "learning_rate": 4.999435230530587e-05, "loss": 2.7892, "mean_token_accuracy": 0.35862069129943847, "step": 56370 }, { "epoch": 0.056781498749548014, "grad_norm": 12.879898678239105, "learning_rate": 4.999434390697547e-05, "loss": 2.884, "mean_token_accuracy": 0.34137930274009703, "step": 56375 }, { "epoch": 0.05678653480265219, "grad_norm": 12.304504974064427, "learning_rate": 4.999433550240617e-05, "loss": 2.8961, "mean_token_accuracy": 0.3793103456497192, "step": 56380 }, { "epoch": 0.05679157085575636, "grad_norm": 8.071786596332304, "learning_rate": 4.999432709159799e-05, "loss": 2.2054, "mean_token_accuracy": 0.49458127617836, "step": 56385 }, { "epoch": 0.056796606908860535, "grad_norm": 11.572834566115532, "learning_rate": 4.999431867455092e-05, "loss": 2.8567, "mean_token_accuracy": 0.37241379022598264, "step": 56390 }, { "epoch": 0.05680164296196471, "grad_norm": 12.712594272603269, "learning_rate": 4.999431025126497e-05, "loss": 2.5742, "mean_token_accuracy": 0.42068966031074523, "step": 56395 }, { "epoch": 0.056806679015068876, "grad_norm": 14.400194125714002, "learning_rate": 4.9994301821740136e-05, "loss": 2.6776, "mean_token_accuracy": 0.3620689630508423, "step": 56400 }, { "epoch": 0.05681171506817305, "grad_norm": 10.380315261291337, "learning_rate": 4.9994293385976425e-05, "loss": 2.9552, "mean_token_accuracy": 0.37586206793785093, "step": 56405 }, { "epoch": 0.05681675112127722, "grad_norm": 13.227092326127044, "learning_rate": 4.999428494397384e-05, "loss": 2.7054, "mean_token_accuracy": 0.3655172407627106, "step": 56410 }, { "epoch": 0.0568217871743814, "grad_norm": 10.508610112598312, "learning_rate": 4.999427649573238e-05, "loss": 2.3083, "mean_token_accuracy": 0.42413792610168455, "step": 56415 }, { "epoch": 0.05682682322748557, "grad_norm": 12.624586416395621, "learning_rate": 4.9994268041252045e-05, "loss": 2.7765, "mean_token_accuracy": 0.4000000059604645, "step": 56420 }, { "epoch": 0.056831859280589744, "grad_norm": 10.41998066209456, "learning_rate": 4.9994259580532846e-05, "loss": 2.3733, "mean_token_accuracy": 0.41724138259887694, "step": 56425 }, { "epoch": 0.05683689533369392, "grad_norm": 10.45594154390414, "learning_rate": 4.9994251113574775e-05, "loss": 2.5369, "mean_token_accuracy": 0.4344827651977539, "step": 56430 }, { "epoch": 0.056841931386798085, "grad_norm": 11.681893947984362, "learning_rate": 4.9994242640377845e-05, "loss": 2.1973, "mean_token_accuracy": 0.4068965554237366, "step": 56435 }, { "epoch": 0.05684696743990226, "grad_norm": 9.10628743171219, "learning_rate": 4.999423416094204e-05, "loss": 2.2625, "mean_token_accuracy": 0.442241370677948, "step": 56440 }, { "epoch": 0.05685200349300643, "grad_norm": 11.80361943668972, "learning_rate": 4.999422567526739e-05, "loss": 2.9042, "mean_token_accuracy": 0.41034482717514037, "step": 56445 }, { "epoch": 0.056857039546110606, "grad_norm": 13.6678205562962, "learning_rate": 4.999421718335388e-05, "loss": 2.6146, "mean_token_accuracy": 0.35172412991523744, "step": 56450 }, { "epoch": 0.05686207559921478, "grad_norm": 13.859787509418128, "learning_rate": 4.999420868520151e-05, "loss": 2.3706, "mean_token_accuracy": 0.4689655125141144, "step": 56455 }, { "epoch": 0.056867111652318954, "grad_norm": 13.277506787650813, "learning_rate": 4.999420018081029e-05, "loss": 2.6024, "mean_token_accuracy": 0.4068965494632721, "step": 56460 }, { "epoch": 0.05687214770542313, "grad_norm": 14.109818025313617, "learning_rate": 4.999419167018022e-05, "loss": 2.694, "mean_token_accuracy": 0.42758620381355283, "step": 56465 }, { "epoch": 0.056877183758527294, "grad_norm": 11.269401403688033, "learning_rate": 4.99941831533113e-05, "loss": 2.297, "mean_token_accuracy": 0.4379310369491577, "step": 56470 }, { "epoch": 0.05688221981163147, "grad_norm": 12.709780661513586, "learning_rate": 4.9994174630203536e-05, "loss": 2.6484, "mean_token_accuracy": 0.38620689511299133, "step": 56475 }, { "epoch": 0.05688725586473564, "grad_norm": 17.303118051058295, "learning_rate": 4.9994166100856935e-05, "loss": 2.7457, "mean_token_accuracy": 0.34482758641242983, "step": 56480 }, { "epoch": 0.056892291917839816, "grad_norm": 13.01333729607657, "learning_rate": 4.999415756527148e-05, "loss": 2.6314, "mean_token_accuracy": 0.39655172228813174, "step": 56485 }, { "epoch": 0.05689732797094399, "grad_norm": 13.563553806049347, "learning_rate": 4.99941490234472e-05, "loss": 2.4584, "mean_token_accuracy": 0.36896551847457887, "step": 56490 }, { "epoch": 0.05690236402404816, "grad_norm": 13.334147009016538, "learning_rate": 4.999414047538407e-05, "loss": 2.8138, "mean_token_accuracy": 0.3827586114406586, "step": 56495 }, { "epoch": 0.05690740007715234, "grad_norm": 13.422772944943365, "learning_rate": 4.999413192108211e-05, "loss": 2.297, "mean_token_accuracy": 0.40689654350280763, "step": 56500 }, { "epoch": 0.056912436130256504, "grad_norm": 15.363227252479481, "learning_rate": 4.999412336054132e-05, "loss": 2.7919, "mean_token_accuracy": 0.37241379022598264, "step": 56505 }, { "epoch": 0.05691747218336068, "grad_norm": 11.604301231651428, "learning_rate": 4.999411479376171e-05, "loss": 2.85, "mean_token_accuracy": 0.42068966031074523, "step": 56510 }, { "epoch": 0.05692250823646485, "grad_norm": 9.96877686448567, "learning_rate": 4.9994106220743264e-05, "loss": 2.6506, "mean_token_accuracy": 0.37931033968925476, "step": 56515 }, { "epoch": 0.056927544289569025, "grad_norm": 12.941568440317083, "learning_rate": 4.9994097641486e-05, "loss": 2.6947, "mean_token_accuracy": 0.4137930929660797, "step": 56520 }, { "epoch": 0.0569325803426732, "grad_norm": 10.824598286122152, "learning_rate": 4.999408905598991e-05, "loss": 2.557, "mean_token_accuracy": 0.41379310488700866, "step": 56525 }, { "epoch": 0.05693761639577737, "grad_norm": 10.86162265677599, "learning_rate": 4.9994080464255e-05, "loss": 2.3411, "mean_token_accuracy": 0.4551724135875702, "step": 56530 }, { "epoch": 0.056942652448881546, "grad_norm": 11.236091300799007, "learning_rate": 4.9994071866281266e-05, "loss": 2.5978, "mean_token_accuracy": 0.42758620381355283, "step": 56535 }, { "epoch": 0.05694768850198571, "grad_norm": 13.017061893734818, "learning_rate": 4.999406326206873e-05, "loss": 2.8506, "mean_token_accuracy": 0.37586206793785093, "step": 56540 }, { "epoch": 0.05695272455508989, "grad_norm": 12.37134557196057, "learning_rate": 4.9994054651617375e-05, "loss": 2.7767, "mean_token_accuracy": 0.35517241060733795, "step": 56545 }, { "epoch": 0.05695776060819406, "grad_norm": 13.582973032604299, "learning_rate": 4.999404603492721e-05, "loss": 2.8826, "mean_token_accuracy": 0.37241379022598264, "step": 56550 }, { "epoch": 0.056962796661298234, "grad_norm": 14.301344107185333, "learning_rate": 4.999403741199824e-05, "loss": 2.7908, "mean_token_accuracy": 0.38620689511299133, "step": 56555 }, { "epoch": 0.05696783271440241, "grad_norm": 12.896444287071745, "learning_rate": 4.999402878283046e-05, "loss": 2.6563, "mean_token_accuracy": 0.37537809610366824, "step": 56560 }, { "epoch": 0.05697286876750658, "grad_norm": 12.670547878720026, "learning_rate": 4.9994020147423886e-05, "loss": 3.1056, "mean_token_accuracy": 0.3103448301553726, "step": 56565 }, { "epoch": 0.056977904820610756, "grad_norm": 12.158586390827315, "learning_rate": 4.9994011505778504e-05, "loss": 3.0262, "mean_token_accuracy": 0.39467633962631227, "step": 56570 }, { "epoch": 0.05698294087371492, "grad_norm": 10.703297164729264, "learning_rate": 4.999400285789433e-05, "loss": 2.3984, "mean_token_accuracy": 0.41034482717514037, "step": 56575 }, { "epoch": 0.056987976926819096, "grad_norm": 16.317895529075017, "learning_rate": 4.999399420377135e-05, "loss": 3.1215, "mean_token_accuracy": 0.3827586233615875, "step": 56580 }, { "epoch": 0.05699301297992327, "grad_norm": 12.666430448808347, "learning_rate": 4.999398554340959e-05, "loss": 2.8564, "mean_token_accuracy": 0.3310344755649567, "step": 56585 }, { "epoch": 0.056998049033027444, "grad_norm": 11.299648589087205, "learning_rate": 4.9993976876809026e-05, "loss": 2.1512, "mean_token_accuracy": 0.4379310369491577, "step": 56590 }, { "epoch": 0.05700308508613162, "grad_norm": 11.083467015914241, "learning_rate": 4.999396820396968e-05, "loss": 2.6386, "mean_token_accuracy": 0.33793103098869326, "step": 56595 }, { "epoch": 0.05700812113923579, "grad_norm": 10.61333249978888, "learning_rate": 4.999395952489156e-05, "loss": 2.4733, "mean_token_accuracy": 0.3965517163276672, "step": 56600 }, { "epoch": 0.057013157192339965, "grad_norm": 10.70603936225942, "learning_rate": 4.9993950839574645e-05, "loss": 2.744, "mean_token_accuracy": 0.35172414481639863, "step": 56605 }, { "epoch": 0.05701819324544413, "grad_norm": 12.308270656272272, "learning_rate": 4.9993942148018944e-05, "loss": 2.4612, "mean_token_accuracy": 0.3586206793785095, "step": 56610 }, { "epoch": 0.057023229298548306, "grad_norm": 9.368210681493663, "learning_rate": 4.9993933450224465e-05, "loss": 2.6723, "mean_token_accuracy": 0.3482758581638336, "step": 56615 }, { "epoch": 0.05702826535165248, "grad_norm": 15.327668951134902, "learning_rate": 4.999392474619122e-05, "loss": 2.6949, "mean_token_accuracy": 0.4172413766384125, "step": 56620 }, { "epoch": 0.05703330140475665, "grad_norm": 15.044080408132025, "learning_rate": 4.999391603591919e-05, "loss": 2.6487, "mean_token_accuracy": 0.38620689809322356, "step": 56625 }, { "epoch": 0.05703833745786083, "grad_norm": 14.224665452817053, "learning_rate": 4.999390731940839e-05, "loss": 2.9042, "mean_token_accuracy": 0.39655172228813174, "step": 56630 }, { "epoch": 0.057043373510965, "grad_norm": 11.596556895308801, "learning_rate": 4.999389859665883e-05, "loss": 2.4464, "mean_token_accuracy": 0.4206896543502808, "step": 56635 }, { "epoch": 0.057048409564069175, "grad_norm": 13.644686007689952, "learning_rate": 4.99938898676705e-05, "loss": 2.6181, "mean_token_accuracy": 0.4068965494632721, "step": 56640 }, { "epoch": 0.05705344561717334, "grad_norm": 10.98998780923295, "learning_rate": 4.999388113244339e-05, "loss": 2.5684, "mean_token_accuracy": 0.34137930870056155, "step": 56645 }, { "epoch": 0.057058481670277515, "grad_norm": 13.100379761807801, "learning_rate": 4.999387239097754e-05, "loss": 2.5429, "mean_token_accuracy": 0.42413792610168455, "step": 56650 }, { "epoch": 0.05706351772338169, "grad_norm": 12.669400216197241, "learning_rate": 4.9993863643272925e-05, "loss": 2.6447, "mean_token_accuracy": 0.4068965494632721, "step": 56655 }, { "epoch": 0.05706855377648586, "grad_norm": 9.991243376319659, "learning_rate": 4.999385488932955e-05, "loss": 2.5012, "mean_token_accuracy": 0.43103448748588563, "step": 56660 }, { "epoch": 0.057073589829590037, "grad_norm": 12.684805165409262, "learning_rate": 4.999384612914741e-05, "loss": 2.5898, "mean_token_accuracy": 0.4517241418361664, "step": 56665 }, { "epoch": 0.05707862588269421, "grad_norm": 14.863206011557716, "learning_rate": 4.9993837362726535e-05, "loss": 2.6497, "mean_token_accuracy": 0.38965517580509185, "step": 56670 }, { "epoch": 0.057083661935798384, "grad_norm": 9.54376402274776, "learning_rate": 4.99938285900669e-05, "loss": 2.8495, "mean_token_accuracy": 0.38451301455497744, "step": 56675 }, { "epoch": 0.05708869798890255, "grad_norm": 11.451032319023579, "learning_rate": 4.999381981116852e-05, "loss": 2.5374, "mean_token_accuracy": 0.44137930274009707, "step": 56680 }, { "epoch": 0.057093734042006725, "grad_norm": 12.017098853339334, "learning_rate": 4.99938110260314e-05, "loss": 2.8144, "mean_token_accuracy": 0.41911675930023196, "step": 56685 }, { "epoch": 0.0570987700951109, "grad_norm": 9.809498333264006, "learning_rate": 4.999380223465554e-05, "loss": 2.2108, "mean_token_accuracy": 0.5000000119209289, "step": 56690 }, { "epoch": 0.05710380614821507, "grad_norm": 13.157593622371417, "learning_rate": 4.999379343704092e-05, "loss": 2.4579, "mean_token_accuracy": 0.4020568668842316, "step": 56695 }, { "epoch": 0.057108842201319246, "grad_norm": 12.011618339410687, "learning_rate": 4.999378463318758e-05, "loss": 2.1722, "mean_token_accuracy": 0.47586206197738645, "step": 56700 }, { "epoch": 0.05711387825442342, "grad_norm": 11.168214245040689, "learning_rate": 4.99937758230955e-05, "loss": 2.5496, "mean_token_accuracy": 0.4206896543502808, "step": 56705 }, { "epoch": 0.05711891430752759, "grad_norm": 12.698882778783561, "learning_rate": 4.999376700676469e-05, "loss": 3.0421, "mean_token_accuracy": 0.3620689630508423, "step": 56710 }, { "epoch": 0.05712395036063176, "grad_norm": 14.221623990764911, "learning_rate": 4.9993758184195155e-05, "loss": 2.574, "mean_token_accuracy": 0.35862068831920624, "step": 56715 }, { "epoch": 0.057128986413735934, "grad_norm": 9.818318443324813, "learning_rate": 4.999374935538688e-05, "loss": 1.9802, "mean_token_accuracy": 0.5034482717514038, "step": 56720 }, { "epoch": 0.05713402246684011, "grad_norm": 10.501232626200357, "learning_rate": 4.999374052033988e-05, "loss": 2.5839, "mean_token_accuracy": 0.45517240166664125, "step": 56725 }, { "epoch": 0.05713905851994428, "grad_norm": 11.922148685766393, "learning_rate": 4.999373167905417e-05, "loss": 2.2856, "mean_token_accuracy": 0.42068966031074523, "step": 56730 }, { "epoch": 0.057144094573048455, "grad_norm": 13.573548065732542, "learning_rate": 4.9993722831529726e-05, "loss": 2.4923, "mean_token_accuracy": 0.41379310488700866, "step": 56735 }, { "epoch": 0.05714913062615263, "grad_norm": 9.992344237366302, "learning_rate": 4.999371397776658e-05, "loss": 2.6208, "mean_token_accuracy": 0.37241379022598264, "step": 56740 }, { "epoch": 0.0571541666792568, "grad_norm": 10.399378884825357, "learning_rate": 4.999370511776471e-05, "loss": 2.6868, "mean_token_accuracy": 0.40108892917633054, "step": 56745 }, { "epoch": 0.05715920273236097, "grad_norm": 10.276211302833515, "learning_rate": 4.999369625152412e-05, "loss": 2.289, "mean_token_accuracy": 0.4172413766384125, "step": 56750 }, { "epoch": 0.057164238785465143, "grad_norm": 11.692390152843773, "learning_rate": 4.9993687379044825e-05, "loss": 2.518, "mean_token_accuracy": 0.42068964838981626, "step": 56755 }, { "epoch": 0.05716927483856932, "grad_norm": 11.997247677473002, "learning_rate": 4.999367850032682e-05, "loss": 2.7736, "mean_token_accuracy": 0.37931033968925476, "step": 56760 }, { "epoch": 0.05717431089167349, "grad_norm": 14.042188805674451, "learning_rate": 4.999366961537011e-05, "loss": 2.3709, "mean_token_accuracy": 0.43103447556495667, "step": 56765 }, { "epoch": 0.057179346944777665, "grad_norm": 11.875428311547322, "learning_rate": 4.99936607241747e-05, "loss": 2.2407, "mean_token_accuracy": 0.4586206912994385, "step": 56770 }, { "epoch": 0.05718438299788184, "grad_norm": 12.342037824124963, "learning_rate": 4.999365182674059e-05, "loss": 2.5136, "mean_token_accuracy": 0.4000000059604645, "step": 56775 }, { "epoch": 0.05718941905098601, "grad_norm": 11.703384937812958, "learning_rate": 4.999364292306778e-05, "loss": 2.8979, "mean_token_accuracy": 0.3896551787853241, "step": 56780 }, { "epoch": 0.05719445510409018, "grad_norm": 18.73220542652447, "learning_rate": 4.999363401315627e-05, "loss": 2.4692, "mean_token_accuracy": 0.44827585816383364, "step": 56785 }, { "epoch": 0.05719949115719435, "grad_norm": 11.41014236617544, "learning_rate": 4.999362509700607e-05, "loss": 2.8347, "mean_token_accuracy": 0.34137930274009703, "step": 56790 }, { "epoch": 0.05720452721029853, "grad_norm": 11.296188985940232, "learning_rate": 4.9993616174617176e-05, "loss": 2.8115, "mean_token_accuracy": 0.36896551847457887, "step": 56795 }, { "epoch": 0.0572095632634027, "grad_norm": 12.720673603196303, "learning_rate": 4.99936072459896e-05, "loss": 2.7329, "mean_token_accuracy": 0.3896551787853241, "step": 56800 }, { "epoch": 0.057214599316506874, "grad_norm": 16.44234651430403, "learning_rate": 4.999359831112333e-05, "loss": 2.9081, "mean_token_accuracy": 0.3551724076271057, "step": 56805 }, { "epoch": 0.05721963536961105, "grad_norm": 11.070626955878454, "learning_rate": 4.9993589370018386e-05, "loss": 2.4931, "mean_token_accuracy": 0.37241379618644715, "step": 56810 }, { "epoch": 0.057224671422715215, "grad_norm": 12.989076161573143, "learning_rate": 4.999358042267476e-05, "loss": 2.8422, "mean_token_accuracy": 0.43611615896224976, "step": 56815 }, { "epoch": 0.05722970747581939, "grad_norm": 12.911740067532618, "learning_rate": 4.9993571469092446e-05, "loss": 2.6351, "mean_token_accuracy": 0.38620689511299133, "step": 56820 }, { "epoch": 0.05723474352892356, "grad_norm": 12.05249379890571, "learning_rate": 4.999356250927147e-05, "loss": 2.8662, "mean_token_accuracy": 0.3655172407627106, "step": 56825 }, { "epoch": 0.057239779582027736, "grad_norm": 13.663370097536497, "learning_rate": 4.999355354321181e-05, "loss": 2.4579, "mean_token_accuracy": 0.42589232325553894, "step": 56830 }, { "epoch": 0.05724481563513191, "grad_norm": 10.314765474430244, "learning_rate": 4.999354457091348e-05, "loss": 2.6578, "mean_token_accuracy": 0.4103448212146759, "step": 56835 }, { "epoch": 0.057249851688236084, "grad_norm": 12.31622455841909, "learning_rate": 4.999353559237649e-05, "loss": 2.5917, "mean_token_accuracy": 0.4103448212146759, "step": 56840 }, { "epoch": 0.05725488774134026, "grad_norm": 11.728756520791755, "learning_rate": 4.999352660760082e-05, "loss": 2.7015, "mean_token_accuracy": 0.42413792610168455, "step": 56845 }, { "epoch": 0.057259923794444424, "grad_norm": 11.259289049534395, "learning_rate": 4.9993517616586495e-05, "loss": 2.342, "mean_token_accuracy": 0.4310344815254211, "step": 56850 }, { "epoch": 0.0572649598475486, "grad_norm": 15.92954637707232, "learning_rate": 4.9993508619333507e-05, "loss": 2.656, "mean_token_accuracy": 0.37586206793785093, "step": 56855 }, { "epoch": 0.05726999590065277, "grad_norm": 10.858295477319485, "learning_rate": 4.999349961584186e-05, "loss": 2.585, "mean_token_accuracy": 0.4000000059604645, "step": 56860 }, { "epoch": 0.057275031953756946, "grad_norm": 16.730859815588158, "learning_rate": 4.9993490606111564e-05, "loss": 2.6695, "mean_token_accuracy": 0.4103448212146759, "step": 56865 }, { "epoch": 0.05728006800686112, "grad_norm": 12.761229812272866, "learning_rate": 4.9993481590142596e-05, "loss": 2.8642, "mean_token_accuracy": 0.3551724135875702, "step": 56870 }, { "epoch": 0.05728510405996529, "grad_norm": 11.078519546809535, "learning_rate": 4.9993472567935e-05, "loss": 2.5934, "mean_token_accuracy": 0.42758620381355283, "step": 56875 }, { "epoch": 0.05729014011306947, "grad_norm": 11.131810978330439, "learning_rate": 4.9993463539488735e-05, "loss": 2.8056, "mean_token_accuracy": 0.35862069129943847, "step": 56880 }, { "epoch": 0.057295176166173634, "grad_norm": 11.377291993784423, "learning_rate": 4.9993454504803836e-05, "loss": 2.8052, "mean_token_accuracy": 0.39655172228813174, "step": 56885 }, { "epoch": 0.05730021221927781, "grad_norm": 12.631874717573584, "learning_rate": 4.99934454638803e-05, "loss": 3.2904, "mean_token_accuracy": 0.3275862097740173, "step": 56890 }, { "epoch": 0.05730524827238198, "grad_norm": 12.074191134010285, "learning_rate": 4.9993436416718106e-05, "loss": 2.6655, "mean_token_accuracy": 0.38620689511299133, "step": 56895 }, { "epoch": 0.057310284325486155, "grad_norm": 11.418836705153414, "learning_rate": 4.999342736331729e-05, "loss": 2.4377, "mean_token_accuracy": 0.41379311084747317, "step": 56900 }, { "epoch": 0.05731532037859033, "grad_norm": 13.317354684661773, "learning_rate": 4.999341830367782e-05, "loss": 2.7542, "mean_token_accuracy": 0.37241379022598264, "step": 56905 }, { "epoch": 0.0573203564316945, "grad_norm": 11.29641471057222, "learning_rate": 4.999340923779973e-05, "loss": 2.5982, "mean_token_accuracy": 0.4551724076271057, "step": 56910 }, { "epoch": 0.057325392484798676, "grad_norm": 14.907122514986007, "learning_rate": 4.9993400165683014e-05, "loss": 2.7071, "mean_token_accuracy": 0.3931034505367279, "step": 56915 }, { "epoch": 0.05733042853790284, "grad_norm": 14.089748028717697, "learning_rate": 4.999339108732766e-05, "loss": 2.5909, "mean_token_accuracy": 0.43448275327682495, "step": 56920 }, { "epoch": 0.05733546459100702, "grad_norm": 11.361229564412609, "learning_rate": 4.999338200273368e-05, "loss": 2.779, "mean_token_accuracy": 0.37586206793785093, "step": 56925 }, { "epoch": 0.05734050064411119, "grad_norm": 14.691571966199357, "learning_rate": 4.999337291190107e-05, "loss": 2.5925, "mean_token_accuracy": 0.358620685338974, "step": 56930 }, { "epoch": 0.057345536697215364, "grad_norm": 12.8469023059474, "learning_rate": 4.999336381482985e-05, "loss": 2.4472, "mean_token_accuracy": 0.4275862067937851, "step": 56935 }, { "epoch": 0.05735057275031954, "grad_norm": 11.871279079412997, "learning_rate": 4.999335471152001e-05, "loss": 2.3841, "mean_token_accuracy": 0.41034482717514037, "step": 56940 }, { "epoch": 0.05735560880342371, "grad_norm": 10.799570568070731, "learning_rate": 4.999334560197156e-05, "loss": 2.8568, "mean_token_accuracy": 0.4068965494632721, "step": 56945 }, { "epoch": 0.057360644856527886, "grad_norm": 11.653328137726932, "learning_rate": 4.9993336486184484e-05, "loss": 2.5634, "mean_token_accuracy": 0.4000000059604645, "step": 56950 }, { "epoch": 0.05736568090963205, "grad_norm": 15.016177322586971, "learning_rate": 4.9993327364158805e-05, "loss": 2.3711, "mean_token_accuracy": 0.43793103396892546, "step": 56955 }, { "epoch": 0.057370716962736226, "grad_norm": 12.96952385783414, "learning_rate": 4.999331823589451e-05, "loss": 2.7216, "mean_token_accuracy": 0.38275861740112305, "step": 56960 }, { "epoch": 0.0573757530158404, "grad_norm": 12.860157160238321, "learning_rate": 4.999330910139162e-05, "loss": 2.8701, "mean_token_accuracy": 0.3655172407627106, "step": 56965 }, { "epoch": 0.057380789068944574, "grad_norm": 11.812576295148537, "learning_rate": 4.999329996065012e-05, "loss": 2.5399, "mean_token_accuracy": 0.42891712188720704, "step": 56970 }, { "epoch": 0.05738582512204875, "grad_norm": 12.062505926389953, "learning_rate": 4.999329081367002e-05, "loss": 2.1792, "mean_token_accuracy": 0.4482758641242981, "step": 56975 }, { "epoch": 0.05739086117515292, "grad_norm": 11.09370697907473, "learning_rate": 4.9993281660451324e-05, "loss": 2.4699, "mean_token_accuracy": 0.3999999940395355, "step": 56980 }, { "epoch": 0.057395897228257095, "grad_norm": 13.293259854541144, "learning_rate": 4.999327250099403e-05, "loss": 2.4625, "mean_token_accuracy": 0.4551724076271057, "step": 56985 }, { "epoch": 0.05740093328136126, "grad_norm": 12.8649436675316, "learning_rate": 4.999326333529815e-05, "loss": 2.5167, "mean_token_accuracy": 0.41034482717514037, "step": 56990 }, { "epoch": 0.057405969334465436, "grad_norm": 9.749384495903454, "learning_rate": 4.999325416336367e-05, "loss": 2.7343, "mean_token_accuracy": 0.40689654350280763, "step": 56995 }, { "epoch": 0.05741100538756961, "grad_norm": 15.035200528519873, "learning_rate": 4.99932449851906e-05, "loss": 3.0004, "mean_token_accuracy": 0.37241379022598264, "step": 57000 }, { "epoch": 0.05741604144067378, "grad_norm": 14.398701262274576, "learning_rate": 4.9993235800778955e-05, "loss": 2.6973, "mean_token_accuracy": 0.3310344755649567, "step": 57005 }, { "epoch": 0.05742107749377796, "grad_norm": 17.007631412929175, "learning_rate": 4.999322661012872e-05, "loss": 2.6348, "mean_token_accuracy": 0.3517241388559341, "step": 57010 }, { "epoch": 0.05742611354688213, "grad_norm": 11.797415074860707, "learning_rate": 4.9993217413239905e-05, "loss": 2.688, "mean_token_accuracy": 0.38130671381950376, "step": 57015 }, { "epoch": 0.057431149599986304, "grad_norm": 13.729055890320819, "learning_rate": 4.9993208210112516e-05, "loss": 2.2852, "mean_token_accuracy": 0.45862067937850953, "step": 57020 }, { "epoch": 0.05743618565309047, "grad_norm": 13.161836475681744, "learning_rate": 4.9993199000746546e-05, "loss": 2.6303, "mean_token_accuracy": 0.45517241954803467, "step": 57025 }, { "epoch": 0.057441221706194645, "grad_norm": 11.79899314016646, "learning_rate": 4.9993189785142016e-05, "loss": 2.7679, "mean_token_accuracy": 0.4, "step": 57030 }, { "epoch": 0.05744625775929882, "grad_norm": 13.946650389403713, "learning_rate": 4.9993180563298905e-05, "loss": 2.8332, "mean_token_accuracy": 0.3896551728248596, "step": 57035 }, { "epoch": 0.05745129381240299, "grad_norm": 11.48907705757755, "learning_rate": 4.9993171335217234e-05, "loss": 2.5309, "mean_token_accuracy": 0.44966726899147036, "step": 57040 }, { "epoch": 0.057456329865507166, "grad_norm": 11.972960574446068, "learning_rate": 4.999316210089698e-05, "loss": 2.6538, "mean_token_accuracy": 0.3915305495262146, "step": 57045 }, { "epoch": 0.05746136591861134, "grad_norm": 11.594613079123697, "learning_rate": 4.9993152860338185e-05, "loss": 2.6242, "mean_token_accuracy": 0.3827586233615875, "step": 57050 }, { "epoch": 0.057466401971715514, "grad_norm": 10.357912538196837, "learning_rate": 4.999314361354082e-05, "loss": 2.3195, "mean_token_accuracy": 0.4689655065536499, "step": 57055 }, { "epoch": 0.05747143802481968, "grad_norm": 11.731625652063055, "learning_rate": 4.99931343605049e-05, "loss": 2.7205, "mean_token_accuracy": 0.3482758641242981, "step": 57060 }, { "epoch": 0.057476474077923854, "grad_norm": 13.112582550156544, "learning_rate": 4.999312510123042e-05, "loss": 2.23, "mean_token_accuracy": 0.4398064225912094, "step": 57065 }, { "epoch": 0.05748151013102803, "grad_norm": 11.962309472100149, "learning_rate": 4.9993115835717395e-05, "loss": 2.7023, "mean_token_accuracy": 0.39655172228813174, "step": 57070 }, { "epoch": 0.0574865461841322, "grad_norm": 11.37899721418775, "learning_rate": 4.9993106563965816e-05, "loss": 2.5962, "mean_token_accuracy": 0.358620685338974, "step": 57075 }, { "epoch": 0.057491582237236376, "grad_norm": 12.192532695870286, "learning_rate": 4.999309728597569e-05, "loss": 2.5903, "mean_token_accuracy": 0.38965516686439516, "step": 57080 }, { "epoch": 0.05749661829034055, "grad_norm": 13.706159168820415, "learning_rate": 4.9993088001747024e-05, "loss": 2.7902, "mean_token_accuracy": 0.39655172228813174, "step": 57085 }, { "epoch": 0.05750165434344472, "grad_norm": 10.113394495890585, "learning_rate": 4.999307871127982e-05, "loss": 2.4097, "mean_token_accuracy": 0.42758620381355283, "step": 57090 }, { "epoch": 0.05750669039654889, "grad_norm": 10.613393503025039, "learning_rate": 4.9993069414574065e-05, "loss": 2.1837, "mean_token_accuracy": 0.4620689630508423, "step": 57095 }, { "epoch": 0.057511726449653064, "grad_norm": 12.050989443295737, "learning_rate": 4.999306011162978e-05, "loss": 2.6479, "mean_token_accuracy": 0.3965517163276672, "step": 57100 }, { "epoch": 0.05751676250275724, "grad_norm": 15.467484510326555, "learning_rate": 4.999305080244695e-05, "loss": 2.5776, "mean_token_accuracy": 0.36551723480224607, "step": 57105 }, { "epoch": 0.05752179855586141, "grad_norm": 12.342780615064195, "learning_rate": 4.99930414870256e-05, "loss": 2.7746, "mean_token_accuracy": 0.3896551728248596, "step": 57110 }, { "epoch": 0.057526834608965585, "grad_norm": 11.820305853203543, "learning_rate": 4.999303216536572e-05, "loss": 2.6576, "mean_token_accuracy": 0.37586206793785093, "step": 57115 }, { "epoch": 0.05753187066206976, "grad_norm": 13.260307017033362, "learning_rate": 4.999302283746731e-05, "loss": 2.5661, "mean_token_accuracy": 0.42068964838981626, "step": 57120 }, { "epoch": 0.05753690671517393, "grad_norm": 11.97019432492, "learning_rate": 4.9993013503330385e-05, "loss": 2.6747, "mean_token_accuracy": 0.38620689511299133, "step": 57125 }, { "epoch": 0.0575419427682781, "grad_norm": 13.20226543863521, "learning_rate": 4.9993004162954934e-05, "loss": 2.7406, "mean_token_accuracy": 0.29999999403953553, "step": 57130 }, { "epoch": 0.05754697882138227, "grad_norm": 13.67484690078131, "learning_rate": 4.999299481634096e-05, "loss": 2.8391, "mean_token_accuracy": 0.3965517282485962, "step": 57135 }, { "epoch": 0.05755201487448645, "grad_norm": 16.107649669539523, "learning_rate": 4.999298546348847e-05, "loss": 2.5779, "mean_token_accuracy": 0.39655172228813174, "step": 57140 }, { "epoch": 0.05755705092759062, "grad_norm": 9.318478617579295, "learning_rate": 4.999297610439747e-05, "loss": 2.5527, "mean_token_accuracy": 0.4034482717514038, "step": 57145 }, { "epoch": 0.057562086980694795, "grad_norm": 14.936182827666997, "learning_rate": 4.999296673906796e-05, "loss": 2.9974, "mean_token_accuracy": 0.36551724672317504, "step": 57150 }, { "epoch": 0.05756712303379897, "grad_norm": 10.308502224547889, "learning_rate": 4.9992957367499945e-05, "loss": 2.4932, "mean_token_accuracy": 0.4068965554237366, "step": 57155 }, { "epoch": 0.05757215908690314, "grad_norm": 11.885303757486795, "learning_rate": 4.9992947989693417e-05, "loss": 2.6451, "mean_token_accuracy": 0.4344827592372894, "step": 57160 }, { "epoch": 0.05757719514000731, "grad_norm": 12.399681148125469, "learning_rate": 4.999293860564838e-05, "loss": 3.4179, "mean_token_accuracy": 0.3493042945861816, "step": 57165 }, { "epoch": 0.05758223119311148, "grad_norm": 11.336888019123098, "learning_rate": 4.9992929215364855e-05, "loss": 2.6676, "mean_token_accuracy": 0.3655172407627106, "step": 57170 }, { "epoch": 0.057587267246215657, "grad_norm": 12.544787637386213, "learning_rate": 4.999291981884283e-05, "loss": 2.7191, "mean_token_accuracy": 0.41034482717514037, "step": 57175 }, { "epoch": 0.05759230329931983, "grad_norm": 9.074175836044176, "learning_rate": 4.999291041608231e-05, "loss": 2.6352, "mean_token_accuracy": 0.3586206823587418, "step": 57180 }, { "epoch": 0.057597339352424004, "grad_norm": 8.90452963227952, "learning_rate": 4.999290100708329e-05, "loss": 2.2349, "mean_token_accuracy": 0.4344827592372894, "step": 57185 }, { "epoch": 0.05760237540552818, "grad_norm": 12.793787761560177, "learning_rate": 4.999289159184579e-05, "loss": 2.4676, "mean_token_accuracy": 0.4034482717514038, "step": 57190 }, { "epoch": 0.05760741145863235, "grad_norm": 13.610230889253506, "learning_rate": 4.99928821703698e-05, "loss": 3.0442, "mean_token_accuracy": 0.334482753276825, "step": 57195 }, { "epoch": 0.05761244751173652, "grad_norm": 12.090811196471845, "learning_rate": 4.9992872742655324e-05, "loss": 2.6437, "mean_token_accuracy": 0.382758629322052, "step": 57200 }, { "epoch": 0.05761748356484069, "grad_norm": 15.814008938062297, "learning_rate": 4.999286330870236e-05, "loss": 2.3298, "mean_token_accuracy": 0.44482758045196535, "step": 57205 }, { "epoch": 0.057622519617944866, "grad_norm": 11.704183741350963, "learning_rate": 4.999285386851092e-05, "loss": 2.7125, "mean_token_accuracy": 0.36206896901130675, "step": 57210 }, { "epoch": 0.05762755567104904, "grad_norm": 9.948972792238973, "learning_rate": 4.999284442208101e-05, "loss": 2.3295, "mean_token_accuracy": 0.441379314661026, "step": 57215 }, { "epoch": 0.05763259172415321, "grad_norm": 11.496668419502647, "learning_rate": 4.999283496941262e-05, "loss": 2.3162, "mean_token_accuracy": 0.41379310488700866, "step": 57220 }, { "epoch": 0.05763762777725739, "grad_norm": 12.070797776477239, "learning_rate": 4.9992825510505756e-05, "loss": 2.4811, "mean_token_accuracy": 0.4068965554237366, "step": 57225 }, { "epoch": 0.05764266383036156, "grad_norm": 9.575953221627952, "learning_rate": 4.999281604536043e-05, "loss": 2.195, "mean_token_accuracy": 0.4413793087005615, "step": 57230 }, { "epoch": 0.05764769988346573, "grad_norm": 9.53784330188606, "learning_rate": 4.9992806573976633e-05, "loss": 2.3927, "mean_token_accuracy": 0.38620689511299133, "step": 57235 }, { "epoch": 0.0576527359365699, "grad_norm": 11.774543382035088, "learning_rate": 4.9992797096354374e-05, "loss": 2.8829, "mean_token_accuracy": 0.34482758641242983, "step": 57240 }, { "epoch": 0.057657771989674075, "grad_norm": 12.537331619660613, "learning_rate": 4.999278761249365e-05, "loss": 2.6089, "mean_token_accuracy": 0.39310344159603117, "step": 57245 }, { "epoch": 0.05766280804277825, "grad_norm": 11.2690024717912, "learning_rate": 4.9992778122394465e-05, "loss": 2.7197, "mean_token_accuracy": 0.4068965494632721, "step": 57250 }, { "epoch": 0.05766784409588242, "grad_norm": 15.50691234585414, "learning_rate": 4.999276862605683e-05, "loss": 2.5386, "mean_token_accuracy": 0.4068965554237366, "step": 57255 }, { "epoch": 0.0576728801489866, "grad_norm": 9.044296447289152, "learning_rate": 4.999275912348075e-05, "loss": 2.2877, "mean_token_accuracy": 0.4400483965873718, "step": 57260 }, { "epoch": 0.05767791620209077, "grad_norm": 13.047760548798507, "learning_rate": 4.99927496146662e-05, "loss": 2.5269, "mean_token_accuracy": 0.4103448331356049, "step": 57265 }, { "epoch": 0.05768295225519494, "grad_norm": 15.508047282207551, "learning_rate": 4.999274009961322e-05, "loss": 2.9094, "mean_token_accuracy": 0.4000000059604645, "step": 57270 }, { "epoch": 0.05768798830829911, "grad_norm": 20.046362997074926, "learning_rate": 4.9992730578321776e-05, "loss": 2.6688, "mean_token_accuracy": 0.3620689570903778, "step": 57275 }, { "epoch": 0.057693024361403285, "grad_norm": 13.237840251920288, "learning_rate": 4.99927210507919e-05, "loss": 3.0317, "mean_token_accuracy": 0.3551724135875702, "step": 57280 }, { "epoch": 0.05769806041450746, "grad_norm": 14.038161316368479, "learning_rate": 4.999271151702359e-05, "loss": 2.3832, "mean_token_accuracy": 0.42413793206214906, "step": 57285 }, { "epoch": 0.05770309646761163, "grad_norm": 13.342003811419074, "learning_rate": 4.999270197701684e-05, "loss": 3.2633, "mean_token_accuracy": 0.3310344785451889, "step": 57290 }, { "epoch": 0.057708132520715806, "grad_norm": 10.644195659735548, "learning_rate": 4.999269243077164e-05, "loss": 2.9996, "mean_token_accuracy": 0.35353902280330657, "step": 57295 }, { "epoch": 0.05771316857381998, "grad_norm": 11.76812814495945, "learning_rate": 4.999268287828803e-05, "loss": 2.5163, "mean_token_accuracy": 0.41724138259887694, "step": 57300 }, { "epoch": 0.05771820462692415, "grad_norm": 11.970856248790895, "learning_rate": 4.9992673319565974e-05, "loss": 2.5546, "mean_token_accuracy": 0.37586207389831544, "step": 57305 }, { "epoch": 0.05772324068002832, "grad_norm": 12.144989780344805, "learning_rate": 4.9992663754605504e-05, "loss": 2.8722, "mean_token_accuracy": 0.37241379618644715, "step": 57310 }, { "epoch": 0.057728276733132494, "grad_norm": 14.625796514674176, "learning_rate": 4.9992654183406595e-05, "loss": 2.6345, "mean_token_accuracy": 0.3999999940395355, "step": 57315 }, { "epoch": 0.05773331278623667, "grad_norm": 12.539052656071842, "learning_rate": 4.9992644605969275e-05, "loss": 2.7417, "mean_token_accuracy": 0.43793103098869324, "step": 57320 }, { "epoch": 0.05773834883934084, "grad_norm": 11.27053693874464, "learning_rate": 4.999263502229353e-05, "loss": 2.5806, "mean_token_accuracy": 0.4310344815254211, "step": 57325 }, { "epoch": 0.057743384892445015, "grad_norm": 12.801565739271933, "learning_rate": 4.999262543237937e-05, "loss": 2.6702, "mean_token_accuracy": 0.35517241060733795, "step": 57330 }, { "epoch": 0.05774842094554919, "grad_norm": 10.219399920074073, "learning_rate": 4.99926158362268e-05, "loss": 2.5418, "mean_token_accuracy": 0.43218390345573426, "step": 57335 }, { "epoch": 0.057753456998653356, "grad_norm": 11.451479382917173, "learning_rate": 4.999260623383582e-05, "loss": 2.1758, "mean_token_accuracy": 0.45862069725990295, "step": 57340 }, { "epoch": 0.05775849305175753, "grad_norm": 11.395144014385595, "learning_rate": 4.9992596625206425e-05, "loss": 2.5118, "mean_token_accuracy": 0.3896551728248596, "step": 57345 }, { "epoch": 0.057763529104861704, "grad_norm": 13.331076268662285, "learning_rate": 4.9992587010338624e-05, "loss": 3.0526, "mean_token_accuracy": 0.3758620619773865, "step": 57350 }, { "epoch": 0.05776856515796588, "grad_norm": 10.323330932600783, "learning_rate": 4.999257738923243e-05, "loss": 2.5068, "mean_token_accuracy": 0.43103447556495667, "step": 57355 }, { "epoch": 0.05777360121107005, "grad_norm": 12.252978462491669, "learning_rate": 4.999256776188783e-05, "loss": 2.4116, "mean_token_accuracy": 0.40000000298023225, "step": 57360 }, { "epoch": 0.057778637264174225, "grad_norm": 11.888449251354396, "learning_rate": 4.9992558128304836e-05, "loss": 2.0593, "mean_token_accuracy": 0.4482758641242981, "step": 57365 }, { "epoch": 0.0577836733172784, "grad_norm": 11.733392916596229, "learning_rate": 4.9992548488483444e-05, "loss": 2.3107, "mean_token_accuracy": 0.4498487591743469, "step": 57370 }, { "epoch": 0.057788709370382566, "grad_norm": 17.838861536484373, "learning_rate": 4.999253884242366e-05, "loss": 2.8911, "mean_token_accuracy": 0.35923645198345183, "step": 57375 }, { "epoch": 0.05779374542348674, "grad_norm": 11.747631666976154, "learning_rate": 4.999252919012549e-05, "loss": 2.7115, "mean_token_accuracy": 0.39999999701976774, "step": 57380 }, { "epoch": 0.05779878147659091, "grad_norm": 11.147413750359455, "learning_rate": 4.9992519531588924e-05, "loss": 2.4241, "mean_token_accuracy": 0.403448274731636, "step": 57385 }, { "epoch": 0.05780381752969509, "grad_norm": 10.121058995504669, "learning_rate": 4.999250986681398e-05, "loss": 2.4734, "mean_token_accuracy": 0.41379311084747317, "step": 57390 }, { "epoch": 0.05780885358279926, "grad_norm": 12.908633713141727, "learning_rate": 4.999250019580065e-05, "loss": 2.8051, "mean_token_accuracy": 0.41034482717514037, "step": 57395 }, { "epoch": 0.057813889635903434, "grad_norm": 12.442985635396475, "learning_rate": 4.9992490518548954e-05, "loss": 2.8611, "mean_token_accuracy": 0.317241370677948, "step": 57400 }, { "epoch": 0.05781892568900761, "grad_norm": 11.799025195749557, "learning_rate": 4.999248083505886e-05, "loss": 2.2785, "mean_token_accuracy": 0.4413793087005615, "step": 57405 }, { "epoch": 0.057823961742111775, "grad_norm": 12.040343127760002, "learning_rate": 4.999247114533041e-05, "loss": 2.8043, "mean_token_accuracy": 0.36551724672317504, "step": 57410 }, { "epoch": 0.05782899779521595, "grad_norm": 11.121771006601772, "learning_rate": 4.999246144936359e-05, "loss": 2.2651, "mean_token_accuracy": 0.4, "step": 57415 }, { "epoch": 0.05783403384832012, "grad_norm": 12.317496100583458, "learning_rate": 4.999245174715839e-05, "loss": 2.2893, "mean_token_accuracy": 0.47241380214691164, "step": 57420 }, { "epoch": 0.057839069901424296, "grad_norm": 13.761624120829946, "learning_rate": 4.9992442038714834e-05, "loss": 2.4881, "mean_token_accuracy": 0.4, "step": 57425 }, { "epoch": 0.05784410595452847, "grad_norm": 15.254943869277371, "learning_rate": 4.9992432324032915e-05, "loss": 2.9177, "mean_token_accuracy": 0.33448275923728943, "step": 57430 }, { "epoch": 0.057849142007632644, "grad_norm": 14.307423581770168, "learning_rate": 4.9992422603112625e-05, "loss": 2.3086, "mean_token_accuracy": 0.4592364549636841, "step": 57435 }, { "epoch": 0.05785417806073682, "grad_norm": 13.165139153254296, "learning_rate": 4.9992412875953985e-05, "loss": 2.9389, "mean_token_accuracy": 0.3620689630508423, "step": 57440 }, { "epoch": 0.057859214113840984, "grad_norm": 12.932970189358256, "learning_rate": 4.9992403142557e-05, "loss": 2.6163, "mean_token_accuracy": 0.4172413766384125, "step": 57445 }, { "epoch": 0.05786425016694516, "grad_norm": 14.332783040365282, "learning_rate": 4.999239340292165e-05, "loss": 2.3578, "mean_token_accuracy": 0.4379310369491577, "step": 57450 }, { "epoch": 0.05786928622004933, "grad_norm": 10.263400752145794, "learning_rate": 4.9992383657047956e-05, "loss": 2.3915, "mean_token_accuracy": 0.44482759237289426, "step": 57455 }, { "epoch": 0.057874322273153506, "grad_norm": 11.723888353251912, "learning_rate": 4.9992373904935915e-05, "loss": 2.4145, "mean_token_accuracy": 0.4206896543502808, "step": 57460 }, { "epoch": 0.05787935832625768, "grad_norm": 13.521375727847918, "learning_rate": 4.9992364146585524e-05, "loss": 2.8984, "mean_token_accuracy": 0.34482758641242983, "step": 57465 }, { "epoch": 0.05788439437936185, "grad_norm": 10.161957143438055, "learning_rate": 4.9992354381996796e-05, "loss": 2.2156, "mean_token_accuracy": 0.4534785270690918, "step": 57470 }, { "epoch": 0.05788943043246603, "grad_norm": 12.792552184339858, "learning_rate": 4.999234461116973e-05, "loss": 2.7206, "mean_token_accuracy": 0.36896551549434664, "step": 57475 }, { "epoch": 0.057894466485570194, "grad_norm": 15.32101753239313, "learning_rate": 4.999233483410433e-05, "loss": 2.6226, "mean_token_accuracy": 0.358620685338974, "step": 57480 }, { "epoch": 0.05789950253867437, "grad_norm": 10.854834194514783, "learning_rate": 4.99923250508006e-05, "loss": 2.6908, "mean_token_accuracy": 0.36896551251411436, "step": 57485 }, { "epoch": 0.05790453859177854, "grad_norm": 11.283132763968833, "learning_rate": 4.999231526125854e-05, "loss": 2.6612, "mean_token_accuracy": 0.4137930989265442, "step": 57490 }, { "epoch": 0.057909574644882715, "grad_norm": 12.346559381771973, "learning_rate": 4.9992305465478146e-05, "loss": 2.4768, "mean_token_accuracy": 0.3862069010734558, "step": 57495 }, { "epoch": 0.05791461069798689, "grad_norm": 11.681939082177314, "learning_rate": 4.999229566345943e-05, "loss": 2.0069, "mean_token_accuracy": 0.501477837562561, "step": 57500 }, { "epoch": 0.05791964675109106, "grad_norm": 14.769353369377393, "learning_rate": 4.9992285855202395e-05, "loss": 2.7215, "mean_token_accuracy": 0.3931034505367279, "step": 57505 }, { "epoch": 0.057924682804195236, "grad_norm": 12.2244556083009, "learning_rate": 4.999227604070703e-05, "loss": 2.4174, "mean_token_accuracy": 0.4310344815254211, "step": 57510 }, { "epoch": 0.0579297188572994, "grad_norm": 10.291547084749382, "learning_rate": 4.999226621997336e-05, "loss": 2.3936, "mean_token_accuracy": 0.45517241954803467, "step": 57515 }, { "epoch": 0.05793475491040358, "grad_norm": 12.48981549632761, "learning_rate": 4.999225639300137e-05, "loss": 2.8655, "mean_token_accuracy": 0.3793103456497192, "step": 57520 }, { "epoch": 0.05793979096350775, "grad_norm": 11.235521559472508, "learning_rate": 4.9992246559791076e-05, "loss": 2.529, "mean_token_accuracy": 0.37586206793785093, "step": 57525 }, { "epoch": 0.057944827016611924, "grad_norm": 12.106284504725325, "learning_rate": 4.9992236720342464e-05, "loss": 2.6062, "mean_token_accuracy": 0.3827586263418198, "step": 57530 }, { "epoch": 0.0579498630697161, "grad_norm": 11.635058009067654, "learning_rate": 4.999222687465555e-05, "loss": 2.6488, "mean_token_accuracy": 0.3620689630508423, "step": 57535 }, { "epoch": 0.05795489912282027, "grad_norm": 13.136728557414289, "learning_rate": 4.999221702273034e-05, "loss": 2.9238, "mean_token_accuracy": 0.324137932062149, "step": 57540 }, { "epoch": 0.057959935175924446, "grad_norm": 12.273208997040715, "learning_rate": 4.9992207164566824e-05, "loss": 3.92, "mean_token_accuracy": 0.2827586218714714, "step": 57545 }, { "epoch": 0.05796497122902861, "grad_norm": 16.738844625018178, "learning_rate": 4.999219730016501e-05, "loss": 2.3522, "mean_token_accuracy": 0.4068965494632721, "step": 57550 }, { "epoch": 0.057970007282132786, "grad_norm": 10.973796224215365, "learning_rate": 4.999218742952489e-05, "loss": 3.0449, "mean_token_accuracy": 0.358620685338974, "step": 57555 }, { "epoch": 0.05797504333523696, "grad_norm": 11.081445871501213, "learning_rate": 4.999217755264649e-05, "loss": 2.1733, "mean_token_accuracy": 0.4813067138195038, "step": 57560 }, { "epoch": 0.057980079388341134, "grad_norm": 16.197548411130477, "learning_rate": 4.99921676695298e-05, "loss": 2.9741, "mean_token_accuracy": 0.38620689511299133, "step": 57565 }, { "epoch": 0.05798511544144531, "grad_norm": 13.843367774552254, "learning_rate": 4.999215778017482e-05, "loss": 2.3978, "mean_token_accuracy": 0.43303084969520567, "step": 57570 }, { "epoch": 0.05799015149454948, "grad_norm": 15.754287428143902, "learning_rate": 4.999214788458156e-05, "loss": 2.655, "mean_token_accuracy": 0.41379311084747317, "step": 57575 }, { "epoch": 0.057995187547653655, "grad_norm": 11.91576056316204, "learning_rate": 4.999213798275002e-05, "loss": 2.6878, "mean_token_accuracy": 0.36206896901130675, "step": 57580 }, { "epoch": 0.05800022360075782, "grad_norm": 14.166664682472806, "learning_rate": 4.99921280746802e-05, "loss": 2.5356, "mean_token_accuracy": 0.3793103456497192, "step": 57585 }, { "epoch": 0.058005259653861996, "grad_norm": 21.730674564662785, "learning_rate": 4.9992118160372103e-05, "loss": 2.8743, "mean_token_accuracy": 0.40344826877117157, "step": 57590 }, { "epoch": 0.05801029570696617, "grad_norm": 13.078804639144298, "learning_rate": 4.999210823982573e-05, "loss": 2.8637, "mean_token_accuracy": 0.3790078639984131, "step": 57595 }, { "epoch": 0.05801533176007034, "grad_norm": 12.663240104256626, "learning_rate": 4.999209831304109e-05, "loss": 2.5151, "mean_token_accuracy": 0.4103448212146759, "step": 57600 }, { "epoch": 0.05802036781317452, "grad_norm": 9.719288688943896, "learning_rate": 4.999208838001818e-05, "loss": 2.4989, "mean_token_accuracy": 0.4, "step": 57605 }, { "epoch": 0.05802540386627869, "grad_norm": 11.700511829390239, "learning_rate": 4.9992078440757015e-05, "loss": 2.5765, "mean_token_accuracy": 0.44137930274009707, "step": 57610 }, { "epoch": 0.058030439919382865, "grad_norm": 13.876858200714418, "learning_rate": 4.999206849525759e-05, "loss": 2.8379, "mean_token_accuracy": 0.36896551847457887, "step": 57615 }, { "epoch": 0.05803547597248703, "grad_norm": 12.327747228256067, "learning_rate": 4.999205854351989e-05, "loss": 2.8716, "mean_token_accuracy": 0.38965516686439516, "step": 57620 }, { "epoch": 0.058040512025591205, "grad_norm": 17.440115893457662, "learning_rate": 4.999204858554394e-05, "loss": 2.5576, "mean_token_accuracy": 0.43103447556495667, "step": 57625 }, { "epoch": 0.05804554807869538, "grad_norm": 12.300303390321904, "learning_rate": 4.999203862132974e-05, "loss": 3.0254, "mean_token_accuracy": 0.33448275923728943, "step": 57630 }, { "epoch": 0.05805058413179955, "grad_norm": 10.964151359838722, "learning_rate": 4.999202865087729e-05, "loss": 2.3905, "mean_token_accuracy": 0.46896551847457885, "step": 57635 }, { "epoch": 0.058055620184903726, "grad_norm": 9.47427538400915, "learning_rate": 4.9992018674186585e-05, "loss": 2.6198, "mean_token_accuracy": 0.4362976372241974, "step": 57640 }, { "epoch": 0.0580606562380079, "grad_norm": 15.123975525876906, "learning_rate": 4.999200869125764e-05, "loss": 2.4058, "mean_token_accuracy": 0.47241379618644713, "step": 57645 }, { "epoch": 0.058065692291112074, "grad_norm": 14.146877072745678, "learning_rate": 4.999199870209045e-05, "loss": 2.6921, "mean_token_accuracy": 0.3482758581638336, "step": 57650 }, { "epoch": 0.05807072834421624, "grad_norm": 10.985118527789368, "learning_rate": 4.9991988706685015e-05, "loss": 2.4065, "mean_token_accuracy": 0.41034482717514037, "step": 57655 }, { "epoch": 0.058075764397320415, "grad_norm": 13.4627334283685, "learning_rate": 4.999197870504136e-05, "loss": 2.9795, "mean_token_accuracy": 0.30689655244350433, "step": 57660 }, { "epoch": 0.05808080045042459, "grad_norm": 12.403825298055832, "learning_rate": 4.999196869715945e-05, "loss": 3.1009, "mean_token_accuracy": 0.33448275923728943, "step": 57665 }, { "epoch": 0.05808583650352876, "grad_norm": 10.995858145357698, "learning_rate": 4.999195868303932e-05, "loss": 2.2313, "mean_token_accuracy": 0.45172414779663084, "step": 57670 }, { "epoch": 0.058090872556632936, "grad_norm": 10.302747809018088, "learning_rate": 4.9991948662680966e-05, "loss": 2.3699, "mean_token_accuracy": 0.43103448748588563, "step": 57675 }, { "epoch": 0.05809590860973711, "grad_norm": 10.87509910651572, "learning_rate": 4.9991938636084374e-05, "loss": 2.2866, "mean_token_accuracy": 0.4034482777118683, "step": 57680 }, { "epoch": 0.05810094466284128, "grad_norm": 12.901040367542265, "learning_rate": 4.999192860324957e-05, "loss": 2.4967, "mean_token_accuracy": 0.3586206942796707, "step": 57685 }, { "epoch": 0.05810598071594545, "grad_norm": 10.36413112704375, "learning_rate": 4.9991918564176534e-05, "loss": 2.7469, "mean_token_accuracy": 0.4068965494632721, "step": 57690 }, { "epoch": 0.058111016769049624, "grad_norm": 14.56691083345873, "learning_rate": 4.999190851886529e-05, "loss": 2.3583, "mean_token_accuracy": 0.43793103098869324, "step": 57695 }, { "epoch": 0.0581160528221538, "grad_norm": 12.067074123999799, "learning_rate": 4.999189846731583e-05, "loss": 2.4572, "mean_token_accuracy": 0.42758620381355283, "step": 57700 }, { "epoch": 0.05812108887525797, "grad_norm": 10.259203268914284, "learning_rate": 4.999188840952816e-05, "loss": 2.8742, "mean_token_accuracy": 0.35862068831920624, "step": 57705 }, { "epoch": 0.058126124928362145, "grad_norm": 10.825345728105177, "learning_rate": 4.999187834550228e-05, "loss": 2.3084, "mean_token_accuracy": 0.4413793087005615, "step": 57710 }, { "epoch": 0.05813116098146632, "grad_norm": 12.591825416151927, "learning_rate": 4.9991868275238176e-05, "loss": 2.5617, "mean_token_accuracy": 0.4, "step": 57715 }, { "epoch": 0.05813619703457049, "grad_norm": 12.088768302424826, "learning_rate": 4.9991858198735895e-05, "loss": 2.7255, "mean_token_accuracy": 0.38275861740112305, "step": 57720 }, { "epoch": 0.05814123308767466, "grad_norm": 10.785434828475342, "learning_rate": 4.99918481159954e-05, "loss": 2.3296, "mean_token_accuracy": 0.37586206793785093, "step": 57725 }, { "epoch": 0.05814626914077883, "grad_norm": 10.54794844790711, "learning_rate": 4.999183802701671e-05, "loss": 2.9022, "mean_token_accuracy": 0.38620688319206237, "step": 57730 }, { "epoch": 0.05815130519388301, "grad_norm": 16.51180580278284, "learning_rate": 4.999182793179982e-05, "loss": 2.2915, "mean_token_accuracy": 0.4206896543502808, "step": 57735 }, { "epoch": 0.05815634124698718, "grad_norm": 28.158845464716098, "learning_rate": 4.999181783034474e-05, "loss": 2.8655, "mean_token_accuracy": 0.42758621871471403, "step": 57740 }, { "epoch": 0.058161377300091355, "grad_norm": 10.648634792828563, "learning_rate": 4.999180772265148e-05, "loss": 2.5358, "mean_token_accuracy": 0.41203871965408323, "step": 57745 }, { "epoch": 0.05816641335319553, "grad_norm": 13.731288030418478, "learning_rate": 4.999179760872002e-05, "loss": 2.4154, "mean_token_accuracy": 0.42068964838981626, "step": 57750 }, { "epoch": 0.0581714494062997, "grad_norm": 13.008247167496366, "learning_rate": 4.9991787488550385e-05, "loss": 2.3941, "mean_token_accuracy": 0.4328493714332581, "step": 57755 }, { "epoch": 0.05817648545940387, "grad_norm": 17.049905067474427, "learning_rate": 4.999177736214256e-05, "loss": 2.2495, "mean_token_accuracy": 0.4689655125141144, "step": 57760 }, { "epoch": 0.05818152151250804, "grad_norm": 15.982037971520343, "learning_rate": 4.999176722949656e-05, "loss": 3.2155, "mean_token_accuracy": 0.31724137961864474, "step": 57765 }, { "epoch": 0.05818655756561222, "grad_norm": 13.135824375298366, "learning_rate": 4.999175709061239e-05, "loss": 3.0663, "mean_token_accuracy": 0.37241379618644715, "step": 57770 }, { "epoch": 0.05819159361871639, "grad_norm": 12.334039478210656, "learning_rate": 4.9991746945490035e-05, "loss": 2.6428, "mean_token_accuracy": 0.3620689630508423, "step": 57775 }, { "epoch": 0.058196629671820564, "grad_norm": 13.47687592940928, "learning_rate": 4.999173679412952e-05, "loss": 2.7659, "mean_token_accuracy": 0.3551724076271057, "step": 57780 }, { "epoch": 0.05820166572492474, "grad_norm": 11.064908352222812, "learning_rate": 4.999172663653084e-05, "loss": 2.1654, "mean_token_accuracy": 0.49655171632766726, "step": 57785 }, { "epoch": 0.05820670177802891, "grad_norm": 14.19722561308654, "learning_rate": 4.999171647269399e-05, "loss": 2.5727, "mean_token_accuracy": 0.4, "step": 57790 }, { "epoch": 0.05821173783113308, "grad_norm": 13.1108835999998, "learning_rate": 4.999170630261898e-05, "loss": 2.6319, "mean_token_accuracy": 0.4068965494632721, "step": 57795 }, { "epoch": 0.05821677388423725, "grad_norm": 11.90530903715145, "learning_rate": 4.999169612630581e-05, "loss": 2.2424, "mean_token_accuracy": 0.4275861978530884, "step": 57800 }, { "epoch": 0.058221809937341426, "grad_norm": 16.733961714294786, "learning_rate": 4.999168594375449e-05, "loss": 2.3704, "mean_token_accuracy": 0.398064124584198, "step": 57805 }, { "epoch": 0.0582268459904456, "grad_norm": 11.639555534171263, "learning_rate": 4.999167575496501e-05, "loss": 2.6935, "mean_token_accuracy": 0.3517241418361664, "step": 57810 }, { "epoch": 0.058231882043549774, "grad_norm": 11.754040016050185, "learning_rate": 4.999166555993738e-05, "loss": 2.9862, "mean_token_accuracy": 0.3551724165678024, "step": 57815 }, { "epoch": 0.05823691809665395, "grad_norm": 12.054007186036795, "learning_rate": 4.9991655358671603e-05, "loss": 2.6064, "mean_token_accuracy": 0.4103448212146759, "step": 57820 }, { "epoch": 0.05824195414975812, "grad_norm": 16.254996743767766, "learning_rate": 4.999164515116769e-05, "loss": 2.3429, "mean_token_accuracy": 0.45517241954803467, "step": 57825 }, { "epoch": 0.05824699020286229, "grad_norm": 13.008722324888442, "learning_rate": 4.999163493742563e-05, "loss": 2.9736, "mean_token_accuracy": 0.32413792610168457, "step": 57830 }, { "epoch": 0.05825202625596646, "grad_norm": 9.588482074195866, "learning_rate": 4.999162471744543e-05, "loss": 2.6122, "mean_token_accuracy": 0.4034482717514038, "step": 57835 }, { "epoch": 0.058257062309070635, "grad_norm": 10.654409263734909, "learning_rate": 4.999161449122709e-05, "loss": 2.6545, "mean_token_accuracy": 0.4379310429096222, "step": 57840 }, { "epoch": 0.05826209836217481, "grad_norm": 13.525322120422883, "learning_rate": 4.999160425877063e-05, "loss": 2.3582, "mean_token_accuracy": 0.4310344815254211, "step": 57845 }, { "epoch": 0.05826713441527898, "grad_norm": 11.672146926890395, "learning_rate": 4.9991594020076024e-05, "loss": 2.5143, "mean_token_accuracy": 0.4172413766384125, "step": 57850 }, { "epoch": 0.05827217046838316, "grad_norm": 13.544392515386573, "learning_rate": 4.9991583775143294e-05, "loss": 2.0403, "mean_token_accuracy": 0.5103448152542114, "step": 57855 }, { "epoch": 0.05827720652148733, "grad_norm": 12.36115802947356, "learning_rate": 4.9991573523972444e-05, "loss": 2.4344, "mean_token_accuracy": 0.4137930989265442, "step": 57860 }, { "epoch": 0.0582822425745915, "grad_norm": 15.13027655154658, "learning_rate": 4.999156326656347e-05, "loss": 2.531, "mean_token_accuracy": 0.3758620619773865, "step": 57865 }, { "epoch": 0.05828727862769567, "grad_norm": 15.975172065166893, "learning_rate": 4.999155300291638e-05, "loss": 2.9955, "mean_token_accuracy": 0.31724137663841245, "step": 57870 }, { "epoch": 0.058292314680799845, "grad_norm": 14.975872428801766, "learning_rate": 4.9991542733031175e-05, "loss": 2.8214, "mean_token_accuracy": 0.3517241418361664, "step": 57875 }, { "epoch": 0.05829735073390402, "grad_norm": 20.413676515678013, "learning_rate": 4.9991532456907846e-05, "loss": 3.0033, "mean_token_accuracy": 0.3655172437429428, "step": 57880 }, { "epoch": 0.05830238678700819, "grad_norm": 10.05592673239935, "learning_rate": 4.999152217454642e-05, "loss": 2.403, "mean_token_accuracy": 0.41724138259887694, "step": 57885 }, { "epoch": 0.058307422840112366, "grad_norm": 14.11237520890565, "learning_rate": 4.9991511885946875e-05, "loss": 2.5631, "mean_token_accuracy": 0.42758620977401735, "step": 57890 }, { "epoch": 0.05831245889321654, "grad_norm": 14.223319435263825, "learning_rate": 4.999150159110923e-05, "loss": 3.6761, "mean_token_accuracy": 0.2965517222881317, "step": 57895 }, { "epoch": 0.05831749494632071, "grad_norm": 20.94054315704241, "learning_rate": 4.9991491290033486e-05, "loss": 2.7996, "mean_token_accuracy": 0.3896551787853241, "step": 57900 }, { "epoch": 0.05832253099942488, "grad_norm": 10.271261499703916, "learning_rate": 4.999148098271963e-05, "loss": 2.8756, "mean_token_accuracy": 0.38620689511299133, "step": 57905 }, { "epoch": 0.058327567052529054, "grad_norm": 12.064438465710602, "learning_rate": 4.99914706691677e-05, "loss": 2.5901, "mean_token_accuracy": 0.42413793206214906, "step": 57910 }, { "epoch": 0.05833260310563323, "grad_norm": 12.995473435766087, "learning_rate": 4.999146034937766e-05, "loss": 2.5431, "mean_token_accuracy": 0.36551723778247835, "step": 57915 }, { "epoch": 0.0583376391587374, "grad_norm": 13.863187038851837, "learning_rate": 4.999145002334954e-05, "loss": 2.329, "mean_token_accuracy": 0.4137930989265442, "step": 57920 }, { "epoch": 0.058342675211841576, "grad_norm": 11.49930694142943, "learning_rate": 4.999143969108332e-05, "loss": 3.1369, "mean_token_accuracy": 0.31379310190677645, "step": 57925 }, { "epoch": 0.05834771126494575, "grad_norm": 10.141836428712791, "learning_rate": 4.9991429352579026e-05, "loss": 2.7194, "mean_token_accuracy": 0.38747731447219846, "step": 57930 }, { "epoch": 0.058352747318049916, "grad_norm": 16.391698204565703, "learning_rate": 4.999141900783665e-05, "loss": 2.6354, "mean_token_accuracy": 0.3965517163276672, "step": 57935 }, { "epoch": 0.05835778337115409, "grad_norm": 13.077879931804969, "learning_rate": 4.999140865685619e-05, "loss": 2.697, "mean_token_accuracy": 0.41379310488700866, "step": 57940 }, { "epoch": 0.058362819424258264, "grad_norm": 11.990448768023217, "learning_rate": 4.999139829963765e-05, "loss": 2.4792, "mean_token_accuracy": 0.39310344457626345, "step": 57945 }, { "epoch": 0.05836785547736244, "grad_norm": 12.498307081894962, "learning_rate": 4.9991387936181046e-05, "loss": 2.5416, "mean_token_accuracy": 0.41379310488700866, "step": 57950 }, { "epoch": 0.05837289153046661, "grad_norm": 10.477484880842788, "learning_rate": 4.9991377566486366e-05, "loss": 2.9785, "mean_token_accuracy": 0.37241379022598264, "step": 57955 }, { "epoch": 0.058377927583570785, "grad_norm": 15.894034534519928, "learning_rate": 4.999136719055363e-05, "loss": 2.6587, "mean_token_accuracy": 0.38965516686439516, "step": 57960 }, { "epoch": 0.05838296363667496, "grad_norm": 14.148774096539112, "learning_rate": 4.999135680838281e-05, "loss": 2.6312, "mean_token_accuracy": 0.39310344457626345, "step": 57965 }, { "epoch": 0.058387999689779126, "grad_norm": 10.181345441465206, "learning_rate": 4.999134641997394e-05, "loss": 2.7742, "mean_token_accuracy": 0.35862069129943847, "step": 57970 }, { "epoch": 0.0583930357428833, "grad_norm": 11.195069592652244, "learning_rate": 4.999133602532701e-05, "loss": 2.9507, "mean_token_accuracy": 0.358620685338974, "step": 57975 }, { "epoch": 0.05839807179598747, "grad_norm": 11.86939297354784, "learning_rate": 4.9991325624442025e-05, "loss": 2.7666, "mean_token_accuracy": 0.3724137842655182, "step": 57980 }, { "epoch": 0.05840310784909165, "grad_norm": 12.89826127961735, "learning_rate": 4.9991315217318986e-05, "loss": 2.7215, "mean_token_accuracy": 0.36896551847457887, "step": 57985 }, { "epoch": 0.05840814390219582, "grad_norm": 13.718473336137832, "learning_rate": 4.9991304803957894e-05, "loss": 2.7096, "mean_token_accuracy": 0.3620689630508423, "step": 57990 }, { "epoch": 0.058413179955299994, "grad_norm": 11.34960453657506, "learning_rate": 4.999129438435876e-05, "loss": 2.5112, "mean_token_accuracy": 0.42934059500694277, "step": 57995 }, { "epoch": 0.05841821600840417, "grad_norm": 11.598299525067583, "learning_rate": 4.9991283958521576e-05, "loss": 2.3874, "mean_token_accuracy": 0.43103447556495667, "step": 58000 }, { "epoch": 0.058423252061508335, "grad_norm": 9.250771122393951, "learning_rate": 4.9991273526446355e-05, "loss": 2.4969, "mean_token_accuracy": 0.39310344457626345, "step": 58005 }, { "epoch": 0.05842828811461251, "grad_norm": 16.090905288244663, "learning_rate": 4.9991263088133096e-05, "loss": 2.8318, "mean_token_accuracy": 0.3517241358757019, "step": 58010 }, { "epoch": 0.05843332416771668, "grad_norm": 11.153094155757385, "learning_rate": 4.999125264358179e-05, "loss": 2.4945, "mean_token_accuracy": 0.42413792610168455, "step": 58015 }, { "epoch": 0.058438360220820856, "grad_norm": 10.082462112520616, "learning_rate": 4.999124219279246e-05, "loss": 2.2097, "mean_token_accuracy": 0.41724138259887694, "step": 58020 }, { "epoch": 0.05844339627392503, "grad_norm": 11.85228843738946, "learning_rate": 4.99912317357651e-05, "loss": 2.772, "mean_token_accuracy": 0.37241379022598264, "step": 58025 }, { "epoch": 0.058448432327029204, "grad_norm": 11.813587319919813, "learning_rate": 4.9991221272499716e-05, "loss": 2.114, "mean_token_accuracy": 0.46551724076271056, "step": 58030 }, { "epoch": 0.05845346838013338, "grad_norm": 12.360024985062351, "learning_rate": 4.9991210802996305e-05, "loss": 2.6769, "mean_token_accuracy": 0.3896551728248596, "step": 58035 }, { "epoch": 0.058458504433237544, "grad_norm": 11.953774505596208, "learning_rate": 4.9991200327254876e-05, "loss": 2.5199, "mean_token_accuracy": 0.4, "step": 58040 }, { "epoch": 0.05846354048634172, "grad_norm": 14.770105386271911, "learning_rate": 4.999118984527542e-05, "loss": 3.0267, "mean_token_accuracy": 0.31034482419490816, "step": 58045 }, { "epoch": 0.05846857653944589, "grad_norm": 13.226518206451283, "learning_rate": 4.999117935705796e-05, "loss": 2.6368, "mean_token_accuracy": 0.4068965494632721, "step": 58050 }, { "epoch": 0.058473612592550066, "grad_norm": 11.8642711839088, "learning_rate": 4.999116886260247e-05, "loss": 2.6854, "mean_token_accuracy": 0.4, "step": 58055 }, { "epoch": 0.05847864864565424, "grad_norm": 13.057276474699377, "learning_rate": 4.999115836190899e-05, "loss": 2.7641, "mean_token_accuracy": 0.41203871965408323, "step": 58060 }, { "epoch": 0.05848368469875841, "grad_norm": 11.440452499953622, "learning_rate": 4.999114785497749e-05, "loss": 2.5918, "mean_token_accuracy": 0.3793103456497192, "step": 58065 }, { "epoch": 0.05848872075186259, "grad_norm": 10.868024124000197, "learning_rate": 4.9991137341807994e-05, "loss": 2.95, "mean_token_accuracy": 0.39310343861579894, "step": 58070 }, { "epoch": 0.058493756804966754, "grad_norm": 11.160758586213356, "learning_rate": 4.99911268224005e-05, "loss": 2.7233, "mean_token_accuracy": 0.4, "step": 58075 }, { "epoch": 0.05849879285807093, "grad_norm": 12.002629460198245, "learning_rate": 4.9991116296755e-05, "loss": 2.4347, "mean_token_accuracy": 0.4068965554237366, "step": 58080 }, { "epoch": 0.0585038289111751, "grad_norm": 12.41181055114698, "learning_rate": 4.999110576487151e-05, "loss": 2.7508, "mean_token_accuracy": 0.3620689570903778, "step": 58085 }, { "epoch": 0.058508864964279275, "grad_norm": 16.269076589220752, "learning_rate": 4.999109522675002e-05, "loss": 2.5703, "mean_token_accuracy": 0.3896551728248596, "step": 58090 }, { "epoch": 0.05851390101738345, "grad_norm": 12.44436747315813, "learning_rate": 4.9991084682390545e-05, "loss": 2.5524, "mean_token_accuracy": 0.3827586233615875, "step": 58095 }, { "epoch": 0.05851893707048762, "grad_norm": 17.237771108475272, "learning_rate": 4.999107413179309e-05, "loss": 2.8552, "mean_token_accuracy": 0.324137932062149, "step": 58100 }, { "epoch": 0.058523973123591796, "grad_norm": 13.402741473614277, "learning_rate": 4.999106357495764e-05, "loss": 2.6562, "mean_token_accuracy": 0.37931033968925476, "step": 58105 }, { "epoch": 0.05852900917669596, "grad_norm": 11.566139204000446, "learning_rate": 4.999105301188421e-05, "loss": 2.6592, "mean_token_accuracy": 0.3793103456497192, "step": 58110 }, { "epoch": 0.05853404522980014, "grad_norm": 10.331620575840399, "learning_rate": 4.999104244257282e-05, "loss": 2.5494, "mean_token_accuracy": 0.3931034505367279, "step": 58115 }, { "epoch": 0.05853908128290431, "grad_norm": 9.523379002408879, "learning_rate": 4.999103186702344e-05, "loss": 2.0584, "mean_token_accuracy": 0.5008620619773865, "step": 58120 }, { "epoch": 0.058544117336008485, "grad_norm": 14.473476228343763, "learning_rate": 4.9991021285236096e-05, "loss": 2.5162, "mean_token_accuracy": 0.417241370677948, "step": 58125 }, { "epoch": 0.05854915338911266, "grad_norm": 11.399740809464562, "learning_rate": 4.999101069721078e-05, "loss": 2.6497, "mean_token_accuracy": 0.3551724135875702, "step": 58130 }, { "epoch": 0.05855418944221683, "grad_norm": 12.546754776768667, "learning_rate": 4.999100010294749e-05, "loss": 2.5099, "mean_token_accuracy": 0.3620689630508423, "step": 58135 }, { "epoch": 0.058559225495321006, "grad_norm": 12.632080453992177, "learning_rate": 4.999098950244625e-05, "loss": 2.7654, "mean_token_accuracy": 0.4034482717514038, "step": 58140 }, { "epoch": 0.05856426154842517, "grad_norm": 12.261728765929192, "learning_rate": 4.9990978895707055e-05, "loss": 3.2115, "mean_token_accuracy": 0.341379314661026, "step": 58145 }, { "epoch": 0.058569297601529346, "grad_norm": 11.980163944109137, "learning_rate": 4.999096828272989e-05, "loss": 2.8505, "mean_token_accuracy": 0.3551724195480347, "step": 58150 }, { "epoch": 0.05857433365463352, "grad_norm": 14.016943305420114, "learning_rate": 4.9990957663514774e-05, "loss": 2.4265, "mean_token_accuracy": 0.43103447556495667, "step": 58155 }, { "epoch": 0.058579369707737694, "grad_norm": 12.126653259054368, "learning_rate": 4.999094703806171e-05, "loss": 2.5097, "mean_token_accuracy": 0.38965516686439516, "step": 58160 }, { "epoch": 0.05858440576084187, "grad_norm": 11.905316722668148, "learning_rate": 4.999093640637069e-05, "loss": 2.7617, "mean_token_accuracy": 0.36551723480224607, "step": 58165 }, { "epoch": 0.05858944181394604, "grad_norm": 12.682281901996925, "learning_rate": 4.999092576844173e-05, "loss": 2.2564, "mean_token_accuracy": 0.47931033968925474, "step": 58170 }, { "epoch": 0.058594477867050215, "grad_norm": 12.867172448281504, "learning_rate": 4.999091512427483e-05, "loss": 2.6203, "mean_token_accuracy": 0.3724137932062149, "step": 58175 }, { "epoch": 0.05859951392015438, "grad_norm": 9.872803945522755, "learning_rate": 4.999090447386999e-05, "loss": 2.246, "mean_token_accuracy": 0.41379311084747317, "step": 58180 }, { "epoch": 0.058604549973258556, "grad_norm": 13.591890382368568, "learning_rate": 4.999089381722722e-05, "loss": 2.828, "mean_token_accuracy": 0.3965517163276672, "step": 58185 }, { "epoch": 0.05860958602636273, "grad_norm": 12.799307227230408, "learning_rate": 4.99908831543465e-05, "loss": 2.4925, "mean_token_accuracy": 0.4379310369491577, "step": 58190 }, { "epoch": 0.0586146220794669, "grad_norm": 13.631473350222166, "learning_rate": 4.9990872485227865e-05, "loss": 2.4108, "mean_token_accuracy": 0.4344827651977539, "step": 58195 }, { "epoch": 0.05861965813257108, "grad_norm": 12.171327677650542, "learning_rate": 4.999086180987129e-05, "loss": 2.7575, "mean_token_accuracy": 0.37241379022598264, "step": 58200 }, { "epoch": 0.05862469418567525, "grad_norm": 11.03572367657223, "learning_rate": 4.9990851128276806e-05, "loss": 2.5818, "mean_token_accuracy": 0.4034482717514038, "step": 58205 }, { "epoch": 0.058629730238779425, "grad_norm": 11.281731532225864, "learning_rate": 4.9990840440444385e-05, "loss": 2.5378, "mean_token_accuracy": 0.44482757449150084, "step": 58210 }, { "epoch": 0.05863476629188359, "grad_norm": 11.412665405880785, "learning_rate": 4.9990829746374054e-05, "loss": 2.5639, "mean_token_accuracy": 0.43103448748588563, "step": 58215 }, { "epoch": 0.058639802344987765, "grad_norm": 11.682116446048864, "learning_rate": 4.999081904606581e-05, "loss": 2.6572, "mean_token_accuracy": 0.37586206793785093, "step": 58220 }, { "epoch": 0.05864483839809194, "grad_norm": 14.395120572704545, "learning_rate": 4.999080833951964e-05, "loss": 2.9651, "mean_token_accuracy": 0.3551724076271057, "step": 58225 }, { "epoch": 0.05864987445119611, "grad_norm": 12.194332282673388, "learning_rate": 4.9990797626735576e-05, "loss": 2.6309, "mean_token_accuracy": 0.4, "step": 58230 }, { "epoch": 0.05865491050430029, "grad_norm": 13.362911112583335, "learning_rate": 4.99907869077136e-05, "loss": 2.8832, "mean_token_accuracy": 0.32068965435028074, "step": 58235 }, { "epoch": 0.05865994655740446, "grad_norm": 21.273003475449, "learning_rate": 4.999077618245372e-05, "loss": 2.6794, "mean_token_accuracy": 0.4034482717514038, "step": 58240 }, { "epoch": 0.058664982610508634, "grad_norm": 12.925637928370348, "learning_rate": 4.999076545095594e-05, "loss": 2.5548, "mean_token_accuracy": 0.38620689511299133, "step": 58245 }, { "epoch": 0.0586700186636128, "grad_norm": 13.418241772915989, "learning_rate": 4.9990754713220265e-05, "loss": 2.9526, "mean_token_accuracy": 0.3413793116807938, "step": 58250 }, { "epoch": 0.058675054716716975, "grad_norm": 11.212893672015234, "learning_rate": 4.999074396924668e-05, "loss": 2.7394, "mean_token_accuracy": 0.38620689511299133, "step": 58255 }, { "epoch": 0.05868009076982115, "grad_norm": 13.179140378005286, "learning_rate": 4.9990733219035225e-05, "loss": 2.7069, "mean_token_accuracy": 0.37241379618644715, "step": 58260 }, { "epoch": 0.05868512682292532, "grad_norm": 12.43030802440109, "learning_rate": 4.9990722462585864e-05, "loss": 2.6006, "mean_token_accuracy": 0.36896551847457887, "step": 58265 }, { "epoch": 0.058690162876029496, "grad_norm": 14.2593349061398, "learning_rate": 4.999071169989862e-05, "loss": 2.6138, "mean_token_accuracy": 0.4068965494632721, "step": 58270 }, { "epoch": 0.05869519892913367, "grad_norm": 10.21676172874777, "learning_rate": 4.999070093097351e-05, "loss": 2.5327, "mean_token_accuracy": 0.4137930989265442, "step": 58275 }, { "epoch": 0.058700234982237844, "grad_norm": 15.391777245551568, "learning_rate": 4.9990690155810506e-05, "loss": 2.6252, "mean_token_accuracy": 0.41724138259887694, "step": 58280 }, { "epoch": 0.05870527103534201, "grad_norm": 15.132827438776575, "learning_rate": 4.999067937440963e-05, "loss": 2.8262, "mean_token_accuracy": 0.4034482777118683, "step": 58285 }, { "epoch": 0.058710307088446184, "grad_norm": 10.65237468266514, "learning_rate": 4.9990668586770874e-05, "loss": 2.1596, "mean_token_accuracy": 0.44482758045196535, "step": 58290 }, { "epoch": 0.05871534314155036, "grad_norm": 11.283660606828796, "learning_rate": 4.9990657792894245e-05, "loss": 2.6391, "mean_token_accuracy": 0.4172413766384125, "step": 58295 }, { "epoch": 0.05872037919465453, "grad_norm": 12.993867137727007, "learning_rate": 4.999064699277976e-05, "loss": 2.4458, "mean_token_accuracy": 0.4137930989265442, "step": 58300 }, { "epoch": 0.058725415247758705, "grad_norm": 14.4344808791683, "learning_rate": 4.99906361864274e-05, "loss": 2.7955, "mean_token_accuracy": 0.38275861740112305, "step": 58305 }, { "epoch": 0.05873045130086288, "grad_norm": 9.9734801806251, "learning_rate": 4.999062537383718e-05, "loss": 2.3838, "mean_token_accuracy": 0.42068964838981626, "step": 58310 }, { "epoch": 0.05873548735396705, "grad_norm": 12.078830607747605, "learning_rate": 4.99906145550091e-05, "loss": 3.6281, "mean_token_accuracy": 0.31724137961864474, "step": 58315 }, { "epoch": 0.05874052340707122, "grad_norm": 11.67002858074103, "learning_rate": 4.999060372994317e-05, "loss": 2.5244, "mean_token_accuracy": 0.4034482777118683, "step": 58320 }, { "epoch": 0.058745559460175394, "grad_norm": 14.060579593033284, "learning_rate": 4.9990592898639385e-05, "loss": 2.4573, "mean_token_accuracy": 0.4052026629447937, "step": 58325 }, { "epoch": 0.05875059551327957, "grad_norm": 10.370626652725292, "learning_rate": 4.9990582061097746e-05, "loss": 2.4551, "mean_token_accuracy": 0.4, "step": 58330 }, { "epoch": 0.05875563156638374, "grad_norm": 11.677830279831795, "learning_rate": 4.999057121731826e-05, "loss": 2.307, "mean_token_accuracy": 0.3965517163276672, "step": 58335 }, { "epoch": 0.058760667619487915, "grad_norm": 11.009148218948424, "learning_rate": 4.9990560367300936e-05, "loss": 2.8369, "mean_token_accuracy": 0.3827586203813553, "step": 58340 }, { "epoch": 0.05876570367259209, "grad_norm": 14.235536079942097, "learning_rate": 4.999054951104578e-05, "loss": 2.5526, "mean_token_accuracy": 0.4310344815254211, "step": 58345 }, { "epoch": 0.05877073972569626, "grad_norm": 10.210656930119285, "learning_rate": 4.999053864855277e-05, "loss": 2.4531, "mean_token_accuracy": 0.39655172228813174, "step": 58350 }, { "epoch": 0.05877577577880043, "grad_norm": 11.769400229235206, "learning_rate": 4.999052777982193e-05, "loss": 2.5959, "mean_token_accuracy": 0.4103448212146759, "step": 58355 }, { "epoch": 0.0587808118319046, "grad_norm": 12.651299092601084, "learning_rate": 4.999051690485326e-05, "loss": 2.4817, "mean_token_accuracy": 0.43793103098869324, "step": 58360 }, { "epoch": 0.05878584788500878, "grad_norm": 10.99216600684398, "learning_rate": 4.999050602364675e-05, "loss": 2.6773, "mean_token_accuracy": 0.38965516090393065, "step": 58365 }, { "epoch": 0.05879088393811295, "grad_norm": 12.900921248889595, "learning_rate": 4.9990495136202425e-05, "loss": 2.8438, "mean_token_accuracy": 0.3620689570903778, "step": 58370 }, { "epoch": 0.058795919991217124, "grad_norm": 12.818526582345942, "learning_rate": 4.999048424252028e-05, "loss": 3.0079, "mean_token_accuracy": 0.3448275923728943, "step": 58375 }, { "epoch": 0.0588009560443213, "grad_norm": 13.85222009221222, "learning_rate": 4.999047334260031e-05, "loss": 2.0867, "mean_token_accuracy": 0.4988505780696869, "step": 58380 }, { "epoch": 0.05880599209742547, "grad_norm": 13.26861068753513, "learning_rate": 4.9990462436442526e-05, "loss": 2.1906, "mean_token_accuracy": 0.4804187178611755, "step": 58385 }, { "epoch": 0.05881102815052964, "grad_norm": 16.18407443104682, "learning_rate": 4.9990451524046926e-05, "loss": 2.6036, "mean_token_accuracy": 0.37586206793785093, "step": 58390 }, { "epoch": 0.05881606420363381, "grad_norm": 10.280644257513085, "learning_rate": 4.999044060541352e-05, "loss": 2.5311, "mean_token_accuracy": 0.4068965494632721, "step": 58395 }, { "epoch": 0.058821100256737986, "grad_norm": 13.714916978082842, "learning_rate": 4.999042968054231e-05, "loss": 2.7961, "mean_token_accuracy": 0.36896551847457887, "step": 58400 }, { "epoch": 0.05882613630984216, "grad_norm": 12.940845277379792, "learning_rate": 4.9990418749433275e-05, "loss": 2.5207, "mean_token_accuracy": 0.39310344457626345, "step": 58405 }, { "epoch": 0.058831172362946334, "grad_norm": 14.326161257822992, "learning_rate": 4.999040781208646e-05, "loss": 2.7674, "mean_token_accuracy": 0.3724137932062149, "step": 58410 }, { "epoch": 0.05883620841605051, "grad_norm": 10.90764893764017, "learning_rate": 4.9990396868501835e-05, "loss": 2.5689, "mean_token_accuracy": 0.39310344457626345, "step": 58415 }, { "epoch": 0.05884124446915468, "grad_norm": 13.166823241845668, "learning_rate": 4.999038591867942e-05, "loss": 2.7041, "mean_token_accuracy": 0.4189352631568909, "step": 58420 }, { "epoch": 0.05884628052225885, "grad_norm": 16.672581592606456, "learning_rate": 4.999037496261921e-05, "loss": 2.729, "mean_token_accuracy": 0.39655172228813174, "step": 58425 }, { "epoch": 0.05885131657536302, "grad_norm": 10.810355758828537, "learning_rate": 4.999036400032121e-05, "loss": 3.0283, "mean_token_accuracy": 0.4068965524435043, "step": 58430 }, { "epoch": 0.058856352628467196, "grad_norm": 13.479095774881834, "learning_rate": 4.999035303178542e-05, "loss": 3.0113, "mean_token_accuracy": 0.35172414481639863, "step": 58435 }, { "epoch": 0.05886138868157137, "grad_norm": 8.664528726341905, "learning_rate": 4.999034205701185e-05, "loss": 2.1789, "mean_token_accuracy": 0.45517241954803467, "step": 58440 }, { "epoch": 0.05886642473467554, "grad_norm": 14.32364779979314, "learning_rate": 4.99903310760005e-05, "loss": 2.8923, "mean_token_accuracy": 0.36551724672317504, "step": 58445 }, { "epoch": 0.05887146078777972, "grad_norm": 11.928761527027048, "learning_rate": 4.999032008875138e-05, "loss": 2.8976, "mean_token_accuracy": 0.3689655244350433, "step": 58450 }, { "epoch": 0.05887649684088389, "grad_norm": 10.869123869897926, "learning_rate": 4.999030909526448e-05, "loss": 2.61, "mean_token_accuracy": 0.37241379022598264, "step": 58455 }, { "epoch": 0.05888153289398806, "grad_norm": 13.104803976666249, "learning_rate": 4.999029809553981e-05, "loss": 2.7761, "mean_token_accuracy": 0.4034482777118683, "step": 58460 }, { "epoch": 0.05888656894709223, "grad_norm": 13.322490311816996, "learning_rate": 4.999028708957737e-05, "loss": 2.4232, "mean_token_accuracy": 0.42413792610168455, "step": 58465 }, { "epoch": 0.058891605000196405, "grad_norm": 11.526170585399552, "learning_rate": 4.999027607737716e-05, "loss": 2.6289, "mean_token_accuracy": 0.3724137932062149, "step": 58470 }, { "epoch": 0.05889664105330058, "grad_norm": 12.589440447052223, "learning_rate": 4.9990265058939194e-05, "loss": 2.3249, "mean_token_accuracy": 0.47241378426551817, "step": 58475 }, { "epoch": 0.05890167710640475, "grad_norm": 9.612419431975118, "learning_rate": 4.9990254034263474e-05, "loss": 2.2433, "mean_token_accuracy": 0.4551724076271057, "step": 58480 }, { "epoch": 0.058906713159508926, "grad_norm": 10.977731267675512, "learning_rate": 4.999024300334999e-05, "loss": 2.492, "mean_token_accuracy": 0.3827586233615875, "step": 58485 }, { "epoch": 0.0589117492126131, "grad_norm": 12.074313522164669, "learning_rate": 4.999023196619876e-05, "loss": 2.871, "mean_token_accuracy": 0.38112522959709166, "step": 58490 }, { "epoch": 0.05891678526571727, "grad_norm": 11.14220313225627, "learning_rate": 4.999022092280977e-05, "loss": 2.0878, "mean_token_accuracy": 0.47931034564971925, "step": 58495 }, { "epoch": 0.05892182131882144, "grad_norm": 11.298115960860097, "learning_rate": 4.999020987318304e-05, "loss": 2.0852, "mean_token_accuracy": 0.4206896543502808, "step": 58500 }, { "epoch": 0.058926857371925614, "grad_norm": 11.542361760487763, "learning_rate": 4.9990198817318566e-05, "loss": 2.8396, "mean_token_accuracy": 0.3586206823587418, "step": 58505 }, { "epoch": 0.05893189342502979, "grad_norm": 11.342139309668278, "learning_rate": 4.999018775521635e-05, "loss": 2.8269, "mean_token_accuracy": 0.3758620649576187, "step": 58510 }, { "epoch": 0.05893692947813396, "grad_norm": 11.40022600407774, "learning_rate": 4.9990176686876394e-05, "loss": 2.5803, "mean_token_accuracy": 0.3551724135875702, "step": 58515 }, { "epoch": 0.058941965531238136, "grad_norm": 11.216281018400004, "learning_rate": 4.999016561229871e-05, "loss": 2.5116, "mean_token_accuracy": 0.41379310488700866, "step": 58520 }, { "epoch": 0.05894700158434231, "grad_norm": 11.6746912367142, "learning_rate": 4.999015453148329e-05, "loss": 2.1654, "mean_token_accuracy": 0.4482758641242981, "step": 58525 }, { "epoch": 0.058952037637446476, "grad_norm": 11.64503849487109, "learning_rate": 4.999014344443015e-05, "loss": 2.6106, "mean_token_accuracy": 0.3862068891525269, "step": 58530 }, { "epoch": 0.05895707369055065, "grad_norm": 13.018170076737198, "learning_rate": 4.999013235113927e-05, "loss": 2.3881, "mean_token_accuracy": 0.45862069725990295, "step": 58535 }, { "epoch": 0.058962109743654824, "grad_norm": 10.367387122473016, "learning_rate": 4.999012125161068e-05, "loss": 2.6577, "mean_token_accuracy": 0.4482758641242981, "step": 58540 }, { "epoch": 0.058967145796759, "grad_norm": 13.97914158018676, "learning_rate": 4.999011014584437e-05, "loss": 2.6244, "mean_token_accuracy": 0.3896551728248596, "step": 58545 }, { "epoch": 0.05897218184986317, "grad_norm": 10.521847327121893, "learning_rate": 4.999009903384034e-05, "loss": 2.2097, "mean_token_accuracy": 0.4724137902259827, "step": 58550 }, { "epoch": 0.058977217902967345, "grad_norm": 13.48655769158509, "learning_rate": 4.9990087915598596e-05, "loss": 2.5905, "mean_token_accuracy": 0.38275861740112305, "step": 58555 }, { "epoch": 0.05898225395607152, "grad_norm": 12.16582228205211, "learning_rate": 4.999007679111915e-05, "loss": 2.3999, "mean_token_accuracy": 0.3862069010734558, "step": 58560 }, { "epoch": 0.058987290009175686, "grad_norm": 15.793279579127145, "learning_rate": 4.999006566040199e-05, "loss": 2.723, "mean_token_accuracy": 0.37931033968925476, "step": 58565 }, { "epoch": 0.05899232606227986, "grad_norm": 11.257375885495158, "learning_rate": 4.999005452344713e-05, "loss": 2.575, "mean_token_accuracy": 0.4034482777118683, "step": 58570 }, { "epoch": 0.05899736211538403, "grad_norm": 12.876908535610417, "learning_rate": 4.999004338025456e-05, "loss": 2.3862, "mean_token_accuracy": 0.43448275327682495, "step": 58575 }, { "epoch": 0.05900239816848821, "grad_norm": 15.801554971622984, "learning_rate": 4.9990032230824306e-05, "loss": 2.8597, "mean_token_accuracy": 0.3517241418361664, "step": 58580 }, { "epoch": 0.05900743422159238, "grad_norm": 11.464720521975142, "learning_rate": 4.999002107515636e-05, "loss": 2.5469, "mean_token_accuracy": 0.3931034505367279, "step": 58585 }, { "epoch": 0.059012470274696555, "grad_norm": 14.328906988062338, "learning_rate": 4.999000991325071e-05, "loss": 2.9102, "mean_token_accuracy": 0.39310344457626345, "step": 58590 }, { "epoch": 0.05901750632780073, "grad_norm": 14.894068611859675, "learning_rate": 4.9989998745107374e-05, "loss": 2.685, "mean_token_accuracy": 0.3551724076271057, "step": 58595 }, { "epoch": 0.059022542380904895, "grad_norm": 13.077909713655979, "learning_rate": 4.998998757072635e-05, "loss": 2.637, "mean_token_accuracy": 0.43448276817798615, "step": 58600 }, { "epoch": 0.05902757843400907, "grad_norm": 13.600544804941952, "learning_rate": 4.998997639010766e-05, "loss": 2.3094, "mean_token_accuracy": 0.41881427764892576, "step": 58605 }, { "epoch": 0.05903261448711324, "grad_norm": 11.293686919184719, "learning_rate": 4.998996520325128e-05, "loss": 2.2382, "mean_token_accuracy": 0.43103447556495667, "step": 58610 }, { "epoch": 0.059037650540217416, "grad_norm": 12.04888194881693, "learning_rate": 4.998995401015723e-05, "loss": 2.8035, "mean_token_accuracy": 0.36896551251411436, "step": 58615 }, { "epoch": 0.05904268659332159, "grad_norm": 13.571958763294521, "learning_rate": 4.99899428108255e-05, "loss": 2.5817, "mean_token_accuracy": 0.3931034505367279, "step": 58620 }, { "epoch": 0.059047722646425764, "grad_norm": 13.126813654059614, "learning_rate": 4.99899316052561e-05, "loss": 2.1028, "mean_token_accuracy": 0.482758617401123, "step": 58625 }, { "epoch": 0.05905275869952994, "grad_norm": 10.855866496352922, "learning_rate": 4.9989920393449035e-05, "loss": 2.3827, "mean_token_accuracy": 0.4344827592372894, "step": 58630 }, { "epoch": 0.059057794752634105, "grad_norm": 11.483846777040888, "learning_rate": 4.998990917540431e-05, "loss": 2.2769, "mean_token_accuracy": 0.43103448748588563, "step": 58635 }, { "epoch": 0.05906283080573828, "grad_norm": 25.1884739134712, "learning_rate": 4.9989897951121925e-05, "loss": 3.0654, "mean_token_accuracy": 0.32758620381355286, "step": 58640 }, { "epoch": 0.05906786685884245, "grad_norm": 20.865457539453736, "learning_rate": 4.998988672060188e-05, "loss": 2.7352, "mean_token_accuracy": 0.41379310488700866, "step": 58645 }, { "epoch": 0.059072902911946626, "grad_norm": 10.575030327891007, "learning_rate": 4.998987548384418e-05, "loss": 2.1478, "mean_token_accuracy": 0.46551724672317507, "step": 58650 }, { "epoch": 0.0590779389650508, "grad_norm": 13.024581534294235, "learning_rate": 4.998986424084884e-05, "loss": 2.4822, "mean_token_accuracy": 0.3965517163276672, "step": 58655 }, { "epoch": 0.05908297501815497, "grad_norm": 11.03628367800722, "learning_rate": 4.998985299161584e-05, "loss": 2.7691, "mean_token_accuracy": 0.3517241358757019, "step": 58660 }, { "epoch": 0.05908801107125915, "grad_norm": 16.19723759953313, "learning_rate": 4.9989841736145193e-05, "loss": 2.6805, "mean_token_accuracy": 0.36206896901130675, "step": 58665 }, { "epoch": 0.059093047124363314, "grad_norm": 13.677810962693576, "learning_rate": 4.998983047443692e-05, "loss": 2.7276, "mean_token_accuracy": 0.3793103456497192, "step": 58670 }, { "epoch": 0.05909808317746749, "grad_norm": 12.968755204008977, "learning_rate": 4.9989819206491e-05, "loss": 3.2961, "mean_token_accuracy": 0.31724137514829637, "step": 58675 }, { "epoch": 0.05910311923057166, "grad_norm": 10.642179541018857, "learning_rate": 4.998980793230743e-05, "loss": 2.6734, "mean_token_accuracy": 0.41034482717514037, "step": 58680 }, { "epoch": 0.059108155283675835, "grad_norm": 11.85518441420973, "learning_rate": 4.998979665188625e-05, "loss": 2.1298, "mean_token_accuracy": 0.4586206912994385, "step": 58685 }, { "epoch": 0.05911319133678001, "grad_norm": 13.04737656379371, "learning_rate": 4.998978536522743e-05, "loss": 2.2877, "mean_token_accuracy": 0.4155474901199341, "step": 58690 }, { "epoch": 0.05911822738988418, "grad_norm": 8.967105319313283, "learning_rate": 4.9989774072330985e-05, "loss": 2.7519, "mean_token_accuracy": 0.40689656138420105, "step": 58695 }, { "epoch": 0.05912326344298836, "grad_norm": 14.133498256694917, "learning_rate": 4.998976277319692e-05, "loss": 2.7748, "mean_token_accuracy": 0.3896551787853241, "step": 58700 }, { "epoch": 0.05912829949609252, "grad_norm": 11.674818432265326, "learning_rate": 4.9989751467825234e-05, "loss": 2.4714, "mean_token_accuracy": 0.4724137783050537, "step": 58705 }, { "epoch": 0.0591333355491967, "grad_norm": 11.657068020650277, "learning_rate": 4.998974015621593e-05, "loss": 2.4509, "mean_token_accuracy": 0.3808832406997681, "step": 58710 }, { "epoch": 0.05913837160230087, "grad_norm": 12.474614161025137, "learning_rate": 4.9989728838369017e-05, "loss": 2.344, "mean_token_accuracy": 0.45517241954803467, "step": 58715 }, { "epoch": 0.059143407655405045, "grad_norm": 13.133626532397379, "learning_rate": 4.998971751428449e-05, "loss": 2.5609, "mean_token_accuracy": 0.39655172228813174, "step": 58720 }, { "epoch": 0.05914844370850922, "grad_norm": 13.074275921696513, "learning_rate": 4.998970618396236e-05, "loss": 2.8272, "mean_token_accuracy": 0.3931034505367279, "step": 58725 }, { "epoch": 0.05915347976161339, "grad_norm": 10.788938455627191, "learning_rate": 4.998969484740262e-05, "loss": 2.3963, "mean_token_accuracy": 0.42413793206214906, "step": 58730 }, { "epoch": 0.059158515814717566, "grad_norm": 11.316826019995348, "learning_rate": 4.998968350460528e-05, "loss": 2.353, "mean_token_accuracy": 0.42068964838981626, "step": 58735 }, { "epoch": 0.05916355186782173, "grad_norm": 12.39146491039513, "learning_rate": 4.998967215557036e-05, "loss": 2.4522, "mean_token_accuracy": 0.41724138259887694, "step": 58740 }, { "epoch": 0.05916858792092591, "grad_norm": 10.612586463996262, "learning_rate": 4.998966080029782e-05, "loss": 2.6475, "mean_token_accuracy": 0.3931034505367279, "step": 58745 }, { "epoch": 0.05917362397403008, "grad_norm": 17.499743318498293, "learning_rate": 4.99896494387877e-05, "loss": 2.7012, "mean_token_accuracy": 0.4310344815254211, "step": 58750 }, { "epoch": 0.059178660027134254, "grad_norm": 11.600560531866211, "learning_rate": 4.998963807103999e-05, "loss": 2.5386, "mean_token_accuracy": 0.4689655065536499, "step": 58755 }, { "epoch": 0.05918369608023843, "grad_norm": 13.13240034680552, "learning_rate": 4.99896266970547e-05, "loss": 2.7715, "mean_token_accuracy": 0.3517241418361664, "step": 58760 }, { "epoch": 0.0591887321333426, "grad_norm": 10.103749376060998, "learning_rate": 4.998961531683182e-05, "loss": 2.3537, "mean_token_accuracy": 0.42068964838981626, "step": 58765 }, { "epoch": 0.059193768186446775, "grad_norm": 11.161734177668214, "learning_rate": 4.998960393037136e-05, "loss": 3.0349, "mean_token_accuracy": 0.3241379350423813, "step": 58770 }, { "epoch": 0.05919880423955094, "grad_norm": 11.848904103937022, "learning_rate": 4.998959253767334e-05, "loss": 2.7163, "mean_token_accuracy": 0.3655172407627106, "step": 58775 }, { "epoch": 0.059203840292655116, "grad_norm": 13.130009703028195, "learning_rate": 4.998958113873774e-05, "loss": 2.8563, "mean_token_accuracy": 0.324137932062149, "step": 58780 }, { "epoch": 0.05920887634575929, "grad_norm": 11.74073416222131, "learning_rate": 4.9989569733564565e-05, "loss": 2.4175, "mean_token_accuracy": 0.41379311084747317, "step": 58785 }, { "epoch": 0.059213912398863464, "grad_norm": 14.925767644174199, "learning_rate": 4.998955832215382e-05, "loss": 2.3978, "mean_token_accuracy": 0.43201970160007475, "step": 58790 }, { "epoch": 0.05921894845196764, "grad_norm": 10.30197493523468, "learning_rate": 4.998954690450553e-05, "loss": 2.5684, "mean_token_accuracy": 0.37931033968925476, "step": 58795 }, { "epoch": 0.05922398450507181, "grad_norm": 12.764724089239957, "learning_rate": 4.998953548061966e-05, "loss": 2.6887, "mean_token_accuracy": 0.3551724135875702, "step": 58800 }, { "epoch": 0.059229020558175985, "grad_norm": 11.109116743531258, "learning_rate": 4.998952405049625e-05, "loss": 2.2581, "mean_token_accuracy": 0.4448275864124298, "step": 58805 }, { "epoch": 0.05923405661128015, "grad_norm": 11.743442505319946, "learning_rate": 4.998951261413528e-05, "loss": 3.3181, "mean_token_accuracy": 0.2965517193078995, "step": 58810 }, { "epoch": 0.059239092664384325, "grad_norm": 8.93081909462916, "learning_rate": 4.998950117153676e-05, "loss": 2.288, "mean_token_accuracy": 0.4194192349910736, "step": 58815 }, { "epoch": 0.0592441287174885, "grad_norm": 12.971720090174436, "learning_rate": 4.99894897227007e-05, "loss": 2.3667, "mean_token_accuracy": 0.41034482419490814, "step": 58820 }, { "epoch": 0.05924916477059267, "grad_norm": 13.140478678687847, "learning_rate": 4.998947826762708e-05, "loss": 3.114, "mean_token_accuracy": 0.3862068891525269, "step": 58825 }, { "epoch": 0.05925420082369685, "grad_norm": 10.627860052609678, "learning_rate": 4.9989466806315924e-05, "loss": 2.4268, "mean_token_accuracy": 0.3827586233615875, "step": 58830 }, { "epoch": 0.05925923687680102, "grad_norm": 11.184528999212581, "learning_rate": 4.998945533876724e-05, "loss": 2.3816, "mean_token_accuracy": 0.43793103098869324, "step": 58835 }, { "epoch": 0.059264272929905194, "grad_norm": 11.639119160357138, "learning_rate": 4.998944386498101e-05, "loss": 2.5424, "mean_token_accuracy": 0.4344827651977539, "step": 58840 }, { "epoch": 0.05926930898300936, "grad_norm": 8.394337740541758, "learning_rate": 4.998943238495726e-05, "loss": 2.3434, "mean_token_accuracy": 0.409558367729187, "step": 58845 }, { "epoch": 0.059274345036113535, "grad_norm": 10.806652685193619, "learning_rate": 4.9989420898695974e-05, "loss": 2.6451, "mean_token_accuracy": 0.37241379618644715, "step": 58850 }, { "epoch": 0.05927938108921771, "grad_norm": 11.597558016421797, "learning_rate": 4.998940940619716e-05, "loss": 2.4474, "mean_token_accuracy": 0.43103447556495667, "step": 58855 }, { "epoch": 0.05928441714232188, "grad_norm": 10.309277776711358, "learning_rate": 4.998939790746083e-05, "loss": 2.6181, "mean_token_accuracy": 0.42915910482406616, "step": 58860 }, { "epoch": 0.059289453195426056, "grad_norm": 10.463224979661796, "learning_rate": 4.9989386402486985e-05, "loss": 2.207, "mean_token_accuracy": 0.48275862336158754, "step": 58865 }, { "epoch": 0.05929448924853023, "grad_norm": 11.234273239966175, "learning_rate": 4.998937489127563e-05, "loss": 2.4072, "mean_token_accuracy": 0.4689655065536499, "step": 58870 }, { "epoch": 0.059299525301634404, "grad_norm": 11.068259497123158, "learning_rate": 4.998936337382675e-05, "loss": 2.2707, "mean_token_accuracy": 0.48275861144065857, "step": 58875 }, { "epoch": 0.05930456135473857, "grad_norm": 11.06848020478526, "learning_rate": 4.9989351850140364e-05, "loss": 2.3281, "mean_token_accuracy": 0.4344827592372894, "step": 58880 }, { "epoch": 0.059309597407842744, "grad_norm": 13.723681499008585, "learning_rate": 4.998934032021648e-05, "loss": 2.555, "mean_token_accuracy": 0.4015124022960663, "step": 58885 }, { "epoch": 0.05931463346094692, "grad_norm": 11.040905928725397, "learning_rate": 4.998932878405508e-05, "loss": 2.5701, "mean_token_accuracy": 0.41724138855934145, "step": 58890 }, { "epoch": 0.05931966951405109, "grad_norm": 14.75788704568464, "learning_rate": 4.9989317241656195e-05, "loss": 2.2684, "mean_token_accuracy": 0.42546883821487425, "step": 58895 }, { "epoch": 0.059324705567155266, "grad_norm": 15.88856058941254, "learning_rate": 4.9989305693019806e-05, "loss": 2.7051, "mean_token_accuracy": 0.4517241418361664, "step": 58900 }, { "epoch": 0.05932974162025944, "grad_norm": 14.567054702115339, "learning_rate": 4.998929413814593e-05, "loss": 2.5808, "mean_token_accuracy": 0.462431937456131, "step": 58905 }, { "epoch": 0.05933477767336361, "grad_norm": 11.032849426141484, "learning_rate": 4.998928257703456e-05, "loss": 2.2975, "mean_token_accuracy": 0.4206896543502808, "step": 58910 }, { "epoch": 0.05933981372646778, "grad_norm": 12.105191391268958, "learning_rate": 4.99892710096857e-05, "loss": 2.75, "mean_token_accuracy": 0.34996975064277647, "step": 58915 }, { "epoch": 0.059344849779571954, "grad_norm": 12.244677299270796, "learning_rate": 4.998925943609936e-05, "loss": 2.7432, "mean_token_accuracy": 0.39655172228813174, "step": 58920 }, { "epoch": 0.05934988583267613, "grad_norm": 15.820009419247432, "learning_rate": 4.998924785627555e-05, "loss": 3.0405, "mean_token_accuracy": 0.36896551847457887, "step": 58925 }, { "epoch": 0.0593549218857803, "grad_norm": 13.246164002115812, "learning_rate": 4.998923627021425e-05, "loss": 2.4165, "mean_token_accuracy": 0.41034482717514037, "step": 58930 }, { "epoch": 0.059359957938884475, "grad_norm": 10.337498680249992, "learning_rate": 4.998922467791547e-05, "loss": 2.794, "mean_token_accuracy": 0.35862069129943847, "step": 58935 }, { "epoch": 0.05936499399198865, "grad_norm": 14.248775080146833, "learning_rate": 4.998921307937923e-05, "loss": 2.8217, "mean_token_accuracy": 0.36551723480224607, "step": 58940 }, { "epoch": 0.05937003004509282, "grad_norm": 12.796284043959828, "learning_rate": 4.998920147460553e-05, "loss": 2.4711, "mean_token_accuracy": 0.4, "step": 58945 }, { "epoch": 0.05937506609819699, "grad_norm": 11.454191912198333, "learning_rate": 4.9989189863594355e-05, "loss": 2.7239, "mean_token_accuracy": 0.3931034505367279, "step": 58950 }, { "epoch": 0.05938010215130116, "grad_norm": 12.23214085029897, "learning_rate": 4.998917824634572e-05, "loss": 2.5001, "mean_token_accuracy": 0.441379314661026, "step": 58955 }, { "epoch": 0.05938513820440534, "grad_norm": 9.973659262778462, "learning_rate": 4.998916662285963e-05, "loss": 2.3519, "mean_token_accuracy": 0.4534785270690918, "step": 58960 }, { "epoch": 0.05939017425750951, "grad_norm": 11.93709296416662, "learning_rate": 4.998915499313609e-05, "loss": 2.4867, "mean_token_accuracy": 0.40689654350280763, "step": 58965 }, { "epoch": 0.059395210310613684, "grad_norm": 9.876055243894868, "learning_rate": 4.9989143357175095e-05, "loss": 2.4771, "mean_token_accuracy": 0.3862069010734558, "step": 58970 }, { "epoch": 0.05940024636371786, "grad_norm": 11.41139172671979, "learning_rate": 4.998913171497666e-05, "loss": 2.6784, "mean_token_accuracy": 0.39310344457626345, "step": 58975 }, { "epoch": 0.05940528241682203, "grad_norm": 11.828379832320408, "learning_rate": 4.998912006654077e-05, "loss": 2.4829, "mean_token_accuracy": 0.4379310429096222, "step": 58980 }, { "epoch": 0.0594103184699262, "grad_norm": 13.485698050340023, "learning_rate": 4.9989108411867434e-05, "loss": 2.4139, "mean_token_accuracy": 0.4679975748062134, "step": 58985 }, { "epoch": 0.05941535452303037, "grad_norm": 11.069308520352765, "learning_rate": 4.998909675095667e-05, "loss": 2.7838, "mean_token_accuracy": 0.35517241060733795, "step": 58990 }, { "epoch": 0.059420390576134546, "grad_norm": 11.984372879863162, "learning_rate": 4.9989085083808467e-05, "loss": 2.4866, "mean_token_accuracy": 0.3965517282485962, "step": 58995 }, { "epoch": 0.05942542662923872, "grad_norm": 13.220351423790104, "learning_rate": 4.998907341042284e-05, "loss": 2.503, "mean_token_accuracy": 0.43793103098869324, "step": 59000 }, { "epoch": 0.059430462682342894, "grad_norm": 11.247430702188277, "learning_rate": 4.998906173079977e-05, "loss": 2.1379, "mean_token_accuracy": 0.44482758045196535, "step": 59005 }, { "epoch": 0.05943549873544707, "grad_norm": 9.37822968939124, "learning_rate": 4.9989050044939294e-05, "loss": 2.2321, "mean_token_accuracy": 0.4379310429096222, "step": 59010 }, { "epoch": 0.05944053478855124, "grad_norm": 10.850501384633876, "learning_rate": 4.998903835284138e-05, "loss": 2.6105, "mean_token_accuracy": 0.3965517282485962, "step": 59015 }, { "epoch": 0.05944557084165541, "grad_norm": 9.866232552324306, "learning_rate": 4.9989026654506055e-05, "loss": 2.8135, "mean_token_accuracy": 0.4068965494632721, "step": 59020 }, { "epoch": 0.05945060689475958, "grad_norm": 14.181364156638187, "learning_rate": 4.998901494993331e-05, "loss": 2.8982, "mean_token_accuracy": 0.337931028008461, "step": 59025 }, { "epoch": 0.059455642947863756, "grad_norm": 13.415891054843744, "learning_rate": 4.998900323912316e-05, "loss": 2.7652, "mean_token_accuracy": 0.37586206793785093, "step": 59030 }, { "epoch": 0.05946067900096793, "grad_norm": 11.956808598122114, "learning_rate": 4.998899152207559e-05, "loss": 2.83, "mean_token_accuracy": 0.36206896901130675, "step": 59035 }, { "epoch": 0.0594657150540721, "grad_norm": 11.500521746110556, "learning_rate": 4.998897979879063e-05, "loss": 2.1805, "mean_token_accuracy": 0.4517241299152374, "step": 59040 }, { "epoch": 0.05947075110717628, "grad_norm": 12.949250173088183, "learning_rate": 4.998896806926825e-05, "loss": 2.3127, "mean_token_accuracy": 0.43448275327682495, "step": 59045 }, { "epoch": 0.05947578716028045, "grad_norm": 14.719307788093833, "learning_rate": 4.998895633350848e-05, "loss": 2.6667, "mean_token_accuracy": 0.42068966031074523, "step": 59050 }, { "epoch": 0.05948082321338462, "grad_norm": 13.919527423465116, "learning_rate": 4.998894459151131e-05, "loss": 2.5504, "mean_token_accuracy": 0.38275861740112305, "step": 59055 }, { "epoch": 0.05948585926648879, "grad_norm": 12.331897373957322, "learning_rate": 4.9988932843276755e-05, "loss": 2.3281, "mean_token_accuracy": 0.4379310369491577, "step": 59060 }, { "epoch": 0.059490895319592965, "grad_norm": 12.866995286202128, "learning_rate": 4.99889210888048e-05, "loss": 2.8571, "mean_token_accuracy": 0.36896551251411436, "step": 59065 }, { "epoch": 0.05949593137269714, "grad_norm": 11.681512910371756, "learning_rate": 4.998890932809547e-05, "loss": 2.5211, "mean_token_accuracy": 0.4, "step": 59070 }, { "epoch": 0.05950096742580131, "grad_norm": 9.361534693454695, "learning_rate": 4.9988897561148753e-05, "loss": 2.2687, "mean_token_accuracy": 0.45366000533103945, "step": 59075 }, { "epoch": 0.059506003478905486, "grad_norm": 12.491551574861692, "learning_rate": 4.998888578796465e-05, "loss": 2.2045, "mean_token_accuracy": 0.47241379618644713, "step": 59080 }, { "epoch": 0.05951103953200966, "grad_norm": 13.831472514405807, "learning_rate": 4.998887400854318e-05, "loss": 2.5779, "mean_token_accuracy": 0.41034482717514037, "step": 59085 }, { "epoch": 0.05951607558511383, "grad_norm": 12.575829861660889, "learning_rate": 4.9988862222884334e-05, "loss": 2.4249, "mean_token_accuracy": 0.42413792610168455, "step": 59090 }, { "epoch": 0.059521111638218, "grad_norm": 11.649071540014495, "learning_rate": 4.998885043098811e-05, "loss": 2.6822, "mean_token_accuracy": 0.4275861978530884, "step": 59095 }, { "epoch": 0.059526147691322175, "grad_norm": 10.069215234913727, "learning_rate": 4.998883863285453e-05, "loss": 2.6944, "mean_token_accuracy": 0.398064124584198, "step": 59100 }, { "epoch": 0.05953118374442635, "grad_norm": 13.129619059399054, "learning_rate": 4.998882682848358e-05, "loss": 2.612, "mean_token_accuracy": 0.43623715043067934, "step": 59105 }, { "epoch": 0.05953621979753052, "grad_norm": 12.093226529297628, "learning_rate": 4.998881501787528e-05, "loss": 2.5563, "mean_token_accuracy": 0.3793103456497192, "step": 59110 }, { "epoch": 0.059541255850634696, "grad_norm": 13.509121380601837, "learning_rate": 4.998880320102961e-05, "loss": 2.1198, "mean_token_accuracy": 0.4379310250282288, "step": 59115 }, { "epoch": 0.05954629190373887, "grad_norm": 12.451777149945938, "learning_rate": 4.9988791377946596e-05, "loss": 2.3015, "mean_token_accuracy": 0.4551724135875702, "step": 59120 }, { "epoch": 0.059551327956843036, "grad_norm": 11.11355618957618, "learning_rate": 4.998877954862622e-05, "loss": 2.567, "mean_token_accuracy": 0.40344828367233276, "step": 59125 }, { "epoch": 0.05955636400994721, "grad_norm": 12.93909149387017, "learning_rate": 4.9988767713068515e-05, "loss": 2.5374, "mean_token_accuracy": 0.35862069129943847, "step": 59130 }, { "epoch": 0.059561400063051384, "grad_norm": 12.987545314469871, "learning_rate": 4.998875587127346e-05, "loss": 2.4685, "mean_token_accuracy": 0.36896551251411436, "step": 59135 }, { "epoch": 0.05956643611615556, "grad_norm": 11.218906518217826, "learning_rate": 4.9988744023241065e-05, "loss": 2.5418, "mean_token_accuracy": 0.39310343861579894, "step": 59140 }, { "epoch": 0.05957147216925973, "grad_norm": 10.566109615957771, "learning_rate": 4.998873216897132e-05, "loss": 2.5067, "mean_token_accuracy": 0.3827586233615875, "step": 59145 }, { "epoch": 0.059576508222363905, "grad_norm": 8.851009062316985, "learning_rate": 4.998872030846425e-05, "loss": 2.4225, "mean_token_accuracy": 0.471082878112793, "step": 59150 }, { "epoch": 0.05958154427546808, "grad_norm": 16.534884292833965, "learning_rate": 4.998870844171985e-05, "loss": 2.7906, "mean_token_accuracy": 0.4082512348890305, "step": 59155 }, { "epoch": 0.059586580328572246, "grad_norm": 12.053098994529302, "learning_rate": 4.9988696568738125e-05, "loss": 2.9011, "mean_token_accuracy": 0.3862068891525269, "step": 59160 }, { "epoch": 0.05959161638167642, "grad_norm": 9.493123152225449, "learning_rate": 4.998868468951908e-05, "loss": 2.4774, "mean_token_accuracy": 0.41724138855934145, "step": 59165 }, { "epoch": 0.05959665243478059, "grad_norm": 11.565401476227244, "learning_rate": 4.998867280406271e-05, "loss": 2.168, "mean_token_accuracy": 0.43103447556495667, "step": 59170 }, { "epoch": 0.05960168848788477, "grad_norm": 13.477905766948895, "learning_rate": 4.998866091236902e-05, "loss": 2.2617, "mean_token_accuracy": 0.458620685338974, "step": 59175 }, { "epoch": 0.05960672454098894, "grad_norm": 11.043773767713418, "learning_rate": 4.9988649014438014e-05, "loss": 2.6218, "mean_token_accuracy": 0.3827586233615875, "step": 59180 }, { "epoch": 0.059611760594093115, "grad_norm": 10.945185808575705, "learning_rate": 4.9988637110269696e-05, "loss": 2.613, "mean_token_accuracy": 0.4344827651977539, "step": 59185 }, { "epoch": 0.05961679664719729, "grad_norm": 12.134704144445134, "learning_rate": 4.998862519986408e-05, "loss": 2.537, "mean_token_accuracy": 0.4137930989265442, "step": 59190 }, { "epoch": 0.059621832700301455, "grad_norm": 11.120474640907574, "learning_rate": 4.998861328322115e-05, "loss": 2.3208, "mean_token_accuracy": 0.4448275864124298, "step": 59195 }, { "epoch": 0.05962686875340563, "grad_norm": 12.757172908089746, "learning_rate": 4.998860136034093e-05, "loss": 2.9531, "mean_token_accuracy": 0.334482753276825, "step": 59200 }, { "epoch": 0.0596319048065098, "grad_norm": 12.826090039123786, "learning_rate": 4.9988589431223404e-05, "loss": 2.7802, "mean_token_accuracy": 0.3827586233615875, "step": 59205 }, { "epoch": 0.05963694085961398, "grad_norm": 15.39407109151909, "learning_rate": 4.9988577495868586e-05, "loss": 2.6464, "mean_token_accuracy": 0.39310344457626345, "step": 59210 }, { "epoch": 0.05964197691271815, "grad_norm": 13.059895387948467, "learning_rate": 4.998856555427648e-05, "loss": 2.6999, "mean_token_accuracy": 0.36551723480224607, "step": 59215 }, { "epoch": 0.059647012965822324, "grad_norm": 10.062127209586782, "learning_rate": 4.9988553606447085e-05, "loss": 2.7815, "mean_token_accuracy": 0.3379310339689255, "step": 59220 }, { "epoch": 0.0596520490189265, "grad_norm": 12.271155622465557, "learning_rate": 4.998854165238041e-05, "loss": 2.4575, "mean_token_accuracy": 0.3999999940395355, "step": 59225 }, { "epoch": 0.059657085072030665, "grad_norm": 11.936558622545698, "learning_rate": 4.9988529692076445e-05, "loss": 2.303, "mean_token_accuracy": 0.42413793206214906, "step": 59230 }, { "epoch": 0.05966212112513484, "grad_norm": 13.517049830487577, "learning_rate": 4.99885177255352e-05, "loss": 2.9089, "mean_token_accuracy": 0.37241379618644715, "step": 59235 }, { "epoch": 0.05966715717823901, "grad_norm": 11.049540743935365, "learning_rate": 4.99885057527567e-05, "loss": 2.8311, "mean_token_accuracy": 0.36896551847457887, "step": 59240 }, { "epoch": 0.059672193231343186, "grad_norm": 11.453550879188398, "learning_rate": 4.9988493773740906e-05, "loss": 2.2979, "mean_token_accuracy": 0.441379314661026, "step": 59245 }, { "epoch": 0.05967722928444736, "grad_norm": 10.921660403326115, "learning_rate": 4.998848178848786e-05, "loss": 2.614, "mean_token_accuracy": 0.4034482777118683, "step": 59250 }, { "epoch": 0.059682265337551534, "grad_norm": 11.108212582161558, "learning_rate": 4.998846979699755e-05, "loss": 3.0907, "mean_token_accuracy": 0.3482758641242981, "step": 59255 }, { "epoch": 0.05968730139065571, "grad_norm": 11.748110530126153, "learning_rate": 4.9988457799269965e-05, "loss": 2.4208, "mean_token_accuracy": 0.4257108271121979, "step": 59260 }, { "epoch": 0.059692337443759874, "grad_norm": 10.912580708692309, "learning_rate": 4.9988445795305133e-05, "loss": 2.4398, "mean_token_accuracy": 0.44827585816383364, "step": 59265 }, { "epoch": 0.05969737349686405, "grad_norm": 13.800260412086224, "learning_rate": 4.998843378510304e-05, "loss": 2.5043, "mean_token_accuracy": 0.3862069010734558, "step": 59270 }, { "epoch": 0.05970240954996822, "grad_norm": 14.446100492260674, "learning_rate": 4.9988421768663705e-05, "loss": 2.7125, "mean_token_accuracy": 0.3517241358757019, "step": 59275 }, { "epoch": 0.059707445603072395, "grad_norm": 10.16095486423684, "learning_rate": 4.9988409745987116e-05, "loss": 2.5052, "mean_token_accuracy": 0.4284482777118683, "step": 59280 }, { "epoch": 0.05971248165617657, "grad_norm": 14.02129291494595, "learning_rate": 4.998839771707329e-05, "loss": 2.4124, "mean_token_accuracy": 0.4813067138195038, "step": 59285 }, { "epoch": 0.05971751770928074, "grad_norm": 12.391065431821701, "learning_rate": 4.9988385681922214e-05, "loss": 2.7404, "mean_token_accuracy": 0.3517241358757019, "step": 59290 }, { "epoch": 0.05972255376238492, "grad_norm": 12.877249695923679, "learning_rate": 4.998837364053391e-05, "loss": 2.712, "mean_token_accuracy": 0.37241379022598264, "step": 59295 }, { "epoch": 0.059727589815489084, "grad_norm": 12.090281693100781, "learning_rate": 4.998836159290836e-05, "loss": 2.5805, "mean_token_accuracy": 0.41379310488700866, "step": 59300 }, { "epoch": 0.05973262586859326, "grad_norm": 11.646682585206323, "learning_rate": 4.9988349539045585e-05, "loss": 2.439, "mean_token_accuracy": 0.38965516686439516, "step": 59305 }, { "epoch": 0.05973766192169743, "grad_norm": 11.084913954681372, "learning_rate": 4.998833747894558e-05, "loss": 2.3275, "mean_token_accuracy": 0.41379310488700866, "step": 59310 }, { "epoch": 0.059742697974801605, "grad_norm": 10.549612511443312, "learning_rate": 4.9988325412608355e-05, "loss": 2.3279, "mean_token_accuracy": 0.4344827473163605, "step": 59315 }, { "epoch": 0.05974773402790578, "grad_norm": 11.13607974083221, "learning_rate": 4.99883133400339e-05, "loss": 2.4961, "mean_token_accuracy": 0.4344827592372894, "step": 59320 }, { "epoch": 0.05975277008100995, "grad_norm": 11.657041666816866, "learning_rate": 4.9988301261222244e-05, "loss": 2.3488, "mean_token_accuracy": 0.4172413766384125, "step": 59325 }, { "epoch": 0.059757806134114126, "grad_norm": 12.983451479916209, "learning_rate": 4.998828917617336e-05, "loss": 2.1332, "mean_token_accuracy": 0.47241379618644713, "step": 59330 }, { "epoch": 0.05976284218721829, "grad_norm": 12.21424864363209, "learning_rate": 4.998827708488727e-05, "loss": 2.6681, "mean_token_accuracy": 0.3896551728248596, "step": 59335 }, { "epoch": 0.05976787824032247, "grad_norm": 11.457072119560102, "learning_rate": 4.998826498736397e-05, "loss": 3.3909, "mean_token_accuracy": 0.3689655214548111, "step": 59340 }, { "epoch": 0.05977291429342664, "grad_norm": 9.971550609495056, "learning_rate": 4.998825288360347e-05, "loss": 2.556, "mean_token_accuracy": 0.3827586233615875, "step": 59345 }, { "epoch": 0.059777950346530814, "grad_norm": 10.560008599451335, "learning_rate": 4.998824077360577e-05, "loss": 2.4283, "mean_token_accuracy": 0.45517240166664125, "step": 59350 }, { "epoch": 0.05978298639963499, "grad_norm": 11.614668656238733, "learning_rate": 4.998822865737086e-05, "loss": 2.5327, "mean_token_accuracy": 0.4206896543502808, "step": 59355 }, { "epoch": 0.05978802245273916, "grad_norm": 17.717088870462884, "learning_rate": 4.9988216534898766e-05, "loss": 2.7206, "mean_token_accuracy": 0.4191167622804642, "step": 59360 }, { "epoch": 0.059793058505843336, "grad_norm": 11.410887643105244, "learning_rate": 4.9988204406189474e-05, "loss": 2.6089, "mean_token_accuracy": 0.36896551847457887, "step": 59365 }, { "epoch": 0.0597980945589475, "grad_norm": 14.251915393865902, "learning_rate": 4.998819227124301e-05, "loss": 2.5267, "mean_token_accuracy": 0.42068964838981626, "step": 59370 }, { "epoch": 0.059803130612051676, "grad_norm": 11.63581847261669, "learning_rate": 4.998818013005936e-05, "loss": 2.4243, "mean_token_accuracy": 0.36551723480224607, "step": 59375 }, { "epoch": 0.05980816666515585, "grad_norm": 12.923047952378496, "learning_rate": 4.998816798263851e-05, "loss": 2.8772, "mean_token_accuracy": 0.3517241388559341, "step": 59380 }, { "epoch": 0.059813202718260024, "grad_norm": 10.343788155452241, "learning_rate": 4.99881558289805e-05, "loss": 2.5149, "mean_token_accuracy": 0.4055656373500824, "step": 59385 }, { "epoch": 0.0598182387713642, "grad_norm": 12.49373128049217, "learning_rate": 4.9988143669085305e-05, "loss": 2.8911, "mean_token_accuracy": 0.341379314661026, "step": 59390 }, { "epoch": 0.05982327482446837, "grad_norm": 14.094836569974099, "learning_rate": 4.998813150295295e-05, "loss": 2.6158, "mean_token_accuracy": 0.3999999940395355, "step": 59395 }, { "epoch": 0.059828310877572545, "grad_norm": 10.713253083037117, "learning_rate": 4.9988119330583416e-05, "loss": 2.5476, "mean_token_accuracy": 0.4, "step": 59400 }, { "epoch": 0.05983334693067671, "grad_norm": 12.83443578572033, "learning_rate": 4.9988107151976734e-05, "loss": 2.781, "mean_token_accuracy": 0.36206896901130675, "step": 59405 }, { "epoch": 0.059838382983780886, "grad_norm": 16.660705207425867, "learning_rate": 4.998809496713288e-05, "loss": 2.4138, "mean_token_accuracy": 0.3945553541183472, "step": 59410 }, { "epoch": 0.05984341903688506, "grad_norm": 10.606487049228424, "learning_rate": 4.998808277605187e-05, "loss": 2.6592, "mean_token_accuracy": 0.3758620709180832, "step": 59415 }, { "epoch": 0.05984845508998923, "grad_norm": 12.620624992318984, "learning_rate": 4.998807057873371e-05, "loss": 2.8247, "mean_token_accuracy": 0.3655172407627106, "step": 59420 }, { "epoch": 0.05985349114309341, "grad_norm": 13.185249411779646, "learning_rate": 4.998805837517839e-05, "loss": 2.9325, "mean_token_accuracy": 0.2944343626499176, "step": 59425 }, { "epoch": 0.05985852719619758, "grad_norm": 11.126089390281564, "learning_rate": 4.9988046165385936e-05, "loss": 2.6194, "mean_token_accuracy": 0.42068964838981626, "step": 59430 }, { "epoch": 0.059863563249301754, "grad_norm": 12.025941591309918, "learning_rate": 4.998803394935634e-05, "loss": 2.2773, "mean_token_accuracy": 0.4348457396030426, "step": 59435 }, { "epoch": 0.05986859930240592, "grad_norm": 14.000797982976875, "learning_rate": 4.998802172708959e-05, "loss": 2.8515, "mean_token_accuracy": 0.35862069129943847, "step": 59440 }, { "epoch": 0.059873635355510095, "grad_norm": 13.233763103786222, "learning_rate": 4.998800949858571e-05, "loss": 2.4839, "mean_token_accuracy": 0.39310344159603117, "step": 59445 }, { "epoch": 0.05987867140861427, "grad_norm": 8.877295375036313, "learning_rate": 4.99879972638447e-05, "loss": 2.514, "mean_token_accuracy": 0.4172413766384125, "step": 59450 }, { "epoch": 0.05988370746171844, "grad_norm": 11.806624309802467, "learning_rate": 4.9987985022866565e-05, "loss": 2.3429, "mean_token_accuracy": 0.4344827592372894, "step": 59455 }, { "epoch": 0.059888743514822616, "grad_norm": 15.883750006449572, "learning_rate": 4.9987972775651285e-05, "loss": 2.97, "mean_token_accuracy": 0.341379314661026, "step": 59460 }, { "epoch": 0.05989377956792679, "grad_norm": 12.344167519307211, "learning_rate": 4.9987960522198896e-05, "loss": 2.6452, "mean_token_accuracy": 0.31724137961864474, "step": 59465 }, { "epoch": 0.059898815621030964, "grad_norm": 9.142959610492534, "learning_rate": 4.998794826250939e-05, "loss": 2.1563, "mean_token_accuracy": 0.43793103098869324, "step": 59470 }, { "epoch": 0.05990385167413513, "grad_norm": 12.735447448286457, "learning_rate": 4.998793599658276e-05, "loss": 3.3874, "mean_token_accuracy": 0.33103448152542114, "step": 59475 }, { "epoch": 0.059908887727239304, "grad_norm": 11.979825693716485, "learning_rate": 4.998792372441902e-05, "loss": 2.5736, "mean_token_accuracy": 0.4068965554237366, "step": 59480 }, { "epoch": 0.05991392378034348, "grad_norm": 13.062132417989382, "learning_rate": 4.998791144601817e-05, "loss": 2.9357, "mean_token_accuracy": 0.3551724135875702, "step": 59485 }, { "epoch": 0.05991895983344765, "grad_norm": 13.019736661118591, "learning_rate": 4.998789916138021e-05, "loss": 2.7705, "mean_token_accuracy": 0.39800362586975097, "step": 59490 }, { "epoch": 0.059923995886551826, "grad_norm": 9.867417397999207, "learning_rate": 4.998788687050515e-05, "loss": 2.7306, "mean_token_accuracy": 0.44954627752304077, "step": 59495 }, { "epoch": 0.059929031939656, "grad_norm": 12.536633716292986, "learning_rate": 4.9987874573392996e-05, "loss": 2.4962, "mean_token_accuracy": 0.3931034505367279, "step": 59500 }, { "epoch": 0.05993406799276017, "grad_norm": 12.695959644494598, "learning_rate": 4.998786227004373e-05, "loss": 2.3952, "mean_token_accuracy": 0.44482758045196535, "step": 59505 }, { "epoch": 0.05993910404586434, "grad_norm": 10.85802627210156, "learning_rate": 4.998784996045739e-05, "loss": 2.1494, "mean_token_accuracy": 0.475862056016922, "step": 59510 }, { "epoch": 0.059944140098968514, "grad_norm": 11.654137978600769, "learning_rate": 4.998783764463395e-05, "loss": 2.1496, "mean_token_accuracy": 0.42758620977401735, "step": 59515 }, { "epoch": 0.05994917615207269, "grad_norm": 11.12386974931033, "learning_rate": 4.998782532257343e-05, "loss": 2.1136, "mean_token_accuracy": 0.45716878175735476, "step": 59520 }, { "epoch": 0.05995421220517686, "grad_norm": 16.392503275748204, "learning_rate": 4.998781299427583e-05, "loss": 2.6689, "mean_token_accuracy": 0.3862069010734558, "step": 59525 }, { "epoch": 0.059959248258281035, "grad_norm": 11.379326196915587, "learning_rate": 4.9987800659741145e-05, "loss": 2.2285, "mean_token_accuracy": 0.45517241954803467, "step": 59530 }, { "epoch": 0.05996428431138521, "grad_norm": 26.328991251545858, "learning_rate": 4.998778831896939e-05, "loss": 3.0549, "mean_token_accuracy": 0.3206896513700485, "step": 59535 }, { "epoch": 0.05996932036448938, "grad_norm": 11.743714546901664, "learning_rate": 4.998777597196056e-05, "loss": 2.3982, "mean_token_accuracy": 0.41379310488700866, "step": 59540 }, { "epoch": 0.05997435641759355, "grad_norm": 13.119055653771602, "learning_rate": 4.998776361871465e-05, "loss": 2.6844, "mean_token_accuracy": 0.41724138259887694, "step": 59545 }, { "epoch": 0.05997939247069772, "grad_norm": 11.189087024210414, "learning_rate": 4.998775125923169e-05, "loss": 2.2738, "mean_token_accuracy": 0.4482758641242981, "step": 59550 }, { "epoch": 0.0599844285238019, "grad_norm": 10.742250809119756, "learning_rate": 4.9987738893511664e-05, "loss": 2.6016, "mean_token_accuracy": 0.36206896901130675, "step": 59555 }, { "epoch": 0.05998946457690607, "grad_norm": 10.333434906491423, "learning_rate": 4.998772652155458e-05, "loss": 2.044, "mean_token_accuracy": 0.4776164650917053, "step": 59560 }, { "epoch": 0.059994500630010245, "grad_norm": 14.350533203427775, "learning_rate": 4.9987714143360436e-05, "loss": 2.8602, "mean_token_accuracy": 0.4034482777118683, "step": 59565 }, { "epoch": 0.05999953668311442, "grad_norm": 12.550634542419782, "learning_rate": 4.998770175892925e-05, "loss": 2.408, "mean_token_accuracy": 0.3965517163276672, "step": 59570 }, { "epoch": 0.06000457273621859, "grad_norm": 15.459495366372062, "learning_rate": 4.998768936826102e-05, "loss": 2.9993, "mean_token_accuracy": 0.358620685338974, "step": 59575 }, { "epoch": 0.06000960878932276, "grad_norm": 14.553089608833684, "learning_rate": 4.998767697135573e-05, "loss": 2.3786, "mean_token_accuracy": 0.5068965435028077, "step": 59580 }, { "epoch": 0.06001464484242693, "grad_norm": 13.07238082776562, "learning_rate": 4.99876645682134e-05, "loss": 2.3192, "mean_token_accuracy": 0.4431337058544159, "step": 59585 }, { "epoch": 0.060019680895531106, "grad_norm": 12.545785352888924, "learning_rate": 4.9987652158834044e-05, "loss": 2.6139, "mean_token_accuracy": 0.39897156953811647, "step": 59590 }, { "epoch": 0.06002471694863528, "grad_norm": 12.962010170514223, "learning_rate": 4.9987639743217644e-05, "loss": 2.9514, "mean_token_accuracy": 0.3517241358757019, "step": 59595 }, { "epoch": 0.060029753001739454, "grad_norm": 15.15373373828142, "learning_rate": 4.998762732136422e-05, "loss": 2.5135, "mean_token_accuracy": 0.41379310488700866, "step": 59600 }, { "epoch": 0.06003478905484363, "grad_norm": 11.435366842002603, "learning_rate": 4.9987614893273764e-05, "loss": 2.3548, "mean_token_accuracy": 0.4068965494632721, "step": 59605 }, { "epoch": 0.0600398251079478, "grad_norm": 9.093383276254944, "learning_rate": 4.9987602458946284e-05, "loss": 2.1739, "mean_token_accuracy": 0.47126436829566953, "step": 59610 }, { "epoch": 0.06004486116105197, "grad_norm": 13.400935093259095, "learning_rate": 4.998759001838179e-05, "loss": 2.6708, "mean_token_accuracy": 0.34137930870056155, "step": 59615 }, { "epoch": 0.06004989721415614, "grad_norm": 12.972030144694294, "learning_rate": 4.998757757158027e-05, "loss": 2.7125, "mean_token_accuracy": 0.41379310488700866, "step": 59620 }, { "epoch": 0.060054933267260316, "grad_norm": 14.487095415789456, "learning_rate": 4.998756511854175e-05, "loss": 2.8431, "mean_token_accuracy": 0.3482758581638336, "step": 59625 }, { "epoch": 0.06005996932036449, "grad_norm": 20.125952498857895, "learning_rate": 4.9987552659266205e-05, "loss": 2.5321, "mean_token_accuracy": 0.4000000059604645, "step": 59630 }, { "epoch": 0.06006500537346866, "grad_norm": 13.314513484836636, "learning_rate": 4.998754019375366e-05, "loss": 2.3828, "mean_token_accuracy": 0.38620689511299133, "step": 59635 }, { "epoch": 0.06007004142657284, "grad_norm": 11.785292901370154, "learning_rate": 4.998752772200411e-05, "loss": 2.4689, "mean_token_accuracy": 0.43103448748588563, "step": 59640 }, { "epoch": 0.06007507747967701, "grad_norm": 14.120998984492328, "learning_rate": 4.9987515244017564e-05, "loss": 2.6114, "mean_token_accuracy": 0.37241379022598264, "step": 59645 }, { "epoch": 0.06008011353278118, "grad_norm": 13.584983502600508, "learning_rate": 4.9987502759794016e-05, "loss": 2.4416, "mean_token_accuracy": 0.43103448748588563, "step": 59650 }, { "epoch": 0.06008514958588535, "grad_norm": 11.1294115075235, "learning_rate": 4.998749026933348e-05, "loss": 2.4503, "mean_token_accuracy": 0.40490018129348754, "step": 59655 }, { "epoch": 0.060090185638989525, "grad_norm": 15.49364027934721, "learning_rate": 4.9987477772635956e-05, "loss": 2.2914, "mean_token_accuracy": 0.45517241954803467, "step": 59660 }, { "epoch": 0.0600952216920937, "grad_norm": 12.152018785689359, "learning_rate": 4.998746526970144e-05, "loss": 2.4934, "mean_token_accuracy": 0.3999999910593033, "step": 59665 }, { "epoch": 0.06010025774519787, "grad_norm": 12.977575433875804, "learning_rate": 4.998745276052995e-05, "loss": 2.4053, "mean_token_accuracy": 0.4344827592372894, "step": 59670 }, { "epoch": 0.06010529379830205, "grad_norm": 12.122288629443316, "learning_rate": 4.998744024512147e-05, "loss": 2.4169, "mean_token_accuracy": 0.42413793206214906, "step": 59675 }, { "epoch": 0.06011032985140622, "grad_norm": 11.11940454693258, "learning_rate": 4.998742772347603e-05, "loss": 2.5868, "mean_token_accuracy": 0.37586206793785093, "step": 59680 }, { "epoch": 0.06011536590451039, "grad_norm": 14.151276432393722, "learning_rate": 4.99874151955936e-05, "loss": 2.4643, "mean_token_accuracy": 0.4103448331356049, "step": 59685 }, { "epoch": 0.06012040195761456, "grad_norm": 11.130885793871853, "learning_rate": 4.9987402661474206e-05, "loss": 2.7559, "mean_token_accuracy": 0.4103448212146759, "step": 59690 }, { "epoch": 0.060125438010718735, "grad_norm": 12.525075176265739, "learning_rate": 4.998739012111785e-05, "loss": 2.6125, "mean_token_accuracy": 0.43103448748588563, "step": 59695 }, { "epoch": 0.06013047406382291, "grad_norm": 12.392339569503312, "learning_rate": 4.9987377574524544e-05, "loss": 2.7337, "mean_token_accuracy": 0.4034482717514038, "step": 59700 }, { "epoch": 0.06013551011692708, "grad_norm": 10.784317121496919, "learning_rate": 4.998736502169427e-05, "loss": 2.3148, "mean_token_accuracy": 0.4241379380226135, "step": 59705 }, { "epoch": 0.060140546170031256, "grad_norm": 11.10753607184445, "learning_rate": 4.998735246262704e-05, "loss": 2.3083, "mean_token_accuracy": 0.4068965494632721, "step": 59710 }, { "epoch": 0.06014558222313543, "grad_norm": 12.124979964809748, "learning_rate": 4.998733989732286e-05, "loss": 2.5379, "mean_token_accuracy": 0.42934059500694277, "step": 59715 }, { "epoch": 0.0601506182762396, "grad_norm": 18.34314546619084, "learning_rate": 4.998732732578173e-05, "loss": 2.8352, "mean_token_accuracy": 0.3565638244152069, "step": 59720 }, { "epoch": 0.06015565432934377, "grad_norm": 11.672721160694284, "learning_rate": 4.9987314748003655e-05, "loss": 2.6699, "mean_token_accuracy": 0.42758620977401735, "step": 59725 }, { "epoch": 0.060160690382447944, "grad_norm": 14.416140762131478, "learning_rate": 4.998730216398865e-05, "loss": 3.0213, "mean_token_accuracy": 0.33448276221752166, "step": 59730 }, { "epoch": 0.06016572643555212, "grad_norm": 11.746530634189202, "learning_rate": 4.9987289573736694e-05, "loss": 2.3977, "mean_token_accuracy": 0.42758620381355283, "step": 59735 }, { "epoch": 0.06017076248865629, "grad_norm": 11.895933316095848, "learning_rate": 4.998727697724781e-05, "loss": 2.638, "mean_token_accuracy": 0.37241379022598264, "step": 59740 }, { "epoch": 0.060175798541760465, "grad_norm": 15.502253553471702, "learning_rate": 4.9987264374522004e-05, "loss": 2.4929, "mean_token_accuracy": 0.4137930989265442, "step": 59745 }, { "epoch": 0.06018083459486464, "grad_norm": 12.755146662967555, "learning_rate": 4.9987251765559255e-05, "loss": 2.6315, "mean_token_accuracy": 0.3655172437429428, "step": 59750 }, { "epoch": 0.060185870647968806, "grad_norm": 12.660283827053775, "learning_rate": 4.99872391503596e-05, "loss": 2.4188, "mean_token_accuracy": 0.5034482717514038, "step": 59755 }, { "epoch": 0.06019090670107298, "grad_norm": 11.730197487064284, "learning_rate": 4.9987226528923015e-05, "loss": 2.6482, "mean_token_accuracy": 0.4103448331356049, "step": 59760 }, { "epoch": 0.060195942754177154, "grad_norm": 12.52773561348265, "learning_rate": 4.998721390124951e-05, "loss": 2.6061, "mean_token_accuracy": 0.3965517282485962, "step": 59765 }, { "epoch": 0.06020097880728133, "grad_norm": 17.613043304394427, "learning_rate": 4.99872012673391e-05, "loss": 3.0406, "mean_token_accuracy": 0.3724137991666794, "step": 59770 }, { "epoch": 0.0602060148603855, "grad_norm": 14.670522293326249, "learning_rate": 4.998718862719178e-05, "loss": 2.2898, "mean_token_accuracy": 0.41724138259887694, "step": 59775 }, { "epoch": 0.060211050913489675, "grad_norm": 12.757640893468812, "learning_rate": 4.998717598080756e-05, "loss": 2.5494, "mean_token_accuracy": 0.4363581418991089, "step": 59780 }, { "epoch": 0.06021608696659385, "grad_norm": 14.013815541766238, "learning_rate": 4.9987163328186426e-05, "loss": 2.5006, "mean_token_accuracy": 0.42413792610168455, "step": 59785 }, { "epoch": 0.060221123019698015, "grad_norm": 10.166078086108945, "learning_rate": 4.99871506693284e-05, "loss": 2.4605, "mean_token_accuracy": 0.4068965554237366, "step": 59790 }, { "epoch": 0.06022615907280219, "grad_norm": 12.971790277309642, "learning_rate": 4.998713800423348e-05, "loss": 2.6372, "mean_token_accuracy": 0.3999999940395355, "step": 59795 }, { "epoch": 0.06023119512590636, "grad_norm": 10.596786219396497, "learning_rate": 4.9987125332901664e-05, "loss": 2.3203, "mean_token_accuracy": 0.4230490028858185, "step": 59800 }, { "epoch": 0.06023623117901054, "grad_norm": 11.690829737780179, "learning_rate": 4.998711265533296e-05, "loss": 2.7754, "mean_token_accuracy": 0.3310344755649567, "step": 59805 }, { "epoch": 0.06024126723211471, "grad_norm": 11.50016808744197, "learning_rate": 4.9987099971527375e-05, "loss": 2.4702, "mean_token_accuracy": 0.39310344457626345, "step": 59810 }, { "epoch": 0.060246303285218884, "grad_norm": 10.879362455649536, "learning_rate": 4.9987087281484904e-05, "loss": 2.5458, "mean_token_accuracy": 0.3931034505367279, "step": 59815 }, { "epoch": 0.06025133933832306, "grad_norm": 14.771462504508076, "learning_rate": 4.998707458520556e-05, "loss": 2.6046, "mean_token_accuracy": 0.4137930989265442, "step": 59820 }, { "epoch": 0.060256375391427225, "grad_norm": 11.848969213101583, "learning_rate": 4.998706188268934e-05, "loss": 2.7071, "mean_token_accuracy": 0.36896551251411436, "step": 59825 }, { "epoch": 0.0602614114445314, "grad_norm": 10.907298145077508, "learning_rate": 4.998704917393625e-05, "loss": 2.3334, "mean_token_accuracy": 0.4241379380226135, "step": 59830 }, { "epoch": 0.06026644749763557, "grad_norm": 18.027190068014384, "learning_rate": 4.9987036458946296e-05, "loss": 2.4879, "mean_token_accuracy": 0.441379314661026, "step": 59835 }, { "epoch": 0.060271483550739746, "grad_norm": 10.418753322740224, "learning_rate": 4.998702373771948e-05, "loss": 2.9406, "mean_token_accuracy": 0.39655172228813174, "step": 59840 }, { "epoch": 0.06027651960384392, "grad_norm": 10.525568920771065, "learning_rate": 4.99870110102558e-05, "loss": 2.1265, "mean_token_accuracy": 0.4896551728248596, "step": 59845 }, { "epoch": 0.060281555656948094, "grad_norm": 10.658760513044863, "learning_rate": 4.9986998276555257e-05, "loss": 2.84, "mean_token_accuracy": 0.3827586233615875, "step": 59850 }, { "epoch": 0.06028659171005227, "grad_norm": 11.348452341551063, "learning_rate": 4.998698553661786e-05, "loss": 2.6081, "mean_token_accuracy": 0.3844525098800659, "step": 59855 }, { "epoch": 0.060291627763156434, "grad_norm": 16.848964426182967, "learning_rate": 4.998697279044363e-05, "loss": 2.5082, "mean_token_accuracy": 0.39310344457626345, "step": 59860 }, { "epoch": 0.06029666381626061, "grad_norm": 10.504821671303686, "learning_rate": 4.998696003803254e-05, "loss": 2.4431, "mean_token_accuracy": 0.3793103456497192, "step": 59865 }, { "epoch": 0.06030169986936478, "grad_norm": 12.233948083948667, "learning_rate": 4.998694727938462e-05, "loss": 2.3754, "mean_token_accuracy": 0.46551724672317507, "step": 59870 }, { "epoch": 0.060306735922468956, "grad_norm": 11.48442236794374, "learning_rate": 4.9986934514499854e-05, "loss": 2.3734, "mean_token_accuracy": 0.44295220971107485, "step": 59875 }, { "epoch": 0.06031177197557313, "grad_norm": 15.487155269098027, "learning_rate": 4.998692174337826e-05, "loss": 2.9844, "mean_token_accuracy": 0.3827586233615875, "step": 59880 }, { "epoch": 0.0603168080286773, "grad_norm": 11.275179843203274, "learning_rate": 4.998690896601982e-05, "loss": 2.7116, "mean_token_accuracy": 0.4, "step": 59885 }, { "epoch": 0.06032184408178148, "grad_norm": 14.19080205594785, "learning_rate": 4.998689618242456e-05, "loss": 2.6233, "mean_token_accuracy": 0.3758620619773865, "step": 59890 }, { "epoch": 0.060326880134885644, "grad_norm": 12.957617768697727, "learning_rate": 4.9986883392592474e-05, "loss": 2.4637, "mean_token_accuracy": 0.4344827651977539, "step": 59895 }, { "epoch": 0.06033191618798982, "grad_norm": 9.967806201063379, "learning_rate": 4.998687059652357e-05, "loss": 2.5508, "mean_token_accuracy": 0.3896551787853241, "step": 59900 }, { "epoch": 0.06033695224109399, "grad_norm": 13.926913038634073, "learning_rate": 4.9986857794217844e-05, "loss": 2.6041, "mean_token_accuracy": 0.39310344457626345, "step": 59905 }, { "epoch": 0.060341988294198165, "grad_norm": 15.341317250958774, "learning_rate": 4.998684498567531e-05, "loss": 2.5893, "mean_token_accuracy": 0.40544464290142057, "step": 59910 }, { "epoch": 0.06034702434730234, "grad_norm": 10.336816873410708, "learning_rate": 4.998683217089596e-05, "loss": 2.3364, "mean_token_accuracy": 0.44110060334205625, "step": 59915 }, { "epoch": 0.06035206040040651, "grad_norm": 14.068893860348433, "learning_rate": 4.9986819349879806e-05, "loss": 2.6563, "mean_token_accuracy": 0.38620689511299133, "step": 59920 }, { "epoch": 0.060357096453510686, "grad_norm": 12.236963696234776, "learning_rate": 4.998680652262685e-05, "loss": 2.3348, "mean_token_accuracy": 0.4413793087005615, "step": 59925 }, { "epoch": 0.06036213250661485, "grad_norm": 10.288734821083162, "learning_rate": 4.998679368913709e-05, "loss": 2.1933, "mean_token_accuracy": 0.4620689570903778, "step": 59930 }, { "epoch": 0.06036716855971903, "grad_norm": 12.441265787756322, "learning_rate": 4.9986780849410534e-05, "loss": 2.6126, "mean_token_accuracy": 0.37931033968925476, "step": 59935 }, { "epoch": 0.0603722046128232, "grad_norm": 10.879870699239097, "learning_rate": 4.9986768003447187e-05, "loss": 2.6538, "mean_token_accuracy": 0.42915910482406616, "step": 59940 }, { "epoch": 0.060377240665927374, "grad_norm": 14.206694797349522, "learning_rate": 4.9986755151247054e-05, "loss": 2.5902, "mean_token_accuracy": 0.41034482717514037, "step": 59945 }, { "epoch": 0.06038227671903155, "grad_norm": 12.869044612626936, "learning_rate": 4.998674229281014e-05, "loss": 2.4545, "mean_token_accuracy": 0.4, "step": 59950 }, { "epoch": 0.06038731277213572, "grad_norm": 10.429904428146598, "learning_rate": 4.998672942813643e-05, "loss": 2.6385, "mean_token_accuracy": 0.4379310369491577, "step": 59955 }, { "epoch": 0.060392348825239896, "grad_norm": 11.98122458392187, "learning_rate": 4.998671655722595e-05, "loss": 2.8358, "mean_token_accuracy": 0.3931034505367279, "step": 59960 }, { "epoch": 0.06039738487834406, "grad_norm": 10.411633204926014, "learning_rate": 4.9986703680078696e-05, "loss": 2.4479, "mean_token_accuracy": 0.43103447556495667, "step": 59965 }, { "epoch": 0.060402420931448236, "grad_norm": 12.818387006602684, "learning_rate": 4.9986690796694667e-05, "loss": 2.2345, "mean_token_accuracy": 0.44827585816383364, "step": 59970 }, { "epoch": 0.06040745698455241, "grad_norm": 11.67885513876231, "learning_rate": 4.998667790707387e-05, "loss": 2.6997, "mean_token_accuracy": 0.4034482777118683, "step": 59975 }, { "epoch": 0.060412493037656584, "grad_norm": 16.688956989975097, "learning_rate": 4.998666501121631e-05, "loss": 3.2698, "mean_token_accuracy": 0.3310344755649567, "step": 59980 }, { "epoch": 0.06041752909076076, "grad_norm": 12.06363331229713, "learning_rate": 4.9986652109122e-05, "loss": 2.5187, "mean_token_accuracy": 0.42068964838981626, "step": 59985 }, { "epoch": 0.06042256514386493, "grad_norm": 14.86786945248949, "learning_rate": 4.998663920079092e-05, "loss": 2.9713, "mean_token_accuracy": 0.3827586114406586, "step": 59990 }, { "epoch": 0.060427601196969105, "grad_norm": 11.984329780165446, "learning_rate": 4.998662628622309e-05, "loss": 2.612, "mean_token_accuracy": 0.4517241358757019, "step": 59995 }, { "epoch": 0.06043263725007327, "grad_norm": 12.580185968798194, "learning_rate": 4.998661336541851e-05, "loss": 2.981, "mean_token_accuracy": 0.3620689570903778, "step": 60000 }, { "epoch": 0.060437673303177446, "grad_norm": 15.529730774839786, "learning_rate": 4.998660043837719e-05, "loss": 2.6903, "mean_token_accuracy": 0.3827586233615875, "step": 60005 }, { "epoch": 0.06044270935628162, "grad_norm": 12.360154325550063, "learning_rate": 4.998658750509913e-05, "loss": 2.5538, "mean_token_accuracy": 0.4221415638923645, "step": 60010 }, { "epoch": 0.06044774540938579, "grad_norm": 10.574775660906774, "learning_rate": 4.998657456558432e-05, "loss": 2.7388, "mean_token_accuracy": 0.41034482717514037, "step": 60015 }, { "epoch": 0.06045278146248997, "grad_norm": 8.96101463114581, "learning_rate": 4.998656161983278e-05, "loss": 2.3431, "mean_token_accuracy": 0.45741077661514284, "step": 60020 }, { "epoch": 0.06045781751559414, "grad_norm": 11.930530223985759, "learning_rate": 4.998654866784451e-05, "loss": 2.5322, "mean_token_accuracy": 0.43448275327682495, "step": 60025 }, { "epoch": 0.060462853568698315, "grad_norm": 10.778106719263068, "learning_rate": 4.9986535709619506e-05, "loss": 2.6913, "mean_token_accuracy": 0.39310344159603117, "step": 60030 }, { "epoch": 0.06046788962180248, "grad_norm": 9.993204034145037, "learning_rate": 4.998652274515778e-05, "loss": 2.3516, "mean_token_accuracy": 0.42758620977401735, "step": 60035 }, { "epoch": 0.060472925674906655, "grad_norm": 12.269064043754108, "learning_rate": 4.9986509774459325e-05, "loss": 2.4559, "mean_token_accuracy": 0.4, "step": 60040 }, { "epoch": 0.06047796172801083, "grad_norm": 9.82140621958726, "learning_rate": 4.9986496797524165e-05, "loss": 2.4714, "mean_token_accuracy": 0.417241370677948, "step": 60045 }, { "epoch": 0.060482997781115, "grad_norm": 11.787171908430809, "learning_rate": 4.998648381435229e-05, "loss": 2.6083, "mean_token_accuracy": 0.4034482777118683, "step": 60050 }, { "epoch": 0.060488033834219176, "grad_norm": 12.684875947742814, "learning_rate": 4.99864708249437e-05, "loss": 2.537, "mean_token_accuracy": 0.4206896543502808, "step": 60055 }, { "epoch": 0.06049306988732335, "grad_norm": 12.088832710673248, "learning_rate": 4.9986457829298404e-05, "loss": 3.0092, "mean_token_accuracy": 0.3275862097740173, "step": 60060 }, { "epoch": 0.060498105940427524, "grad_norm": 14.581697280791245, "learning_rate": 4.998644482741641e-05, "loss": 2.6427, "mean_token_accuracy": 0.4068965554237366, "step": 60065 }, { "epoch": 0.06050314199353169, "grad_norm": 11.586149085424024, "learning_rate": 4.9986431819297705e-05, "loss": 2.4205, "mean_token_accuracy": 0.3896551728248596, "step": 60070 }, { "epoch": 0.060508178046635865, "grad_norm": 11.718066779780797, "learning_rate": 4.998641880494232e-05, "loss": 2.3301, "mean_token_accuracy": 0.4517241418361664, "step": 60075 }, { "epoch": 0.06051321409974004, "grad_norm": 11.820044481061363, "learning_rate": 4.998640578435023e-05, "loss": 2.3732, "mean_token_accuracy": 0.46896552443504336, "step": 60080 }, { "epoch": 0.06051825015284421, "grad_norm": 13.244326264345723, "learning_rate": 4.998639275752146e-05, "loss": 2.3792, "mean_token_accuracy": 0.4344827592372894, "step": 60085 }, { "epoch": 0.060523286205948386, "grad_norm": 14.350680706776217, "learning_rate": 4.9986379724456e-05, "loss": 2.4969, "mean_token_accuracy": 0.4241379380226135, "step": 60090 }, { "epoch": 0.06052832225905256, "grad_norm": 10.040245267785558, "learning_rate": 4.9986366685153865e-05, "loss": 2.7277, "mean_token_accuracy": 0.4, "step": 60095 }, { "epoch": 0.06053335831215673, "grad_norm": 12.280614296063147, "learning_rate": 4.998635363961505e-05, "loss": 2.5743, "mean_token_accuracy": 0.38620689511299133, "step": 60100 }, { "epoch": 0.0605383943652609, "grad_norm": 12.334375084236823, "learning_rate": 4.998634058783955e-05, "loss": 2.4669, "mean_token_accuracy": 0.3965517163276672, "step": 60105 }, { "epoch": 0.060543430418365074, "grad_norm": 12.741120290193614, "learning_rate": 4.9986327529827396e-05, "loss": 2.7242, "mean_token_accuracy": 0.3655172407627106, "step": 60110 }, { "epoch": 0.06054846647146925, "grad_norm": 13.062818067139155, "learning_rate": 4.998631446557856e-05, "loss": 2.684, "mean_token_accuracy": 0.3724137842655182, "step": 60115 }, { "epoch": 0.06055350252457342, "grad_norm": 13.82010109560419, "learning_rate": 4.998630139509307e-05, "loss": 2.5226, "mean_token_accuracy": 0.4363581359386444, "step": 60120 }, { "epoch": 0.060558538577677595, "grad_norm": 10.64260053966043, "learning_rate": 4.998628831837092e-05, "loss": 2.5423, "mean_token_accuracy": 0.3862069010734558, "step": 60125 }, { "epoch": 0.06056357463078177, "grad_norm": 11.216303083397731, "learning_rate": 4.998627523541211e-05, "loss": 2.3611, "mean_token_accuracy": 0.4068965554237366, "step": 60130 }, { "epoch": 0.06056861068388594, "grad_norm": 10.923251052414535, "learning_rate": 4.998626214621666e-05, "loss": 2.7107, "mean_token_accuracy": 0.3827586233615875, "step": 60135 }, { "epoch": 0.06057364673699011, "grad_norm": 28.46770741694154, "learning_rate": 4.9986249050784545e-05, "loss": 2.972, "mean_token_accuracy": 0.38759830594062805, "step": 60140 }, { "epoch": 0.06057868279009428, "grad_norm": 11.801732654688525, "learning_rate": 4.9986235949115794e-05, "loss": 2.7585, "mean_token_accuracy": 0.3620689630508423, "step": 60145 }, { "epoch": 0.06058371884319846, "grad_norm": 11.786285933769491, "learning_rate": 4.9986222841210395e-05, "loss": 2.4121, "mean_token_accuracy": 0.46049606800079346, "step": 60150 }, { "epoch": 0.06058875489630263, "grad_norm": 24.889343474540897, "learning_rate": 4.998620972706836e-05, "loss": 2.9434, "mean_token_accuracy": 0.34640048146247865, "step": 60155 }, { "epoch": 0.060593790949406805, "grad_norm": 12.101806703842897, "learning_rate": 4.99861966066897e-05, "loss": 2.853, "mean_token_accuracy": 0.36551723480224607, "step": 60160 }, { "epoch": 0.06059882700251098, "grad_norm": 11.007724488234762, "learning_rate": 4.998618348007441e-05, "loss": 2.2967, "mean_token_accuracy": 0.4862069070339203, "step": 60165 }, { "epoch": 0.06060386305561515, "grad_norm": 10.705870518944355, "learning_rate": 4.998617034722248e-05, "loss": 2.0629, "mean_token_accuracy": 0.48275862336158754, "step": 60170 }, { "epoch": 0.06060889910871932, "grad_norm": 12.08846234051216, "learning_rate": 4.9986157208133934e-05, "loss": 2.3347, "mean_token_accuracy": 0.4482758641242981, "step": 60175 }, { "epoch": 0.06061393516182349, "grad_norm": 12.276982540208824, "learning_rate": 4.9986144062808766e-05, "loss": 3.1694, "mean_token_accuracy": 0.3482758581638336, "step": 60180 }, { "epoch": 0.06061897121492767, "grad_norm": 11.411186142119309, "learning_rate": 4.9986130911246984e-05, "loss": 2.3465, "mean_token_accuracy": 0.42068966031074523, "step": 60185 }, { "epoch": 0.06062400726803184, "grad_norm": 10.937005503246166, "learning_rate": 4.998611775344858e-05, "loss": 2.4895, "mean_token_accuracy": 0.39655172228813174, "step": 60190 }, { "epoch": 0.060629043321136014, "grad_norm": 12.61097424752664, "learning_rate": 4.998610458941358e-05, "loss": 2.6122, "mean_token_accuracy": 0.4068965494632721, "step": 60195 }, { "epoch": 0.06063407937424019, "grad_norm": 11.564774950674384, "learning_rate": 4.998609141914197e-05, "loss": 3.028, "mean_token_accuracy": 0.324137932062149, "step": 60200 }, { "epoch": 0.06063911542734436, "grad_norm": 12.018986871849423, "learning_rate": 4.998607824263376e-05, "loss": 2.5712, "mean_token_accuracy": 0.3655172407627106, "step": 60205 }, { "epoch": 0.06064415148044853, "grad_norm": 11.369298761463416, "learning_rate": 4.998606505988896e-05, "loss": 2.1714, "mean_token_accuracy": 0.4781004309654236, "step": 60210 }, { "epoch": 0.0606491875335527, "grad_norm": 10.687775661142522, "learning_rate": 4.998605187090755e-05, "loss": 2.7691, "mean_token_accuracy": 0.3758620619773865, "step": 60215 }, { "epoch": 0.060654223586656876, "grad_norm": 17.628866667855135, "learning_rate": 4.998603867568956e-05, "loss": 2.7761, "mean_token_accuracy": 0.3655172407627106, "step": 60220 }, { "epoch": 0.06065925963976105, "grad_norm": 10.446900215293864, "learning_rate": 4.9986025474234984e-05, "loss": 2.3303, "mean_token_accuracy": 0.41379310488700866, "step": 60225 }, { "epoch": 0.060664295692865224, "grad_norm": 13.925298463867044, "learning_rate": 4.998601226654382e-05, "loss": 2.8146, "mean_token_accuracy": 0.3551724076271057, "step": 60230 }, { "epoch": 0.0606693317459694, "grad_norm": 13.651459381222505, "learning_rate": 4.998599905261608e-05, "loss": 2.8703, "mean_token_accuracy": 0.37586207389831544, "step": 60235 }, { "epoch": 0.06067436779907357, "grad_norm": 12.524136696148837, "learning_rate": 4.998598583245176e-05, "loss": 2.7203, "mean_token_accuracy": 0.3896551787853241, "step": 60240 }, { "epoch": 0.06067940385217774, "grad_norm": 11.7518991894446, "learning_rate": 4.998597260605088e-05, "loss": 2.5573, "mean_token_accuracy": 0.39310344457626345, "step": 60245 }, { "epoch": 0.06068443990528191, "grad_norm": 10.30108572371737, "learning_rate": 4.998595937341342e-05, "loss": 2.2024, "mean_token_accuracy": 0.4896551787853241, "step": 60250 }, { "epoch": 0.060689475958386085, "grad_norm": 16.99647508997087, "learning_rate": 4.998594613453939e-05, "loss": 2.5761, "mean_token_accuracy": 0.36551723480224607, "step": 60255 }, { "epoch": 0.06069451201149026, "grad_norm": 12.451184333997794, "learning_rate": 4.998593288942881e-05, "loss": 2.6463, "mean_token_accuracy": 0.43103447556495667, "step": 60260 }, { "epoch": 0.06069954806459443, "grad_norm": 11.329699917084962, "learning_rate": 4.9985919638081666e-05, "loss": 2.8007, "mean_token_accuracy": 0.38070176243782045, "step": 60265 }, { "epoch": 0.06070458411769861, "grad_norm": 12.135995239458376, "learning_rate": 4.998590638049797e-05, "loss": 2.207, "mean_token_accuracy": 0.4655172348022461, "step": 60270 }, { "epoch": 0.06070962017080278, "grad_norm": 11.286267948409566, "learning_rate": 4.998589311667773e-05, "loss": 2.3756, "mean_token_accuracy": 0.42413793206214906, "step": 60275 }, { "epoch": 0.06071465622390695, "grad_norm": 11.414847165245911, "learning_rate": 4.998587984662094e-05, "loss": 2.3012, "mean_token_accuracy": 0.4793103337287903, "step": 60280 }, { "epoch": 0.06071969227701112, "grad_norm": 14.536249151442135, "learning_rate": 4.9985866570327604e-05, "loss": 2.8038, "mean_token_accuracy": 0.3068965464830399, "step": 60285 }, { "epoch": 0.060724728330115295, "grad_norm": 13.07535618216538, "learning_rate": 4.9985853287797736e-05, "loss": 2.8498, "mean_token_accuracy": 0.3986085891723633, "step": 60290 }, { "epoch": 0.06072976438321947, "grad_norm": 10.869326955763011, "learning_rate": 4.998583999903133e-05, "loss": 2.18, "mean_token_accuracy": 0.45396249294281005, "step": 60295 }, { "epoch": 0.06073480043632364, "grad_norm": 11.741379002789953, "learning_rate": 4.998582670402839e-05, "loss": 2.3583, "mean_token_accuracy": 0.4275862157344818, "step": 60300 }, { "epoch": 0.060739836489427816, "grad_norm": 10.981064595351743, "learning_rate": 4.9985813402788914e-05, "loss": 2.478, "mean_token_accuracy": 0.4448275864124298, "step": 60305 }, { "epoch": 0.06074487254253199, "grad_norm": 14.230856898344285, "learning_rate": 4.998580009531293e-05, "loss": 2.6615, "mean_token_accuracy": 0.4, "step": 60310 }, { "epoch": 0.06074990859563616, "grad_norm": 12.479934422119927, "learning_rate": 4.998578678160041e-05, "loss": 2.4785, "mean_token_accuracy": 0.41034482717514037, "step": 60315 }, { "epoch": 0.06075494464874033, "grad_norm": 10.677533580463997, "learning_rate": 4.9985773461651386e-05, "loss": 2.443, "mean_token_accuracy": 0.44482758045196535, "step": 60320 }, { "epoch": 0.060759980701844504, "grad_norm": 12.637577450779567, "learning_rate": 4.9985760135465846e-05, "loss": 2.436, "mean_token_accuracy": 0.43448275327682495, "step": 60325 }, { "epoch": 0.06076501675494868, "grad_norm": 10.81078548379524, "learning_rate": 4.9985746803043786e-05, "loss": 2.3427, "mean_token_accuracy": 0.3827586203813553, "step": 60330 }, { "epoch": 0.06077005280805285, "grad_norm": 11.68373646296189, "learning_rate": 4.998573346438524e-05, "loss": 2.1599, "mean_token_accuracy": 0.4517241418361664, "step": 60335 }, { "epoch": 0.060775088861157026, "grad_norm": 12.653157064650847, "learning_rate": 4.998572011949018e-05, "loss": 2.7148, "mean_token_accuracy": 0.40344826579093934, "step": 60340 }, { "epoch": 0.0607801249142612, "grad_norm": 13.949734212715072, "learning_rate": 4.998570676835862e-05, "loss": 2.6103, "mean_token_accuracy": 0.3655172407627106, "step": 60345 }, { "epoch": 0.060785160967365366, "grad_norm": 15.034014465596217, "learning_rate": 4.998569341099056e-05, "loss": 2.6412, "mean_token_accuracy": 0.4413793087005615, "step": 60350 }, { "epoch": 0.06079019702046954, "grad_norm": 12.86815841928514, "learning_rate": 4.998568004738602e-05, "loss": 3.0264, "mean_token_accuracy": 0.3551724195480347, "step": 60355 }, { "epoch": 0.060795233073573714, "grad_norm": 12.862917287364654, "learning_rate": 4.9985666677544994e-05, "loss": 2.7915, "mean_token_accuracy": 0.4034482717514038, "step": 60360 }, { "epoch": 0.06080026912667789, "grad_norm": 12.81333125531648, "learning_rate": 4.998565330146748e-05, "loss": 2.6665, "mean_token_accuracy": 0.3655172407627106, "step": 60365 }, { "epoch": 0.06080530517978206, "grad_norm": 11.228291208523077, "learning_rate": 4.9985639919153484e-05, "loss": 2.028, "mean_token_accuracy": 0.47241379618644713, "step": 60370 }, { "epoch": 0.060810341232886235, "grad_norm": 9.111681455818363, "learning_rate": 4.9985626530603016e-05, "loss": 2.5606, "mean_token_accuracy": 0.4574107676744461, "step": 60375 }, { "epoch": 0.06081537728599041, "grad_norm": 11.525290645037707, "learning_rate": 4.9985613135816075e-05, "loss": 2.5915, "mean_token_accuracy": 0.38275861740112305, "step": 60380 }, { "epoch": 0.060820413339094576, "grad_norm": 14.753366910506818, "learning_rate": 4.998559973479266e-05, "loss": 2.7312, "mean_token_accuracy": 0.3551724135875702, "step": 60385 }, { "epoch": 0.06082544939219875, "grad_norm": 11.988090048112577, "learning_rate": 4.9985586327532794e-05, "loss": 2.4103, "mean_token_accuracy": 0.5057471334934235, "step": 60390 }, { "epoch": 0.06083048544530292, "grad_norm": 11.051442766305067, "learning_rate": 4.9985572914036446e-05, "loss": 2.2724, "mean_token_accuracy": 0.4294010877609253, "step": 60395 }, { "epoch": 0.0608355214984071, "grad_norm": 11.824832207162391, "learning_rate": 4.998555949430365e-05, "loss": 3.1633, "mean_token_accuracy": 0.35172414481639863, "step": 60400 }, { "epoch": 0.06084055755151127, "grad_norm": 19.420752650911407, "learning_rate": 4.998554606833441e-05, "loss": 2.8683, "mean_token_accuracy": 0.3896551728248596, "step": 60405 }, { "epoch": 0.060845593604615444, "grad_norm": 13.053557668476259, "learning_rate": 4.998553263612871e-05, "loss": 2.6707, "mean_token_accuracy": 0.31034481823444365, "step": 60410 }, { "epoch": 0.06085062965771962, "grad_norm": 11.689821421754035, "learning_rate": 4.998551919768656e-05, "loss": 2.4285, "mean_token_accuracy": 0.39122806787490844, "step": 60415 }, { "epoch": 0.060855665710823785, "grad_norm": 13.695400171664188, "learning_rate": 4.998550575300798e-05, "loss": 2.9181, "mean_token_accuracy": 0.38620689511299133, "step": 60420 }, { "epoch": 0.06086070176392796, "grad_norm": 13.170333591299064, "learning_rate": 4.998549230209295e-05, "loss": 2.6872, "mean_token_accuracy": 0.3655172407627106, "step": 60425 }, { "epoch": 0.06086573781703213, "grad_norm": 17.380961133057244, "learning_rate": 4.998547884494148e-05, "loss": 2.5364, "mean_token_accuracy": 0.4310344815254211, "step": 60430 }, { "epoch": 0.060870773870136306, "grad_norm": 9.877438870099555, "learning_rate": 4.998546538155359e-05, "loss": 2.1051, "mean_token_accuracy": 0.5334543228149414, "step": 60435 }, { "epoch": 0.06087580992324048, "grad_norm": 11.476739883362733, "learning_rate": 4.998545191192927e-05, "loss": 3.0133, "mean_token_accuracy": 0.32413793802261354, "step": 60440 }, { "epoch": 0.060880845976344654, "grad_norm": 13.418254303902149, "learning_rate": 4.998543843606853e-05, "loss": 2.4849, "mean_token_accuracy": 0.35172414481639863, "step": 60445 }, { "epoch": 0.06088588202944882, "grad_norm": 11.520542978845263, "learning_rate": 4.998542495397136e-05, "loss": 2.3928, "mean_token_accuracy": 0.41034482419490814, "step": 60450 }, { "epoch": 0.060890918082552994, "grad_norm": 13.561369119546196, "learning_rate": 4.998541146563778e-05, "loss": 2.4797, "mean_token_accuracy": 0.3965517282485962, "step": 60455 }, { "epoch": 0.06089595413565717, "grad_norm": 11.238565536828915, "learning_rate": 4.998539797106778e-05, "loss": 2.6553, "mean_token_accuracy": 0.4413793087005615, "step": 60460 }, { "epoch": 0.06090099018876134, "grad_norm": 13.100258335234958, "learning_rate": 4.998538447026138e-05, "loss": 2.7412, "mean_token_accuracy": 0.37241379618644715, "step": 60465 }, { "epoch": 0.060906026241865516, "grad_norm": 11.084291485640083, "learning_rate": 4.9985370963218575e-05, "loss": 3.0759, "mean_token_accuracy": 0.34137930274009703, "step": 60470 }, { "epoch": 0.06091106229496969, "grad_norm": 12.976846685434413, "learning_rate": 4.998535744993936e-05, "loss": 2.7507, "mean_token_accuracy": 0.4068965554237366, "step": 60475 }, { "epoch": 0.06091609834807386, "grad_norm": 11.412004920752665, "learning_rate": 4.9985343930423756e-05, "loss": 2.2269, "mean_token_accuracy": 0.44482759237289426, "step": 60480 }, { "epoch": 0.06092113440117803, "grad_norm": 17.03733666817087, "learning_rate": 4.998533040467175e-05, "loss": 2.3482, "mean_token_accuracy": 0.42758620977401735, "step": 60485 }, { "epoch": 0.060926170454282204, "grad_norm": 12.677766350963694, "learning_rate": 4.998531687268336e-05, "loss": 2.9805, "mean_token_accuracy": 0.31379310190677645, "step": 60490 }, { "epoch": 0.06093120650738638, "grad_norm": 10.642242725827334, "learning_rate": 4.998530333445858e-05, "loss": 2.662, "mean_token_accuracy": 0.3793103456497192, "step": 60495 }, { "epoch": 0.06093624256049055, "grad_norm": 11.043577991225295, "learning_rate": 4.998528978999742e-05, "loss": 2.6405, "mean_token_accuracy": 0.3655172407627106, "step": 60500 }, { "epoch": 0.060941278613594725, "grad_norm": 11.167720525855971, "learning_rate": 4.998527623929988e-05, "loss": 2.3208, "mean_token_accuracy": 0.4586206912994385, "step": 60505 }, { "epoch": 0.0609463146666989, "grad_norm": 12.521070552822009, "learning_rate": 4.998526268236596e-05, "loss": 2.4588, "mean_token_accuracy": 0.4137930989265442, "step": 60510 }, { "epoch": 0.06095135071980307, "grad_norm": 10.416724015822673, "learning_rate": 4.9985249119195674e-05, "loss": 2.2841, "mean_token_accuracy": 0.47241379618644713, "step": 60515 }, { "epoch": 0.06095638677290724, "grad_norm": 14.473165870052979, "learning_rate": 4.998523554978902e-05, "loss": 2.5636, "mean_token_accuracy": 0.4, "step": 60520 }, { "epoch": 0.06096142282601141, "grad_norm": 12.749256885024645, "learning_rate": 4.9985221974146004e-05, "loss": 2.8357, "mean_token_accuracy": 0.38275861740112305, "step": 60525 }, { "epoch": 0.06096645887911559, "grad_norm": 13.803196445154077, "learning_rate": 4.998520839226662e-05, "loss": 3.0427, "mean_token_accuracy": 0.35862069129943847, "step": 60530 }, { "epoch": 0.06097149493221976, "grad_norm": 9.377474000977855, "learning_rate": 4.998519480415089e-05, "loss": 2.2234, "mean_token_accuracy": 0.47241379618644713, "step": 60535 }, { "epoch": 0.060976530985323935, "grad_norm": 10.02835186439114, "learning_rate": 4.99851812097988e-05, "loss": 3.2793, "mean_token_accuracy": 0.3137931048870087, "step": 60540 }, { "epoch": 0.06098156703842811, "grad_norm": 11.205267450980456, "learning_rate": 4.998516760921036e-05, "loss": 2.5558, "mean_token_accuracy": 0.4034482717514038, "step": 60545 }, { "epoch": 0.06098660309153228, "grad_norm": 12.091668180846625, "learning_rate": 4.998515400238558e-05, "loss": 2.7795, "mean_token_accuracy": 0.36551723480224607, "step": 60550 }, { "epoch": 0.06099163914463645, "grad_norm": 11.662943833008136, "learning_rate": 4.998514038932446e-05, "loss": 2.4262, "mean_token_accuracy": 0.35862069129943847, "step": 60555 }, { "epoch": 0.06099667519774062, "grad_norm": 11.842173556365761, "learning_rate": 4.9985126770027e-05, "loss": 2.4756, "mean_token_accuracy": 0.44482759237289426, "step": 60560 }, { "epoch": 0.061001711250844796, "grad_norm": 12.62525317520617, "learning_rate": 4.99851131444932e-05, "loss": 2.651, "mean_token_accuracy": 0.3827586233615875, "step": 60565 }, { "epoch": 0.06100674730394897, "grad_norm": 11.575389337232481, "learning_rate": 4.998509951272308e-05, "loss": 2.848, "mean_token_accuracy": 0.34137930274009703, "step": 60570 }, { "epoch": 0.061011783357053144, "grad_norm": 11.513188914590241, "learning_rate": 4.998508587471662e-05, "loss": 2.905, "mean_token_accuracy": 0.37241379618644715, "step": 60575 }, { "epoch": 0.06101681941015732, "grad_norm": 14.516846633896915, "learning_rate": 4.998507223047386e-05, "loss": 2.7737, "mean_token_accuracy": 0.3551724076271057, "step": 60580 }, { "epoch": 0.06102185546326149, "grad_norm": 11.786578284217796, "learning_rate": 4.998505857999476e-05, "loss": 2.4641, "mean_token_accuracy": 0.4291590988636017, "step": 60585 }, { "epoch": 0.06102689151636566, "grad_norm": 12.067663404063822, "learning_rate": 4.9985044923279354e-05, "loss": 2.6296, "mean_token_accuracy": 0.4379310369491577, "step": 60590 }, { "epoch": 0.06103192756946983, "grad_norm": 10.738264213435809, "learning_rate": 4.9985031260327636e-05, "loss": 2.3867, "mean_token_accuracy": 0.4172413766384125, "step": 60595 }, { "epoch": 0.061036963622574006, "grad_norm": 12.952086503885884, "learning_rate": 4.998501759113961e-05, "loss": 3.0961, "mean_token_accuracy": 0.3448275923728943, "step": 60600 }, { "epoch": 0.06104199967567818, "grad_norm": 10.884299905413041, "learning_rate": 4.9985003915715276e-05, "loss": 2.264, "mean_token_accuracy": 0.4034482717514038, "step": 60605 }, { "epoch": 0.06104703572878235, "grad_norm": 11.46011344216903, "learning_rate": 4.9984990234054654e-05, "loss": 2.2516, "mean_token_accuracy": 0.4620689690113068, "step": 60610 }, { "epoch": 0.06105207178188653, "grad_norm": 16.747914550529572, "learning_rate": 4.998497654615773e-05, "loss": 3.0367, "mean_token_accuracy": 0.3655172437429428, "step": 60615 }, { "epoch": 0.0610571078349907, "grad_norm": 12.088291194497128, "learning_rate": 4.9984962852024505e-05, "loss": 2.5426, "mean_token_accuracy": 0.3655172407627106, "step": 60620 }, { "epoch": 0.06106214388809487, "grad_norm": 14.280081083323502, "learning_rate": 4.9984949151655005e-05, "loss": 2.3373, "mean_token_accuracy": 0.4724137902259827, "step": 60625 }, { "epoch": 0.06106717994119904, "grad_norm": 13.259700781870801, "learning_rate": 4.998493544504922e-05, "loss": 2.5638, "mean_token_accuracy": 0.39310344457626345, "step": 60630 }, { "epoch": 0.061072215994303215, "grad_norm": 12.51917726089093, "learning_rate": 4.998492173220715e-05, "loss": 2.4645, "mean_token_accuracy": 0.39310343861579894, "step": 60635 }, { "epoch": 0.06107725204740739, "grad_norm": 11.935221135321953, "learning_rate": 4.99849080131288e-05, "loss": 2.3157, "mean_token_accuracy": 0.4103448152542114, "step": 60640 }, { "epoch": 0.06108228810051156, "grad_norm": 10.784701545140745, "learning_rate": 4.998489428781418e-05, "loss": 2.4664, "mean_token_accuracy": 0.4330308556556702, "step": 60645 }, { "epoch": 0.06108732415361574, "grad_norm": 14.174958720568027, "learning_rate": 4.998488055626329e-05, "loss": 3.0776, "mean_token_accuracy": 0.33103448152542114, "step": 60650 }, { "epoch": 0.06109236020671991, "grad_norm": 12.999759848422103, "learning_rate": 4.9984866818476136e-05, "loss": 2.3552, "mean_token_accuracy": 0.4543859601020813, "step": 60655 }, { "epoch": 0.06109739625982408, "grad_norm": 10.469801022448646, "learning_rate": 4.998485307445271e-05, "loss": 2.7086, "mean_token_accuracy": 0.3724137932062149, "step": 60660 }, { "epoch": 0.06110243231292825, "grad_norm": 12.20101436416402, "learning_rate": 4.998483932419305e-05, "loss": 2.1409, "mean_token_accuracy": 0.4640048325061798, "step": 60665 }, { "epoch": 0.061107468366032425, "grad_norm": 11.6016952055602, "learning_rate": 4.998482556769711e-05, "loss": 2.6587, "mean_token_accuracy": 0.38620689511299133, "step": 60670 }, { "epoch": 0.0611125044191366, "grad_norm": 15.07959621201417, "learning_rate": 4.998481180496494e-05, "loss": 2.6004, "mean_token_accuracy": 0.42068964838981626, "step": 60675 }, { "epoch": 0.06111754047224077, "grad_norm": 10.706297776946737, "learning_rate": 4.998479803599651e-05, "loss": 2.414, "mean_token_accuracy": 0.4, "step": 60680 }, { "epoch": 0.061122576525344946, "grad_norm": 12.583561090343297, "learning_rate": 4.998478426079184e-05, "loss": 2.3766, "mean_token_accuracy": 0.46079854369163514, "step": 60685 }, { "epoch": 0.06112761257844912, "grad_norm": 11.375496134716098, "learning_rate": 4.998477047935094e-05, "loss": 2.852, "mean_token_accuracy": 0.3482758581638336, "step": 60690 }, { "epoch": 0.06113264863155329, "grad_norm": 11.921027954536335, "learning_rate": 4.998475669167379e-05, "loss": 2.2144, "mean_token_accuracy": 0.46551724672317507, "step": 60695 }, { "epoch": 0.06113768468465746, "grad_norm": 11.75837918289072, "learning_rate": 4.998474289776042e-05, "loss": 2.4636, "mean_token_accuracy": 0.358620685338974, "step": 60700 }, { "epoch": 0.061142720737761634, "grad_norm": 11.73011377112593, "learning_rate": 4.998472909761082e-05, "loss": 2.5863, "mean_token_accuracy": 0.41379310488700866, "step": 60705 }, { "epoch": 0.06114775679086581, "grad_norm": 13.210835905967208, "learning_rate": 4.9984715291225e-05, "loss": 2.6836, "mean_token_accuracy": 0.3448275804519653, "step": 60710 }, { "epoch": 0.06115279284396998, "grad_norm": 10.622805748016017, "learning_rate": 4.9984701478602955e-05, "loss": 2.184, "mean_token_accuracy": 0.482758617401123, "step": 60715 }, { "epoch": 0.061157828897074155, "grad_norm": 11.914364265685528, "learning_rate": 4.998468765974469e-05, "loss": 2.3602, "mean_token_accuracy": 0.41724138259887694, "step": 60720 }, { "epoch": 0.06116286495017833, "grad_norm": 9.665203000557105, "learning_rate": 4.998467383465022e-05, "loss": 2.2192, "mean_token_accuracy": 0.4310344815254211, "step": 60725 }, { "epoch": 0.061167901003282496, "grad_norm": 11.508144277754866, "learning_rate": 4.998466000331954e-05, "loss": 2.2544, "mean_token_accuracy": 0.43793103098869324, "step": 60730 }, { "epoch": 0.06117293705638667, "grad_norm": 13.371516394193609, "learning_rate": 4.998464616575266e-05, "loss": 2.5949, "mean_token_accuracy": 0.36896551847457887, "step": 60735 }, { "epoch": 0.061177973109490844, "grad_norm": 13.640361312297172, "learning_rate": 4.998463232194957e-05, "loss": 2.6681, "mean_token_accuracy": 0.4, "step": 60740 }, { "epoch": 0.06118300916259502, "grad_norm": 13.46600094455518, "learning_rate": 4.99846184719103e-05, "loss": 2.372, "mean_token_accuracy": 0.4185722887516022, "step": 60745 }, { "epoch": 0.06118804521569919, "grad_norm": 12.70777933808871, "learning_rate": 4.9984604615634815e-05, "loss": 2.8276, "mean_token_accuracy": 0.3827586233615875, "step": 60750 }, { "epoch": 0.061193081268803365, "grad_norm": 11.533950931569853, "learning_rate": 4.998459075312316e-05, "loss": 2.4559, "mean_token_accuracy": 0.4448880791664124, "step": 60755 }, { "epoch": 0.06119811732190754, "grad_norm": 12.28890369858678, "learning_rate": 4.9984576884375306e-05, "loss": 2.631, "mean_token_accuracy": 0.4068965554237366, "step": 60760 }, { "epoch": 0.061203153375011705, "grad_norm": 12.632913584543306, "learning_rate": 4.998456300939128e-05, "loss": 2.69, "mean_token_accuracy": 0.40000001192092893, "step": 60765 }, { "epoch": 0.06120818942811588, "grad_norm": 11.72593839271051, "learning_rate": 4.998454912817107e-05, "loss": 2.8603, "mean_token_accuracy": 0.38064125180244446, "step": 60770 }, { "epoch": 0.06121322548122005, "grad_norm": 13.16968082160041, "learning_rate": 4.998453524071469e-05, "loss": 2.3786, "mean_token_accuracy": 0.4724137902259827, "step": 60775 }, { "epoch": 0.06121826153432423, "grad_norm": 13.710325395783022, "learning_rate": 4.998452134702214e-05, "loss": 2.612, "mean_token_accuracy": 0.358620685338974, "step": 60780 }, { "epoch": 0.0612232975874284, "grad_norm": 13.88846842862795, "learning_rate": 4.998450744709342e-05, "loss": 2.7457, "mean_token_accuracy": 0.36896551251411436, "step": 60785 }, { "epoch": 0.061228333640532574, "grad_norm": 11.178429985622644, "learning_rate": 4.9984493540928545e-05, "loss": 2.422, "mean_token_accuracy": 0.4206896543502808, "step": 60790 }, { "epoch": 0.06123336969363675, "grad_norm": 12.246653927549643, "learning_rate": 4.9984479628527515e-05, "loss": 2.4918, "mean_token_accuracy": 0.4248768508434296, "step": 60795 }, { "epoch": 0.061238405746740915, "grad_norm": 14.601659334825566, "learning_rate": 4.9984465709890324e-05, "loss": 2.3189, "mean_token_accuracy": 0.41379308700561523, "step": 60800 }, { "epoch": 0.06124344179984509, "grad_norm": 9.825405226156088, "learning_rate": 4.9984451785016984e-05, "loss": 2.1877, "mean_token_accuracy": 0.44827585220336913, "step": 60805 }, { "epoch": 0.06124847785294926, "grad_norm": 14.2518741044702, "learning_rate": 4.99844378539075e-05, "loss": 2.4936, "mean_token_accuracy": 0.4448275864124298, "step": 60810 }, { "epoch": 0.061253513906053436, "grad_norm": 12.503127658708125, "learning_rate": 4.998442391656186e-05, "loss": 2.8253, "mean_token_accuracy": 0.37241379022598264, "step": 60815 }, { "epoch": 0.06125854995915761, "grad_norm": 11.365220650088974, "learning_rate": 4.998440997298009e-05, "loss": 2.8196, "mean_token_accuracy": 0.38275861740112305, "step": 60820 }, { "epoch": 0.061263586012261784, "grad_norm": 11.19644098456086, "learning_rate": 4.998439602316219e-05, "loss": 2.2451, "mean_token_accuracy": 0.4379310369491577, "step": 60825 }, { "epoch": 0.06126862206536596, "grad_norm": 10.843849066229062, "learning_rate": 4.9984382067108157e-05, "loss": 2.2162, "mean_token_accuracy": 0.4344827592372894, "step": 60830 }, { "epoch": 0.061273658118470124, "grad_norm": 9.978040903871667, "learning_rate": 4.9984368104818e-05, "loss": 2.1801, "mean_token_accuracy": 0.41724138259887694, "step": 60835 }, { "epoch": 0.0612786941715743, "grad_norm": 15.48391771231129, "learning_rate": 4.998435413629171e-05, "loss": 2.765, "mean_token_accuracy": 0.37586206793785093, "step": 60840 }, { "epoch": 0.06128373022467847, "grad_norm": 12.79379909886921, "learning_rate": 4.99843401615293e-05, "loss": 2.6533, "mean_token_accuracy": 0.37586206793785093, "step": 60845 }, { "epoch": 0.061288766277782646, "grad_norm": 12.432238199306587, "learning_rate": 4.998432618053078e-05, "loss": 2.5838, "mean_token_accuracy": 0.4137930989265442, "step": 60850 }, { "epoch": 0.06129380233088682, "grad_norm": 9.906281777612866, "learning_rate": 4.998431219329615e-05, "loss": 2.0733, "mean_token_accuracy": 0.4965517222881317, "step": 60855 }, { "epoch": 0.06129883838399099, "grad_norm": 11.667680713918477, "learning_rate": 4.998429819982541e-05, "loss": 2.5069, "mean_token_accuracy": 0.42413793206214906, "step": 60860 }, { "epoch": 0.06130387443709517, "grad_norm": 9.985311962092275, "learning_rate": 4.9984284200118566e-05, "loss": 2.678, "mean_token_accuracy": 0.4275862067937851, "step": 60865 }, { "epoch": 0.061308910490199334, "grad_norm": 9.87021724048607, "learning_rate": 4.998427019417562e-05, "loss": 2.3677, "mean_token_accuracy": 0.4586206912994385, "step": 60870 }, { "epoch": 0.06131394654330351, "grad_norm": 12.870368588308825, "learning_rate": 4.9984256181996584e-05, "loss": 2.8536, "mean_token_accuracy": 0.36551724672317504, "step": 60875 }, { "epoch": 0.06131898259640768, "grad_norm": 11.81527390751178, "learning_rate": 4.9984242163581445e-05, "loss": 2.352, "mean_token_accuracy": 0.4068965554237366, "step": 60880 }, { "epoch": 0.061324018649511855, "grad_norm": 14.361974485249515, "learning_rate": 4.998422813893022e-05, "loss": 2.8835, "mean_token_accuracy": 0.3551724076271057, "step": 60885 }, { "epoch": 0.06132905470261603, "grad_norm": 8.293796362084402, "learning_rate": 4.998421410804292e-05, "loss": 2.2778, "mean_token_accuracy": 0.4816697001457214, "step": 60890 }, { "epoch": 0.0613340907557202, "grad_norm": 12.337936978376892, "learning_rate": 4.9984200070919536e-05, "loss": 2.2816, "mean_token_accuracy": 0.4620689630508423, "step": 60895 }, { "epoch": 0.061339126808824376, "grad_norm": 14.269805893767883, "learning_rate": 4.998418602756007e-05, "loss": 2.541, "mean_token_accuracy": 0.4034482777118683, "step": 60900 }, { "epoch": 0.06134416286192854, "grad_norm": 12.014824054042165, "learning_rate": 4.998417197796454e-05, "loss": 2.7445, "mean_token_accuracy": 0.34482758939266206, "step": 60905 }, { "epoch": 0.06134919891503272, "grad_norm": 12.578812375891, "learning_rate": 4.998415792213293e-05, "loss": 2.9219, "mean_token_accuracy": 0.3896551728248596, "step": 60910 }, { "epoch": 0.06135423496813689, "grad_norm": 11.515401433789897, "learning_rate": 4.998414386006526e-05, "loss": 2.5048, "mean_token_accuracy": 0.4300060421228409, "step": 60915 }, { "epoch": 0.061359271021241064, "grad_norm": 11.896152881686483, "learning_rate": 4.998412979176153e-05, "loss": 2.4348, "mean_token_accuracy": 0.39655172228813174, "step": 60920 }, { "epoch": 0.06136430707434524, "grad_norm": 11.382969923357996, "learning_rate": 4.9984115717221735e-05, "loss": 2.7127, "mean_token_accuracy": 0.41034482717514037, "step": 60925 }, { "epoch": 0.06136934312744941, "grad_norm": 10.758223073812248, "learning_rate": 4.99841016364459e-05, "loss": 2.8235, "mean_token_accuracy": 0.3551724135875702, "step": 60930 }, { "epoch": 0.061374379180553586, "grad_norm": 12.881763063486847, "learning_rate": 4.9984087549434005e-05, "loss": 2.7333, "mean_token_accuracy": 0.33103448152542114, "step": 60935 }, { "epoch": 0.06137941523365775, "grad_norm": 10.519227375378943, "learning_rate": 4.998407345618607e-05, "loss": 2.3672, "mean_token_accuracy": 0.41034482717514037, "step": 60940 }, { "epoch": 0.061384451286761926, "grad_norm": 12.902271769599553, "learning_rate": 4.998405935670209e-05, "loss": 2.4911, "mean_token_accuracy": 0.3931034505367279, "step": 60945 }, { "epoch": 0.0613894873398661, "grad_norm": 11.681012936576305, "learning_rate": 4.998404525098207e-05, "loss": 2.4153, "mean_token_accuracy": 0.37586206793785093, "step": 60950 }, { "epoch": 0.061394523392970274, "grad_norm": 11.446626796566896, "learning_rate": 4.9984031139026024e-05, "loss": 2.6148, "mean_token_accuracy": 0.4413793087005615, "step": 60955 }, { "epoch": 0.06139955944607445, "grad_norm": 12.24068049422373, "learning_rate": 4.998401702083394e-05, "loss": 2.7793, "mean_token_accuracy": 0.3809437394142151, "step": 60960 }, { "epoch": 0.06140459549917862, "grad_norm": 13.282359976507566, "learning_rate": 4.9984002896405834e-05, "loss": 2.6898, "mean_token_accuracy": 0.37241379618644715, "step": 60965 }, { "epoch": 0.061409631552282795, "grad_norm": 12.229371942059815, "learning_rate": 4.99839887657417e-05, "loss": 2.4021, "mean_token_accuracy": 0.42758620977401735, "step": 60970 }, { "epoch": 0.06141466760538696, "grad_norm": 9.72245085093366, "learning_rate": 4.998397462884156e-05, "loss": 2.5888, "mean_token_accuracy": 0.4427707254886627, "step": 60975 }, { "epoch": 0.061419703658491136, "grad_norm": 11.576517467180311, "learning_rate": 4.99839604857054e-05, "loss": 2.2239, "mean_token_accuracy": 0.48469448685646055, "step": 60980 }, { "epoch": 0.06142473971159531, "grad_norm": 14.575449453874423, "learning_rate": 4.998394633633323e-05, "loss": 2.3665, "mean_token_accuracy": 0.42758620381355283, "step": 60985 }, { "epoch": 0.06142977576469948, "grad_norm": 10.703681873516116, "learning_rate": 4.998393218072505e-05, "loss": 2.6956, "mean_token_accuracy": 0.3793103516101837, "step": 60990 }, { "epoch": 0.06143481181780366, "grad_norm": 12.821291466823785, "learning_rate": 4.9983918018880876e-05, "loss": 2.7018, "mean_token_accuracy": 0.38620689511299133, "step": 60995 }, { "epoch": 0.06143984787090783, "grad_norm": 12.437399177555061, "learning_rate": 4.9983903850800695e-05, "loss": 2.6565, "mean_token_accuracy": 0.3655172437429428, "step": 61000 }, { "epoch": 0.061444883924012005, "grad_norm": 8.836579479929444, "learning_rate": 4.998388967648452e-05, "loss": 2.5141, "mean_token_accuracy": 0.4068965554237366, "step": 61005 }, { "epoch": 0.06144991997711617, "grad_norm": 10.358773795347407, "learning_rate": 4.9983875495932355e-05, "loss": 2.2272, "mean_token_accuracy": 0.4275862157344818, "step": 61010 }, { "epoch": 0.061454956030220345, "grad_norm": 10.655984042280286, "learning_rate": 4.998386130914421e-05, "loss": 2.617, "mean_token_accuracy": 0.4225045442581177, "step": 61015 }, { "epoch": 0.06145999208332452, "grad_norm": 10.06587294477688, "learning_rate": 4.998384711612007e-05, "loss": 2.1148, "mean_token_accuracy": 0.458620685338974, "step": 61020 }, { "epoch": 0.06146502813642869, "grad_norm": 10.959257957527587, "learning_rate": 4.998383291685996e-05, "loss": 2.6675, "mean_token_accuracy": 0.38965516686439516, "step": 61025 }, { "epoch": 0.061470064189532866, "grad_norm": 12.979696527583272, "learning_rate": 4.998381871136387e-05, "loss": 2.8825, "mean_token_accuracy": 0.37241379022598264, "step": 61030 }, { "epoch": 0.06147510024263704, "grad_norm": 11.820341484208374, "learning_rate": 4.9983804499631815e-05, "loss": 2.6082, "mean_token_accuracy": 0.3655172407627106, "step": 61035 }, { "epoch": 0.061480136295741214, "grad_norm": 10.594816698177208, "learning_rate": 4.998379028166379e-05, "loss": 1.8772, "mean_token_accuracy": 0.5125226855278016, "step": 61040 }, { "epoch": 0.06148517234884538, "grad_norm": 13.748667149548465, "learning_rate": 4.99837760574598e-05, "loss": 2.448, "mean_token_accuracy": 0.3551724165678024, "step": 61045 }, { "epoch": 0.061490208401949555, "grad_norm": 20.695947567361593, "learning_rate": 4.9983761827019857e-05, "loss": 2.5128, "mean_token_accuracy": 0.46551724672317507, "step": 61050 }, { "epoch": 0.06149524445505373, "grad_norm": 25.19566171312912, "learning_rate": 4.998374759034395e-05, "loss": 2.5511, "mean_token_accuracy": 0.44827587008476255, "step": 61055 }, { "epoch": 0.0615002805081579, "grad_norm": 12.622428596948168, "learning_rate": 4.9983733347432096e-05, "loss": 2.6227, "mean_token_accuracy": 0.417241370677948, "step": 61060 }, { "epoch": 0.061505316561262076, "grad_norm": 11.744017226978869, "learning_rate": 4.9983719098284295e-05, "loss": 2.4806, "mean_token_accuracy": 0.4034482777118683, "step": 61065 }, { "epoch": 0.06151035261436625, "grad_norm": 11.555240398490348, "learning_rate": 4.998370484290055e-05, "loss": 2.2825, "mean_token_accuracy": 0.4413793087005615, "step": 61070 }, { "epoch": 0.06151538866747042, "grad_norm": 12.039358341119847, "learning_rate": 4.9983690581280866e-05, "loss": 2.7447, "mean_token_accuracy": 0.31724137663841245, "step": 61075 }, { "epoch": 0.06152042472057459, "grad_norm": 12.732539508710907, "learning_rate": 4.9983676313425245e-05, "loss": 2.7096, "mean_token_accuracy": 0.36896551847457887, "step": 61080 }, { "epoch": 0.061525460773678764, "grad_norm": 10.941224027987998, "learning_rate": 4.998366203933369e-05, "loss": 2.4579, "mean_token_accuracy": 0.4551724076271057, "step": 61085 }, { "epoch": 0.06153049682678294, "grad_norm": 22.649235881120084, "learning_rate": 4.9983647759006216e-05, "loss": 2.6894, "mean_token_accuracy": 0.4000000059604645, "step": 61090 }, { "epoch": 0.06153553287988711, "grad_norm": 9.160445996849305, "learning_rate": 4.9983633472442815e-05, "loss": 2.499, "mean_token_accuracy": 0.3655172407627106, "step": 61095 }, { "epoch": 0.061540568932991285, "grad_norm": 11.617730606290884, "learning_rate": 4.998361917964349e-05, "loss": 2.0935, "mean_token_accuracy": 0.4502117335796356, "step": 61100 }, { "epoch": 0.06154560498609546, "grad_norm": 10.068171510737814, "learning_rate": 4.998360488060826e-05, "loss": 2.6928, "mean_token_accuracy": 0.3758620649576187, "step": 61105 }, { "epoch": 0.06155064103919963, "grad_norm": 11.915712620806389, "learning_rate": 4.99835905753371e-05, "loss": 2.4442, "mean_token_accuracy": 0.4379310369491577, "step": 61110 }, { "epoch": 0.0615556770923038, "grad_norm": 12.12056400646725, "learning_rate": 4.998357626383005e-05, "loss": 2.5804, "mean_token_accuracy": 0.33448275923728943, "step": 61115 }, { "epoch": 0.06156071314540797, "grad_norm": 14.717487990470058, "learning_rate": 4.998356194608709e-05, "loss": 3.0927, "mean_token_accuracy": 0.3551724135875702, "step": 61120 }, { "epoch": 0.06156574919851215, "grad_norm": 16.075721285934897, "learning_rate": 4.998354762210823e-05, "loss": 2.7387, "mean_token_accuracy": 0.36896551847457887, "step": 61125 }, { "epoch": 0.06157078525161632, "grad_norm": 10.377626617221766, "learning_rate": 4.998353329189347e-05, "loss": 2.5762, "mean_token_accuracy": 0.4068965584039688, "step": 61130 }, { "epoch": 0.061575821304720495, "grad_norm": 12.658106343727441, "learning_rate": 4.998351895544283e-05, "loss": 2.674, "mean_token_accuracy": 0.3827586233615875, "step": 61135 }, { "epoch": 0.06158085735782467, "grad_norm": 11.341667485107928, "learning_rate": 4.9983504612756297e-05, "loss": 2.286, "mean_token_accuracy": 0.49491833448410033, "step": 61140 }, { "epoch": 0.06158589341092884, "grad_norm": 13.858331409055655, "learning_rate": 4.998349026383387e-05, "loss": 2.5145, "mean_token_accuracy": 0.4, "step": 61145 }, { "epoch": 0.06159092946403301, "grad_norm": 9.79071140793119, "learning_rate": 4.998347590867557e-05, "loss": 2.8608, "mean_token_accuracy": 0.3724137991666794, "step": 61150 }, { "epoch": 0.06159596551713718, "grad_norm": 11.974664670359026, "learning_rate": 4.99834615472814e-05, "loss": 2.9615, "mean_token_accuracy": 0.33793103098869326, "step": 61155 }, { "epoch": 0.06160100157024136, "grad_norm": 12.170581596823617, "learning_rate": 4.9983447179651346e-05, "loss": 2.3914, "mean_token_accuracy": 0.44482758045196535, "step": 61160 }, { "epoch": 0.06160603762334553, "grad_norm": 11.937746124976512, "learning_rate": 4.998343280578543e-05, "loss": 3.018, "mean_token_accuracy": 0.3379310369491577, "step": 61165 }, { "epoch": 0.061611073676449704, "grad_norm": 11.425222585585725, "learning_rate": 4.9983418425683654e-05, "loss": 2.5276, "mean_token_accuracy": 0.4103448331356049, "step": 61170 }, { "epoch": 0.06161610972955388, "grad_norm": 10.972341242013544, "learning_rate": 4.998340403934601e-05, "loss": 2.2552, "mean_token_accuracy": 0.4728372633457184, "step": 61175 }, { "epoch": 0.06162114578265805, "grad_norm": 13.238054488495989, "learning_rate": 4.9983389646772515e-05, "loss": 2.4851, "mean_token_accuracy": 0.4355911314487457, "step": 61180 }, { "epoch": 0.06162618183576222, "grad_norm": 12.658361503474714, "learning_rate": 4.9983375247963174e-05, "loss": 2.5511, "mean_token_accuracy": 0.38965516686439516, "step": 61185 }, { "epoch": 0.06163121788886639, "grad_norm": 10.903764341131021, "learning_rate": 4.998336084291798e-05, "loss": 2.6009, "mean_token_accuracy": 0.3827586114406586, "step": 61190 }, { "epoch": 0.061636253941970566, "grad_norm": 12.255867419784378, "learning_rate": 4.9983346431636943e-05, "loss": 2.728, "mean_token_accuracy": 0.3620689630508423, "step": 61195 }, { "epoch": 0.06164128999507474, "grad_norm": 11.86671729161742, "learning_rate": 4.998333201412006e-05, "loss": 2.578, "mean_token_accuracy": 0.38965516686439516, "step": 61200 }, { "epoch": 0.061646326048178914, "grad_norm": 10.095263787843054, "learning_rate": 4.998331759036734e-05, "loss": 2.6843, "mean_token_accuracy": 0.3379310339689255, "step": 61205 }, { "epoch": 0.06165136210128309, "grad_norm": 12.394359044802973, "learning_rate": 4.998330316037879e-05, "loss": 2.811, "mean_token_accuracy": 0.3379310369491577, "step": 61210 }, { "epoch": 0.06165639815438726, "grad_norm": 10.650654758974728, "learning_rate": 4.998328872415441e-05, "loss": 2.5656, "mean_token_accuracy": 0.4, "step": 61215 }, { "epoch": 0.06166143420749143, "grad_norm": 13.700319957960788, "learning_rate": 4.998327428169422e-05, "loss": 2.9714, "mean_token_accuracy": 0.3827586233615875, "step": 61220 }, { "epoch": 0.0616664702605956, "grad_norm": 12.871321434088754, "learning_rate": 4.9983259832998195e-05, "loss": 2.546, "mean_token_accuracy": 0.4241379380226135, "step": 61225 }, { "epoch": 0.061671506313699775, "grad_norm": 13.541510953289714, "learning_rate": 4.998324537806637e-05, "loss": 2.8257, "mean_token_accuracy": 0.3620689630508423, "step": 61230 }, { "epoch": 0.06167654236680395, "grad_norm": 10.485501814367387, "learning_rate": 4.998323091689871e-05, "loss": 2.5537, "mean_token_accuracy": 0.42758620381355283, "step": 61235 }, { "epoch": 0.06168157841990812, "grad_norm": 11.510750758742542, "learning_rate": 4.9983216449495255e-05, "loss": 2.3985, "mean_token_accuracy": 0.4206896543502808, "step": 61240 }, { "epoch": 0.0616866144730123, "grad_norm": 11.042795540177016, "learning_rate": 4.9983201975856e-05, "loss": 2.7364, "mean_token_accuracy": 0.324137932062149, "step": 61245 }, { "epoch": 0.06169165052611647, "grad_norm": 14.562593856534097, "learning_rate": 4.9983187495980946e-05, "loss": 2.317, "mean_token_accuracy": 0.4344827592372894, "step": 61250 }, { "epoch": 0.06169668657922064, "grad_norm": 12.035032599055087, "learning_rate": 4.9983173009870094e-05, "loss": 2.3153, "mean_token_accuracy": 0.42413793206214906, "step": 61255 }, { "epoch": 0.06170172263232481, "grad_norm": 13.366867866959446, "learning_rate": 4.998315851752344e-05, "loss": 2.9947, "mean_token_accuracy": 0.3965517163276672, "step": 61260 }, { "epoch": 0.061706758685428985, "grad_norm": 9.908334140111782, "learning_rate": 4.9983144018941005e-05, "loss": 2.4755, "mean_token_accuracy": 0.39310344457626345, "step": 61265 }, { "epoch": 0.06171179473853316, "grad_norm": 11.736464688904853, "learning_rate": 4.998312951412279e-05, "loss": 2.4541, "mean_token_accuracy": 0.4344827592372894, "step": 61270 }, { "epoch": 0.06171683079163733, "grad_norm": 11.181537124416623, "learning_rate": 4.998311500306879e-05, "loss": 2.1947, "mean_token_accuracy": 0.4241379380226135, "step": 61275 }, { "epoch": 0.061721866844741506, "grad_norm": 11.468012607208172, "learning_rate": 4.998310048577901e-05, "loss": 2.3766, "mean_token_accuracy": 0.42413793206214906, "step": 61280 }, { "epoch": 0.06172690289784568, "grad_norm": 14.570667613752613, "learning_rate": 4.998308596225347e-05, "loss": 2.4263, "mean_token_accuracy": 0.42758620381355283, "step": 61285 }, { "epoch": 0.06173193895094985, "grad_norm": 13.077842010640376, "learning_rate": 4.998307143249215e-05, "loss": 2.5806, "mean_token_accuracy": 0.42758620977401735, "step": 61290 }, { "epoch": 0.06173697500405402, "grad_norm": 14.151620355347168, "learning_rate": 4.9983056896495076e-05, "loss": 2.8954, "mean_token_accuracy": 0.38965516686439516, "step": 61295 }, { "epoch": 0.061742011057158194, "grad_norm": 11.624414056422784, "learning_rate": 4.998304235426224e-05, "loss": 2.5111, "mean_token_accuracy": 0.35862069129943847, "step": 61300 }, { "epoch": 0.06174704711026237, "grad_norm": 10.152922198967312, "learning_rate": 4.998302780579365e-05, "loss": 2.3029, "mean_token_accuracy": 0.44482758045196535, "step": 61305 }, { "epoch": 0.06175208316336654, "grad_norm": 13.498896543630305, "learning_rate": 4.99830132510893e-05, "loss": 2.8595, "mean_token_accuracy": 0.36551724672317504, "step": 61310 }, { "epoch": 0.061757119216470716, "grad_norm": 13.329928019732176, "learning_rate": 4.9982998690149205e-05, "loss": 2.8889, "mean_token_accuracy": 0.3482758641242981, "step": 61315 }, { "epoch": 0.06176215526957489, "grad_norm": 11.494763865806634, "learning_rate": 4.998298412297337e-05, "loss": 2.4117, "mean_token_accuracy": 0.46896552443504336, "step": 61320 }, { "epoch": 0.061767191322679056, "grad_norm": 10.730761202838421, "learning_rate": 4.99829695495618e-05, "loss": 2.313, "mean_token_accuracy": 0.4448275864124298, "step": 61325 }, { "epoch": 0.06177222737578323, "grad_norm": 15.295204853239804, "learning_rate": 4.998295496991448e-05, "loss": 2.8224, "mean_token_accuracy": 0.3689655065536499, "step": 61330 }, { "epoch": 0.061777263428887404, "grad_norm": 11.66116157561056, "learning_rate": 4.9982940384031445e-05, "loss": 2.6388, "mean_token_accuracy": 0.4068965494632721, "step": 61335 }, { "epoch": 0.06178229948199158, "grad_norm": 13.158571961052234, "learning_rate": 4.998292579191268e-05, "loss": 2.4017, "mean_token_accuracy": 0.4137930989265442, "step": 61340 }, { "epoch": 0.06178733553509575, "grad_norm": 11.415450732643984, "learning_rate": 4.998291119355818e-05, "loss": 2.6976, "mean_token_accuracy": 0.35862069129943847, "step": 61345 }, { "epoch": 0.061792371588199925, "grad_norm": 9.88108759925936, "learning_rate": 4.998289658896797e-05, "loss": 2.4407, "mean_token_accuracy": 0.42068966031074523, "step": 61350 }, { "epoch": 0.0617974076413041, "grad_norm": 9.796304636079038, "learning_rate": 4.998288197814204e-05, "loss": 2.469, "mean_token_accuracy": 0.4517241418361664, "step": 61355 }, { "epoch": 0.061802443694408266, "grad_norm": 10.544512717436971, "learning_rate": 4.99828673610804e-05, "loss": 3.0007, "mean_token_accuracy": 0.3482758641242981, "step": 61360 }, { "epoch": 0.06180747974751244, "grad_norm": 10.84167618452037, "learning_rate": 4.998285273778306e-05, "loss": 2.5306, "mean_token_accuracy": 0.42758620381355283, "step": 61365 }, { "epoch": 0.06181251580061661, "grad_norm": 12.565363374042542, "learning_rate": 4.998283810825001e-05, "loss": 2.6312, "mean_token_accuracy": 0.40217784941196444, "step": 61370 }, { "epoch": 0.06181755185372079, "grad_norm": 13.163550310429573, "learning_rate": 4.998282347248126e-05, "loss": 2.7995, "mean_token_accuracy": 0.3758620619773865, "step": 61375 }, { "epoch": 0.06182258790682496, "grad_norm": 11.57908482655264, "learning_rate": 4.998280883047682e-05, "loss": 2.3172, "mean_token_accuracy": 0.482819128036499, "step": 61380 }, { "epoch": 0.061827623959929134, "grad_norm": 10.656431164734363, "learning_rate": 4.998279418223668e-05, "loss": 2.8556, "mean_token_accuracy": 0.3241379290819168, "step": 61385 }, { "epoch": 0.06183266001303331, "grad_norm": 11.238398042517932, "learning_rate": 4.998277952776087e-05, "loss": 2.6576, "mean_token_accuracy": 0.4, "step": 61390 }, { "epoch": 0.061837696066137475, "grad_norm": 16.50338690932847, "learning_rate": 4.998276486704936e-05, "loss": 2.4246, "mean_token_accuracy": 0.42413792610168455, "step": 61395 }, { "epoch": 0.06184273211924165, "grad_norm": 10.755975690163028, "learning_rate": 4.998275020010217e-05, "loss": 2.6001, "mean_token_accuracy": 0.4052026629447937, "step": 61400 }, { "epoch": 0.06184776817234582, "grad_norm": 11.747854794290209, "learning_rate": 4.998273552691932e-05, "loss": 2.2736, "mean_token_accuracy": 0.4739866852760315, "step": 61405 }, { "epoch": 0.061852804225449996, "grad_norm": 11.624956893573927, "learning_rate": 4.9982720847500794e-05, "loss": 2.6577, "mean_token_accuracy": 0.42413792610168455, "step": 61410 }, { "epoch": 0.06185784027855417, "grad_norm": 11.109034234775965, "learning_rate": 4.9982706161846605e-05, "loss": 2.702, "mean_token_accuracy": 0.37241379022598264, "step": 61415 }, { "epoch": 0.061862876331658344, "grad_norm": 11.007641647308722, "learning_rate": 4.998269146995674e-05, "loss": 1.9984, "mean_token_accuracy": 0.4676950931549072, "step": 61420 }, { "epoch": 0.06186791238476252, "grad_norm": 11.01345259138696, "learning_rate": 4.9982676771831234e-05, "loss": 2.6161, "mean_token_accuracy": 0.4, "step": 61425 }, { "epoch": 0.061872948437866684, "grad_norm": 12.415990840501886, "learning_rate": 4.998266206747006e-05, "loss": 2.2763, "mean_token_accuracy": 0.4379310369491577, "step": 61430 }, { "epoch": 0.06187798449097086, "grad_norm": 11.7144231041422, "learning_rate": 4.998264735687324e-05, "loss": 2.4861, "mean_token_accuracy": 0.3999999940395355, "step": 61435 }, { "epoch": 0.06188302054407503, "grad_norm": 15.08400134978886, "learning_rate": 4.9982632640040774e-05, "loss": 2.7548, "mean_token_accuracy": 0.4448275864124298, "step": 61440 }, { "epoch": 0.061888056597179206, "grad_norm": 10.997616597536501, "learning_rate": 4.9982617916972666e-05, "loss": 2.6518, "mean_token_accuracy": 0.41379310488700866, "step": 61445 }, { "epoch": 0.06189309265028338, "grad_norm": 13.172450491629998, "learning_rate": 4.998260318766892e-05, "loss": 2.5827, "mean_token_accuracy": 0.4310344815254211, "step": 61450 }, { "epoch": 0.06189812870338755, "grad_norm": 13.653861908568981, "learning_rate": 4.9982588452129537e-05, "loss": 2.9313, "mean_token_accuracy": 0.3517241388559341, "step": 61455 }, { "epoch": 0.06190316475649173, "grad_norm": 17.443022336596822, "learning_rate": 4.998257371035453e-05, "loss": 2.9806, "mean_token_accuracy": 0.3356321781873703, "step": 61460 }, { "epoch": 0.061908200809595894, "grad_norm": 11.553446681609449, "learning_rate": 4.9982558962343896e-05, "loss": 2.5557, "mean_token_accuracy": 0.4275861978530884, "step": 61465 }, { "epoch": 0.06191323686270007, "grad_norm": 10.864588855604435, "learning_rate": 4.998254420809764e-05, "loss": 2.8106, "mean_token_accuracy": 0.3724137872457504, "step": 61470 }, { "epoch": 0.06191827291580424, "grad_norm": 16.738330702560827, "learning_rate": 4.9982529447615764e-05, "loss": 2.3987, "mean_token_accuracy": 0.4344827592372894, "step": 61475 }, { "epoch": 0.061923308968908415, "grad_norm": 13.050785724502227, "learning_rate": 4.9982514680898286e-05, "loss": 2.679, "mean_token_accuracy": 0.36896551251411436, "step": 61480 }, { "epoch": 0.06192834502201259, "grad_norm": 9.085657887867605, "learning_rate": 4.998249990794518e-05, "loss": 2.182, "mean_token_accuracy": 0.5034482777118683, "step": 61485 }, { "epoch": 0.06193338107511676, "grad_norm": 13.003771993114151, "learning_rate": 4.998248512875648e-05, "loss": 2.5409, "mean_token_accuracy": 0.4261947929859161, "step": 61490 }, { "epoch": 0.061938417128220936, "grad_norm": 10.261386855504673, "learning_rate": 4.9982470343332184e-05, "loss": 2.3032, "mean_token_accuracy": 0.4965517222881317, "step": 61495 }, { "epoch": 0.0619434531813251, "grad_norm": 11.90611017119523, "learning_rate": 4.998245555167228e-05, "loss": 2.5309, "mean_token_accuracy": 0.4068965554237366, "step": 61500 }, { "epoch": 0.06194848923442928, "grad_norm": 11.928910853836483, "learning_rate": 4.998244075377679e-05, "loss": 2.3976, "mean_token_accuracy": 0.40689656138420105, "step": 61505 }, { "epoch": 0.06195352528753345, "grad_norm": 10.719240542862194, "learning_rate": 4.998242594964571e-05, "loss": 2.7833, "mean_token_accuracy": 0.3931034505367279, "step": 61510 }, { "epoch": 0.061958561340637625, "grad_norm": 12.64631656298641, "learning_rate": 4.998241113927904e-05, "loss": 2.3892, "mean_token_accuracy": 0.4379310369491577, "step": 61515 }, { "epoch": 0.0619635973937418, "grad_norm": 12.055597877717409, "learning_rate": 4.998239632267679e-05, "loss": 2.5573, "mean_token_accuracy": 0.44827585816383364, "step": 61520 }, { "epoch": 0.06196863344684597, "grad_norm": 10.132957613726102, "learning_rate": 4.998238149983897e-05, "loss": 2.308, "mean_token_accuracy": 0.4432546854019165, "step": 61525 }, { "epoch": 0.061973669499950146, "grad_norm": 10.988356005985198, "learning_rate": 4.998236667076557e-05, "loss": 2.446, "mean_token_accuracy": 0.43448275327682495, "step": 61530 }, { "epoch": 0.06197870555305431, "grad_norm": 17.548017373955883, "learning_rate": 4.998235183545661e-05, "loss": 2.7644, "mean_token_accuracy": 0.4153055131435394, "step": 61535 }, { "epoch": 0.061983741606158486, "grad_norm": 11.903423291765865, "learning_rate": 4.998233699391208e-05, "loss": 2.6269, "mean_token_accuracy": 0.3604355752468109, "step": 61540 }, { "epoch": 0.06198877765926266, "grad_norm": 18.07622877201099, "learning_rate": 4.998232214613199e-05, "loss": 2.8877, "mean_token_accuracy": 0.37586206793785093, "step": 61545 }, { "epoch": 0.061993813712366834, "grad_norm": 13.071540668714718, "learning_rate": 4.998230729211635e-05, "loss": 3.1695, "mean_token_accuracy": 0.358620685338974, "step": 61550 }, { "epoch": 0.06199884976547101, "grad_norm": 13.043094008636514, "learning_rate": 4.998229243186516e-05, "loss": 2.7874, "mean_token_accuracy": 0.36896551847457887, "step": 61555 }, { "epoch": 0.06200388581857518, "grad_norm": 14.24071937558595, "learning_rate": 4.9982277565378413e-05, "loss": 2.5308, "mean_token_accuracy": 0.4137930989265442, "step": 61560 }, { "epoch": 0.062008921871679355, "grad_norm": 11.834815486386319, "learning_rate": 4.9982262692656124e-05, "loss": 2.2782, "mean_token_accuracy": 0.4379310369491577, "step": 61565 }, { "epoch": 0.06201395792478352, "grad_norm": 14.797756083500381, "learning_rate": 4.99822478136983e-05, "loss": 2.5491, "mean_token_accuracy": 0.43968542516231535, "step": 61570 }, { "epoch": 0.062018993977887696, "grad_norm": 9.609729725483426, "learning_rate": 4.998223292850494e-05, "loss": 2.6458, "mean_token_accuracy": 0.35862068831920624, "step": 61575 }, { "epoch": 0.06202403003099187, "grad_norm": 10.514796291873681, "learning_rate": 4.998221803707604e-05, "loss": 2.6998, "mean_token_accuracy": 0.4206896543502808, "step": 61580 }, { "epoch": 0.06202906608409604, "grad_norm": 13.643705881042427, "learning_rate": 4.998220313941162e-05, "loss": 2.2231, "mean_token_accuracy": 0.43448275327682495, "step": 61585 }, { "epoch": 0.06203410213720022, "grad_norm": 11.050182656052737, "learning_rate": 4.998218823551168e-05, "loss": 2.6722, "mean_token_accuracy": 0.3931034505367279, "step": 61590 }, { "epoch": 0.06203913819030439, "grad_norm": 12.576074982804448, "learning_rate": 4.998217332537621e-05, "loss": 2.6449, "mean_token_accuracy": 0.41167573928833007, "step": 61595 }, { "epoch": 0.062044174243408565, "grad_norm": 12.006587185633139, "learning_rate": 4.9982158409005236e-05, "loss": 3.498, "mean_token_accuracy": 0.35862068831920624, "step": 61600 }, { "epoch": 0.06204921029651273, "grad_norm": 12.362629218756272, "learning_rate": 4.998214348639876e-05, "loss": 2.7446, "mean_token_accuracy": 0.39655172228813174, "step": 61605 }, { "epoch": 0.062054246349616905, "grad_norm": 13.26180847716348, "learning_rate": 4.998212855755676e-05, "loss": 2.7949, "mean_token_accuracy": 0.3655172407627106, "step": 61610 }, { "epoch": 0.06205928240272108, "grad_norm": 14.37448408636518, "learning_rate": 4.998211362247927e-05, "loss": 2.5717, "mean_token_accuracy": 0.417241370677948, "step": 61615 }, { "epoch": 0.06206431845582525, "grad_norm": 12.695326992367068, "learning_rate": 4.9982098681166276e-05, "loss": 2.7182, "mean_token_accuracy": 0.3862068891525269, "step": 61620 }, { "epoch": 0.06206935450892943, "grad_norm": 8.761091798657166, "learning_rate": 4.998208373361779e-05, "loss": 2.3486, "mean_token_accuracy": 0.43103448748588563, "step": 61625 }, { "epoch": 0.0620743905620336, "grad_norm": 20.943093883948997, "learning_rate": 4.998206877983381e-05, "loss": 2.2828, "mean_token_accuracy": 0.46551724672317507, "step": 61630 }, { "epoch": 0.062079426615137774, "grad_norm": 11.689794179888644, "learning_rate": 4.9982053819814345e-05, "loss": 2.7125, "mean_token_accuracy": 0.3827586233615875, "step": 61635 }, { "epoch": 0.06208446266824194, "grad_norm": 10.345225077018595, "learning_rate": 4.99820388535594e-05, "loss": 2.3627, "mean_token_accuracy": 0.42758620381355283, "step": 61640 }, { "epoch": 0.062089498721346115, "grad_norm": 13.681093662428259, "learning_rate": 4.998202388106898e-05, "loss": 3.1751, "mean_token_accuracy": 0.3620689630508423, "step": 61645 }, { "epoch": 0.06209453477445029, "grad_norm": 11.27277845197705, "learning_rate": 4.998200890234308e-05, "loss": 2.3893, "mean_token_accuracy": 0.38620689511299133, "step": 61650 }, { "epoch": 0.06209957082755446, "grad_norm": 11.057330853131813, "learning_rate": 4.9981993917381715e-05, "loss": 2.5688, "mean_token_accuracy": 0.37931033968925476, "step": 61655 }, { "epoch": 0.062104606880658636, "grad_norm": 11.908632076717923, "learning_rate": 4.998197892618489e-05, "loss": 2.4993, "mean_token_accuracy": 0.38620689511299133, "step": 61660 }, { "epoch": 0.06210964293376281, "grad_norm": 16.94587915679362, "learning_rate": 4.99819639287526e-05, "loss": 2.778, "mean_token_accuracy": 0.36896551847457887, "step": 61665 }, { "epoch": 0.062114678986866984, "grad_norm": 9.49488581888489, "learning_rate": 4.998194892508486e-05, "loss": 2.3184, "mean_token_accuracy": 0.4068965494632721, "step": 61670 }, { "epoch": 0.06211971503997115, "grad_norm": 11.843151707883353, "learning_rate": 4.9981933915181654e-05, "loss": 2.842, "mean_token_accuracy": 0.33793102502822875, "step": 61675 }, { "epoch": 0.062124751093075324, "grad_norm": 12.028622846928004, "learning_rate": 4.9981918899043006e-05, "loss": 2.7788, "mean_token_accuracy": 0.37586207389831544, "step": 61680 }, { "epoch": 0.0621297871461795, "grad_norm": 11.432723118710356, "learning_rate": 4.998190387666892e-05, "loss": 2.4197, "mean_token_accuracy": 0.4316502511501312, "step": 61685 }, { "epoch": 0.06213482319928367, "grad_norm": 11.607407856322915, "learning_rate": 4.9981888848059386e-05, "loss": 2.6172, "mean_token_accuracy": 0.4034482717514038, "step": 61690 }, { "epoch": 0.062139859252387845, "grad_norm": 11.953709685273008, "learning_rate": 4.998187381321443e-05, "loss": 2.5251, "mean_token_accuracy": 0.41379310488700866, "step": 61695 }, { "epoch": 0.06214489530549202, "grad_norm": 14.634164138506524, "learning_rate": 4.998185877213402e-05, "loss": 2.841, "mean_token_accuracy": 0.3827586203813553, "step": 61700 }, { "epoch": 0.06214993135859619, "grad_norm": 11.020396881208146, "learning_rate": 4.99818437248182e-05, "loss": 2.1501, "mean_token_accuracy": 0.42758620381355283, "step": 61705 }, { "epoch": 0.06215496741170036, "grad_norm": 10.582910782870869, "learning_rate": 4.998182867126695e-05, "loss": 2.9725, "mean_token_accuracy": 0.38965516686439516, "step": 61710 }, { "epoch": 0.062160003464804534, "grad_norm": 15.826538511016324, "learning_rate": 4.998181361148028e-05, "loss": 2.8664, "mean_token_accuracy": 0.34482758343219755, "step": 61715 }, { "epoch": 0.06216503951790871, "grad_norm": 12.476983410135572, "learning_rate": 4.9981798545458194e-05, "loss": 2.8217, "mean_token_accuracy": 0.3793103456497192, "step": 61720 }, { "epoch": 0.06217007557101288, "grad_norm": 11.32230152982007, "learning_rate": 4.9981783473200706e-05, "loss": 2.4206, "mean_token_accuracy": 0.42758620381355283, "step": 61725 }, { "epoch": 0.062175111624117055, "grad_norm": 13.521489465661448, "learning_rate": 4.998176839470781e-05, "loss": 2.6459, "mean_token_accuracy": 0.358620685338974, "step": 61730 }, { "epoch": 0.06218014767722123, "grad_norm": 14.177422435647726, "learning_rate": 4.99817533099795e-05, "loss": 3.3368, "mean_token_accuracy": 0.33103448450565337, "step": 61735 }, { "epoch": 0.0621851837303254, "grad_norm": 11.518514194533516, "learning_rate": 4.9981738219015805e-05, "loss": 2.4828, "mean_token_accuracy": 0.4206896543502808, "step": 61740 }, { "epoch": 0.06219021978342957, "grad_norm": 12.296069505257273, "learning_rate": 4.9981723121816706e-05, "loss": 2.5777, "mean_token_accuracy": 0.4482758641242981, "step": 61745 }, { "epoch": 0.06219525583653374, "grad_norm": 10.738606660014648, "learning_rate": 4.9981708018382226e-05, "loss": 2.446, "mean_token_accuracy": 0.441379314661026, "step": 61750 }, { "epoch": 0.06220029188963792, "grad_norm": 13.339306566492768, "learning_rate": 4.9981692908712356e-05, "loss": 2.7554, "mean_token_accuracy": 0.36896551251411436, "step": 61755 }, { "epoch": 0.06220532794274209, "grad_norm": 16.269448035814364, "learning_rate": 4.9981677792807104e-05, "loss": 2.1499, "mean_token_accuracy": 0.4801724135875702, "step": 61760 }, { "epoch": 0.062210363995846264, "grad_norm": 14.117542349461383, "learning_rate": 4.998166267066648e-05, "loss": 2.7082, "mean_token_accuracy": 0.41034482717514037, "step": 61765 }, { "epoch": 0.06221540004895044, "grad_norm": 10.645616699602432, "learning_rate": 4.998164754229048e-05, "loss": 2.2028, "mean_token_accuracy": 0.4620689630508423, "step": 61770 }, { "epoch": 0.06222043610205461, "grad_norm": 10.826693667004337, "learning_rate": 4.99816324076791e-05, "loss": 2.2169, "mean_token_accuracy": 0.4437386512756348, "step": 61775 }, { "epoch": 0.06222547215515878, "grad_norm": 12.707149006296968, "learning_rate": 4.998161726683236e-05, "loss": 2.6857, "mean_token_accuracy": 0.4, "step": 61780 }, { "epoch": 0.06223050820826295, "grad_norm": 10.261297904611293, "learning_rate": 4.9981602119750274e-05, "loss": 2.2401, "mean_token_accuracy": 0.43793103098869324, "step": 61785 }, { "epoch": 0.062235544261367126, "grad_norm": 14.694302372307588, "learning_rate": 4.9981586966432817e-05, "loss": 3.0429, "mean_token_accuracy": 0.3103448271751404, "step": 61790 }, { "epoch": 0.0622405803144713, "grad_norm": 10.908509107415773, "learning_rate": 4.998157180688002e-05, "loss": 2.4925, "mean_token_accuracy": 0.4, "step": 61795 }, { "epoch": 0.062245616367575474, "grad_norm": 9.635762385977953, "learning_rate": 4.998155664109187e-05, "loss": 2.1652, "mean_token_accuracy": 0.4429521977901459, "step": 61800 }, { "epoch": 0.06225065242067965, "grad_norm": 13.38136062659689, "learning_rate": 4.998154146906837e-05, "loss": 2.777, "mean_token_accuracy": 0.37586206793785093, "step": 61805 }, { "epoch": 0.06225568847378382, "grad_norm": 10.620972850122714, "learning_rate": 4.998152629080954e-05, "loss": 2.3338, "mean_token_accuracy": 0.42413793206214906, "step": 61810 }, { "epoch": 0.06226072452688799, "grad_norm": 12.519783467993364, "learning_rate": 4.9981511106315373e-05, "loss": 2.4997, "mean_token_accuracy": 0.3980036318302155, "step": 61815 }, { "epoch": 0.06226576057999216, "grad_norm": 11.343947943883633, "learning_rate": 4.998149591558586e-05, "loss": 2.4283, "mean_token_accuracy": 0.40689654648303986, "step": 61820 }, { "epoch": 0.062270796633096336, "grad_norm": 12.52557943799043, "learning_rate": 4.998148071862104e-05, "loss": 2.3068, "mean_token_accuracy": 0.44482758045196535, "step": 61825 }, { "epoch": 0.06227583268620051, "grad_norm": 14.261396731502801, "learning_rate": 4.998146551542089e-05, "loss": 2.552, "mean_token_accuracy": 0.3620689630508423, "step": 61830 }, { "epoch": 0.06228086873930468, "grad_norm": 11.74810635695176, "learning_rate": 4.998145030598542e-05, "loss": 2.7293, "mean_token_accuracy": 0.3965517282485962, "step": 61835 }, { "epoch": 0.06228590479240886, "grad_norm": 10.307344834578016, "learning_rate": 4.998143509031464e-05, "loss": 2.569, "mean_token_accuracy": 0.41034482717514037, "step": 61840 }, { "epoch": 0.06229094084551303, "grad_norm": 10.998418980740832, "learning_rate": 4.998141986840855e-05, "loss": 2.2983, "mean_token_accuracy": 0.4551724076271057, "step": 61845 }, { "epoch": 0.0622959768986172, "grad_norm": 9.847837404921052, "learning_rate": 4.998140464026715e-05, "loss": 2.4723, "mean_token_accuracy": 0.4310344845056534, "step": 61850 }, { "epoch": 0.06230101295172137, "grad_norm": 14.109341428392359, "learning_rate": 4.998138940589045e-05, "loss": 2.7926, "mean_token_accuracy": 0.36896551847457887, "step": 61855 }, { "epoch": 0.062306049004825545, "grad_norm": 11.709216617380044, "learning_rate": 4.998137416527846e-05, "loss": 2.4043, "mean_token_accuracy": 0.4034482777118683, "step": 61860 }, { "epoch": 0.06231108505792972, "grad_norm": 33.766816009536704, "learning_rate": 4.9981358918431164e-05, "loss": 3.0452, "mean_token_accuracy": 0.32758620083332063, "step": 61865 }, { "epoch": 0.06231612111103389, "grad_norm": 12.504112267695515, "learning_rate": 4.998134366534859e-05, "loss": 2.645, "mean_token_accuracy": 0.37241379022598264, "step": 61870 }, { "epoch": 0.062321157164138066, "grad_norm": 11.65046722838868, "learning_rate": 4.9981328406030724e-05, "loss": 2.5522, "mean_token_accuracy": 0.4103448212146759, "step": 61875 }, { "epoch": 0.06232619321724224, "grad_norm": 12.863270455853554, "learning_rate": 4.9981313140477586e-05, "loss": 2.3978, "mean_token_accuracy": 0.41584996581077577, "step": 61880 }, { "epoch": 0.06233122927034641, "grad_norm": 16.634138675513306, "learning_rate": 4.998129786868916e-05, "loss": 2.7586, "mean_token_accuracy": 0.3827586233615875, "step": 61885 }, { "epoch": 0.06233626532345058, "grad_norm": 10.80945416969448, "learning_rate": 4.9981282590665476e-05, "loss": 2.6719, "mean_token_accuracy": 0.38620689511299133, "step": 61890 }, { "epoch": 0.062341301376554754, "grad_norm": 12.800422405253357, "learning_rate": 4.998126730640652e-05, "loss": 2.7708, "mean_token_accuracy": 0.39310345649719236, "step": 61895 }, { "epoch": 0.06234633742965893, "grad_norm": 18.586244439152633, "learning_rate": 4.99812520159123e-05, "loss": 2.8749, "mean_token_accuracy": 0.36551724672317504, "step": 61900 }, { "epoch": 0.0623513734827631, "grad_norm": 23.62107819192551, "learning_rate": 4.998123671918281e-05, "loss": 2.6968, "mean_token_accuracy": 0.4068965554237366, "step": 61905 }, { "epoch": 0.062356409535867276, "grad_norm": 11.65694559386593, "learning_rate": 4.998122141621808e-05, "loss": 2.3701, "mean_token_accuracy": 0.42413793206214906, "step": 61910 }, { "epoch": 0.06236144558897145, "grad_norm": 16.95050829440943, "learning_rate": 4.998120610701809e-05, "loss": 2.9717, "mean_token_accuracy": 0.3586206942796707, "step": 61915 }, { "epoch": 0.062366481642075616, "grad_norm": 10.624334752438582, "learning_rate": 4.998119079158286e-05, "loss": 2.5054, "mean_token_accuracy": 0.4172413766384125, "step": 61920 }, { "epoch": 0.06237151769517979, "grad_norm": 13.58626828102806, "learning_rate": 4.9981175469912385e-05, "loss": 2.8818, "mean_token_accuracy": 0.38275861740112305, "step": 61925 }, { "epoch": 0.062376553748283964, "grad_norm": 13.01411404810873, "learning_rate": 4.998116014200667e-05, "loss": 2.1237, "mean_token_accuracy": 0.4896551728248596, "step": 61930 }, { "epoch": 0.06238158980138814, "grad_norm": 12.770864170944165, "learning_rate": 4.9981144807865726e-05, "loss": 2.8048, "mean_token_accuracy": 0.32413792610168457, "step": 61935 }, { "epoch": 0.06238662585449231, "grad_norm": 10.946546030009342, "learning_rate": 4.9981129467489546e-05, "loss": 2.2685, "mean_token_accuracy": 0.4226860284805298, "step": 61940 }, { "epoch": 0.062391661907596485, "grad_norm": 13.55843980299917, "learning_rate": 4.998111412087815e-05, "loss": 2.1777, "mean_token_accuracy": 0.4344827592372894, "step": 61945 }, { "epoch": 0.06239669796070066, "grad_norm": 12.043078623741616, "learning_rate": 4.998109876803153e-05, "loss": 2.5453, "mean_token_accuracy": 0.37434967160224913, "step": 61950 }, { "epoch": 0.062401734013804826, "grad_norm": 10.397129641310412, "learning_rate": 4.9981083408949684e-05, "loss": 2.3512, "mean_token_accuracy": 0.41034482717514037, "step": 61955 }, { "epoch": 0.062406770066909, "grad_norm": 11.412631668368094, "learning_rate": 4.998106804363263e-05, "loss": 2.5047, "mean_token_accuracy": 0.39655172228813174, "step": 61960 }, { "epoch": 0.06241180612001317, "grad_norm": 11.519264941441573, "learning_rate": 4.9981052672080376e-05, "loss": 2.797, "mean_token_accuracy": 0.3655172437429428, "step": 61965 }, { "epoch": 0.06241684217311735, "grad_norm": 11.808168315591233, "learning_rate": 4.9981037294292906e-05, "loss": 2.6116, "mean_token_accuracy": 0.38275861740112305, "step": 61970 }, { "epoch": 0.06242187822622152, "grad_norm": 10.822011851370267, "learning_rate": 4.998102191027025e-05, "loss": 2.5593, "mean_token_accuracy": 0.4068965554237366, "step": 61975 }, { "epoch": 0.062426914279325695, "grad_norm": 18.2423352482425, "learning_rate": 4.9981006520012396e-05, "loss": 3.2342, "mean_token_accuracy": 0.3620689570903778, "step": 61980 }, { "epoch": 0.06243195033242987, "grad_norm": 11.10424193673708, "learning_rate": 4.998099112351935e-05, "loss": 2.2929, "mean_token_accuracy": 0.42068966031074523, "step": 61985 }, { "epoch": 0.062436986385534035, "grad_norm": 12.850949859280323, "learning_rate": 4.998097572079111e-05, "loss": 2.6371, "mean_token_accuracy": 0.4068965494632721, "step": 61990 }, { "epoch": 0.06244202243863821, "grad_norm": 11.093332127385423, "learning_rate": 4.99809603118277e-05, "loss": 2.7391, "mean_token_accuracy": 0.40344828069210054, "step": 61995 }, { "epoch": 0.06244705849174238, "grad_norm": 10.780032226152947, "learning_rate": 4.99809448966291e-05, "loss": 2.4933, "mean_token_accuracy": 0.39310343861579894, "step": 62000 }, { "epoch": 0.062452094544846556, "grad_norm": 11.848169836114655, "learning_rate": 4.998092947519533e-05, "loss": 2.6122, "mean_token_accuracy": 0.3551724135875702, "step": 62005 }, { "epoch": 0.06245713059795073, "grad_norm": 14.059507591029904, "learning_rate": 4.998091404752639e-05, "loss": 2.9683, "mean_token_accuracy": 0.36896551251411436, "step": 62010 }, { "epoch": 0.062462166651054904, "grad_norm": 10.40035764032004, "learning_rate": 4.998089861362229e-05, "loss": 2.1151, "mean_token_accuracy": 0.4620689570903778, "step": 62015 }, { "epoch": 0.06246720270415908, "grad_norm": 14.344305624532506, "learning_rate": 4.9980883173483015e-05, "loss": 2.9559, "mean_token_accuracy": 0.38275861740112305, "step": 62020 }, { "epoch": 0.062472238757263245, "grad_norm": 10.842127439917341, "learning_rate": 4.99808677271086e-05, "loss": 2.6955, "mean_token_accuracy": 0.3931034475564957, "step": 62025 }, { "epoch": 0.06247727481036742, "grad_norm": 11.413335416232472, "learning_rate": 4.998085227449902e-05, "loss": 2.7276, "mean_token_accuracy": 0.3965517282485962, "step": 62030 }, { "epoch": 0.06248231086347159, "grad_norm": 10.829702277830625, "learning_rate": 4.99808368156543e-05, "loss": 2.6741, "mean_token_accuracy": 0.3926799833774567, "step": 62035 }, { "epoch": 0.062487346916575766, "grad_norm": 12.277328026949816, "learning_rate": 4.9980821350574426e-05, "loss": 2.533, "mean_token_accuracy": 0.39310344159603117, "step": 62040 }, { "epoch": 0.06249238296967994, "grad_norm": 11.53507928483763, "learning_rate": 4.9980805879259415e-05, "loss": 2.8252, "mean_token_accuracy": 0.37241379022598264, "step": 62045 }, { "epoch": 0.06249741902278411, "grad_norm": 13.82972267864647, "learning_rate": 4.998079040170928e-05, "loss": 2.2513, "mean_token_accuracy": 0.42068964838981626, "step": 62050 }, { "epoch": 0.06250245507588828, "grad_norm": 12.446997992507965, "learning_rate": 4.9980774917924004e-05, "loss": 2.535, "mean_token_accuracy": 0.42068964838981626, "step": 62055 }, { "epoch": 0.06250749112899245, "grad_norm": 9.69013145088671, "learning_rate": 4.9980759427903605e-05, "loss": 2.3636, "mean_token_accuracy": 0.41379310488700866, "step": 62060 }, { "epoch": 0.06251252718209663, "grad_norm": 13.894256486106459, "learning_rate": 4.998074393164808e-05, "loss": 2.4733, "mean_token_accuracy": 0.4517241358757019, "step": 62065 }, { "epoch": 0.0625175632352008, "grad_norm": 11.621507297185293, "learning_rate": 4.998072842915744e-05, "loss": 2.5019, "mean_token_accuracy": 0.38275861740112305, "step": 62070 }, { "epoch": 0.06252259928830498, "grad_norm": 15.41459338095443, "learning_rate": 4.998071292043168e-05, "loss": 2.5948, "mean_token_accuracy": 0.38965516686439516, "step": 62075 }, { "epoch": 0.06252763534140915, "grad_norm": 10.021925979295649, "learning_rate": 4.998069740547082e-05, "loss": 2.2906, "mean_token_accuracy": 0.4675136089324951, "step": 62080 }, { "epoch": 0.06253267139451332, "grad_norm": 12.014763205989091, "learning_rate": 4.998068188427485e-05, "loss": 2.4702, "mean_token_accuracy": 0.45680580139160154, "step": 62085 }, { "epoch": 0.0625377074476175, "grad_norm": 14.312796546645393, "learning_rate": 4.998066635684378e-05, "loss": 2.3265, "mean_token_accuracy": 0.452339905500412, "step": 62090 }, { "epoch": 0.06254274350072167, "grad_norm": 10.44504548052275, "learning_rate": 4.998065082317761e-05, "loss": 2.1138, "mean_token_accuracy": 0.46896551847457885, "step": 62095 }, { "epoch": 0.06254777955382584, "grad_norm": 11.502758684110288, "learning_rate": 4.9980635283276355e-05, "loss": 2.4433, "mean_token_accuracy": 0.37931033968925476, "step": 62100 }, { "epoch": 0.06255281560693002, "grad_norm": 13.332451955401085, "learning_rate": 4.998061973714e-05, "loss": 3.1752, "mean_token_accuracy": 0.3034482717514038, "step": 62105 }, { "epoch": 0.06255785166003419, "grad_norm": 14.440886858729266, "learning_rate": 4.9980604184768574e-05, "loss": 2.9196, "mean_token_accuracy": 0.4, "step": 62110 }, { "epoch": 0.06256288771313835, "grad_norm": 11.32702333989073, "learning_rate": 4.998058862616206e-05, "loss": 2.8711, "mean_token_accuracy": 0.36551723480224607, "step": 62115 }, { "epoch": 0.06256792376624253, "grad_norm": 11.505385622910655, "learning_rate": 4.9980573061320476e-05, "loss": 2.6412, "mean_token_accuracy": 0.42758620381355283, "step": 62120 }, { "epoch": 0.0625729598193467, "grad_norm": 12.269783643147237, "learning_rate": 4.9980557490243814e-05, "loss": 2.4396, "mean_token_accuracy": 0.4275861978530884, "step": 62125 }, { "epoch": 0.06257799587245087, "grad_norm": 12.531133988426804, "learning_rate": 4.9980541912932096e-05, "loss": 2.4807, "mean_token_accuracy": 0.4206896543502808, "step": 62130 }, { "epoch": 0.06258303192555505, "grad_norm": 12.765495797590255, "learning_rate": 4.998052632938531e-05, "loss": 2.5847, "mean_token_accuracy": 0.36551724672317504, "step": 62135 }, { "epoch": 0.06258806797865922, "grad_norm": 11.297646077447837, "learning_rate": 4.998051073960346e-05, "loss": 2.6343, "mean_token_accuracy": 0.4034482717514038, "step": 62140 }, { "epoch": 0.0625931040317634, "grad_norm": 11.952887798249673, "learning_rate": 4.998049514358657e-05, "loss": 2.6151, "mean_token_accuracy": 0.3758620649576187, "step": 62145 }, { "epoch": 0.06259814008486757, "grad_norm": 12.026730173799285, "learning_rate": 4.9980479541334626e-05, "loss": 2.7523, "mean_token_accuracy": 0.29310344755649564, "step": 62150 }, { "epoch": 0.06260317613797174, "grad_norm": 12.232513431985785, "learning_rate": 4.998046393284763e-05, "loss": 2.4238, "mean_token_accuracy": 0.4034482717514038, "step": 62155 }, { "epoch": 0.06260821219107592, "grad_norm": 10.806554916932077, "learning_rate": 4.99804483181256e-05, "loss": 2.3376, "mean_token_accuracy": 0.42413792610168455, "step": 62160 }, { "epoch": 0.06261324824418009, "grad_norm": 12.470881576532385, "learning_rate": 4.9980432697168534e-05, "loss": 2.3288, "mean_token_accuracy": 0.39655172228813174, "step": 62165 }, { "epoch": 0.06261828429728426, "grad_norm": 12.778385305951138, "learning_rate": 4.9980417069976434e-05, "loss": 2.8237, "mean_token_accuracy": 0.37586207389831544, "step": 62170 }, { "epoch": 0.06262332035038844, "grad_norm": 11.538637777292367, "learning_rate": 4.99804014365493e-05, "loss": 2.3894, "mean_token_accuracy": 0.37586207389831544, "step": 62175 }, { "epoch": 0.06262835640349261, "grad_norm": 9.922858878422783, "learning_rate": 4.998038579688715e-05, "loss": 2.7127, "mean_token_accuracy": 0.3987295746803284, "step": 62180 }, { "epoch": 0.06263339245659677, "grad_norm": 14.53588101588764, "learning_rate": 4.9980370150989984e-05, "loss": 2.6879, "mean_token_accuracy": 0.36896551847457887, "step": 62185 }, { "epoch": 0.06263842850970094, "grad_norm": 10.241607436354613, "learning_rate": 4.99803544988578e-05, "loss": 2.2922, "mean_token_accuracy": 0.42068966031074523, "step": 62190 }, { "epoch": 0.06264346456280512, "grad_norm": 11.796449185512044, "learning_rate": 4.998033884049061e-05, "loss": 2.6286, "mean_token_accuracy": 0.37586206793785093, "step": 62195 }, { "epoch": 0.06264850061590929, "grad_norm": 10.365798586997801, "learning_rate": 4.998032317588841e-05, "loss": 2.6028, "mean_token_accuracy": 0.37241379022598264, "step": 62200 }, { "epoch": 0.06265353666901347, "grad_norm": 11.7155269448105, "learning_rate": 4.998030750505121e-05, "loss": 2.7615, "mean_token_accuracy": 0.3517241358757019, "step": 62205 }, { "epoch": 0.06265857272211764, "grad_norm": 13.962457311917545, "learning_rate": 4.9980291827979004e-05, "loss": 2.537, "mean_token_accuracy": 0.3946763455867767, "step": 62210 }, { "epoch": 0.06266360877522181, "grad_norm": 13.24449080095707, "learning_rate": 4.998027614467182e-05, "loss": 2.8266, "mean_token_accuracy": 0.33448274731636046, "step": 62215 }, { "epoch": 0.06266864482832599, "grad_norm": 53.2126738579196, "learning_rate": 4.998026045512964e-05, "loss": 2.4494, "mean_token_accuracy": 0.3862068891525269, "step": 62220 }, { "epoch": 0.06267368088143016, "grad_norm": 10.560050104835417, "learning_rate": 4.9980244759352474e-05, "loss": 2.5377, "mean_token_accuracy": 0.3999999940395355, "step": 62225 }, { "epoch": 0.06267871693453433, "grad_norm": 12.351089354460358, "learning_rate": 4.998022905734033e-05, "loss": 2.3385, "mean_token_accuracy": 0.403448274731636, "step": 62230 }, { "epoch": 0.06268375298763851, "grad_norm": 9.651060796633615, "learning_rate": 4.9980213349093206e-05, "loss": 2.4843, "mean_token_accuracy": 0.41379310488700866, "step": 62235 }, { "epoch": 0.06268878904074268, "grad_norm": 12.666003900243949, "learning_rate": 4.998019763461112e-05, "loss": 2.3178, "mean_token_accuracy": 0.38620689511299133, "step": 62240 }, { "epoch": 0.06269382509384686, "grad_norm": 9.644461810277864, "learning_rate": 4.9980181913894056e-05, "loss": 2.527, "mean_token_accuracy": 0.4034482717514038, "step": 62245 }, { "epoch": 0.06269886114695103, "grad_norm": 14.743235851091555, "learning_rate": 4.998016618694204e-05, "loss": 2.4974, "mean_token_accuracy": 0.3896551728248596, "step": 62250 }, { "epoch": 0.06270389720005519, "grad_norm": 17.25992273652925, "learning_rate": 4.9980150453755065e-05, "loss": 3.2558, "mean_token_accuracy": 0.31034482419490816, "step": 62255 }, { "epoch": 0.06270893325315936, "grad_norm": 12.113644535999981, "learning_rate": 4.998013471433313e-05, "loss": 3.2247, "mean_token_accuracy": 0.35862069129943847, "step": 62260 }, { "epoch": 0.06271396930626354, "grad_norm": 12.398181084451958, "learning_rate": 4.998011896867625e-05, "loss": 2.5813, "mean_token_accuracy": 0.3793103456497192, "step": 62265 }, { "epoch": 0.06271900535936771, "grad_norm": 14.211156881198992, "learning_rate": 4.998010321678443e-05, "loss": 2.877, "mean_token_accuracy": 0.38275861740112305, "step": 62270 }, { "epoch": 0.06272404141247188, "grad_norm": 12.704639576900126, "learning_rate": 4.9980087458657665e-05, "loss": 2.4824, "mean_token_accuracy": 0.4241379380226135, "step": 62275 }, { "epoch": 0.06272907746557606, "grad_norm": 10.103183824034012, "learning_rate": 4.998007169429596e-05, "loss": 2.7058, "mean_token_accuracy": 0.42232305407524107, "step": 62280 }, { "epoch": 0.06273411351868023, "grad_norm": 11.425038295772477, "learning_rate": 4.998005592369933e-05, "loss": 2.6438, "mean_token_accuracy": 0.4034482777118683, "step": 62285 }, { "epoch": 0.0627391495717844, "grad_norm": 12.882669653487197, "learning_rate": 4.9980040146867777e-05, "loss": 2.7466, "mean_token_accuracy": 0.37586206793785093, "step": 62290 }, { "epoch": 0.06274418562488858, "grad_norm": 12.94803587512305, "learning_rate": 4.9980024363801286e-05, "loss": 2.5969, "mean_token_accuracy": 0.4012704133987427, "step": 62295 }, { "epoch": 0.06274922167799275, "grad_norm": 12.035849375951202, "learning_rate": 4.998000857449988e-05, "loss": 2.4455, "mean_token_accuracy": 0.4935960590839386, "step": 62300 }, { "epoch": 0.06275425773109693, "grad_norm": 11.447652566367399, "learning_rate": 4.997999277896357e-05, "loss": 2.6115, "mean_token_accuracy": 0.38620689809322356, "step": 62305 }, { "epoch": 0.0627592937842011, "grad_norm": 11.777480976953067, "learning_rate": 4.997997697719234e-05, "loss": 2.7785, "mean_token_accuracy": 0.36896551847457887, "step": 62310 }, { "epoch": 0.06276432983730527, "grad_norm": 11.486996245868356, "learning_rate": 4.9979961169186204e-05, "loss": 2.3841, "mean_token_accuracy": 0.43103448748588563, "step": 62315 }, { "epoch": 0.06276936589040943, "grad_norm": 13.241748370206066, "learning_rate": 4.997994535494518e-05, "loss": 2.5649, "mean_token_accuracy": 0.4137930989265442, "step": 62320 }, { "epoch": 0.06277440194351361, "grad_norm": 15.575348358716179, "learning_rate": 4.997992953446925e-05, "loss": 2.18, "mean_token_accuracy": 0.4413793087005615, "step": 62325 }, { "epoch": 0.06277943799661778, "grad_norm": 10.078360050648557, "learning_rate": 4.997991370775843e-05, "loss": 2.3921, "mean_token_accuracy": 0.4137930989265442, "step": 62330 }, { "epoch": 0.06278447404972196, "grad_norm": 12.581449749526632, "learning_rate": 4.997989787481271e-05, "loss": 2.3721, "mean_token_accuracy": 0.41034482717514037, "step": 62335 }, { "epoch": 0.06278951010282613, "grad_norm": 13.209318898230753, "learning_rate": 4.997988203563212e-05, "loss": 2.5787, "mean_token_accuracy": 0.3551724135875702, "step": 62340 }, { "epoch": 0.0627945461559303, "grad_norm": 11.748341230340182, "learning_rate": 4.997986619021664e-05, "loss": 2.4305, "mean_token_accuracy": 0.4068965554237366, "step": 62345 }, { "epoch": 0.06279958220903448, "grad_norm": 11.188471622974934, "learning_rate": 4.99798503385663e-05, "loss": 2.3276, "mean_token_accuracy": 0.4839901506900787, "step": 62350 }, { "epoch": 0.06280461826213865, "grad_norm": 12.34101110526177, "learning_rate": 4.9979834480681086e-05, "loss": 2.6021, "mean_token_accuracy": 0.4034482777118683, "step": 62355 }, { "epoch": 0.06280965431524282, "grad_norm": 12.61302959898061, "learning_rate": 4.9979818616561e-05, "loss": 2.9714, "mean_token_accuracy": 0.3643073230981827, "step": 62360 }, { "epoch": 0.062814690368347, "grad_norm": 13.245669597804927, "learning_rate": 4.997980274620605e-05, "loss": 2.7502, "mean_token_accuracy": 0.3896551787853241, "step": 62365 }, { "epoch": 0.06281972642145117, "grad_norm": 15.808861519887156, "learning_rate": 4.997978686961625e-05, "loss": 2.7324, "mean_token_accuracy": 0.41379310488700866, "step": 62370 }, { "epoch": 0.06282476247455535, "grad_norm": 11.41619169676549, "learning_rate": 4.9979770986791605e-05, "loss": 2.8882, "mean_token_accuracy": 0.3862068891525269, "step": 62375 }, { "epoch": 0.06282979852765952, "grad_norm": 10.40929396866401, "learning_rate": 4.9979755097732094e-05, "loss": 2.1438, "mean_token_accuracy": 0.4915305495262146, "step": 62380 }, { "epoch": 0.0628348345807637, "grad_norm": 12.645847794576438, "learning_rate": 4.997973920243775e-05, "loss": 2.6442, "mean_token_accuracy": 0.39655172228813174, "step": 62385 }, { "epoch": 0.06283987063386785, "grad_norm": 10.774207501865742, "learning_rate": 4.997972330090856e-05, "loss": 2.5731, "mean_token_accuracy": 0.38965516686439516, "step": 62390 }, { "epoch": 0.06284490668697203, "grad_norm": 12.35902198080692, "learning_rate": 4.997970739314454e-05, "loss": 2.4009, "mean_token_accuracy": 0.4206896543502808, "step": 62395 }, { "epoch": 0.0628499427400762, "grad_norm": 11.223575094144403, "learning_rate": 4.997969147914569e-05, "loss": 2.59, "mean_token_accuracy": 0.3965517282485962, "step": 62400 }, { "epoch": 0.06285497879318037, "grad_norm": 11.181386096219601, "learning_rate": 4.997967555891201e-05, "loss": 2.2681, "mean_token_accuracy": 0.4344827592372894, "step": 62405 }, { "epoch": 0.06286001484628455, "grad_norm": 11.092854536486229, "learning_rate": 4.997965963244351e-05, "loss": 2.3509, "mean_token_accuracy": 0.42758620381355283, "step": 62410 }, { "epoch": 0.06286505089938872, "grad_norm": 10.449986378487464, "learning_rate": 4.9979643699740195e-05, "loss": 2.3912, "mean_token_accuracy": 0.441379314661026, "step": 62415 }, { "epoch": 0.0628700869524929, "grad_norm": 12.587710324529981, "learning_rate": 4.997962776080206e-05, "loss": 2.4135, "mean_token_accuracy": 0.3807622492313385, "step": 62420 }, { "epoch": 0.06287512300559707, "grad_norm": 15.892085125642838, "learning_rate": 4.997961181562912e-05, "loss": 2.6002, "mean_token_accuracy": 0.42068966031074523, "step": 62425 }, { "epoch": 0.06288015905870124, "grad_norm": 12.967656971103171, "learning_rate": 4.997959586422138e-05, "loss": 2.2028, "mean_token_accuracy": 0.4793103337287903, "step": 62430 }, { "epoch": 0.06288519511180542, "grad_norm": 13.225917543495527, "learning_rate": 4.9979579906578836e-05, "loss": 3.1354, "mean_token_accuracy": 0.36817906498909, "step": 62435 }, { "epoch": 0.06289023116490959, "grad_norm": 13.472481209756575, "learning_rate": 4.9979563942701494e-05, "loss": 2.3012, "mean_token_accuracy": 0.4482758641242981, "step": 62440 }, { "epoch": 0.06289526721801376, "grad_norm": 11.42313288857788, "learning_rate": 4.9979547972589366e-05, "loss": 2.3888, "mean_token_accuracy": 0.420689657330513, "step": 62445 }, { "epoch": 0.06290030327111794, "grad_norm": 11.673780985232318, "learning_rate": 4.9979531996242446e-05, "loss": 2.2684, "mean_token_accuracy": 0.4517241299152374, "step": 62450 }, { "epoch": 0.06290533932422211, "grad_norm": 9.654641446594203, "learning_rate": 4.9979516013660746e-05, "loss": 2.4958, "mean_token_accuracy": 0.39310344457626345, "step": 62455 }, { "epoch": 0.06291037537732627, "grad_norm": 11.093014663417016, "learning_rate": 4.9979500024844274e-05, "loss": 2.3332, "mean_token_accuracy": 0.42667876482009887, "step": 62460 }, { "epoch": 0.06291541143043045, "grad_norm": 12.458585914553815, "learning_rate": 4.997948402979302e-05, "loss": 2.5219, "mean_token_accuracy": 0.3999999940395355, "step": 62465 }, { "epoch": 0.06292044748353462, "grad_norm": 11.048079183779482, "learning_rate": 4.9979468028507006e-05, "loss": 2.3889, "mean_token_accuracy": 0.4068965554237366, "step": 62470 }, { "epoch": 0.0629254835366388, "grad_norm": 11.244244600912156, "learning_rate": 4.997945202098623e-05, "loss": 2.7073, "mean_token_accuracy": 0.3827586114406586, "step": 62475 }, { "epoch": 0.06293051958974297, "grad_norm": 22.257837315165585, "learning_rate": 4.9979436007230676e-05, "loss": 2.6666, "mean_token_accuracy": 0.401935875415802, "step": 62480 }, { "epoch": 0.06293555564284714, "grad_norm": 12.109950462166607, "learning_rate": 4.9979419987240376e-05, "loss": 2.7719, "mean_token_accuracy": 0.38771929740905764, "step": 62485 }, { "epoch": 0.06294059169595131, "grad_norm": 12.61607693666439, "learning_rate": 4.997940396101533e-05, "loss": 2.3295, "mean_token_accuracy": 0.43448275327682495, "step": 62490 }, { "epoch": 0.06294562774905549, "grad_norm": 10.711281986169466, "learning_rate": 4.997938792855554e-05, "loss": 3.1789, "mean_token_accuracy": 0.3379310339689255, "step": 62495 }, { "epoch": 0.06295066380215966, "grad_norm": 10.510978886405585, "learning_rate": 4.9979371889861e-05, "loss": 2.5674, "mean_token_accuracy": 0.4172413766384125, "step": 62500 }, { "epoch": 0.06295569985526384, "grad_norm": 12.330968358361359, "learning_rate": 4.9979355844931724e-05, "loss": 2.4827, "mean_token_accuracy": 0.4344827473163605, "step": 62505 }, { "epoch": 0.06296073590836801, "grad_norm": 11.424444234372416, "learning_rate": 4.997933979376772e-05, "loss": 2.8833, "mean_token_accuracy": 0.3705989122390747, "step": 62510 }, { "epoch": 0.06296577196147218, "grad_norm": 11.735958911456319, "learning_rate": 4.9979323736368974e-05, "loss": 2.9061, "mean_token_accuracy": 0.3517241358757019, "step": 62515 }, { "epoch": 0.06297080801457636, "grad_norm": 18.22613404235153, "learning_rate": 4.997930767273552e-05, "loss": 2.7004, "mean_token_accuracy": 0.4000000059604645, "step": 62520 }, { "epoch": 0.06297584406768053, "grad_norm": 23.61983593764508, "learning_rate": 4.997929160286734e-05, "loss": 2.8826, "mean_token_accuracy": 0.4034482717514038, "step": 62525 }, { "epoch": 0.06298088012078469, "grad_norm": 11.853141917536039, "learning_rate": 4.997927552676444e-05, "loss": 2.8192, "mean_token_accuracy": 0.3931034505367279, "step": 62530 }, { "epoch": 0.06298591617388886, "grad_norm": 12.224987472827817, "learning_rate": 4.9979259444426836e-05, "loss": 2.3744, "mean_token_accuracy": 0.41724138259887694, "step": 62535 }, { "epoch": 0.06299095222699304, "grad_norm": 14.501782720695113, "learning_rate": 4.997924335585452e-05, "loss": 2.6304, "mean_token_accuracy": 0.36896551847457887, "step": 62540 }, { "epoch": 0.06299598828009721, "grad_norm": 12.22618132731522, "learning_rate": 4.997922726104751e-05, "loss": 2.6139, "mean_token_accuracy": 0.41724138259887694, "step": 62545 }, { "epoch": 0.06300102433320139, "grad_norm": 12.805109773441645, "learning_rate": 4.997921116000579e-05, "loss": 2.4927, "mean_token_accuracy": 0.42758620381355283, "step": 62550 }, { "epoch": 0.06300606038630556, "grad_norm": 10.616842235617703, "learning_rate": 4.9979195052729384e-05, "loss": 2.6438, "mean_token_accuracy": 0.39655172228813174, "step": 62555 }, { "epoch": 0.06301109643940973, "grad_norm": 10.899057300856157, "learning_rate": 4.9979178939218285e-05, "loss": 2.3265, "mean_token_accuracy": 0.4068965494632721, "step": 62560 }, { "epoch": 0.06301613249251391, "grad_norm": 9.906509931671884, "learning_rate": 4.997916281947251e-05, "loss": 2.4277, "mean_token_accuracy": 0.38421053290367124, "step": 62565 }, { "epoch": 0.06302116854561808, "grad_norm": 11.847082871403346, "learning_rate": 4.997914669349205e-05, "loss": 2.4823, "mean_token_accuracy": 0.44568965435028074, "step": 62570 }, { "epoch": 0.06302620459872225, "grad_norm": 17.997542366909325, "learning_rate": 4.9979130561276915e-05, "loss": 2.492, "mean_token_accuracy": 0.41724138259887694, "step": 62575 }, { "epoch": 0.06303124065182643, "grad_norm": 16.56944355255586, "learning_rate": 4.9979114422827106e-05, "loss": 3.0716, "mean_token_accuracy": 0.33103448450565337, "step": 62580 }, { "epoch": 0.0630362767049306, "grad_norm": 11.862315666691755, "learning_rate": 4.997909827814264e-05, "loss": 2.4111, "mean_token_accuracy": 0.4379310369491577, "step": 62585 }, { "epoch": 0.06304131275803478, "grad_norm": 11.48370584649829, "learning_rate": 4.997908212722351e-05, "loss": 2.7825, "mean_token_accuracy": 0.3793103456497192, "step": 62590 }, { "epoch": 0.06304634881113895, "grad_norm": 14.275106085885836, "learning_rate": 4.9979065970069716e-05, "loss": 2.8273, "mean_token_accuracy": 0.38275861740112305, "step": 62595 }, { "epoch": 0.06305138486424311, "grad_norm": 13.054366175325045, "learning_rate": 4.997904980668127e-05, "loss": 2.6159, "mean_token_accuracy": 0.4152450144290924, "step": 62600 }, { "epoch": 0.06305642091734728, "grad_norm": 11.460283129677492, "learning_rate": 4.997903363705818e-05, "loss": 2.7363, "mean_token_accuracy": 0.3551724135875702, "step": 62605 }, { "epoch": 0.06306145697045146, "grad_norm": 16.86719803656878, "learning_rate": 4.997901746120045e-05, "loss": 2.7191, "mean_token_accuracy": 0.4206896543502808, "step": 62610 }, { "epoch": 0.06306649302355563, "grad_norm": 11.60301527465864, "learning_rate": 4.9979001279108076e-05, "loss": 2.7312, "mean_token_accuracy": 0.38275861740112305, "step": 62615 }, { "epoch": 0.0630715290766598, "grad_norm": 9.59334785716302, "learning_rate": 4.9978985090781064e-05, "loss": 2.4335, "mean_token_accuracy": 0.415426504611969, "step": 62620 }, { "epoch": 0.06307656512976398, "grad_norm": 10.853431205632479, "learning_rate": 4.997896889621942e-05, "loss": 2.6157, "mean_token_accuracy": 0.39655172228813174, "step": 62625 }, { "epoch": 0.06308160118286815, "grad_norm": 12.081148675996888, "learning_rate": 4.9978952695423156e-05, "loss": 2.493, "mean_token_accuracy": 0.43448275327682495, "step": 62630 }, { "epoch": 0.06308663723597233, "grad_norm": 12.755527740367013, "learning_rate": 4.997893648839227e-05, "loss": 2.5389, "mean_token_accuracy": 0.4913793087005615, "step": 62635 }, { "epoch": 0.0630916732890765, "grad_norm": 11.795269004460417, "learning_rate": 4.997892027512677e-05, "loss": 3.228, "mean_token_accuracy": 0.33793103098869326, "step": 62640 }, { "epoch": 0.06309670934218067, "grad_norm": 10.671812678460117, "learning_rate": 4.997890405562665e-05, "loss": 2.7223, "mean_token_accuracy": 0.3551724135875702, "step": 62645 }, { "epoch": 0.06310174539528485, "grad_norm": 11.676922883180511, "learning_rate": 4.997888782989193e-05, "loss": 2.6014, "mean_token_accuracy": 0.39655172228813174, "step": 62650 }, { "epoch": 0.06310678144838902, "grad_norm": 10.538287782999582, "learning_rate": 4.99788715979226e-05, "loss": 2.2629, "mean_token_accuracy": 0.4206896543502808, "step": 62655 }, { "epoch": 0.0631118175014932, "grad_norm": 14.90855415026501, "learning_rate": 4.997885535971867e-05, "loss": 2.7905, "mean_token_accuracy": 0.4137930989265442, "step": 62660 }, { "epoch": 0.06311685355459737, "grad_norm": 9.237068849497524, "learning_rate": 4.9978839115280144e-05, "loss": 2.0829, "mean_token_accuracy": 0.49602670073509214, "step": 62665 }, { "epoch": 0.06312188960770153, "grad_norm": 10.361769433594771, "learning_rate": 4.997882286460704e-05, "loss": 2.3876, "mean_token_accuracy": 0.4241379380226135, "step": 62670 }, { "epoch": 0.0631269256608057, "grad_norm": 12.489250870863886, "learning_rate": 4.997880660769934e-05, "loss": 2.696, "mean_token_accuracy": 0.39655172228813174, "step": 62675 }, { "epoch": 0.06313196171390988, "grad_norm": 13.983554636084667, "learning_rate": 4.997879034455707e-05, "loss": 2.5567, "mean_token_accuracy": 0.37586206793785093, "step": 62680 }, { "epoch": 0.06313699776701405, "grad_norm": 9.814819149116904, "learning_rate": 4.997877407518021e-05, "loss": 2.4475, "mean_token_accuracy": 0.42758620977401735, "step": 62685 }, { "epoch": 0.06314203382011822, "grad_norm": 10.93821320454277, "learning_rate": 4.9978757799568784e-05, "loss": 2.7511, "mean_token_accuracy": 0.3758620649576187, "step": 62690 }, { "epoch": 0.0631470698732224, "grad_norm": 10.829397335044291, "learning_rate": 4.99787415177228e-05, "loss": 2.245, "mean_token_accuracy": 0.4851784646511078, "step": 62695 }, { "epoch": 0.06315210592632657, "grad_norm": 9.712067111178953, "learning_rate": 4.997872522964224e-05, "loss": 2.1791, "mean_token_accuracy": 0.41379310488700866, "step": 62700 }, { "epoch": 0.06315714197943074, "grad_norm": 11.873982588167959, "learning_rate": 4.997870893532713e-05, "loss": 2.7286, "mean_token_accuracy": 0.4344827473163605, "step": 62705 }, { "epoch": 0.06316217803253492, "grad_norm": 24.710975842656893, "learning_rate": 4.997869263477746e-05, "loss": 2.8054, "mean_token_accuracy": 0.3379310369491577, "step": 62710 }, { "epoch": 0.06316721408563909, "grad_norm": 11.575714797171193, "learning_rate": 4.997867632799324e-05, "loss": 2.5979, "mean_token_accuracy": 0.34137930870056155, "step": 62715 }, { "epoch": 0.06317225013874327, "grad_norm": 10.513917086488547, "learning_rate": 4.997866001497448e-05, "loss": 2.4359, "mean_token_accuracy": 0.4119782209396362, "step": 62720 }, { "epoch": 0.06317728619184744, "grad_norm": 10.698267865912632, "learning_rate": 4.9978643695721184e-05, "loss": 2.4976, "mean_token_accuracy": 0.4103448331356049, "step": 62725 }, { "epoch": 0.06318232224495161, "grad_norm": 18.16761284445716, "learning_rate": 4.997862737023334e-05, "loss": 2.4092, "mean_token_accuracy": 0.41584996581077577, "step": 62730 }, { "epoch": 0.06318735829805579, "grad_norm": 12.508270562390786, "learning_rate": 4.9978611038510976e-05, "loss": 2.654, "mean_token_accuracy": 0.38965516686439516, "step": 62735 }, { "epoch": 0.06319239435115995, "grad_norm": 11.397088166177975, "learning_rate": 4.997859470055408e-05, "loss": 2.968, "mean_token_accuracy": 0.34482758641242983, "step": 62740 }, { "epoch": 0.06319743040426412, "grad_norm": 10.100531089945259, "learning_rate": 4.997857835636266e-05, "loss": 2.2038, "mean_token_accuracy": 0.4551724135875702, "step": 62745 }, { "epoch": 0.0632024664573683, "grad_norm": 11.226503582629404, "learning_rate": 4.9978562005936724e-05, "loss": 2.9389, "mean_token_accuracy": 0.36896551847457887, "step": 62750 }, { "epoch": 0.06320750251047247, "grad_norm": 12.870337472897727, "learning_rate": 4.997854564927628e-05, "loss": 2.9107, "mean_token_accuracy": 0.36896551251411436, "step": 62755 }, { "epoch": 0.06321253856357664, "grad_norm": 12.829992082370925, "learning_rate": 4.997852928638132e-05, "loss": 2.7187, "mean_token_accuracy": 0.41724138855934145, "step": 62760 }, { "epoch": 0.06321757461668082, "grad_norm": 12.229133994137596, "learning_rate": 4.997851291725186e-05, "loss": 2.6039, "mean_token_accuracy": 0.41034482717514037, "step": 62765 }, { "epoch": 0.06322261066978499, "grad_norm": 11.061082915404828, "learning_rate": 4.99784965418879e-05, "loss": 2.3104, "mean_token_accuracy": 0.4068965554237366, "step": 62770 }, { "epoch": 0.06322764672288916, "grad_norm": 15.358048199655533, "learning_rate": 4.9978480160289447e-05, "loss": 2.4977, "mean_token_accuracy": 0.42413793206214906, "step": 62775 }, { "epoch": 0.06323268277599334, "grad_norm": 10.643038854491767, "learning_rate": 4.997846377245651e-05, "loss": 2.578, "mean_token_accuracy": 0.39655172228813174, "step": 62780 }, { "epoch": 0.06323771882909751, "grad_norm": 12.07324951302468, "learning_rate": 4.997844737838907e-05, "loss": 2.5732, "mean_token_accuracy": 0.4206896543502808, "step": 62785 }, { "epoch": 0.06324275488220168, "grad_norm": 13.497796717479257, "learning_rate": 4.9978430978087157e-05, "loss": 2.6489, "mean_token_accuracy": 0.38620689511299133, "step": 62790 }, { "epoch": 0.06324779093530586, "grad_norm": 11.293321670814565, "learning_rate": 4.997841457155077e-05, "loss": 2.2181, "mean_token_accuracy": 0.441379314661026, "step": 62795 }, { "epoch": 0.06325282698841003, "grad_norm": 11.953459418662739, "learning_rate": 4.9978398158779914e-05, "loss": 2.5272, "mean_token_accuracy": 0.41034482717514037, "step": 62800 }, { "epoch": 0.0632578630415142, "grad_norm": 10.55252555515734, "learning_rate": 4.9978381739774585e-05, "loss": 2.5238, "mean_token_accuracy": 0.458620685338974, "step": 62805 }, { "epoch": 0.06326289909461837, "grad_norm": 12.656446890749, "learning_rate": 4.997836531453479e-05, "loss": 2.6859, "mean_token_accuracy": 0.3310344755649567, "step": 62810 }, { "epoch": 0.06326793514772254, "grad_norm": 10.21942769802935, "learning_rate": 4.997834888306054e-05, "loss": 2.5237, "mean_token_accuracy": 0.4068965494632721, "step": 62815 }, { "epoch": 0.06327297120082671, "grad_norm": 10.12716900947491, "learning_rate": 4.9978332445351834e-05, "loss": 2.1677, "mean_token_accuracy": 0.4819116711616516, "step": 62820 }, { "epoch": 0.06327800725393089, "grad_norm": 11.776979181085718, "learning_rate": 4.997831600140869e-05, "loss": 2.9502, "mean_token_accuracy": 0.3551724076271057, "step": 62825 }, { "epoch": 0.06328304330703506, "grad_norm": 10.794313497532576, "learning_rate": 4.997829955123109e-05, "loss": 2.5536, "mean_token_accuracy": 0.4220810651779175, "step": 62830 }, { "epoch": 0.06328807936013923, "grad_norm": 10.836106390048661, "learning_rate": 4.997828309481905e-05, "loss": 2.3873, "mean_token_accuracy": 0.42758620381355283, "step": 62835 }, { "epoch": 0.06329311541324341, "grad_norm": 21.372688150776362, "learning_rate": 4.9978266632172573e-05, "loss": 2.7892, "mean_token_accuracy": 0.45722927451133727, "step": 62840 }, { "epoch": 0.06329815146634758, "grad_norm": 10.962490477211139, "learning_rate": 4.997825016329167e-05, "loss": 2.6139, "mean_token_accuracy": 0.42758620977401735, "step": 62845 }, { "epoch": 0.06330318751945176, "grad_norm": 11.831796561043065, "learning_rate": 4.997823368817634e-05, "loss": 2.7147, "mean_token_accuracy": 0.3827586233615875, "step": 62850 }, { "epoch": 0.06330822357255593, "grad_norm": 11.027178918930792, "learning_rate": 4.997821720682658e-05, "loss": 2.5693, "mean_token_accuracy": 0.38965516686439516, "step": 62855 }, { "epoch": 0.0633132596256601, "grad_norm": 14.645262125735309, "learning_rate": 4.997820071924241e-05, "loss": 2.5093, "mean_token_accuracy": 0.38965516686439516, "step": 62860 }, { "epoch": 0.06331829567876428, "grad_norm": 11.806101436397247, "learning_rate": 4.997818422542384e-05, "loss": 2.5099, "mean_token_accuracy": 0.4172413766384125, "step": 62865 }, { "epoch": 0.06332333173186845, "grad_norm": 11.735624725176145, "learning_rate": 4.997816772537084e-05, "loss": 2.881, "mean_token_accuracy": 0.36896551549434664, "step": 62870 }, { "epoch": 0.06332836778497263, "grad_norm": 10.974194335934163, "learning_rate": 4.997815121908344e-05, "loss": 2.4363, "mean_token_accuracy": 0.42758620977401735, "step": 62875 }, { "epoch": 0.06333340383807678, "grad_norm": 15.37668527091801, "learning_rate": 4.997813470656165e-05, "loss": 2.3278, "mean_token_accuracy": 0.4068965494632721, "step": 62880 }, { "epoch": 0.06333843989118096, "grad_norm": 9.995425936929692, "learning_rate": 4.9978118187805464e-05, "loss": 2.6105, "mean_token_accuracy": 0.45862069725990295, "step": 62885 }, { "epoch": 0.06334347594428513, "grad_norm": 11.345111963198455, "learning_rate": 4.997810166281488e-05, "loss": 2.3433, "mean_token_accuracy": 0.4517241299152374, "step": 62890 }, { "epoch": 0.0633485119973893, "grad_norm": 13.867665584125604, "learning_rate": 4.9978085131589917e-05, "loss": 2.7356, "mean_token_accuracy": 0.3482758641242981, "step": 62895 }, { "epoch": 0.06335354805049348, "grad_norm": 10.51114361338589, "learning_rate": 4.9978068594130574e-05, "loss": 2.5965, "mean_token_accuracy": 0.3896551728248596, "step": 62900 }, { "epoch": 0.06335858410359765, "grad_norm": 12.408525903786222, "learning_rate": 4.9978052050436845e-05, "loss": 2.4405, "mean_token_accuracy": 0.39655172228813174, "step": 62905 }, { "epoch": 0.06336362015670183, "grad_norm": 11.164442797086085, "learning_rate": 4.997803550050876e-05, "loss": 2.4056, "mean_token_accuracy": 0.4551724135875702, "step": 62910 }, { "epoch": 0.063368656209806, "grad_norm": 13.51298160588357, "learning_rate": 4.99780189443463e-05, "loss": 2.6889, "mean_token_accuracy": 0.3793103456497192, "step": 62915 }, { "epoch": 0.06337369226291018, "grad_norm": 11.619742638909166, "learning_rate": 4.997800238194947e-05, "loss": 2.5925, "mean_token_accuracy": 0.38275861740112305, "step": 62920 }, { "epoch": 0.06337872831601435, "grad_norm": 13.307964549359367, "learning_rate": 4.997798581331829e-05, "loss": 2.2547, "mean_token_accuracy": 0.4344827592372894, "step": 62925 }, { "epoch": 0.06338376436911852, "grad_norm": 14.397365160648638, "learning_rate": 4.997796923845276e-05, "loss": 2.7711, "mean_token_accuracy": 0.38275861740112305, "step": 62930 }, { "epoch": 0.0633888004222227, "grad_norm": 10.497841330405317, "learning_rate": 4.997795265735288e-05, "loss": 2.5143, "mean_token_accuracy": 0.43103448748588563, "step": 62935 }, { "epoch": 0.06339383647532687, "grad_norm": 10.823227285467338, "learning_rate": 4.997793607001865e-05, "loss": 2.9231, "mean_token_accuracy": 0.37241379618644715, "step": 62940 }, { "epoch": 0.06339887252843104, "grad_norm": 12.947377640832107, "learning_rate": 4.997791947645009e-05, "loss": 2.5871, "mean_token_accuracy": 0.38620689809322356, "step": 62945 }, { "epoch": 0.0634039085815352, "grad_norm": 12.817206924961173, "learning_rate": 4.9977902876647184e-05, "loss": 2.5038, "mean_token_accuracy": 0.3862068891525269, "step": 62950 }, { "epoch": 0.06340894463463938, "grad_norm": 12.508946739520635, "learning_rate": 4.9977886270609954e-05, "loss": 2.8962, "mean_token_accuracy": 0.35862069129943847, "step": 62955 }, { "epoch": 0.06341398068774355, "grad_norm": 11.33053567371234, "learning_rate": 4.99778696583384e-05, "loss": 2.3936, "mean_token_accuracy": 0.38620689511299133, "step": 62960 }, { "epoch": 0.06341901674084773, "grad_norm": 10.486351483419694, "learning_rate": 4.997785303983252e-05, "loss": 2.4282, "mean_token_accuracy": 0.3965517163276672, "step": 62965 }, { "epoch": 0.0634240527939519, "grad_norm": 13.012336525853955, "learning_rate": 4.9977836415092324e-05, "loss": 2.9493, "mean_token_accuracy": 0.3172413736581802, "step": 62970 }, { "epoch": 0.06342908884705607, "grad_norm": 15.176180764010281, "learning_rate": 4.997781978411782e-05, "loss": 2.3287, "mean_token_accuracy": 0.3931034505367279, "step": 62975 }, { "epoch": 0.06343412490016025, "grad_norm": 12.208617826001646, "learning_rate": 4.997780314690901e-05, "loss": 2.1636, "mean_token_accuracy": 0.4620689630508423, "step": 62980 }, { "epoch": 0.06343916095326442, "grad_norm": 12.202523748047229, "learning_rate": 4.99777865034659e-05, "loss": 2.4651, "mean_token_accuracy": 0.4068965494632721, "step": 62985 }, { "epoch": 0.0634441970063686, "grad_norm": 10.509024103551493, "learning_rate": 4.997776985378848e-05, "loss": 2.7396, "mean_token_accuracy": 0.3862069070339203, "step": 62990 }, { "epoch": 0.06344923305947277, "grad_norm": 12.697886661582267, "learning_rate": 4.997775319787678e-05, "loss": 2.3262, "mean_token_accuracy": 0.38275861740112305, "step": 62995 }, { "epoch": 0.06345426911257694, "grad_norm": 9.792681006966529, "learning_rate": 4.997773653573078e-05, "loss": 2.6428, "mean_token_accuracy": 0.3931034505367279, "step": 63000 }, { "epoch": 0.06345930516568112, "grad_norm": 10.925680548089185, "learning_rate": 4.99777198673505e-05, "loss": 2.3808, "mean_token_accuracy": 0.41379310488700866, "step": 63005 }, { "epoch": 0.06346434121878529, "grad_norm": 11.889808588793308, "learning_rate": 4.997770319273594e-05, "loss": 2.6302, "mean_token_accuracy": 0.37931033968925476, "step": 63010 }, { "epoch": 0.06346937727188946, "grad_norm": 11.566248975214169, "learning_rate": 4.997768651188711e-05, "loss": 2.5148, "mean_token_accuracy": 0.3931034505367279, "step": 63015 }, { "epoch": 0.06347441332499362, "grad_norm": 10.211655840622063, "learning_rate": 4.9977669824804e-05, "loss": 2.5806, "mean_token_accuracy": 0.4137930989265442, "step": 63020 }, { "epoch": 0.0634794493780978, "grad_norm": 10.87467976315671, "learning_rate": 4.997765313148664e-05, "loss": 2.3792, "mean_token_accuracy": 0.4551724135875702, "step": 63025 }, { "epoch": 0.06348448543120197, "grad_norm": 12.49875647957717, "learning_rate": 4.9977636431935005e-05, "loss": 2.168, "mean_token_accuracy": 0.4517241299152374, "step": 63030 }, { "epoch": 0.06348952148430614, "grad_norm": 13.348785850842898, "learning_rate": 4.997761972614912e-05, "loss": 2.3538, "mean_token_accuracy": 0.37241379022598264, "step": 63035 }, { "epoch": 0.06349455753741032, "grad_norm": 11.006129740225912, "learning_rate": 4.997760301412898e-05, "loss": 2.2569, "mean_token_accuracy": 0.44682395458221436, "step": 63040 }, { "epoch": 0.06349959359051449, "grad_norm": 17.709199345690887, "learning_rate": 4.99775862958746e-05, "loss": 2.4971, "mean_token_accuracy": 0.37586206793785093, "step": 63045 }, { "epoch": 0.06350462964361867, "grad_norm": 10.296251984232336, "learning_rate": 4.9977569571385966e-05, "loss": 2.3312, "mean_token_accuracy": 0.45862067937850953, "step": 63050 }, { "epoch": 0.06350966569672284, "grad_norm": 12.590140769430512, "learning_rate": 4.9977552840663106e-05, "loss": 2.512, "mean_token_accuracy": 0.42413793206214906, "step": 63055 }, { "epoch": 0.06351470174982701, "grad_norm": 11.295289432710671, "learning_rate": 4.997753610370601e-05, "loss": 2.235, "mean_token_accuracy": 0.47931034564971925, "step": 63060 }, { "epoch": 0.06351973780293119, "grad_norm": 17.41531609938963, "learning_rate": 4.997751936051468e-05, "loss": 2.8108, "mean_token_accuracy": 0.3862069010734558, "step": 63065 }, { "epoch": 0.06352477385603536, "grad_norm": 11.526873377100701, "learning_rate": 4.997750261108913e-05, "loss": 2.4722, "mean_token_accuracy": 0.4413793087005615, "step": 63070 }, { "epoch": 0.06352980990913953, "grad_norm": 10.80343095231409, "learning_rate": 4.997748585542936e-05, "loss": 2.7652, "mean_token_accuracy": 0.3827586233615875, "step": 63075 }, { "epoch": 0.06353484596224371, "grad_norm": 12.391753481395607, "learning_rate": 4.997746909353538e-05, "loss": 2.6906, "mean_token_accuracy": 0.36896551847457887, "step": 63080 }, { "epoch": 0.06353988201534788, "grad_norm": 10.849596166029507, "learning_rate": 4.9977452325407184e-05, "loss": 2.3387, "mean_token_accuracy": 0.4344827592372894, "step": 63085 }, { "epoch": 0.06354491806845204, "grad_norm": 11.34962919939225, "learning_rate": 4.997743555104478e-05, "loss": 2.8666, "mean_token_accuracy": 0.38620689511299133, "step": 63090 }, { "epoch": 0.06354995412155622, "grad_norm": 9.645527777748962, "learning_rate": 4.997741877044818e-05, "loss": 2.6885, "mean_token_accuracy": 0.3965517163276672, "step": 63095 }, { "epoch": 0.06355499017466039, "grad_norm": 9.765620035514612, "learning_rate": 4.9977401983617386e-05, "loss": 2.8432, "mean_token_accuracy": 0.47241379618644713, "step": 63100 }, { "epoch": 0.06356002622776456, "grad_norm": 9.516551623707082, "learning_rate": 4.9977385190552394e-05, "loss": 2.6742, "mean_token_accuracy": 0.3655172437429428, "step": 63105 }, { "epoch": 0.06356506228086874, "grad_norm": 9.295572835471232, "learning_rate": 4.9977368391253225e-05, "loss": 2.3065, "mean_token_accuracy": 0.41560798287391665, "step": 63110 }, { "epoch": 0.06357009833397291, "grad_norm": 10.821879250169616, "learning_rate": 4.997735158571986e-05, "loss": 2.7486, "mean_token_accuracy": 0.3844525098800659, "step": 63115 }, { "epoch": 0.06357513438707708, "grad_norm": 13.628662211805263, "learning_rate": 4.9977334773952333e-05, "loss": 3.116, "mean_token_accuracy": 0.4, "step": 63120 }, { "epoch": 0.06358017044018126, "grad_norm": 21.20882584696481, "learning_rate": 4.9977317955950623e-05, "loss": 2.3007, "mean_token_accuracy": 0.3965517282485962, "step": 63125 }, { "epoch": 0.06358520649328543, "grad_norm": 14.21690545699418, "learning_rate": 4.997730113171475e-05, "loss": 2.3823, "mean_token_accuracy": 0.458620685338974, "step": 63130 }, { "epoch": 0.0635902425463896, "grad_norm": 14.38705054726303, "learning_rate": 4.9977284301244716e-05, "loss": 2.5231, "mean_token_accuracy": 0.41379310488700866, "step": 63135 }, { "epoch": 0.06359527859949378, "grad_norm": 11.278973915636076, "learning_rate": 4.997726746454052e-05, "loss": 2.562, "mean_token_accuracy": 0.41034482717514037, "step": 63140 }, { "epoch": 0.06360031465259795, "grad_norm": 11.427734078640018, "learning_rate": 4.9977250621602165e-05, "loss": 2.5195, "mean_token_accuracy": 0.39655172228813174, "step": 63145 }, { "epoch": 0.06360535070570213, "grad_norm": 12.187791017923425, "learning_rate": 4.997723377242966e-05, "loss": 3.1222, "mean_token_accuracy": 0.334482753276825, "step": 63150 }, { "epoch": 0.0636103867588063, "grad_norm": 11.539350390152805, "learning_rate": 4.9977216917023026e-05, "loss": 2.644, "mean_token_accuracy": 0.3793103456497192, "step": 63155 }, { "epoch": 0.06361542281191046, "grad_norm": 11.24816336109071, "learning_rate": 4.997720005538224e-05, "loss": 2.6427, "mean_token_accuracy": 0.3999999940395355, "step": 63160 }, { "epoch": 0.06362045886501463, "grad_norm": 12.509004571078448, "learning_rate": 4.997718318750732e-05, "loss": 2.7501, "mean_token_accuracy": 0.3275862067937851, "step": 63165 }, { "epoch": 0.06362549491811881, "grad_norm": 11.598917291669215, "learning_rate": 4.9977166313398274e-05, "loss": 2.8979, "mean_token_accuracy": 0.3620689630508423, "step": 63170 }, { "epoch": 0.06363053097122298, "grad_norm": 11.166131181654826, "learning_rate": 4.9977149433055095e-05, "loss": 2.3234, "mean_token_accuracy": 0.43448275327682495, "step": 63175 }, { "epoch": 0.06363556702432716, "grad_norm": 13.551153532101033, "learning_rate": 4.9977132546477794e-05, "loss": 2.5216, "mean_token_accuracy": 0.3931034505367279, "step": 63180 }, { "epoch": 0.06364060307743133, "grad_norm": 18.9003953210268, "learning_rate": 4.9977115653666385e-05, "loss": 3.004, "mean_token_accuracy": 0.3344827651977539, "step": 63185 }, { "epoch": 0.0636456391305355, "grad_norm": 11.158137401513866, "learning_rate": 4.9977098754620854e-05, "loss": 2.556, "mean_token_accuracy": 0.43103448748588563, "step": 63190 }, { "epoch": 0.06365067518363968, "grad_norm": 23.87861349720783, "learning_rate": 4.997708184934123e-05, "loss": 2.5001, "mean_token_accuracy": 0.41724138259887694, "step": 63195 }, { "epoch": 0.06365571123674385, "grad_norm": 12.204114721710244, "learning_rate": 4.997706493782749e-05, "loss": 2.6374, "mean_token_accuracy": 0.38275861740112305, "step": 63200 }, { "epoch": 0.06366074728984802, "grad_norm": 13.214232527625812, "learning_rate": 4.997704802007965e-05, "loss": 2.2976, "mean_token_accuracy": 0.4034482777118683, "step": 63205 }, { "epoch": 0.0636657833429522, "grad_norm": 11.084630459788958, "learning_rate": 4.997703109609773e-05, "loss": 2.3995, "mean_token_accuracy": 0.3827586233615875, "step": 63210 }, { "epoch": 0.06367081939605637, "grad_norm": 10.238150474875157, "learning_rate": 4.997701416588172e-05, "loss": 2.5116, "mean_token_accuracy": 0.39177253246307375, "step": 63215 }, { "epoch": 0.06367585544916055, "grad_norm": 10.594671521227854, "learning_rate": 4.997699722943162e-05, "loss": 2.4187, "mean_token_accuracy": 0.4517241418361664, "step": 63220 }, { "epoch": 0.06368089150226472, "grad_norm": 13.080216595447656, "learning_rate": 4.9976980286747444e-05, "loss": 2.9102, "mean_token_accuracy": 0.358620685338974, "step": 63225 }, { "epoch": 0.06368592755536888, "grad_norm": 12.23305530208788, "learning_rate": 4.997696333782919e-05, "loss": 2.7292, "mean_token_accuracy": 0.3793103456497192, "step": 63230 }, { "epoch": 0.06369096360847305, "grad_norm": 11.94714586330912, "learning_rate": 4.997694638267687e-05, "loss": 2.6121, "mean_token_accuracy": 0.358620685338974, "step": 63235 }, { "epoch": 0.06369599966157723, "grad_norm": 12.489467614460153, "learning_rate": 4.997692942129048e-05, "loss": 2.6647, "mean_token_accuracy": 0.4172413766384125, "step": 63240 }, { "epoch": 0.0637010357146814, "grad_norm": 9.919376558515882, "learning_rate": 4.997691245367004e-05, "loss": 2.5829, "mean_token_accuracy": 0.42413793206214906, "step": 63245 }, { "epoch": 0.06370607176778557, "grad_norm": 13.804107657013484, "learning_rate": 4.9976895479815534e-05, "loss": 2.9093, "mean_token_accuracy": 0.36896551251411436, "step": 63250 }, { "epoch": 0.06371110782088975, "grad_norm": 11.990478593465738, "learning_rate": 4.997687849972698e-05, "loss": 2.7383, "mean_token_accuracy": 0.3931034505367279, "step": 63255 }, { "epoch": 0.06371614387399392, "grad_norm": 15.679260867060291, "learning_rate": 4.997686151340438e-05, "loss": 2.6363, "mean_token_accuracy": 0.42413793206214906, "step": 63260 }, { "epoch": 0.0637211799270981, "grad_norm": 8.256280967282471, "learning_rate": 4.9976844520847735e-05, "loss": 2.2399, "mean_token_accuracy": 0.4551724076271057, "step": 63265 }, { "epoch": 0.06372621598020227, "grad_norm": 12.107231461333452, "learning_rate": 4.997682752205706e-05, "loss": 2.453, "mean_token_accuracy": 0.42758620381355283, "step": 63270 }, { "epoch": 0.06373125203330644, "grad_norm": 12.659418740034596, "learning_rate": 4.997681051703236e-05, "loss": 2.5193, "mean_token_accuracy": 0.4482758641242981, "step": 63275 }, { "epoch": 0.06373628808641062, "grad_norm": 13.315742665187408, "learning_rate": 4.997679350577362e-05, "loss": 3.1609, "mean_token_accuracy": 0.3068965464830399, "step": 63280 }, { "epoch": 0.06374132413951479, "grad_norm": 11.20398462130214, "learning_rate": 4.997677648828086e-05, "loss": 2.545, "mean_token_accuracy": 0.3931034505367279, "step": 63285 }, { "epoch": 0.06374636019261896, "grad_norm": 10.878976319003169, "learning_rate": 4.997675946455408e-05, "loss": 2.2835, "mean_token_accuracy": 0.4310344815254211, "step": 63290 }, { "epoch": 0.06375139624572314, "grad_norm": 11.127145562508073, "learning_rate": 4.997674243459329e-05, "loss": 2.4556, "mean_token_accuracy": 0.3999999940395355, "step": 63295 }, { "epoch": 0.0637564322988273, "grad_norm": 16.925077194729635, "learning_rate": 4.997672539839849e-05, "loss": 2.8662, "mean_token_accuracy": 0.3655172407627106, "step": 63300 }, { "epoch": 0.06376146835193147, "grad_norm": 14.375031439210662, "learning_rate": 4.9976708355969696e-05, "loss": 2.9231, "mean_token_accuracy": 0.38620689511299133, "step": 63305 }, { "epoch": 0.06376650440503565, "grad_norm": 11.408587774770599, "learning_rate": 4.9976691307306895e-05, "loss": 2.436, "mean_token_accuracy": 0.3551724076271057, "step": 63310 }, { "epoch": 0.06377154045813982, "grad_norm": 10.83959831649001, "learning_rate": 4.99766742524101e-05, "loss": 2.3578, "mean_token_accuracy": 0.4517241358757019, "step": 63315 }, { "epoch": 0.063776576511244, "grad_norm": 13.484847677117825, "learning_rate": 4.997665719127931e-05, "loss": 2.7433, "mean_token_accuracy": 0.3655172407627106, "step": 63320 }, { "epoch": 0.06378161256434817, "grad_norm": 11.77695444858873, "learning_rate": 4.9976640123914545e-05, "loss": 2.602, "mean_token_accuracy": 0.3931034505367279, "step": 63325 }, { "epoch": 0.06378664861745234, "grad_norm": 11.811662870956198, "learning_rate": 4.99766230503158e-05, "loss": 2.3361, "mean_token_accuracy": 0.4517241418361664, "step": 63330 }, { "epoch": 0.06379168467055651, "grad_norm": 10.276126145341447, "learning_rate": 4.997660597048307e-05, "loss": 2.5772, "mean_token_accuracy": 0.3620689630508423, "step": 63335 }, { "epoch": 0.06379672072366069, "grad_norm": 12.198763836826688, "learning_rate": 4.997658888441637e-05, "loss": 2.6502, "mean_token_accuracy": 0.3896551728248596, "step": 63340 }, { "epoch": 0.06380175677676486, "grad_norm": 11.661362387885186, "learning_rate": 4.997657179211572e-05, "loss": 2.2321, "mean_token_accuracy": 0.48965518474578856, "step": 63345 }, { "epoch": 0.06380679282986904, "grad_norm": 12.647521332896089, "learning_rate": 4.997655469358109e-05, "loss": 2.6739, "mean_token_accuracy": 0.3655172407627106, "step": 63350 }, { "epoch": 0.06381182888297321, "grad_norm": 13.875023624712433, "learning_rate": 4.997653758881252e-05, "loss": 2.6246, "mean_token_accuracy": 0.3999999940395355, "step": 63355 }, { "epoch": 0.06381686493607738, "grad_norm": 10.24518492284466, "learning_rate": 4.9976520477809984e-05, "loss": 2.3814, "mean_token_accuracy": 0.4206896543502808, "step": 63360 }, { "epoch": 0.06382190098918156, "grad_norm": 10.964505185797076, "learning_rate": 4.997650336057351e-05, "loss": 2.4232, "mean_token_accuracy": 0.4103448212146759, "step": 63365 }, { "epoch": 0.06382693704228572, "grad_norm": 12.912499536411739, "learning_rate": 4.9976486237103096e-05, "loss": 2.5719, "mean_token_accuracy": 0.4034482717514038, "step": 63370 }, { "epoch": 0.06383197309538989, "grad_norm": 11.440229352750475, "learning_rate": 4.997646910739874e-05, "loss": 2.9031, "mean_token_accuracy": 0.3241379290819168, "step": 63375 }, { "epoch": 0.06383700914849406, "grad_norm": 12.906632040609026, "learning_rate": 4.997645197146045e-05, "loss": 2.7181, "mean_token_accuracy": 0.38275861740112305, "step": 63380 }, { "epoch": 0.06384204520159824, "grad_norm": 11.715984917397773, "learning_rate": 4.9976434829288235e-05, "loss": 2.6308, "mean_token_accuracy": 0.3827586233615875, "step": 63385 }, { "epoch": 0.06384708125470241, "grad_norm": 13.657477189207045, "learning_rate": 4.99764176808821e-05, "loss": 2.8588, "mean_token_accuracy": 0.34137930274009703, "step": 63390 }, { "epoch": 0.06385211730780659, "grad_norm": 11.755087163734018, "learning_rate": 4.997640052624204e-05, "loss": 2.4587, "mean_token_accuracy": 0.43793103098869324, "step": 63395 }, { "epoch": 0.06385715336091076, "grad_norm": 11.367460200987733, "learning_rate": 4.997638336536807e-05, "loss": 2.2045, "mean_token_accuracy": 0.4310344815254211, "step": 63400 }, { "epoch": 0.06386218941401493, "grad_norm": 11.591915284504195, "learning_rate": 4.997636619826019e-05, "loss": 2.7151, "mean_token_accuracy": 0.42413792610168455, "step": 63405 }, { "epoch": 0.06386722546711911, "grad_norm": 10.571675202571237, "learning_rate": 4.997634902491841e-05, "loss": 2.1931, "mean_token_accuracy": 0.40865094065666197, "step": 63410 }, { "epoch": 0.06387226152022328, "grad_norm": 11.07506351615169, "learning_rate": 4.997633184534272e-05, "loss": 2.4622, "mean_token_accuracy": 0.4413793206214905, "step": 63415 }, { "epoch": 0.06387729757332745, "grad_norm": 13.903265912634309, "learning_rate": 4.997631465953315e-05, "loss": 2.5838, "mean_token_accuracy": 0.3965517282485962, "step": 63420 }, { "epoch": 0.06388233362643163, "grad_norm": 10.819228478310656, "learning_rate": 4.997629746748968e-05, "loss": 2.5452, "mean_token_accuracy": 0.3965517282485962, "step": 63425 }, { "epoch": 0.0638873696795358, "grad_norm": 12.73018680328224, "learning_rate": 4.997628026921232e-05, "loss": 2.3459, "mean_token_accuracy": 0.46364185214042664, "step": 63430 }, { "epoch": 0.06389240573263998, "grad_norm": 10.221986991331171, "learning_rate": 4.9976263064701096e-05, "loss": 2.8595, "mean_token_accuracy": 0.4068965494632721, "step": 63435 }, { "epoch": 0.06389744178574414, "grad_norm": 10.529447617098226, "learning_rate": 4.9976245853955985e-05, "loss": 2.4286, "mean_token_accuracy": 0.4620689630508423, "step": 63440 }, { "epoch": 0.06390247783884831, "grad_norm": 10.191378591498585, "learning_rate": 4.997622863697701e-05, "loss": 2.4441, "mean_token_accuracy": 0.47108287215232847, "step": 63445 }, { "epoch": 0.06390751389195248, "grad_norm": 13.20666845510418, "learning_rate": 4.9976211413764164e-05, "loss": 2.7672, "mean_token_accuracy": 0.36551723480224607, "step": 63450 }, { "epoch": 0.06391254994505666, "grad_norm": 10.932915369588255, "learning_rate": 4.997619418431746e-05, "loss": 2.6027, "mean_token_accuracy": 0.3793103456497192, "step": 63455 }, { "epoch": 0.06391758599816083, "grad_norm": 15.673128488812408, "learning_rate": 4.99761769486369e-05, "loss": 2.8893, "mean_token_accuracy": 0.324137932062149, "step": 63460 }, { "epoch": 0.063922622051265, "grad_norm": 12.211686268731789, "learning_rate": 4.997615970672249e-05, "loss": 2.4525, "mean_token_accuracy": 0.3827586233615875, "step": 63465 }, { "epoch": 0.06392765810436918, "grad_norm": 8.739707962963923, "learning_rate": 4.9976142458574224e-05, "loss": 3.1536, "mean_token_accuracy": 0.3655172407627106, "step": 63470 }, { "epoch": 0.06393269415747335, "grad_norm": 11.208363184433805, "learning_rate": 4.9976125204192124e-05, "loss": 2.5098, "mean_token_accuracy": 0.43103448748588563, "step": 63475 }, { "epoch": 0.06393773021057753, "grad_norm": 9.79950717846871, "learning_rate": 4.997610794357618e-05, "loss": 2.4275, "mean_token_accuracy": 0.41034482717514037, "step": 63480 }, { "epoch": 0.0639427662636817, "grad_norm": 15.209781113691305, "learning_rate": 4.997609067672641e-05, "loss": 2.5033, "mean_token_accuracy": 0.4310344815254211, "step": 63485 }, { "epoch": 0.06394780231678587, "grad_norm": 14.167752256467516, "learning_rate": 4.997607340364282e-05, "loss": 2.6225, "mean_token_accuracy": 0.41712037920951844, "step": 63490 }, { "epoch": 0.06395283836989005, "grad_norm": 11.100999586764857, "learning_rate": 4.997605612432539e-05, "loss": 2.2545, "mean_token_accuracy": 0.4448275864124298, "step": 63495 }, { "epoch": 0.06395787442299422, "grad_norm": 10.966504768297337, "learning_rate": 4.9976038838774154e-05, "loss": 2.2442, "mean_token_accuracy": 0.4620689630508423, "step": 63500 }, { "epoch": 0.0639629104760984, "grad_norm": 14.388775829550012, "learning_rate": 4.9976021546989105e-05, "loss": 2.8611, "mean_token_accuracy": 0.3999999940395355, "step": 63505 }, { "epoch": 0.06396794652920255, "grad_norm": 12.387886631425452, "learning_rate": 4.9976004248970246e-05, "loss": 3.1523, "mean_token_accuracy": 0.32068965435028074, "step": 63510 }, { "epoch": 0.06397298258230673, "grad_norm": 11.362997099574981, "learning_rate": 4.9975986944717576e-05, "loss": 2.5575, "mean_token_accuracy": 0.36896551251411436, "step": 63515 }, { "epoch": 0.0639780186354109, "grad_norm": 16.192976984623527, "learning_rate": 4.997596963423112e-05, "loss": 2.6553, "mean_token_accuracy": 0.3999999940395355, "step": 63520 }, { "epoch": 0.06398305468851508, "grad_norm": 9.875343457876722, "learning_rate": 4.997595231751086e-05, "loss": 2.3841, "mean_token_accuracy": 0.4448275864124298, "step": 63525 }, { "epoch": 0.06398809074161925, "grad_norm": 13.665571163569808, "learning_rate": 4.99759349945568e-05, "loss": 2.4868, "mean_token_accuracy": 0.42758620381355283, "step": 63530 }, { "epoch": 0.06399312679472342, "grad_norm": 11.1912015876693, "learning_rate": 4.9975917665368975e-05, "loss": 2.5395, "mean_token_accuracy": 0.39310344457626345, "step": 63535 }, { "epoch": 0.0639981628478276, "grad_norm": 9.120064874876928, "learning_rate": 4.997590032994736e-05, "loss": 2.3697, "mean_token_accuracy": 0.42758620977401735, "step": 63540 }, { "epoch": 0.06400319890093177, "grad_norm": 9.272633248285443, "learning_rate": 4.997588298829198e-05, "loss": 2.0672, "mean_token_accuracy": 0.5103448271751404, "step": 63545 }, { "epoch": 0.06400823495403594, "grad_norm": 12.948368051972265, "learning_rate": 4.9975865640402825e-05, "loss": 2.6744, "mean_token_accuracy": 0.39310344457626345, "step": 63550 }, { "epoch": 0.06401327100714012, "grad_norm": 12.423413399008998, "learning_rate": 4.9975848286279906e-05, "loss": 2.4794, "mean_token_accuracy": 0.43103448748588563, "step": 63555 }, { "epoch": 0.06401830706024429, "grad_norm": 11.395164187350844, "learning_rate": 4.997583092592322e-05, "loss": 2.4467, "mean_token_accuracy": 0.42758620381355283, "step": 63560 }, { "epoch": 0.06402334311334847, "grad_norm": 10.13471431021179, "learning_rate": 4.9975813559332785e-05, "loss": 2.9657, "mean_token_accuracy": 0.37586206793785093, "step": 63565 }, { "epoch": 0.06402837916645264, "grad_norm": 11.549504181070393, "learning_rate": 4.99757961865086e-05, "loss": 2.3976, "mean_token_accuracy": 0.44618227481842043, "step": 63570 }, { "epoch": 0.06403341521955681, "grad_norm": 11.536189460612412, "learning_rate": 4.997577880745066e-05, "loss": 2.6871, "mean_token_accuracy": 0.4206896543502808, "step": 63575 }, { "epoch": 0.06403845127266097, "grad_norm": 10.38922256890761, "learning_rate": 4.997576142215899e-05, "loss": 2.8363, "mean_token_accuracy": 0.35862069129943847, "step": 63580 }, { "epoch": 0.06404348732576515, "grad_norm": 10.317430042015749, "learning_rate": 4.997574403063358e-05, "loss": 2.4315, "mean_token_accuracy": 0.42413792610168455, "step": 63585 }, { "epoch": 0.06404852337886932, "grad_norm": 10.544819944891124, "learning_rate": 4.997572663287444e-05, "loss": 2.5318, "mean_token_accuracy": 0.37931033670902253, "step": 63590 }, { "epoch": 0.0640535594319735, "grad_norm": 14.657058336835519, "learning_rate": 4.9975709228881565e-05, "loss": 2.4867, "mean_token_accuracy": 0.39310344457626345, "step": 63595 }, { "epoch": 0.06405859548507767, "grad_norm": 11.280024663806302, "learning_rate": 4.997569181865498e-05, "loss": 2.571, "mean_token_accuracy": 0.4310344815254211, "step": 63600 }, { "epoch": 0.06406363153818184, "grad_norm": 12.156751533400804, "learning_rate": 4.997567440219467e-05, "loss": 2.6509, "mean_token_accuracy": 0.3689655244350433, "step": 63605 }, { "epoch": 0.06406866759128602, "grad_norm": 12.979973551279228, "learning_rate": 4.997565697950065e-05, "loss": 2.2349, "mean_token_accuracy": 0.44664678573608396, "step": 63610 }, { "epoch": 0.06407370364439019, "grad_norm": 14.28149695784813, "learning_rate": 4.9975639550572916e-05, "loss": 2.4403, "mean_token_accuracy": 0.42068964838981626, "step": 63615 }, { "epoch": 0.06407873969749436, "grad_norm": 10.78705531233412, "learning_rate": 4.9975622115411495e-05, "loss": 2.3318, "mean_token_accuracy": 0.42413793206214906, "step": 63620 }, { "epoch": 0.06408377575059854, "grad_norm": 12.448957464403424, "learning_rate": 4.997560467401637e-05, "loss": 1.9822, "mean_token_accuracy": 0.5277072012424469, "step": 63625 }, { "epoch": 0.06408881180370271, "grad_norm": 13.446651787861287, "learning_rate": 4.997558722638754e-05, "loss": 2.6381, "mean_token_accuracy": 0.4226255267858505, "step": 63630 }, { "epoch": 0.06409384785680688, "grad_norm": 11.679342005142507, "learning_rate": 4.997556977252504e-05, "loss": 2.5566, "mean_token_accuracy": 0.4310344815254211, "step": 63635 }, { "epoch": 0.06409888390991106, "grad_norm": 13.121698477220587, "learning_rate": 4.9975552312428846e-05, "loss": 2.5438, "mean_token_accuracy": 0.42928009629249575, "step": 63640 }, { "epoch": 0.06410391996301523, "grad_norm": 12.01048748425883, "learning_rate": 4.997553484609898e-05, "loss": 2.5324, "mean_token_accuracy": 0.4034482717514038, "step": 63645 }, { "epoch": 0.06410895601611939, "grad_norm": 13.442931495953765, "learning_rate": 4.997551737353543e-05, "loss": 2.7197, "mean_token_accuracy": 0.35862069129943847, "step": 63650 }, { "epoch": 0.06411399206922357, "grad_norm": 10.99846432018099, "learning_rate": 4.997549989473822e-05, "loss": 2.3362, "mean_token_accuracy": 0.4068965554237366, "step": 63655 }, { "epoch": 0.06411902812232774, "grad_norm": 11.674528634927146, "learning_rate": 4.997548240970735e-05, "loss": 2.5813, "mean_token_accuracy": 0.3896551728248596, "step": 63660 }, { "epoch": 0.06412406417543191, "grad_norm": 10.737567429673202, "learning_rate": 4.997546491844281e-05, "loss": 2.4706, "mean_token_accuracy": 0.4034482717514038, "step": 63665 }, { "epoch": 0.06412910022853609, "grad_norm": 11.479922823042527, "learning_rate": 4.997544742094462e-05, "loss": 2.7219, "mean_token_accuracy": 0.3965517282485962, "step": 63670 }, { "epoch": 0.06413413628164026, "grad_norm": 13.827038573206677, "learning_rate": 4.9975429917212793e-05, "loss": 2.5697, "mean_token_accuracy": 0.38620689511299133, "step": 63675 }, { "epoch": 0.06413917233474443, "grad_norm": 13.153667121994358, "learning_rate": 4.997541240724731e-05, "loss": 2.5, "mean_token_accuracy": 0.4448275864124298, "step": 63680 }, { "epoch": 0.06414420838784861, "grad_norm": 11.549180428583616, "learning_rate": 4.997539489104818e-05, "loss": 2.3835, "mean_token_accuracy": 0.4310344815254211, "step": 63685 }, { "epoch": 0.06414924444095278, "grad_norm": 10.731964296006106, "learning_rate": 4.997537736861543e-05, "loss": 2.568, "mean_token_accuracy": 0.41724138259887694, "step": 63690 }, { "epoch": 0.06415428049405696, "grad_norm": 15.682129388935925, "learning_rate": 4.997535983994905e-05, "loss": 2.6059, "mean_token_accuracy": 0.43448275327682495, "step": 63695 }, { "epoch": 0.06415931654716113, "grad_norm": 10.530591610677469, "learning_rate": 4.9975342305049033e-05, "loss": 2.7889, "mean_token_accuracy": 0.3620689630508423, "step": 63700 }, { "epoch": 0.0641643526002653, "grad_norm": 9.193220562309554, "learning_rate": 4.99753247639154e-05, "loss": 2.1755, "mean_token_accuracy": 0.4689655125141144, "step": 63705 }, { "epoch": 0.06416938865336948, "grad_norm": 11.853488898559288, "learning_rate": 4.997530721654816e-05, "loss": 2.7762, "mean_token_accuracy": 0.37586206793785093, "step": 63710 }, { "epoch": 0.06417442470647365, "grad_norm": 13.090050233671361, "learning_rate": 4.99752896629473e-05, "loss": 2.8838, "mean_token_accuracy": 0.40689654648303986, "step": 63715 }, { "epoch": 0.06417946075957781, "grad_norm": 13.012997227267867, "learning_rate": 4.9975272103112843e-05, "loss": 3.1407, "mean_token_accuracy": 0.34137930274009703, "step": 63720 }, { "epoch": 0.06418449681268198, "grad_norm": 11.670499843261506, "learning_rate": 4.997525453704478e-05, "loss": 2.7562, "mean_token_accuracy": 0.3551724135875702, "step": 63725 }, { "epoch": 0.06418953286578616, "grad_norm": 10.741735803707135, "learning_rate": 4.997523696474312e-05, "loss": 2.5863, "mean_token_accuracy": 0.4517241299152374, "step": 63730 }, { "epoch": 0.06419456891889033, "grad_norm": 12.298533556515801, "learning_rate": 4.9975219386207865e-05, "loss": 2.8177, "mean_token_accuracy": 0.36896551847457887, "step": 63735 }, { "epoch": 0.0641996049719945, "grad_norm": 10.04131896935388, "learning_rate": 4.997520180143904e-05, "loss": 2.1383, "mean_token_accuracy": 0.4896551609039307, "step": 63740 }, { "epoch": 0.06420464102509868, "grad_norm": 10.379943756188952, "learning_rate": 4.997518421043662e-05, "loss": 2.7052, "mean_token_accuracy": 0.38275861740112305, "step": 63745 }, { "epoch": 0.06420967707820285, "grad_norm": 11.9223669739286, "learning_rate": 4.997516661320063e-05, "loss": 2.5935, "mean_token_accuracy": 0.38620689511299133, "step": 63750 }, { "epoch": 0.06421471313130703, "grad_norm": 9.966227697130536, "learning_rate": 4.997514900973106e-05, "loss": 2.6824, "mean_token_accuracy": 0.37241379618644715, "step": 63755 }, { "epoch": 0.0642197491844112, "grad_norm": 13.030955730356617, "learning_rate": 4.997513140002794e-05, "loss": 2.4712, "mean_token_accuracy": 0.47586206197738645, "step": 63760 }, { "epoch": 0.06422478523751537, "grad_norm": 11.495548804624207, "learning_rate": 4.997511378409125e-05, "loss": 2.1722, "mean_token_accuracy": 0.42758620381355283, "step": 63765 }, { "epoch": 0.06422982129061955, "grad_norm": 15.074710887003405, "learning_rate": 4.9975096161921e-05, "loss": 2.2432, "mean_token_accuracy": 0.4, "step": 63770 }, { "epoch": 0.06423485734372372, "grad_norm": 11.335977363717813, "learning_rate": 4.99750785335172e-05, "loss": 2.5278, "mean_token_accuracy": 0.37931033968925476, "step": 63775 }, { "epoch": 0.0642398933968279, "grad_norm": 14.239503286117822, "learning_rate": 4.9975060898879853e-05, "loss": 2.9525, "mean_token_accuracy": 0.38620689511299133, "step": 63780 }, { "epoch": 0.06424492944993207, "grad_norm": 11.719927453144036, "learning_rate": 4.9975043258008966e-05, "loss": 2.6005, "mean_token_accuracy": 0.4034482717514038, "step": 63785 }, { "epoch": 0.06424996550303623, "grad_norm": 12.309092734026658, "learning_rate": 4.997502561090454e-05, "loss": 2.7066, "mean_token_accuracy": 0.37392619252204895, "step": 63790 }, { "epoch": 0.0642550015561404, "grad_norm": 13.876190564420307, "learning_rate": 4.997500795756658e-05, "loss": 2.7105, "mean_token_accuracy": 0.3827586203813553, "step": 63795 }, { "epoch": 0.06426003760924458, "grad_norm": 12.36681760661571, "learning_rate": 4.99749902979951e-05, "loss": 2.8244, "mean_token_accuracy": 0.3896551787853241, "step": 63800 }, { "epoch": 0.06426507366234875, "grad_norm": 12.480970715421119, "learning_rate": 4.997497263219009e-05, "loss": 2.7236, "mean_token_accuracy": 0.37586206793785093, "step": 63805 }, { "epoch": 0.06427010971545292, "grad_norm": 13.317197676434864, "learning_rate": 4.9974954960151564e-05, "loss": 2.753, "mean_token_accuracy": 0.358620685338974, "step": 63810 }, { "epoch": 0.0642751457685571, "grad_norm": 12.630853561344956, "learning_rate": 4.997493728187953e-05, "loss": 2.3891, "mean_token_accuracy": 0.4103448212146759, "step": 63815 }, { "epoch": 0.06428018182166127, "grad_norm": 12.664991632455454, "learning_rate": 4.9974919597373984e-05, "loss": 2.4977, "mean_token_accuracy": 0.37586206793785093, "step": 63820 }, { "epoch": 0.06428521787476545, "grad_norm": 12.81571130296138, "learning_rate": 4.997490190663493e-05, "loss": 2.7666, "mean_token_accuracy": 0.34482758641242983, "step": 63825 }, { "epoch": 0.06429025392786962, "grad_norm": 14.564564405837809, "learning_rate": 4.997488420966239e-05, "loss": 2.8642, "mean_token_accuracy": 0.3379310339689255, "step": 63830 }, { "epoch": 0.0642952899809738, "grad_norm": 11.95610424880165, "learning_rate": 4.997486650645635e-05, "loss": 2.401, "mean_token_accuracy": 0.43448275327682495, "step": 63835 }, { "epoch": 0.06430032603407797, "grad_norm": 11.263058632023299, "learning_rate": 4.997484879701682e-05, "loss": 2.3107, "mean_token_accuracy": 0.4467029690742493, "step": 63840 }, { "epoch": 0.06430536208718214, "grad_norm": 11.773886463553232, "learning_rate": 4.997483108134381e-05, "loss": 2.5012, "mean_token_accuracy": 0.43103448748588563, "step": 63845 }, { "epoch": 0.06431039814028632, "grad_norm": 14.744681009582063, "learning_rate": 4.997481335943733e-05, "loss": 2.6286, "mean_token_accuracy": 0.38027827739715575, "step": 63850 }, { "epoch": 0.06431543419339049, "grad_norm": 10.496723461057591, "learning_rate": 4.997479563129736e-05, "loss": 2.2493, "mean_token_accuracy": 0.5009852170944213, "step": 63855 }, { "epoch": 0.06432047024649465, "grad_norm": 11.804996611979675, "learning_rate": 4.997477789692394e-05, "loss": 2.6289, "mean_token_accuracy": 0.36551723480224607, "step": 63860 }, { "epoch": 0.06432550629959882, "grad_norm": 14.075878930327566, "learning_rate": 4.997476015631705e-05, "loss": 2.4714, "mean_token_accuracy": 0.4381773293018341, "step": 63865 }, { "epoch": 0.064330542352703, "grad_norm": 10.657117037528382, "learning_rate": 4.9974742409476695e-05, "loss": 2.5214, "mean_token_accuracy": 0.37586206793785093, "step": 63870 }, { "epoch": 0.06433557840580717, "grad_norm": 22.491929351378598, "learning_rate": 4.997472465640289e-05, "loss": 2.9506, "mean_token_accuracy": 0.37241379022598264, "step": 63875 }, { "epoch": 0.06434061445891134, "grad_norm": 17.16744040100882, "learning_rate": 4.997470689709563e-05, "loss": 3.0854, "mean_token_accuracy": 0.33103448152542114, "step": 63880 }, { "epoch": 0.06434565051201552, "grad_norm": 12.262910492123531, "learning_rate": 4.997468913155493e-05, "loss": 2.6062, "mean_token_accuracy": 0.4344827592372894, "step": 63885 }, { "epoch": 0.06435068656511969, "grad_norm": 10.098699416163296, "learning_rate": 4.99746713597808e-05, "loss": 2.3469, "mean_token_accuracy": 0.4137930989265442, "step": 63890 }, { "epoch": 0.06435572261822387, "grad_norm": 13.73743601938842, "learning_rate": 4.997465358177322e-05, "loss": 2.2336, "mean_token_accuracy": 0.4034482777118683, "step": 63895 }, { "epoch": 0.06436075867132804, "grad_norm": 11.466730222345506, "learning_rate": 4.997463579753222e-05, "loss": 2.5123, "mean_token_accuracy": 0.3862069010734558, "step": 63900 }, { "epoch": 0.06436579472443221, "grad_norm": 11.486469043796623, "learning_rate": 4.99746180070578e-05, "loss": 2.5861, "mean_token_accuracy": 0.42068964838981626, "step": 63905 }, { "epoch": 0.06437083077753639, "grad_norm": 13.782417808476762, "learning_rate": 4.997460021034996e-05, "loss": 3.013, "mean_token_accuracy": 0.358620685338974, "step": 63910 }, { "epoch": 0.06437586683064056, "grad_norm": 11.167547901522223, "learning_rate": 4.9974582407408704e-05, "loss": 2.6748, "mean_token_accuracy": 0.42894089221954346, "step": 63915 }, { "epoch": 0.06438090288374473, "grad_norm": 10.26441809185597, "learning_rate": 4.997456459823404e-05, "loss": 2.9932, "mean_token_accuracy": 0.35517241060733795, "step": 63920 }, { "epoch": 0.06438593893684891, "grad_norm": 11.051865409669025, "learning_rate": 4.997454678282597e-05, "loss": 2.4311, "mean_token_accuracy": 0.3965517282485962, "step": 63925 }, { "epoch": 0.06439097498995307, "grad_norm": 10.704890159256053, "learning_rate": 4.99745289611845e-05, "loss": 2.6075, "mean_token_accuracy": 0.41034482717514037, "step": 63930 }, { "epoch": 0.06439601104305724, "grad_norm": 13.649647681719939, "learning_rate": 4.9974511133309634e-05, "loss": 2.4808, "mean_token_accuracy": 0.4137930989265442, "step": 63935 }, { "epoch": 0.06440104709616142, "grad_norm": 11.241760095239865, "learning_rate": 4.997449329920137e-05, "loss": 2.6612, "mean_token_accuracy": 0.38130671381950376, "step": 63940 }, { "epoch": 0.06440608314926559, "grad_norm": 22.272813939689495, "learning_rate": 4.9974475458859735e-05, "loss": 2.8221, "mean_token_accuracy": 0.34482758641242983, "step": 63945 }, { "epoch": 0.06441111920236976, "grad_norm": 9.712746697748715, "learning_rate": 4.997445761228472e-05, "loss": 2.57, "mean_token_accuracy": 0.4034482717514038, "step": 63950 }, { "epoch": 0.06441615525547394, "grad_norm": 11.658141622732634, "learning_rate": 4.9974439759476324e-05, "loss": 2.9764, "mean_token_accuracy": 0.36896551847457887, "step": 63955 }, { "epoch": 0.06442119130857811, "grad_norm": 10.526798421273325, "learning_rate": 4.997442190043456e-05, "loss": 2.0067, "mean_token_accuracy": 0.4310344815254211, "step": 63960 }, { "epoch": 0.06442622736168228, "grad_norm": 12.159988197425585, "learning_rate": 4.9974404035159434e-05, "loss": 2.4717, "mean_token_accuracy": 0.37931033968925476, "step": 63965 }, { "epoch": 0.06443126341478646, "grad_norm": 12.69663521520186, "learning_rate": 4.997438616365095e-05, "loss": 3.2534, "mean_token_accuracy": 0.3805202662944794, "step": 63970 }, { "epoch": 0.06443629946789063, "grad_norm": 9.810880034009662, "learning_rate": 4.9974368285909107e-05, "loss": 2.5654, "mean_token_accuracy": 0.4034482777118683, "step": 63975 }, { "epoch": 0.0644413355209948, "grad_norm": 10.755751722861335, "learning_rate": 4.997435040193391e-05, "loss": 2.3036, "mean_token_accuracy": 0.4103448212146759, "step": 63980 }, { "epoch": 0.06444637157409898, "grad_norm": 10.757302081398992, "learning_rate": 4.9974332511725375e-05, "loss": 2.2935, "mean_token_accuracy": 0.4103448212146759, "step": 63985 }, { "epoch": 0.06445140762720315, "grad_norm": 13.433487368100982, "learning_rate": 4.99743146152835e-05, "loss": 2.5781, "mean_token_accuracy": 0.39310344457626345, "step": 63990 }, { "epoch": 0.06445644368030733, "grad_norm": 14.246225424916659, "learning_rate": 4.9974296712608286e-05, "loss": 2.8529, "mean_token_accuracy": 0.36551723480224607, "step": 63995 }, { "epoch": 0.06446147973341149, "grad_norm": 14.434841834711976, "learning_rate": 4.9974278803699745e-05, "loss": 2.7772, "mean_token_accuracy": 0.39310344457626345, "step": 64000 }, { "epoch": 0.06446651578651566, "grad_norm": 13.191140429067788, "learning_rate": 4.997426088855788e-05, "loss": 2.4506, "mean_token_accuracy": 0.4103448212146759, "step": 64005 }, { "epoch": 0.06447155183961983, "grad_norm": 12.195452131685462, "learning_rate": 4.997424296718269e-05, "loss": 2.5895, "mean_token_accuracy": 0.40344826579093934, "step": 64010 }, { "epoch": 0.06447658789272401, "grad_norm": 11.848964311693365, "learning_rate": 4.997422503957419e-05, "loss": 2.6724, "mean_token_accuracy": 0.38275861740112305, "step": 64015 }, { "epoch": 0.06448162394582818, "grad_norm": 8.019860897280749, "learning_rate": 4.997420710573237e-05, "loss": 2.126, "mean_token_accuracy": 0.5246478259563446, "step": 64020 }, { "epoch": 0.06448665999893236, "grad_norm": 11.007923007798906, "learning_rate": 4.997418916565726e-05, "loss": 2.664, "mean_token_accuracy": 0.39655172228813174, "step": 64025 }, { "epoch": 0.06449169605203653, "grad_norm": 11.074576514163029, "learning_rate": 4.9974171219348845e-05, "loss": 2.5075, "mean_token_accuracy": 0.4344827592372894, "step": 64030 }, { "epoch": 0.0644967321051407, "grad_norm": 10.096199345785292, "learning_rate": 4.997415326680713e-05, "loss": 2.2276, "mean_token_accuracy": 0.43448275327682495, "step": 64035 }, { "epoch": 0.06450176815824488, "grad_norm": 26.574250836800967, "learning_rate": 4.9974135308032126e-05, "loss": 2.5144, "mean_token_accuracy": 0.4689655125141144, "step": 64040 }, { "epoch": 0.06450680421134905, "grad_norm": 10.624223084262825, "learning_rate": 4.997411734302384e-05, "loss": 2.4923, "mean_token_accuracy": 0.41379310488700866, "step": 64045 }, { "epoch": 0.06451184026445322, "grad_norm": 12.178128824958737, "learning_rate": 4.997409937178226e-05, "loss": 2.3801, "mean_token_accuracy": 0.4275861978530884, "step": 64050 }, { "epoch": 0.0645168763175574, "grad_norm": 13.353332345662182, "learning_rate": 4.9974081394307425e-05, "loss": 3.0235, "mean_token_accuracy": 0.3689655244350433, "step": 64055 }, { "epoch": 0.06452191237066157, "grad_norm": 14.606862493042199, "learning_rate": 4.997406341059931e-05, "loss": 2.4179, "mean_token_accuracy": 0.4482758641242981, "step": 64060 }, { "epoch": 0.06452694842376575, "grad_norm": 12.499269107363759, "learning_rate": 4.997404542065793e-05, "loss": 2.6466, "mean_token_accuracy": 0.4103448331356049, "step": 64065 }, { "epoch": 0.0645319844768699, "grad_norm": 13.151894696237415, "learning_rate": 4.997402742448329e-05, "loss": 3.0328, "mean_token_accuracy": 0.3517241358757019, "step": 64070 }, { "epoch": 0.06453702052997408, "grad_norm": 11.175666403720752, "learning_rate": 4.997400942207539e-05, "loss": 2.5056, "mean_token_accuracy": 0.36896551251411436, "step": 64075 }, { "epoch": 0.06454205658307825, "grad_norm": 11.921136064374776, "learning_rate": 4.997399141343425e-05, "loss": 2.5301, "mean_token_accuracy": 0.4034482777118683, "step": 64080 }, { "epoch": 0.06454709263618243, "grad_norm": 12.235957248222135, "learning_rate": 4.997397339855985e-05, "loss": 2.4466, "mean_token_accuracy": 0.4517241299152374, "step": 64085 }, { "epoch": 0.0645521286892866, "grad_norm": 10.848690601300751, "learning_rate": 4.997395537745222e-05, "loss": 2.8101, "mean_token_accuracy": 0.3793103456497192, "step": 64090 }, { "epoch": 0.06455716474239077, "grad_norm": 12.377799695251019, "learning_rate": 4.997393735011135e-05, "loss": 2.439, "mean_token_accuracy": 0.441379314661026, "step": 64095 }, { "epoch": 0.06456220079549495, "grad_norm": 10.836704037683083, "learning_rate": 4.997391931653726e-05, "loss": 2.7065, "mean_token_accuracy": 0.3620689630508423, "step": 64100 }, { "epoch": 0.06456723684859912, "grad_norm": 10.302030178970655, "learning_rate": 4.997390127672994e-05, "loss": 2.5574, "mean_token_accuracy": 0.39655172228813174, "step": 64105 }, { "epoch": 0.0645722729017033, "grad_norm": 15.68610649148647, "learning_rate": 4.9973883230689396e-05, "loss": 2.4479, "mean_token_accuracy": 0.39310344457626345, "step": 64110 }, { "epoch": 0.06457730895480747, "grad_norm": 11.027718873307434, "learning_rate": 4.9973865178415636e-05, "loss": 2.4574, "mean_token_accuracy": 0.4137930989265442, "step": 64115 }, { "epoch": 0.06458234500791164, "grad_norm": 13.810414138721175, "learning_rate": 4.997384711990867e-05, "loss": 2.2749, "mean_token_accuracy": 0.4493842363357544, "step": 64120 }, { "epoch": 0.06458738106101582, "grad_norm": 11.030816720393899, "learning_rate": 4.9973829055168494e-05, "loss": 2.4186, "mean_token_accuracy": 0.44646098017692565, "step": 64125 }, { "epoch": 0.06459241711411999, "grad_norm": 13.172110285785811, "learning_rate": 4.997381098419512e-05, "loss": 2.3714, "mean_token_accuracy": 0.4172413766384125, "step": 64130 }, { "epoch": 0.06459745316722416, "grad_norm": 15.170130103537646, "learning_rate": 4.997379290698855e-05, "loss": 2.7812, "mean_token_accuracy": 0.3999999940395355, "step": 64135 }, { "epoch": 0.06460248922032832, "grad_norm": 11.776145195231349, "learning_rate": 4.9973774823548784e-05, "loss": 2.3001, "mean_token_accuracy": 0.4448275864124298, "step": 64140 }, { "epoch": 0.0646075252734325, "grad_norm": 11.464090806591756, "learning_rate": 4.997375673387585e-05, "loss": 2.2881, "mean_token_accuracy": 0.41034482717514037, "step": 64145 }, { "epoch": 0.06461256132653667, "grad_norm": 13.79262832385389, "learning_rate": 4.997373863796972e-05, "loss": 2.6135, "mean_token_accuracy": 0.4379310369491577, "step": 64150 }, { "epoch": 0.06461759737964085, "grad_norm": 10.412831287317363, "learning_rate": 4.9973720535830414e-05, "loss": 2.3245, "mean_token_accuracy": 0.41222020983695984, "step": 64155 }, { "epoch": 0.06462263343274502, "grad_norm": 11.201051804504871, "learning_rate": 4.9973702427457946e-05, "loss": 2.6355, "mean_token_accuracy": 0.4344827473163605, "step": 64160 }, { "epoch": 0.06462766948584919, "grad_norm": 10.752423248207196, "learning_rate": 4.997368431285231e-05, "loss": 2.4431, "mean_token_accuracy": 0.3655172407627106, "step": 64165 }, { "epoch": 0.06463270553895337, "grad_norm": 13.283866287842937, "learning_rate": 4.997366619201351e-05, "loss": 3.2648, "mean_token_accuracy": 0.3103448271751404, "step": 64170 }, { "epoch": 0.06463774159205754, "grad_norm": 16.365355752037107, "learning_rate": 4.9973648064941565e-05, "loss": 3.0095, "mean_token_accuracy": 0.39310343861579894, "step": 64175 }, { "epoch": 0.06464277764516171, "grad_norm": 14.612515784138765, "learning_rate": 4.997362993163646e-05, "loss": 3.2296, "mean_token_accuracy": 0.3379310369491577, "step": 64180 }, { "epoch": 0.06464781369826589, "grad_norm": 15.517731967393404, "learning_rate": 4.997361179209822e-05, "loss": 2.2093, "mean_token_accuracy": 0.43793103098869324, "step": 64185 }, { "epoch": 0.06465284975137006, "grad_norm": 17.446604047988114, "learning_rate": 4.997359364632683e-05, "loss": 2.3404, "mean_token_accuracy": 0.43103448748588563, "step": 64190 }, { "epoch": 0.06465788580447424, "grad_norm": 13.684150084020356, "learning_rate": 4.9973575494322306e-05, "loss": 2.9885, "mean_token_accuracy": 0.3793103456497192, "step": 64195 }, { "epoch": 0.06466292185757841, "grad_norm": 11.57862017360859, "learning_rate": 4.9973557336084657e-05, "loss": 2.2945, "mean_token_accuracy": 0.4379310429096222, "step": 64200 }, { "epoch": 0.06466795791068258, "grad_norm": 14.760826999245856, "learning_rate": 4.9973539171613875e-05, "loss": 2.7184, "mean_token_accuracy": 0.4034482717514038, "step": 64205 }, { "epoch": 0.06467299396378674, "grad_norm": 11.298026787926146, "learning_rate": 4.997352100090998e-05, "loss": 2.3735, "mean_token_accuracy": 0.4482758641242981, "step": 64210 }, { "epoch": 0.06467803001689092, "grad_norm": 10.68761647316672, "learning_rate": 4.997350282397297e-05, "loss": 2.2346, "mean_token_accuracy": 0.4724137902259827, "step": 64215 }, { "epoch": 0.06468306606999509, "grad_norm": 16.821699326825687, "learning_rate": 4.997348464080284e-05, "loss": 2.8541, "mean_token_accuracy": 0.4018148809671402, "step": 64220 }, { "epoch": 0.06468810212309926, "grad_norm": 13.340399189198891, "learning_rate": 4.997346645139962e-05, "loss": 2.593, "mean_token_accuracy": 0.334482753276825, "step": 64225 }, { "epoch": 0.06469313817620344, "grad_norm": 13.54095100715977, "learning_rate": 4.9973448255763295e-05, "loss": 2.4762, "mean_token_accuracy": 0.358620685338974, "step": 64230 }, { "epoch": 0.06469817422930761, "grad_norm": 9.772481957326963, "learning_rate": 4.9973430053893874e-05, "loss": 2.6568, "mean_token_accuracy": 0.39655172228813174, "step": 64235 }, { "epoch": 0.06470321028241179, "grad_norm": 12.339049224399552, "learning_rate": 4.997341184579137e-05, "loss": 2.9468, "mean_token_accuracy": 0.3310344874858856, "step": 64240 }, { "epoch": 0.06470824633551596, "grad_norm": 11.451027552109528, "learning_rate": 4.997339363145577e-05, "loss": 2.4428, "mean_token_accuracy": 0.42068964838981626, "step": 64245 }, { "epoch": 0.06471328238862013, "grad_norm": 11.974183421025108, "learning_rate": 4.9973375410887094e-05, "loss": 2.4077, "mean_token_accuracy": 0.4034482777118683, "step": 64250 }, { "epoch": 0.0647183184417243, "grad_norm": 11.316537646700016, "learning_rate": 4.9973357184085346e-05, "loss": 2.907, "mean_token_accuracy": 0.3620689570903778, "step": 64255 }, { "epoch": 0.06472335449482848, "grad_norm": 13.282958298747062, "learning_rate": 4.9973338951050526e-05, "loss": 2.4081, "mean_token_accuracy": 0.4344827592372894, "step": 64260 }, { "epoch": 0.06472839054793265, "grad_norm": 9.693364631375667, "learning_rate": 4.997332071178264e-05, "loss": 2.5132, "mean_token_accuracy": 0.44827585816383364, "step": 64265 }, { "epoch": 0.06473342660103683, "grad_norm": 9.083962245428125, "learning_rate": 4.99733024662817e-05, "loss": 2.4691, "mean_token_accuracy": 0.39655172228813174, "step": 64270 }, { "epoch": 0.064738462654141, "grad_norm": 10.869289258292717, "learning_rate": 4.99732842145477e-05, "loss": 2.2227, "mean_token_accuracy": 0.41379311084747317, "step": 64275 }, { "epoch": 0.06474349870724516, "grad_norm": 11.512431001819683, "learning_rate": 4.9973265956580656e-05, "loss": 2.1794, "mean_token_accuracy": 0.46551724076271056, "step": 64280 }, { "epoch": 0.06474853476034934, "grad_norm": 12.428739851892022, "learning_rate": 4.997324769238057e-05, "loss": 2.15, "mean_token_accuracy": 0.4379310369491577, "step": 64285 }, { "epoch": 0.06475357081345351, "grad_norm": 11.465309843589882, "learning_rate": 4.997322942194744e-05, "loss": 2.3877, "mean_token_accuracy": 0.4655172348022461, "step": 64290 }, { "epoch": 0.06475860686655768, "grad_norm": 12.587870477992764, "learning_rate": 4.9973211145281276e-05, "loss": 2.9417, "mean_token_accuracy": 0.358620685338974, "step": 64295 }, { "epoch": 0.06476364291966186, "grad_norm": 13.525054158896475, "learning_rate": 4.997319286238208e-05, "loss": 2.9354, "mean_token_accuracy": 0.37241379022598264, "step": 64300 }, { "epoch": 0.06476867897276603, "grad_norm": 10.729778690187633, "learning_rate": 4.997317457324987e-05, "loss": 2.6319, "mean_token_accuracy": 0.4194192409515381, "step": 64305 }, { "epoch": 0.0647737150258702, "grad_norm": 10.235201793442, "learning_rate": 4.997315627788463e-05, "loss": 2.8422, "mean_token_accuracy": 0.37241379618644715, "step": 64310 }, { "epoch": 0.06477875107897438, "grad_norm": 16.11368499215886, "learning_rate": 4.9973137976286386e-05, "loss": 2.2916, "mean_token_accuracy": 0.4257713258266449, "step": 64315 }, { "epoch": 0.06478378713207855, "grad_norm": 10.050932784300809, "learning_rate": 4.997311966845513e-05, "loss": 2.4409, "mean_token_accuracy": 0.379310342669487, "step": 64320 }, { "epoch": 0.06478882318518273, "grad_norm": 12.382238606793473, "learning_rate": 4.997310135439087e-05, "loss": 2.8203, "mean_token_accuracy": 0.34827586114406583, "step": 64325 }, { "epoch": 0.0647938592382869, "grad_norm": 10.46937191893655, "learning_rate": 4.99730830340936e-05, "loss": 2.3171, "mean_token_accuracy": 0.42413792610168455, "step": 64330 }, { "epoch": 0.06479889529139107, "grad_norm": 10.954808292286012, "learning_rate": 4.9973064707563354e-05, "loss": 2.3764, "mean_token_accuracy": 0.42068966031074523, "step": 64335 }, { "epoch": 0.06480393134449525, "grad_norm": 11.477447268386127, "learning_rate": 4.9973046374800116e-05, "loss": 2.8465, "mean_token_accuracy": 0.36896551251411436, "step": 64340 }, { "epoch": 0.06480896739759942, "grad_norm": 12.046964173549153, "learning_rate": 4.9973028035803894e-05, "loss": 2.749, "mean_token_accuracy": 0.39310344457626345, "step": 64345 }, { "epoch": 0.06481400345070358, "grad_norm": 14.043353924031237, "learning_rate": 4.997300969057469e-05, "loss": 2.675, "mean_token_accuracy": 0.37241379022598264, "step": 64350 }, { "epoch": 0.06481903950380775, "grad_norm": 13.49394776275331, "learning_rate": 4.997299133911251e-05, "loss": 2.4688, "mean_token_accuracy": 0.4379310369491577, "step": 64355 }, { "epoch": 0.06482407555691193, "grad_norm": 10.682503590506679, "learning_rate": 4.9972972981417375e-05, "loss": 2.7388, "mean_token_accuracy": 0.3517241358757019, "step": 64360 }, { "epoch": 0.0648291116100161, "grad_norm": 13.65735895072366, "learning_rate": 4.9972954617489267e-05, "loss": 2.9424, "mean_token_accuracy": 0.37586207389831544, "step": 64365 }, { "epoch": 0.06483414766312028, "grad_norm": 18.995142843425135, "learning_rate": 4.997293624732821e-05, "loss": 3.043, "mean_token_accuracy": 0.3896551728248596, "step": 64370 }, { "epoch": 0.06483918371622445, "grad_norm": 12.649926974781994, "learning_rate": 4.997291787093419e-05, "loss": 2.5788, "mean_token_accuracy": 0.3896551728248596, "step": 64375 }, { "epoch": 0.06484421976932862, "grad_norm": 10.552278067844838, "learning_rate": 4.997289948830722e-05, "loss": 2.5708, "mean_token_accuracy": 0.324137932062149, "step": 64380 }, { "epoch": 0.0648492558224328, "grad_norm": 10.303347633902186, "learning_rate": 4.997288109944731e-05, "loss": 2.2614, "mean_token_accuracy": 0.44295220375061034, "step": 64385 }, { "epoch": 0.06485429187553697, "grad_norm": 14.705594623507556, "learning_rate": 4.997286270435447e-05, "loss": 2.4055, "mean_token_accuracy": 0.37586206793785093, "step": 64390 }, { "epoch": 0.06485932792864114, "grad_norm": 11.729385059518776, "learning_rate": 4.997284430302869e-05, "loss": 2.7863, "mean_token_accuracy": 0.4103448212146759, "step": 64395 }, { "epoch": 0.06486436398174532, "grad_norm": 12.803408239644686, "learning_rate": 4.997282589546999e-05, "loss": 2.5585, "mean_token_accuracy": 0.4241379380226135, "step": 64400 }, { "epoch": 0.06486940003484949, "grad_norm": 11.201385453311024, "learning_rate": 4.997280748167836e-05, "loss": 2.42, "mean_token_accuracy": 0.4482758641242981, "step": 64405 }, { "epoch": 0.06487443608795367, "grad_norm": 10.986594973708124, "learning_rate": 4.997278906165382e-05, "loss": 2.8192, "mean_token_accuracy": 0.38275861740112305, "step": 64410 }, { "epoch": 0.06487947214105784, "grad_norm": 18.701416465142216, "learning_rate": 4.9972770635396366e-05, "loss": 2.6144, "mean_token_accuracy": 0.40889292657375337, "step": 64415 }, { "epoch": 0.064884508194162, "grad_norm": 44.61634841175035, "learning_rate": 4.9972752202906e-05, "loss": 2.9275, "mean_token_accuracy": 0.3724137932062149, "step": 64420 }, { "epoch": 0.06488954424726617, "grad_norm": 13.105377296466584, "learning_rate": 4.9972733764182744e-05, "loss": 2.1343, "mean_token_accuracy": 0.4344827651977539, "step": 64425 }, { "epoch": 0.06489458030037035, "grad_norm": 14.182019272883563, "learning_rate": 4.997271531922658e-05, "loss": 2.454, "mean_token_accuracy": 0.3482758581638336, "step": 64430 }, { "epoch": 0.06489961635347452, "grad_norm": 62.06514582154275, "learning_rate": 4.997269686803753e-05, "loss": 3.3458, "mean_token_accuracy": 0.3448275804519653, "step": 64435 }, { "epoch": 0.0649046524065787, "grad_norm": 10.233137165283622, "learning_rate": 4.997267841061559e-05, "loss": 2.5675, "mean_token_accuracy": 0.38275861740112305, "step": 64440 }, { "epoch": 0.06490968845968287, "grad_norm": 11.549209858309151, "learning_rate": 4.997265994696077e-05, "loss": 2.2648, "mean_token_accuracy": 0.44482758045196535, "step": 64445 }, { "epoch": 0.06491472451278704, "grad_norm": 10.074304624642464, "learning_rate": 4.997264147707308e-05, "loss": 2.4121, "mean_token_accuracy": 0.4448275864124298, "step": 64450 }, { "epoch": 0.06491976056589122, "grad_norm": 13.79346876029164, "learning_rate": 4.997262300095252e-05, "loss": 2.3976, "mean_token_accuracy": 0.42068964838981626, "step": 64455 }, { "epoch": 0.06492479661899539, "grad_norm": 9.903401345435258, "learning_rate": 4.9972604518599074e-05, "loss": 2.9169, "mean_token_accuracy": 0.32068965435028074, "step": 64460 }, { "epoch": 0.06492983267209956, "grad_norm": 15.028279366823238, "learning_rate": 4.997258603001279e-05, "loss": 2.2728, "mean_token_accuracy": 0.4379310250282288, "step": 64465 }, { "epoch": 0.06493486872520374, "grad_norm": 13.807539647029355, "learning_rate": 4.997256753519364e-05, "loss": 2.5347, "mean_token_accuracy": 0.43448275327682495, "step": 64470 }, { "epoch": 0.06493990477830791, "grad_norm": 11.056730389574923, "learning_rate": 4.997254903414164e-05, "loss": 2.3145, "mean_token_accuracy": 0.4221415579319, "step": 64475 }, { "epoch": 0.06494494083141208, "grad_norm": 14.311660939163673, "learning_rate": 4.9972530526856796e-05, "loss": 2.7011, "mean_token_accuracy": 0.37350272536277773, "step": 64480 }, { "epoch": 0.06494997688451626, "grad_norm": 12.843706402846506, "learning_rate": 4.997251201333912e-05, "loss": 2.7279, "mean_token_accuracy": 0.3586206793785095, "step": 64485 }, { "epoch": 0.06495501293762042, "grad_norm": 15.875209786489487, "learning_rate": 4.997249349358859e-05, "loss": 2.6704, "mean_token_accuracy": 0.3724137872457504, "step": 64490 }, { "epoch": 0.06496004899072459, "grad_norm": 12.037182070395405, "learning_rate": 4.997247496760524e-05, "loss": 2.6475, "mean_token_accuracy": 0.39999999701976774, "step": 64495 }, { "epoch": 0.06496508504382877, "grad_norm": 11.449280414327877, "learning_rate": 4.997245643538907e-05, "loss": 2.4058, "mean_token_accuracy": 0.44482759237289426, "step": 64500 }, { "epoch": 0.06497012109693294, "grad_norm": 12.037660169357627, "learning_rate": 4.9972437896940085e-05, "loss": 2.9194, "mean_token_accuracy": 0.37586207389831544, "step": 64505 }, { "epoch": 0.06497515715003711, "grad_norm": 10.891206822157644, "learning_rate": 4.997241935225828e-05, "loss": 3.0671, "mean_token_accuracy": 0.35862068831920624, "step": 64510 }, { "epoch": 0.06498019320314129, "grad_norm": 11.028936144512006, "learning_rate": 4.9972400801343654e-05, "loss": 2.691, "mean_token_accuracy": 0.4034482717514038, "step": 64515 }, { "epoch": 0.06498522925624546, "grad_norm": 10.888050699110789, "learning_rate": 4.997238224419624e-05, "loss": 2.2876, "mean_token_accuracy": 0.4620689630508423, "step": 64520 }, { "epoch": 0.06499026530934963, "grad_norm": 11.247672566663509, "learning_rate": 4.9972363680816014e-05, "loss": 2.5056, "mean_token_accuracy": 0.4, "step": 64525 }, { "epoch": 0.06499530136245381, "grad_norm": 11.122148665001722, "learning_rate": 4.9972345111203004e-05, "loss": 2.5485, "mean_token_accuracy": 0.4310344815254211, "step": 64530 }, { "epoch": 0.06500033741555798, "grad_norm": 11.764745549335244, "learning_rate": 4.99723265353572e-05, "loss": 2.5666, "mean_token_accuracy": 0.3724137932062149, "step": 64535 }, { "epoch": 0.06500537346866216, "grad_norm": 12.839580403082458, "learning_rate": 4.9972307953278616e-05, "loss": 2.6142, "mean_token_accuracy": 0.3879612863063812, "step": 64540 }, { "epoch": 0.06501040952176633, "grad_norm": 12.899165506140532, "learning_rate": 4.9972289364967245e-05, "loss": 2.5436, "mean_token_accuracy": 0.4034482777118683, "step": 64545 }, { "epoch": 0.0650154455748705, "grad_norm": 10.70216559570666, "learning_rate": 4.9972270770423114e-05, "loss": 2.1912, "mean_token_accuracy": 0.43103447556495667, "step": 64550 }, { "epoch": 0.06502048162797468, "grad_norm": 10.856771326405397, "learning_rate": 4.99722521696462e-05, "loss": 2.6818, "mean_token_accuracy": 0.39999999701976774, "step": 64555 }, { "epoch": 0.06502551768107884, "grad_norm": 11.158353802554075, "learning_rate": 4.997223356263653e-05, "loss": 2.7908, "mean_token_accuracy": 0.3827586233615875, "step": 64560 }, { "epoch": 0.06503055373418301, "grad_norm": 17.972207128376052, "learning_rate": 4.997221494939411e-05, "loss": 2.8989, "mean_token_accuracy": 0.3709013879299164, "step": 64565 }, { "epoch": 0.06503558978728718, "grad_norm": 14.059351678273316, "learning_rate": 4.997219632991893e-05, "loss": 2.9316, "mean_token_accuracy": 0.36896551251411436, "step": 64570 }, { "epoch": 0.06504062584039136, "grad_norm": 12.055380770889936, "learning_rate": 4.9972177704211e-05, "loss": 2.7269, "mean_token_accuracy": 0.36551724970340727, "step": 64575 }, { "epoch": 0.06504566189349553, "grad_norm": 14.312793556288577, "learning_rate": 4.997215907227034e-05, "loss": 2.4403, "mean_token_accuracy": 0.41379310488700866, "step": 64580 }, { "epoch": 0.0650506979465997, "grad_norm": 12.09510453877924, "learning_rate": 4.997214043409692e-05, "loss": 2.5542, "mean_token_accuracy": 0.4413793087005615, "step": 64585 }, { "epoch": 0.06505573399970388, "grad_norm": 12.104201672350323, "learning_rate": 4.997212178969078e-05, "loss": 2.3494, "mean_token_accuracy": 0.4310344815254211, "step": 64590 }, { "epoch": 0.06506077005280805, "grad_norm": 10.61952417803201, "learning_rate": 4.997210313905192e-05, "loss": 2.4235, "mean_token_accuracy": 0.4206896543502808, "step": 64595 }, { "epoch": 0.06506580610591223, "grad_norm": 11.559627814047222, "learning_rate": 4.9972084482180334e-05, "loss": 2.5661, "mean_token_accuracy": 0.4258923172950745, "step": 64600 }, { "epoch": 0.0650708421590164, "grad_norm": 12.723862637075733, "learning_rate": 4.997206581907603e-05, "loss": 2.2586, "mean_token_accuracy": 0.47931033968925474, "step": 64605 }, { "epoch": 0.06507587821212057, "grad_norm": 14.42859616275144, "learning_rate": 4.997204714973902e-05, "loss": 2.6249, "mean_token_accuracy": 0.35172414481639863, "step": 64610 }, { "epoch": 0.06508091426522475, "grad_norm": 10.472781295105523, "learning_rate": 4.99720284741693e-05, "loss": 2.4711, "mean_token_accuracy": 0.4413793087005615, "step": 64615 }, { "epoch": 0.06508595031832892, "grad_norm": 11.302363272003406, "learning_rate": 4.997200979236688e-05, "loss": 2.6958, "mean_token_accuracy": 0.3896551728248596, "step": 64620 }, { "epoch": 0.0650909863714331, "grad_norm": 11.587146672952281, "learning_rate": 4.9971991104331756e-05, "loss": 2.2381, "mean_token_accuracy": 0.43448275327682495, "step": 64625 }, { "epoch": 0.06509602242453726, "grad_norm": 11.823615056675573, "learning_rate": 4.997197241006395e-05, "loss": 2.3815, "mean_token_accuracy": 0.42413792610168455, "step": 64630 }, { "epoch": 0.06510105847764143, "grad_norm": 12.373576628380274, "learning_rate": 4.997195370956346e-05, "loss": 2.9002, "mean_token_accuracy": 0.32413792312145234, "step": 64635 }, { "epoch": 0.0651060945307456, "grad_norm": 12.005467761941809, "learning_rate": 4.997193500283029e-05, "loss": 2.6778, "mean_token_accuracy": 0.3655172407627106, "step": 64640 }, { "epoch": 0.06511113058384978, "grad_norm": 15.477387324848285, "learning_rate": 4.997191628986443e-05, "loss": 3.0425, "mean_token_accuracy": 0.3206896483898163, "step": 64645 }, { "epoch": 0.06511616663695395, "grad_norm": 11.672775474684398, "learning_rate": 4.997189757066592e-05, "loss": 2.3548, "mean_token_accuracy": 0.40344828367233276, "step": 64650 }, { "epoch": 0.06512120269005812, "grad_norm": 10.23840963476569, "learning_rate": 4.9971878845234725e-05, "loss": 1.9522, "mean_token_accuracy": 0.4965517222881317, "step": 64655 }, { "epoch": 0.0651262387431623, "grad_norm": 13.3825730346537, "learning_rate": 4.997186011357089e-05, "loss": 2.4851, "mean_token_accuracy": 0.441379314661026, "step": 64660 }, { "epoch": 0.06513127479626647, "grad_norm": 13.764024830346779, "learning_rate": 4.997184137567439e-05, "loss": 2.4183, "mean_token_accuracy": 0.4034482717514038, "step": 64665 }, { "epoch": 0.06513631084937065, "grad_norm": 19.518019082262786, "learning_rate": 4.9971822631545246e-05, "loss": 2.8121, "mean_token_accuracy": 0.3551724076271057, "step": 64670 }, { "epoch": 0.06514134690247482, "grad_norm": 14.415682710026498, "learning_rate": 4.9971803881183455e-05, "loss": 3.0653, "mean_token_accuracy": 0.3482758581638336, "step": 64675 }, { "epoch": 0.065146382955579, "grad_norm": 15.619678696416816, "learning_rate": 4.9971785124589026e-05, "loss": 2.5638, "mean_token_accuracy": 0.38620689511299133, "step": 64680 }, { "epoch": 0.06515141900868317, "grad_norm": 9.323511561960446, "learning_rate": 4.997176636176196e-05, "loss": 2.2029, "mean_token_accuracy": 0.4551724135875702, "step": 64685 }, { "epoch": 0.06515645506178734, "grad_norm": 10.850596250930028, "learning_rate": 4.9971747592702275e-05, "loss": 2.4466, "mean_token_accuracy": 0.41530550122261045, "step": 64690 }, { "epoch": 0.06516149111489151, "grad_norm": 15.080409907641867, "learning_rate": 4.997172881740996e-05, "loss": 2.8381, "mean_token_accuracy": 0.3774954617023468, "step": 64695 }, { "epoch": 0.06516652716799567, "grad_norm": 12.034546684761619, "learning_rate": 4.997171003588503e-05, "loss": 2.7027, "mean_token_accuracy": 0.4137930989265442, "step": 64700 }, { "epoch": 0.06517156322109985, "grad_norm": 13.431052317163712, "learning_rate": 4.997169124812749e-05, "loss": 2.5817, "mean_token_accuracy": 0.4034482717514038, "step": 64705 }, { "epoch": 0.06517659927420402, "grad_norm": 11.802645776078489, "learning_rate": 4.9971672454137336e-05, "loss": 2.4169, "mean_token_accuracy": 0.40344828367233276, "step": 64710 }, { "epoch": 0.0651816353273082, "grad_norm": 11.408052577280777, "learning_rate": 4.9971653653914585e-05, "loss": 2.6236, "mean_token_accuracy": 0.42413792610168455, "step": 64715 }, { "epoch": 0.06518667138041237, "grad_norm": 11.298046432800817, "learning_rate": 4.9971634847459236e-05, "loss": 3.0018, "mean_token_accuracy": 0.3310344785451889, "step": 64720 }, { "epoch": 0.06519170743351654, "grad_norm": 12.73069033944942, "learning_rate": 4.997161603477129e-05, "loss": 2.8028, "mean_token_accuracy": 0.33793102502822875, "step": 64725 }, { "epoch": 0.06519674348662072, "grad_norm": 14.931542357445197, "learning_rate": 4.997159721585076e-05, "loss": 2.6301, "mean_token_accuracy": 0.3793103456497192, "step": 64730 }, { "epoch": 0.06520177953972489, "grad_norm": 15.815234975068162, "learning_rate": 4.997157839069766e-05, "loss": 2.4818, "mean_token_accuracy": 0.38620689511299133, "step": 64735 }, { "epoch": 0.06520681559282906, "grad_norm": 11.480762283574364, "learning_rate": 4.997155955931197e-05, "loss": 2.6949, "mean_token_accuracy": 0.38965516686439516, "step": 64740 }, { "epoch": 0.06521185164593324, "grad_norm": 11.356607353407735, "learning_rate": 4.997154072169371e-05, "loss": 2.2604, "mean_token_accuracy": 0.4379310429096222, "step": 64745 }, { "epoch": 0.06521688769903741, "grad_norm": 10.353515464197038, "learning_rate": 4.9971521877842895e-05, "loss": 2.4588, "mean_token_accuracy": 0.38275861740112305, "step": 64750 }, { "epoch": 0.06522192375214159, "grad_norm": 10.921828868284166, "learning_rate": 4.997150302775951e-05, "loss": 2.4878, "mean_token_accuracy": 0.4191772550344467, "step": 64755 }, { "epoch": 0.06522695980524576, "grad_norm": 11.504221921652743, "learning_rate": 4.997148417144357e-05, "loss": 2.6078, "mean_token_accuracy": 0.3793103456497192, "step": 64760 }, { "epoch": 0.06523199585834993, "grad_norm": 8.862244972105684, "learning_rate": 4.997146530889508e-05, "loss": 1.9771, "mean_token_accuracy": 0.5093163907527923, "step": 64765 }, { "epoch": 0.0652370319114541, "grad_norm": 10.023737836310328, "learning_rate": 4.997144644011405e-05, "loss": 2.5176, "mean_token_accuracy": 0.4068965494632721, "step": 64770 }, { "epoch": 0.06524206796455827, "grad_norm": 12.031062247578312, "learning_rate": 4.997142756510048e-05, "loss": 2.3024, "mean_token_accuracy": 0.45172414779663084, "step": 64775 }, { "epoch": 0.06524710401766244, "grad_norm": 11.197543566436527, "learning_rate": 4.997140868385437e-05, "loss": 2.114, "mean_token_accuracy": 0.4655172288417816, "step": 64780 }, { "epoch": 0.06525214007076661, "grad_norm": 10.872040225264275, "learning_rate": 4.997138979637574e-05, "loss": 2.7369, "mean_token_accuracy": 0.36896551251411436, "step": 64785 }, { "epoch": 0.06525717612387079, "grad_norm": 17.21210975966897, "learning_rate": 4.997137090266457e-05, "loss": 3.0848, "mean_token_accuracy": 0.3413793116807938, "step": 64790 }, { "epoch": 0.06526221217697496, "grad_norm": 18.5246921478657, "learning_rate": 4.99713520027209e-05, "loss": 2.8659, "mean_token_accuracy": 0.3517241388559341, "step": 64795 }, { "epoch": 0.06526724823007914, "grad_norm": 11.264843091788103, "learning_rate": 4.997133309654471e-05, "loss": 2.6731, "mean_token_accuracy": 0.35517241060733795, "step": 64800 }, { "epoch": 0.06527228428318331, "grad_norm": 10.178931708263343, "learning_rate": 4.997131418413601e-05, "loss": 2.2645, "mean_token_accuracy": 0.4482758641242981, "step": 64805 }, { "epoch": 0.06527732033628748, "grad_norm": 15.86252835577339, "learning_rate": 4.997129526549481e-05, "loss": 2.7594, "mean_token_accuracy": 0.38052027225494384, "step": 64810 }, { "epoch": 0.06528235638939166, "grad_norm": 14.624945036599803, "learning_rate": 4.997127634062111e-05, "loss": 2.6795, "mean_token_accuracy": 0.334482753276825, "step": 64815 }, { "epoch": 0.06528739244249583, "grad_norm": 10.318307007651585, "learning_rate": 4.9971257409514913e-05, "loss": 2.4694, "mean_token_accuracy": 0.4103448331356049, "step": 64820 }, { "epoch": 0.0652924284956, "grad_norm": 9.831507872199152, "learning_rate": 4.9971238472176244e-05, "loss": 2.4667, "mean_token_accuracy": 0.46896551847457885, "step": 64825 }, { "epoch": 0.06529746454870418, "grad_norm": 11.424650355093075, "learning_rate": 4.997121952860508e-05, "loss": 2.7934, "mean_token_accuracy": 0.3551724076271057, "step": 64830 }, { "epoch": 0.06530250060180835, "grad_norm": 10.191790828262855, "learning_rate": 4.997120057880144e-05, "loss": 2.5271, "mean_token_accuracy": 0.41379310488700866, "step": 64835 }, { "epoch": 0.06530753665491251, "grad_norm": 13.391319344101985, "learning_rate": 4.997118162276533e-05, "loss": 2.7077, "mean_token_accuracy": 0.43793103098869324, "step": 64840 }, { "epoch": 0.06531257270801669, "grad_norm": 38.5238216736616, "learning_rate": 4.997116266049676e-05, "loss": 2.4215, "mean_token_accuracy": 0.38965517580509185, "step": 64845 }, { "epoch": 0.06531760876112086, "grad_norm": 10.117411369956777, "learning_rate": 4.997114369199573e-05, "loss": 2.2416, "mean_token_accuracy": 0.46551724076271056, "step": 64850 }, { "epoch": 0.06532264481422503, "grad_norm": 10.528389456067643, "learning_rate": 4.997112471726223e-05, "loss": 2.578, "mean_token_accuracy": 0.4724137902259827, "step": 64855 }, { "epoch": 0.06532768086732921, "grad_norm": 11.571369150529442, "learning_rate": 4.997110573629629e-05, "loss": 2.7088, "mean_token_accuracy": 0.4191772609949112, "step": 64860 }, { "epoch": 0.06533271692043338, "grad_norm": 11.775724665888097, "learning_rate": 4.99710867490979e-05, "loss": 2.7593, "mean_token_accuracy": 0.3793103456497192, "step": 64865 }, { "epoch": 0.06533775297353756, "grad_norm": 10.749331150639613, "learning_rate": 4.997106775566708e-05, "loss": 2.4014, "mean_token_accuracy": 0.43534483313560485, "step": 64870 }, { "epoch": 0.06534278902664173, "grad_norm": 13.15106900743184, "learning_rate": 4.997104875600382e-05, "loss": 2.6002, "mean_token_accuracy": 0.42758620381355283, "step": 64875 }, { "epoch": 0.0653478250797459, "grad_norm": 14.305468702104733, "learning_rate": 4.9971029750108125e-05, "loss": 2.5057, "mean_token_accuracy": 0.44343616962432864, "step": 64880 }, { "epoch": 0.06535286113285008, "grad_norm": 10.558288104573247, "learning_rate": 4.997101073798002e-05, "loss": 2.3549, "mean_token_accuracy": 0.4502722263336182, "step": 64885 }, { "epoch": 0.06535789718595425, "grad_norm": 11.781244081846133, "learning_rate": 4.997099171961948e-05, "loss": 2.4368, "mean_token_accuracy": 0.4103448331356049, "step": 64890 }, { "epoch": 0.06536293323905842, "grad_norm": 10.389559056637578, "learning_rate": 4.997097269502654e-05, "loss": 2.8408, "mean_token_accuracy": 0.4068965554237366, "step": 64895 }, { "epoch": 0.0653679692921626, "grad_norm": 13.37008570775885, "learning_rate": 4.997095366420119e-05, "loss": 2.6028, "mean_token_accuracy": 0.3793103456497192, "step": 64900 }, { "epoch": 0.06537300534526677, "grad_norm": 12.484963620782418, "learning_rate": 4.9970934627143424e-05, "loss": 2.597, "mean_token_accuracy": 0.39655172228813174, "step": 64905 }, { "epoch": 0.06537804139837093, "grad_norm": 12.145935788478827, "learning_rate": 4.9970915583853265e-05, "loss": 2.6924, "mean_token_accuracy": 0.39310344457626345, "step": 64910 }, { "epoch": 0.0653830774514751, "grad_norm": 11.072032557913573, "learning_rate": 4.9970896534330726e-05, "loss": 2.5457, "mean_token_accuracy": 0.3551724165678024, "step": 64915 }, { "epoch": 0.06538811350457928, "grad_norm": 10.557541119590413, "learning_rate": 4.9970877478575794e-05, "loss": 2.3734, "mean_token_accuracy": 0.44482758045196535, "step": 64920 }, { "epoch": 0.06539314955768345, "grad_norm": 10.044485034200772, "learning_rate": 4.997085841658848e-05, "loss": 2.5603, "mean_token_accuracy": 0.37586206793785093, "step": 64925 }, { "epoch": 0.06539818561078763, "grad_norm": 10.303432992622986, "learning_rate": 4.997083934836879e-05, "loss": 2.2256, "mean_token_accuracy": 0.5017543911933899, "step": 64930 }, { "epoch": 0.0654032216638918, "grad_norm": 12.56077878271298, "learning_rate": 4.997082027391673e-05, "loss": 2.7891, "mean_token_accuracy": 0.3827586233615875, "step": 64935 }, { "epoch": 0.06540825771699597, "grad_norm": 10.12361899449248, "learning_rate": 4.9970801193232293e-05, "loss": 2.7714, "mean_token_accuracy": 0.37241379022598264, "step": 64940 }, { "epoch": 0.06541329377010015, "grad_norm": 10.296899232489508, "learning_rate": 4.9970782106315507e-05, "loss": 2.4054, "mean_token_accuracy": 0.4413793087005615, "step": 64945 }, { "epoch": 0.06541832982320432, "grad_norm": 12.346419620284966, "learning_rate": 4.9970763013166366e-05, "loss": 2.5035, "mean_token_accuracy": 0.41034482717514037, "step": 64950 }, { "epoch": 0.0654233658763085, "grad_norm": 12.84787002862828, "learning_rate": 4.9970743913784866e-05, "loss": 2.7812, "mean_token_accuracy": 0.3551724135875702, "step": 64955 }, { "epoch": 0.06542840192941267, "grad_norm": 11.163481770167147, "learning_rate": 4.9970724808171026e-05, "loss": 2.533, "mean_token_accuracy": 0.44029037952423095, "step": 64960 }, { "epoch": 0.06543343798251684, "grad_norm": 9.986359020432651, "learning_rate": 4.9970705696324854e-05, "loss": 2.6214, "mean_token_accuracy": 0.3965517282485962, "step": 64965 }, { "epoch": 0.06543847403562102, "grad_norm": 14.888717559987086, "learning_rate": 4.997068657824634e-05, "loss": 2.7874, "mean_token_accuracy": 0.3896551728248596, "step": 64970 }, { "epoch": 0.06544351008872519, "grad_norm": 11.885474923123517, "learning_rate": 4.9970667453935496e-05, "loss": 2.5644, "mean_token_accuracy": 0.41724138259887694, "step": 64975 }, { "epoch": 0.06544854614182935, "grad_norm": 10.508784083928356, "learning_rate": 4.997064832339234e-05, "loss": 2.3807, "mean_token_accuracy": 0.4034482777118683, "step": 64980 }, { "epoch": 0.06545358219493352, "grad_norm": 12.24653912273854, "learning_rate": 4.9970629186616854e-05, "loss": 2.6336, "mean_token_accuracy": 0.4344827592372894, "step": 64985 }, { "epoch": 0.0654586182480377, "grad_norm": 9.388594060783554, "learning_rate": 4.997061004360906e-05, "loss": 2.4518, "mean_token_accuracy": 0.47586206197738645, "step": 64990 }, { "epoch": 0.06546365430114187, "grad_norm": 13.320067058419248, "learning_rate": 4.9970590894368955e-05, "loss": 2.6827, "mean_token_accuracy": 0.36551723778247835, "step": 64995 }, { "epoch": 0.06546869035424605, "grad_norm": 13.264288908506451, "learning_rate": 4.9970571738896553e-05, "loss": 2.1275, "mean_token_accuracy": 0.46551724076271056, "step": 65000 }, { "epoch": 0.06547372640735022, "grad_norm": 11.209152338039866, "learning_rate": 4.9970552577191846e-05, "loss": 2.4391, "mean_token_accuracy": 0.47931033968925474, "step": 65005 }, { "epoch": 0.06547876246045439, "grad_norm": 11.47444929782627, "learning_rate": 4.997053340925485e-05, "loss": 2.5459, "mean_token_accuracy": 0.3896551728248596, "step": 65010 }, { "epoch": 0.06548379851355857, "grad_norm": 14.30823744507397, "learning_rate": 4.9970514235085575e-05, "loss": 2.8399, "mean_token_accuracy": 0.36206896901130675, "step": 65015 }, { "epoch": 0.06548883456666274, "grad_norm": 10.838760144550397, "learning_rate": 4.997049505468401e-05, "loss": 2.5964, "mean_token_accuracy": 0.36896551251411436, "step": 65020 }, { "epoch": 0.06549387061976691, "grad_norm": 10.417844340793048, "learning_rate": 4.9970475868050176e-05, "loss": 2.3465, "mean_token_accuracy": 0.4586207032203674, "step": 65025 }, { "epoch": 0.06549890667287109, "grad_norm": 16.602607201217918, "learning_rate": 4.997045667518407e-05, "loss": 3.3829, "mean_token_accuracy": 0.3043557196855545, "step": 65030 }, { "epoch": 0.06550394272597526, "grad_norm": 12.241008812288587, "learning_rate": 4.9970437476085696e-05, "loss": 2.6415, "mean_token_accuracy": 0.3931034505367279, "step": 65035 }, { "epoch": 0.06550897877907944, "grad_norm": 14.920428789402699, "learning_rate": 4.9970418270755065e-05, "loss": 2.4694, "mean_token_accuracy": 0.4310344815254211, "step": 65040 }, { "epoch": 0.06551401483218361, "grad_norm": 12.836177164438281, "learning_rate": 4.997039905919218e-05, "loss": 2.5929, "mean_token_accuracy": 0.37586206793785093, "step": 65045 }, { "epoch": 0.06551905088528777, "grad_norm": 10.950852650712818, "learning_rate": 4.997037984139705e-05, "loss": 2.1905, "mean_token_accuracy": 0.4640048384666443, "step": 65050 }, { "epoch": 0.06552408693839194, "grad_norm": 11.217662110234622, "learning_rate": 4.997036061736967e-05, "loss": 2.696, "mean_token_accuracy": 0.439443439245224, "step": 65055 }, { "epoch": 0.06552912299149612, "grad_norm": 11.032539848109701, "learning_rate": 4.997034138711005e-05, "loss": 2.4892, "mean_token_accuracy": 0.3620689660310745, "step": 65060 }, { "epoch": 0.06553415904460029, "grad_norm": 12.149552392652938, "learning_rate": 4.9970322150618205e-05, "loss": 2.6778, "mean_token_accuracy": 0.37931033968925476, "step": 65065 }, { "epoch": 0.06553919509770446, "grad_norm": 13.784001695228483, "learning_rate": 4.9970302907894125e-05, "loss": 2.5073, "mean_token_accuracy": 0.4103448212146759, "step": 65070 }, { "epoch": 0.06554423115080864, "grad_norm": 13.425647604919858, "learning_rate": 4.997028365893782e-05, "loss": 2.5335, "mean_token_accuracy": 0.42413793206214906, "step": 65075 }, { "epoch": 0.06554926720391281, "grad_norm": 12.41897223403293, "learning_rate": 4.997026440374931e-05, "loss": 2.835, "mean_token_accuracy": 0.3413792967796326, "step": 65080 }, { "epoch": 0.06555430325701699, "grad_norm": 15.404729861662698, "learning_rate": 4.997024514232858e-05, "loss": 2.3594, "mean_token_accuracy": 0.458620685338974, "step": 65085 }, { "epoch": 0.06555933931012116, "grad_norm": 10.711907536267494, "learning_rate": 4.9970225874675644e-05, "loss": 2.7029, "mean_token_accuracy": 0.33793103098869326, "step": 65090 }, { "epoch": 0.06556437536322533, "grad_norm": 16.269545982721876, "learning_rate": 4.997020660079051e-05, "loss": 2.5518, "mean_token_accuracy": 0.4034482777118683, "step": 65095 }, { "epoch": 0.0655694114163295, "grad_norm": 10.219959538209208, "learning_rate": 4.997018732067318e-05, "loss": 2.4329, "mean_token_accuracy": 0.3896551728248596, "step": 65100 }, { "epoch": 0.06557444746943368, "grad_norm": 15.75559532341415, "learning_rate": 4.9970168034323656e-05, "loss": 2.5308, "mean_token_accuracy": 0.42758620977401735, "step": 65105 }, { "epoch": 0.06557948352253785, "grad_norm": 11.025504952672485, "learning_rate": 4.997014874174195e-05, "loss": 2.2578, "mean_token_accuracy": 0.44482759237289426, "step": 65110 }, { "epoch": 0.06558451957564203, "grad_norm": 22.84973417952192, "learning_rate": 4.997012944292806e-05, "loss": 2.8426, "mean_token_accuracy": 0.3275862127542496, "step": 65115 }, { "epoch": 0.06558955562874619, "grad_norm": 9.87374959651685, "learning_rate": 4.9970110137882e-05, "loss": 2.2776, "mean_token_accuracy": 0.44827585816383364, "step": 65120 }, { "epoch": 0.06559459168185036, "grad_norm": 13.443132294185327, "learning_rate": 4.997009082660377e-05, "loss": 3.1697, "mean_token_accuracy": 0.34827585220336915, "step": 65125 }, { "epoch": 0.06559962773495454, "grad_norm": 12.677891725561125, "learning_rate": 4.997007150909338e-05, "loss": 2.9893, "mean_token_accuracy": 0.35517241060733795, "step": 65130 }, { "epoch": 0.06560466378805871, "grad_norm": 19.026056424161254, "learning_rate": 4.997005218535083e-05, "loss": 2.7172, "mean_token_accuracy": 0.41724138557910917, "step": 65135 }, { "epoch": 0.06560969984116288, "grad_norm": 9.275741226191961, "learning_rate": 4.9970032855376125e-05, "loss": 2.2838, "mean_token_accuracy": 0.45366001725196836, "step": 65140 }, { "epoch": 0.06561473589426706, "grad_norm": 13.323789479767306, "learning_rate": 4.997001351916928e-05, "loss": 3.0301, "mean_token_accuracy": 0.3758620619773865, "step": 65145 }, { "epoch": 0.06561977194737123, "grad_norm": 12.065905908725158, "learning_rate": 4.996999417673028e-05, "loss": 3.113, "mean_token_accuracy": 0.3655172407627106, "step": 65150 }, { "epoch": 0.0656248080004754, "grad_norm": 10.920392079088483, "learning_rate": 4.996997482805915e-05, "loss": 2.475, "mean_token_accuracy": 0.41379311084747317, "step": 65155 }, { "epoch": 0.06562984405357958, "grad_norm": 11.795404252943495, "learning_rate": 4.996995547315589e-05, "loss": 2.6588, "mean_token_accuracy": 0.3655172407627106, "step": 65160 }, { "epoch": 0.06563488010668375, "grad_norm": 11.979996194782537, "learning_rate": 4.996993611202051e-05, "loss": 2.9733, "mean_token_accuracy": 0.37241379022598264, "step": 65165 }, { "epoch": 0.06563991615978793, "grad_norm": 11.804553602510314, "learning_rate": 4.9969916744653e-05, "loss": 2.4313, "mean_token_accuracy": 0.4137930989265442, "step": 65170 }, { "epoch": 0.0656449522128921, "grad_norm": 10.229640365191054, "learning_rate": 4.9969897371053376e-05, "loss": 2.4493, "mean_token_accuracy": 0.43278887271881106, "step": 65175 }, { "epoch": 0.06564998826599627, "grad_norm": 10.613681559636753, "learning_rate": 4.9969877991221646e-05, "loss": 2.3053, "mean_token_accuracy": 0.42758620977401735, "step": 65180 }, { "epoch": 0.06565502431910045, "grad_norm": 9.993974881358307, "learning_rate": 4.9969858605157806e-05, "loss": 2.0233, "mean_token_accuracy": 0.5034482657909394, "step": 65185 }, { "epoch": 0.0656600603722046, "grad_norm": 9.00498623604641, "learning_rate": 4.996983921286188e-05, "loss": 2.3311, "mean_token_accuracy": 0.46551724672317507, "step": 65190 }, { "epoch": 0.06566509642530878, "grad_norm": 12.07519456171486, "learning_rate": 4.996981981433384e-05, "loss": 2.4227, "mean_token_accuracy": 0.4103448212146759, "step": 65195 }, { "epoch": 0.06567013247841295, "grad_norm": 17.766870765365134, "learning_rate": 4.996980040957372e-05, "loss": 2.396, "mean_token_accuracy": 0.4034482717514038, "step": 65200 }, { "epoch": 0.06567516853151713, "grad_norm": 14.790583946375987, "learning_rate": 4.9969780998581517e-05, "loss": 2.7338, "mean_token_accuracy": 0.3379310369491577, "step": 65205 }, { "epoch": 0.0656802045846213, "grad_norm": 11.209361809518665, "learning_rate": 4.996976158135724e-05, "loss": 2.5497, "mean_token_accuracy": 0.41379310488700866, "step": 65210 }, { "epoch": 0.06568524063772548, "grad_norm": 9.694014627628961, "learning_rate": 4.996974215790089e-05, "loss": 2.6238, "mean_token_accuracy": 0.3896551728248596, "step": 65215 }, { "epoch": 0.06569027669082965, "grad_norm": 12.97105282522778, "learning_rate": 4.996972272821247e-05, "loss": 2.253, "mean_token_accuracy": 0.441379314661026, "step": 65220 }, { "epoch": 0.06569531274393382, "grad_norm": 12.645206393031733, "learning_rate": 4.9969703292291993e-05, "loss": 2.4302, "mean_token_accuracy": 0.4294615864753723, "step": 65225 }, { "epoch": 0.065700348797038, "grad_norm": 10.642080425667514, "learning_rate": 4.9969683850139455e-05, "loss": 2.5925, "mean_token_accuracy": 0.4172413766384125, "step": 65230 }, { "epoch": 0.06570538485014217, "grad_norm": 12.48197100135752, "learning_rate": 4.9969664401754874e-05, "loss": 2.5984, "mean_token_accuracy": 0.38275861740112305, "step": 65235 }, { "epoch": 0.06571042090324634, "grad_norm": 10.855704386081841, "learning_rate": 4.996964494713823e-05, "loss": 2.4823, "mean_token_accuracy": 0.3999999940395355, "step": 65240 }, { "epoch": 0.06571545695635052, "grad_norm": 14.556694010523788, "learning_rate": 4.996962548628957e-05, "loss": 2.9918, "mean_token_accuracy": 0.2965517222881317, "step": 65245 }, { "epoch": 0.06572049300945469, "grad_norm": 12.821300500706608, "learning_rate": 4.9969606019208864e-05, "loss": 2.9859, "mean_token_accuracy": 0.3551724076271057, "step": 65250 }, { "epoch": 0.06572552906255887, "grad_norm": 12.962174296129136, "learning_rate": 4.9969586545896124e-05, "loss": 2.6853, "mean_token_accuracy": 0.4, "step": 65255 }, { "epoch": 0.06573056511566303, "grad_norm": 8.571240969110962, "learning_rate": 4.9969567066351364e-05, "loss": 2.2543, "mean_token_accuracy": 0.4241379380226135, "step": 65260 }, { "epoch": 0.0657356011687672, "grad_norm": 12.993517324652368, "learning_rate": 4.9969547580574595e-05, "loss": 3.0137, "mean_token_accuracy": 0.3482758581638336, "step": 65265 }, { "epoch": 0.06574063722187137, "grad_norm": 13.705610152611374, "learning_rate": 4.99695280885658e-05, "loss": 2.4454, "mean_token_accuracy": 0.3896551728248596, "step": 65270 }, { "epoch": 0.06574567327497555, "grad_norm": 9.862894972909425, "learning_rate": 4.9969508590325e-05, "loss": 2.6734, "mean_token_accuracy": 0.4068965494632721, "step": 65275 }, { "epoch": 0.06575070932807972, "grad_norm": 9.802660144385916, "learning_rate": 4.99694890858522e-05, "loss": 2.3862, "mean_token_accuracy": 0.4517241358757019, "step": 65280 }, { "epoch": 0.0657557453811839, "grad_norm": 12.977800842000624, "learning_rate": 4.996946957514741e-05, "loss": 2.8093, "mean_token_accuracy": 0.4068965554237366, "step": 65285 }, { "epoch": 0.06576078143428807, "grad_norm": 12.468434318598709, "learning_rate": 4.996945005821062e-05, "loss": 2.5929, "mean_token_accuracy": 0.41379310488700866, "step": 65290 }, { "epoch": 0.06576581748739224, "grad_norm": 11.783458630643858, "learning_rate": 4.996943053504185e-05, "loss": 2.7317, "mean_token_accuracy": 0.3896551728248596, "step": 65295 }, { "epoch": 0.06577085354049642, "grad_norm": 12.366942193288237, "learning_rate": 4.99694110056411e-05, "loss": 2.3928, "mean_token_accuracy": 0.42758620977401735, "step": 65300 }, { "epoch": 0.06577588959360059, "grad_norm": 11.922713344887724, "learning_rate": 4.996939147000837e-05, "loss": 2.3201, "mean_token_accuracy": 0.3999999940395355, "step": 65305 }, { "epoch": 0.06578092564670476, "grad_norm": 11.681194711248034, "learning_rate": 4.9969371928143674e-05, "loss": 2.7589, "mean_token_accuracy": 0.38965516686439516, "step": 65310 }, { "epoch": 0.06578596169980894, "grad_norm": 12.562881920801395, "learning_rate": 4.996935238004701e-05, "loss": 2.4379, "mean_token_accuracy": 0.38275861740112305, "step": 65315 }, { "epoch": 0.06579099775291311, "grad_norm": 11.79460403787884, "learning_rate": 4.996933282571839e-05, "loss": 2.4015, "mean_token_accuracy": 0.4586206912994385, "step": 65320 }, { "epoch": 0.06579603380601728, "grad_norm": 10.899699593934638, "learning_rate": 4.9969313265157815e-05, "loss": 2.6911, "mean_token_accuracy": 0.37586206793785093, "step": 65325 }, { "epoch": 0.06580106985912144, "grad_norm": 15.153869071725529, "learning_rate": 4.99692936983653e-05, "loss": 2.722, "mean_token_accuracy": 0.38620689511299133, "step": 65330 }, { "epoch": 0.06580610591222562, "grad_norm": 13.434080964390509, "learning_rate": 4.996927412534083e-05, "loss": 2.414, "mean_token_accuracy": 0.4034482777118683, "step": 65335 }, { "epoch": 0.06581114196532979, "grad_norm": 12.710116680022304, "learning_rate": 4.9969254546084435e-05, "loss": 2.3528, "mean_token_accuracy": 0.45033273100852966, "step": 65340 }, { "epoch": 0.06581617801843397, "grad_norm": 12.42083563790262, "learning_rate": 4.99692349605961e-05, "loss": 2.4836, "mean_token_accuracy": 0.43260738253593445, "step": 65345 }, { "epoch": 0.06582121407153814, "grad_norm": 11.562562996137192, "learning_rate": 4.9969215368875846e-05, "loss": 2.5185, "mean_token_accuracy": 0.42068964838981626, "step": 65350 }, { "epoch": 0.06582625012464231, "grad_norm": 12.232805064529998, "learning_rate": 4.996919577092366e-05, "loss": 2.8474, "mean_token_accuracy": 0.3620689630508423, "step": 65355 }, { "epoch": 0.06583128617774649, "grad_norm": 14.135918759739035, "learning_rate": 4.9969176166739564e-05, "loss": 2.4846, "mean_token_accuracy": 0.4, "step": 65360 }, { "epoch": 0.06583632223085066, "grad_norm": 11.041238749026546, "learning_rate": 4.9969156556323565e-05, "loss": 2.3784, "mean_token_accuracy": 0.41724138259887694, "step": 65365 }, { "epoch": 0.06584135828395483, "grad_norm": 10.679499426768874, "learning_rate": 4.996913693967565e-05, "loss": 2.4045, "mean_token_accuracy": 0.44827585816383364, "step": 65370 }, { "epoch": 0.06584639433705901, "grad_norm": 12.11767092220549, "learning_rate": 4.996911731679585e-05, "loss": 2.6773, "mean_token_accuracy": 0.39310343861579894, "step": 65375 }, { "epoch": 0.06585143039016318, "grad_norm": 11.275364754430525, "learning_rate": 4.996909768768414e-05, "loss": 2.6749, "mean_token_accuracy": 0.36896551251411436, "step": 65380 }, { "epoch": 0.06585646644326736, "grad_norm": 12.49622211374794, "learning_rate": 4.996907805234055e-05, "loss": 2.6873, "mean_token_accuracy": 0.3655172407627106, "step": 65385 }, { "epoch": 0.06586150249637153, "grad_norm": 10.923790030390217, "learning_rate": 4.9969058410765085e-05, "loss": 2.554, "mean_token_accuracy": 0.39655172228813174, "step": 65390 }, { "epoch": 0.0658665385494757, "grad_norm": 23.347232922109853, "learning_rate": 4.996903876295773e-05, "loss": 2.3908, "mean_token_accuracy": 0.4586206912994385, "step": 65395 }, { "epoch": 0.06587157460257986, "grad_norm": 10.662415067270413, "learning_rate": 4.996901910891851e-05, "loss": 2.4954, "mean_token_accuracy": 0.4068965494632721, "step": 65400 }, { "epoch": 0.06587661065568404, "grad_norm": 11.253295135655188, "learning_rate": 4.9968999448647426e-05, "loss": 2.2926, "mean_token_accuracy": 0.4689655065536499, "step": 65405 }, { "epoch": 0.06588164670878821, "grad_norm": 15.634729814723372, "learning_rate": 4.996897978214448e-05, "loss": 2.435, "mean_token_accuracy": 0.4310344815254211, "step": 65410 }, { "epoch": 0.06588668276189238, "grad_norm": 11.351158233786778, "learning_rate": 4.996896010940967e-05, "loss": 2.2554, "mean_token_accuracy": 0.43968542814254763, "step": 65415 }, { "epoch": 0.06589171881499656, "grad_norm": 12.56871335377824, "learning_rate": 4.9968940430443026e-05, "loss": 3.0543, "mean_token_accuracy": 0.3999999940395355, "step": 65420 }, { "epoch": 0.06589675486810073, "grad_norm": 12.460619345925506, "learning_rate": 4.996892074524452e-05, "loss": 2.7554, "mean_token_accuracy": 0.3999999940395355, "step": 65425 }, { "epoch": 0.0659017909212049, "grad_norm": 13.48566835728215, "learning_rate": 4.996890105381418e-05, "loss": 2.4997, "mean_token_accuracy": 0.39310344457626345, "step": 65430 }, { "epoch": 0.06590682697430908, "grad_norm": 10.569177780371906, "learning_rate": 4.9968881356152014e-05, "loss": 3.0132, "mean_token_accuracy": 0.35172412991523744, "step": 65435 }, { "epoch": 0.06591186302741325, "grad_norm": 10.595327235477416, "learning_rate": 4.9968861652258014e-05, "loss": 2.3642, "mean_token_accuracy": 0.4103448212146759, "step": 65440 }, { "epoch": 0.06591689908051743, "grad_norm": 14.203039817953197, "learning_rate": 4.9968841942132196e-05, "loss": 2.6731, "mean_token_accuracy": 0.4172413766384125, "step": 65445 }, { "epoch": 0.0659219351336216, "grad_norm": 10.823733679587228, "learning_rate": 4.996882222577456e-05, "loss": 2.2352, "mean_token_accuracy": 0.42758620977401735, "step": 65450 }, { "epoch": 0.06592697118672577, "grad_norm": 10.998957445125669, "learning_rate": 4.996880250318511e-05, "loss": 2.2451, "mean_token_accuracy": 0.46551724076271056, "step": 65455 }, { "epoch": 0.06593200723982995, "grad_norm": 9.91620354828304, "learning_rate": 4.996878277436385e-05, "loss": 2.7678, "mean_token_accuracy": 0.4172413766384125, "step": 65460 }, { "epoch": 0.06593704329293412, "grad_norm": 13.26655773789606, "learning_rate": 4.99687630393108e-05, "loss": 2.4446, "mean_token_accuracy": 0.4379310369491577, "step": 65465 }, { "epoch": 0.06594207934603828, "grad_norm": 9.875355402548797, "learning_rate": 4.9968743298025946e-05, "loss": 2.8613, "mean_token_accuracy": 0.3551724135875702, "step": 65470 }, { "epoch": 0.06594711539914246, "grad_norm": 11.998546982321045, "learning_rate": 4.99687235505093e-05, "loss": 2.6614, "mean_token_accuracy": 0.3827586203813553, "step": 65475 }, { "epoch": 0.06595215145224663, "grad_norm": 11.852341240306094, "learning_rate": 4.996870379676088e-05, "loss": 2.5705, "mean_token_accuracy": 0.40689654350280763, "step": 65480 }, { "epoch": 0.0659571875053508, "grad_norm": 12.049928221307734, "learning_rate": 4.996868403678067e-05, "loss": 2.3508, "mean_token_accuracy": 0.4517241358757019, "step": 65485 }, { "epoch": 0.06596222355845498, "grad_norm": 9.60336556484452, "learning_rate": 4.996866427056869e-05, "loss": 2.2918, "mean_token_accuracy": 0.44670296311378477, "step": 65490 }, { "epoch": 0.06596725961155915, "grad_norm": 15.722390574896556, "learning_rate": 4.996864449812495e-05, "loss": 2.4369, "mean_token_accuracy": 0.3896551728248596, "step": 65495 }, { "epoch": 0.06597229566466332, "grad_norm": 11.625071153538663, "learning_rate": 4.996862471944944e-05, "loss": 2.5663, "mean_token_accuracy": 0.4103448301553726, "step": 65500 }, { "epoch": 0.0659773317177675, "grad_norm": 11.568527243962391, "learning_rate": 4.9968604934542176e-05, "loss": 2.6505, "mean_token_accuracy": 0.3896551728248596, "step": 65505 }, { "epoch": 0.06598236777087167, "grad_norm": 11.63023585917754, "learning_rate": 4.996858514340316e-05, "loss": 2.8583, "mean_token_accuracy": 0.3827586233615875, "step": 65510 }, { "epoch": 0.06598740382397585, "grad_norm": 11.305279730670007, "learning_rate": 4.9968565346032406e-05, "loss": 2.4131, "mean_token_accuracy": 0.41724138259887694, "step": 65515 }, { "epoch": 0.06599243987708002, "grad_norm": 12.009779761898915, "learning_rate": 4.9968545542429895e-05, "loss": 2.365, "mean_token_accuracy": 0.41379310488700866, "step": 65520 }, { "epoch": 0.0659974759301842, "grad_norm": 11.518594019424027, "learning_rate": 4.996852573259566e-05, "loss": 2.6181, "mean_token_accuracy": 0.42068964838981626, "step": 65525 }, { "epoch": 0.06600251198328837, "grad_norm": 10.441983443094456, "learning_rate": 4.996850591652969e-05, "loss": 2.532, "mean_token_accuracy": 0.3551724135875702, "step": 65530 }, { "epoch": 0.06600754803639254, "grad_norm": 7.5885939350469265, "learning_rate": 4.9968486094232e-05, "loss": 2.0881, "mean_token_accuracy": 0.4758620738983154, "step": 65535 }, { "epoch": 0.0660125840894967, "grad_norm": 11.380359667745816, "learning_rate": 4.996846626570259e-05, "loss": 2.579, "mean_token_accuracy": 0.42601331472396853, "step": 65540 }, { "epoch": 0.06601762014260087, "grad_norm": 10.108514869349683, "learning_rate": 4.996844643094147e-05, "loss": 2.7189, "mean_token_accuracy": 0.4413793087005615, "step": 65545 }, { "epoch": 0.06602265619570505, "grad_norm": 10.655178481754222, "learning_rate": 4.9968426589948634e-05, "loss": 2.2949, "mean_token_accuracy": 0.45172414779663084, "step": 65550 }, { "epoch": 0.06602769224880922, "grad_norm": 10.923417970271302, "learning_rate": 4.99684067427241e-05, "loss": 2.3969, "mean_token_accuracy": 0.4586206912994385, "step": 65555 }, { "epoch": 0.0660327283019134, "grad_norm": 15.948263248874573, "learning_rate": 4.9968386889267876e-05, "loss": 2.3435, "mean_token_accuracy": 0.45517241954803467, "step": 65560 }, { "epoch": 0.06603776435501757, "grad_norm": 11.670395371206665, "learning_rate": 4.996836702957995e-05, "loss": 2.3725, "mean_token_accuracy": 0.458620685338974, "step": 65565 }, { "epoch": 0.06604280040812174, "grad_norm": 15.271588626473754, "learning_rate": 4.996834716366035e-05, "loss": 2.6224, "mean_token_accuracy": 0.3793103456497192, "step": 65570 }, { "epoch": 0.06604783646122592, "grad_norm": 10.170640835993577, "learning_rate": 4.9968327291509067e-05, "loss": 2.5474, "mean_token_accuracy": 0.3655172437429428, "step": 65575 }, { "epoch": 0.06605287251433009, "grad_norm": 12.238107989862943, "learning_rate": 4.996830741312611e-05, "loss": 2.6937, "mean_token_accuracy": 0.37586207389831544, "step": 65580 }, { "epoch": 0.06605790856743426, "grad_norm": 13.92690058202146, "learning_rate": 4.9968287528511485e-05, "loss": 2.8264, "mean_token_accuracy": 0.37586206793785093, "step": 65585 }, { "epoch": 0.06606294462053844, "grad_norm": 11.698076546974281, "learning_rate": 4.99682676376652e-05, "loss": 2.786, "mean_token_accuracy": 0.36551723480224607, "step": 65590 }, { "epoch": 0.06606798067364261, "grad_norm": 14.217071781944645, "learning_rate": 4.996824774058724e-05, "loss": 3.0245, "mean_token_accuracy": 0.3241379290819168, "step": 65595 }, { "epoch": 0.06607301672674679, "grad_norm": 13.444287330945965, "learning_rate": 4.9968227837277636e-05, "loss": 2.7208, "mean_token_accuracy": 0.39655172228813174, "step": 65600 }, { "epoch": 0.06607805277985096, "grad_norm": 11.201321278333081, "learning_rate": 4.9968207927736396e-05, "loss": 2.8515, "mean_token_accuracy": 0.41724138259887694, "step": 65605 }, { "epoch": 0.06608308883295512, "grad_norm": 10.66057533373802, "learning_rate": 4.99681880119635e-05, "loss": 2.5016, "mean_token_accuracy": 0.4, "step": 65610 }, { "epoch": 0.0660881248860593, "grad_norm": 12.160024654406152, "learning_rate": 4.996816808995897e-05, "loss": 2.8296, "mean_token_accuracy": 0.358620685338974, "step": 65615 }, { "epoch": 0.06609316093916347, "grad_norm": 11.441429474045075, "learning_rate": 4.996814816172282e-05, "loss": 2.6658, "mean_token_accuracy": 0.36206896901130675, "step": 65620 }, { "epoch": 0.06609819699226764, "grad_norm": 10.888089839160742, "learning_rate": 4.996812822725504e-05, "loss": 2.3375, "mean_token_accuracy": 0.4551724135875702, "step": 65625 }, { "epoch": 0.06610323304537181, "grad_norm": 10.75715846620542, "learning_rate": 4.9968108286555646e-05, "loss": 2.6154, "mean_token_accuracy": 0.40344826579093934, "step": 65630 }, { "epoch": 0.06610826909847599, "grad_norm": 11.476308576044449, "learning_rate": 4.996808833962463e-05, "loss": 2.2108, "mean_token_accuracy": 0.4379310250282288, "step": 65635 }, { "epoch": 0.06611330515158016, "grad_norm": 12.2273121829413, "learning_rate": 4.996806838646201e-05, "loss": 2.4679, "mean_token_accuracy": 0.40344828367233276, "step": 65640 }, { "epoch": 0.06611834120468434, "grad_norm": 12.313745232382882, "learning_rate": 4.996804842706778e-05, "loss": 2.4771, "mean_token_accuracy": 0.44700544476509096, "step": 65645 }, { "epoch": 0.06612337725778851, "grad_norm": 11.600091502920273, "learning_rate": 4.996802846144197e-05, "loss": 2.4729, "mean_token_accuracy": 0.42068964838981626, "step": 65650 }, { "epoch": 0.06612841331089268, "grad_norm": 10.953615196343153, "learning_rate": 4.996800848958455e-05, "loss": 2.5514, "mean_token_accuracy": 0.3793103486299515, "step": 65655 }, { "epoch": 0.06613344936399686, "grad_norm": 11.125559930841042, "learning_rate": 4.996798851149556e-05, "loss": 2.5017, "mean_token_accuracy": 0.394313371181488, "step": 65660 }, { "epoch": 0.06613848541710103, "grad_norm": 81.94796238135298, "learning_rate": 4.9967968527174976e-05, "loss": 2.6867, "mean_token_accuracy": 0.3931034505367279, "step": 65665 }, { "epoch": 0.0661435214702052, "grad_norm": 11.984488202314138, "learning_rate": 4.996794853662282e-05, "loss": 2.7475, "mean_token_accuracy": 0.38620689511299133, "step": 65670 }, { "epoch": 0.06614855752330938, "grad_norm": 11.524644538598913, "learning_rate": 4.99679285398391e-05, "loss": 2.7002, "mean_token_accuracy": 0.36206896007061007, "step": 65675 }, { "epoch": 0.06615359357641354, "grad_norm": 12.395259635343379, "learning_rate": 4.996790853682381e-05, "loss": 3.0995, "mean_token_accuracy": 0.3103448212146759, "step": 65680 }, { "epoch": 0.06615862962951771, "grad_norm": 12.251259793762424, "learning_rate": 4.9967888527576965e-05, "loss": 2.6303, "mean_token_accuracy": 0.44150031208992, "step": 65685 }, { "epoch": 0.06616366568262189, "grad_norm": 12.563306247392003, "learning_rate": 4.996786851209856e-05, "loss": 2.3918, "mean_token_accuracy": 0.42413792610168455, "step": 65690 }, { "epoch": 0.06616870173572606, "grad_norm": 11.160389155646747, "learning_rate": 4.9967848490388625e-05, "loss": 2.9554, "mean_token_accuracy": 0.36206896901130675, "step": 65695 }, { "epoch": 0.06617373778883023, "grad_norm": 10.799258024802246, "learning_rate": 4.996782846244714e-05, "loss": 3.0101, "mean_token_accuracy": 0.3413793116807938, "step": 65700 }, { "epoch": 0.06617877384193441, "grad_norm": 11.974176905365884, "learning_rate": 4.996780842827412e-05, "loss": 2.9939, "mean_token_accuracy": 0.36896551847457887, "step": 65705 }, { "epoch": 0.06618380989503858, "grad_norm": 11.663029508529444, "learning_rate": 4.996778838786956e-05, "loss": 2.8163, "mean_token_accuracy": 0.3896551728248596, "step": 65710 }, { "epoch": 0.06618884594814275, "grad_norm": 10.644703937730162, "learning_rate": 4.996776834123348e-05, "loss": 2.6914, "mean_token_accuracy": 0.37731397449970244, "step": 65715 }, { "epoch": 0.06619388200124693, "grad_norm": 11.927257307523423, "learning_rate": 4.9967748288365886e-05, "loss": 2.3882, "mean_token_accuracy": 0.46551724076271056, "step": 65720 }, { "epoch": 0.0661989180543511, "grad_norm": 10.737075136749617, "learning_rate": 4.9967728229266776e-05, "loss": 3.602, "mean_token_accuracy": 0.279310342669487, "step": 65725 }, { "epoch": 0.06620395410745528, "grad_norm": 10.917126384689633, "learning_rate": 4.9967708163936154e-05, "loss": 2.5821, "mean_token_accuracy": 0.4068965494632721, "step": 65730 }, { "epoch": 0.06620899016055945, "grad_norm": 11.006160497014983, "learning_rate": 4.996768809237404e-05, "loss": 2.6702, "mean_token_accuracy": 0.43103448748588563, "step": 65735 }, { "epoch": 0.06621402621366362, "grad_norm": 17.033276816009113, "learning_rate": 4.996766801458042e-05, "loss": 2.6449, "mean_token_accuracy": 0.358620685338974, "step": 65740 }, { "epoch": 0.0662190622667678, "grad_norm": 14.323147274450582, "learning_rate": 4.99676479305553e-05, "loss": 2.6891, "mean_token_accuracy": 0.358620685338974, "step": 65745 }, { "epoch": 0.06622409831987196, "grad_norm": 12.353574921814861, "learning_rate": 4.996762784029871e-05, "loss": 2.7006, "mean_token_accuracy": 0.3620689630508423, "step": 65750 }, { "epoch": 0.06622913437297613, "grad_norm": 13.647310163335069, "learning_rate": 4.996760774381064e-05, "loss": 2.6207, "mean_token_accuracy": 0.4, "step": 65755 }, { "epoch": 0.0662341704260803, "grad_norm": 10.859201021842837, "learning_rate": 4.9967587641091086e-05, "loss": 2.6595, "mean_token_accuracy": 0.4310344696044922, "step": 65760 }, { "epoch": 0.06623920647918448, "grad_norm": 12.52026717261555, "learning_rate": 4.9967567532140056e-05, "loss": 2.5308, "mean_token_accuracy": 0.4068965554237366, "step": 65765 }, { "epoch": 0.06624424253228865, "grad_norm": 13.471069302938337, "learning_rate": 4.996754741695757e-05, "loss": 2.5951, "mean_token_accuracy": 0.37241379618644715, "step": 65770 }, { "epoch": 0.06624927858539283, "grad_norm": 10.489732797141977, "learning_rate": 4.996752729554364e-05, "loss": 2.5884, "mean_token_accuracy": 0.4172413766384125, "step": 65775 }, { "epoch": 0.066254314638497, "grad_norm": 9.12093494395487, "learning_rate": 4.996750716789824e-05, "loss": 2.3982, "mean_token_accuracy": 0.42413792610168455, "step": 65780 }, { "epoch": 0.06625935069160117, "grad_norm": 12.375699342483061, "learning_rate": 4.9967487034021393e-05, "loss": 2.356, "mean_token_accuracy": 0.42758620381355283, "step": 65785 }, { "epoch": 0.06626438674470535, "grad_norm": 11.694837723731352, "learning_rate": 4.99674668939131e-05, "loss": 2.24, "mean_token_accuracy": 0.4344827473163605, "step": 65790 }, { "epoch": 0.06626942279780952, "grad_norm": 12.398774857071942, "learning_rate": 4.9967446747573395e-05, "loss": 2.5127, "mean_token_accuracy": 0.40992135405540464, "step": 65795 }, { "epoch": 0.0662744588509137, "grad_norm": 10.890134969853603, "learning_rate": 4.9967426595002234e-05, "loss": 2.3732, "mean_token_accuracy": 0.3965517282485962, "step": 65800 }, { "epoch": 0.06627949490401787, "grad_norm": 12.002805868490261, "learning_rate": 4.996740643619967e-05, "loss": 2.4098, "mean_token_accuracy": 0.4517241418361664, "step": 65805 }, { "epoch": 0.06628453095712204, "grad_norm": 13.332440332637358, "learning_rate": 4.996738627116567e-05, "loss": 2.6436, "mean_token_accuracy": 0.38965516686439516, "step": 65810 }, { "epoch": 0.06628956701022622, "grad_norm": 10.309022170250916, "learning_rate": 4.996736609990027e-05, "loss": 2.9232, "mean_token_accuracy": 0.3862069010734558, "step": 65815 }, { "epoch": 0.06629460306333038, "grad_norm": 13.323531585741378, "learning_rate": 4.996734592240346e-05, "loss": 2.7114, "mean_token_accuracy": 0.39310344457626345, "step": 65820 }, { "epoch": 0.06629963911643455, "grad_norm": 10.925388578891596, "learning_rate": 4.996732573867524e-05, "loss": 2.2188, "mean_token_accuracy": 0.4586206912994385, "step": 65825 }, { "epoch": 0.06630467516953872, "grad_norm": 11.204509515647834, "learning_rate": 4.9967305548715626e-05, "loss": 2.6651, "mean_token_accuracy": 0.38965516686439516, "step": 65830 }, { "epoch": 0.0663097112226429, "grad_norm": 10.618147065907145, "learning_rate": 4.996728535252462e-05, "loss": 2.6012, "mean_token_accuracy": 0.42413793206214906, "step": 65835 }, { "epoch": 0.06631474727574707, "grad_norm": 11.183392959370348, "learning_rate": 4.996726515010223e-05, "loss": 2.4456, "mean_token_accuracy": 0.42413793206214906, "step": 65840 }, { "epoch": 0.06631978332885125, "grad_norm": 13.765901532246167, "learning_rate": 4.996724494144846e-05, "loss": 2.5826, "mean_token_accuracy": 0.4103448212146759, "step": 65845 }, { "epoch": 0.06632481938195542, "grad_norm": 11.380528605699602, "learning_rate": 4.996722472656332e-05, "loss": 2.1582, "mean_token_accuracy": 0.4896551728248596, "step": 65850 }, { "epoch": 0.06632985543505959, "grad_norm": 14.630692242046269, "learning_rate": 4.99672045054468e-05, "loss": 2.5606, "mean_token_accuracy": 0.4551724135875702, "step": 65855 }, { "epoch": 0.06633489148816377, "grad_norm": 10.774751331395201, "learning_rate": 4.9967184278098936e-05, "loss": 2.7397, "mean_token_accuracy": 0.38965516686439516, "step": 65860 }, { "epoch": 0.06633992754126794, "grad_norm": 15.087119219723876, "learning_rate": 4.99671640445197e-05, "loss": 2.7007, "mean_token_accuracy": 0.3931034505367279, "step": 65865 }, { "epoch": 0.06634496359437211, "grad_norm": 11.557235178390245, "learning_rate": 4.996714380470912e-05, "loss": 2.5228, "mean_token_accuracy": 0.4034482777118683, "step": 65870 }, { "epoch": 0.06634999964747629, "grad_norm": 13.472117220627453, "learning_rate": 4.996712355866718e-05, "loss": 2.2327, "mean_token_accuracy": 0.47084088921546935, "step": 65875 }, { "epoch": 0.06635503570058046, "grad_norm": 13.387657730666263, "learning_rate": 4.9967103306393914e-05, "loss": 2.6246, "mean_token_accuracy": 0.3896551728248596, "step": 65880 }, { "epoch": 0.06636007175368464, "grad_norm": 11.799704346306692, "learning_rate": 4.996708304788931e-05, "loss": 2.617, "mean_token_accuracy": 0.3827586233615875, "step": 65885 }, { "epoch": 0.0663651078067888, "grad_norm": 10.943174417674227, "learning_rate": 4.9967062783153376e-05, "loss": 2.5643, "mean_token_accuracy": 0.44137930274009707, "step": 65890 }, { "epoch": 0.06637014385989297, "grad_norm": 9.93561905902741, "learning_rate": 4.996704251218612e-05, "loss": 2.5709, "mean_token_accuracy": 0.36206896901130675, "step": 65895 }, { "epoch": 0.06637517991299714, "grad_norm": 11.33470499025213, "learning_rate": 4.996702223498754e-05, "loss": 2.4371, "mean_token_accuracy": 0.43103447556495667, "step": 65900 }, { "epoch": 0.06638021596610132, "grad_norm": 13.04370995573622, "learning_rate": 4.9967001951557654e-05, "loss": 2.6165, "mean_token_accuracy": 0.3620689630508423, "step": 65905 }, { "epoch": 0.06638525201920549, "grad_norm": 18.48936608924017, "learning_rate": 4.9966981661896454e-05, "loss": 2.6098, "mean_token_accuracy": 0.4517241418361664, "step": 65910 }, { "epoch": 0.06639028807230966, "grad_norm": 14.3094350939331, "learning_rate": 4.996696136600396e-05, "loss": 2.4373, "mean_token_accuracy": 0.4034482777118683, "step": 65915 }, { "epoch": 0.06639532412541384, "grad_norm": 12.793646615269228, "learning_rate": 4.9966941063880165e-05, "loss": 2.2073, "mean_token_accuracy": 0.4551724076271057, "step": 65920 }, { "epoch": 0.06640036017851801, "grad_norm": 13.487501468303014, "learning_rate": 4.996692075552508e-05, "loss": 2.5007, "mean_token_accuracy": 0.42758620381355283, "step": 65925 }, { "epoch": 0.06640539623162219, "grad_norm": 11.656551067846463, "learning_rate": 4.996690044093871e-05, "loss": 2.6905, "mean_token_accuracy": 0.36206896901130675, "step": 65930 }, { "epoch": 0.06641043228472636, "grad_norm": 11.114755804889356, "learning_rate": 4.9966880120121066e-05, "loss": 2.4705, "mean_token_accuracy": 0.37931033968925476, "step": 65935 }, { "epoch": 0.06641546833783053, "grad_norm": 9.980562101917032, "learning_rate": 4.996685979307214e-05, "loss": 2.4551, "mean_token_accuracy": 0.4309113323688507, "step": 65940 }, { "epoch": 0.0664205043909347, "grad_norm": 10.90274938301233, "learning_rate": 4.996683945979196e-05, "loss": 2.4641, "mean_token_accuracy": 0.4413793087005615, "step": 65945 }, { "epoch": 0.06642554044403888, "grad_norm": 10.398303642904503, "learning_rate": 4.996681912028051e-05, "loss": 2.2057, "mean_token_accuracy": 0.4379310250282288, "step": 65950 }, { "epoch": 0.06643057649714304, "grad_norm": 10.7211467870642, "learning_rate": 4.9966798774537815e-05, "loss": 2.1404, "mean_token_accuracy": 0.4310344815254211, "step": 65955 }, { "epoch": 0.06643561255024721, "grad_norm": 13.147318041123757, "learning_rate": 4.996677842256386e-05, "loss": 2.5036, "mean_token_accuracy": 0.403448274731636, "step": 65960 }, { "epoch": 0.06644064860335139, "grad_norm": 13.727520547328762, "learning_rate": 4.996675806435866e-05, "loss": 2.3904, "mean_token_accuracy": 0.4363581418991089, "step": 65965 }, { "epoch": 0.06644568465645556, "grad_norm": 9.955144829553648, "learning_rate": 4.9966737699922215e-05, "loss": 2.4317, "mean_token_accuracy": 0.4137930989265442, "step": 65970 }, { "epoch": 0.06645072070955974, "grad_norm": 12.658262905199003, "learning_rate": 4.9966717329254545e-05, "loss": 2.3449, "mean_token_accuracy": 0.42413793206214906, "step": 65975 }, { "epoch": 0.06645575676266391, "grad_norm": 11.025214466267453, "learning_rate": 4.9966696952355646e-05, "loss": 2.8289, "mean_token_accuracy": 0.3655172437429428, "step": 65980 }, { "epoch": 0.06646079281576808, "grad_norm": 14.432668410039296, "learning_rate": 4.9966676569225526e-05, "loss": 2.545, "mean_token_accuracy": 0.4344827592372894, "step": 65985 }, { "epoch": 0.06646582886887226, "grad_norm": 10.853109190891788, "learning_rate": 4.9966656179864184e-05, "loss": 2.3786, "mean_token_accuracy": 0.4, "step": 65990 }, { "epoch": 0.06647086492197643, "grad_norm": 12.032013677507848, "learning_rate": 4.9966635784271634e-05, "loss": 2.9064, "mean_token_accuracy": 0.3793103456497192, "step": 65995 }, { "epoch": 0.0664759009750806, "grad_norm": 12.127793305778987, "learning_rate": 4.996661538244788e-05, "loss": 3.0199, "mean_token_accuracy": 0.32758620381355286, "step": 66000 }, { "epoch": 0.06648093702818478, "grad_norm": 11.304633570665953, "learning_rate": 4.996659497439293e-05, "loss": 2.3188, "mean_token_accuracy": 0.45033273100852966, "step": 66005 }, { "epoch": 0.06648597308128895, "grad_norm": 12.688480046994256, "learning_rate": 4.996657456010678e-05, "loss": 2.5242, "mean_token_accuracy": 0.4137930989265442, "step": 66010 }, { "epoch": 0.06649100913439313, "grad_norm": 13.583451222265722, "learning_rate": 4.996655413958944e-05, "loss": 2.609, "mean_token_accuracy": 0.3931034505367279, "step": 66015 }, { "epoch": 0.0664960451874973, "grad_norm": 10.364487326804996, "learning_rate": 4.9966533712840926e-05, "loss": 2.2668, "mean_token_accuracy": 0.4344827592372894, "step": 66020 }, { "epoch": 0.06650108124060146, "grad_norm": 13.992668736757036, "learning_rate": 4.996651327986123e-05, "loss": 2.7204, "mean_token_accuracy": 0.37241379022598264, "step": 66025 }, { "epoch": 0.06650611729370563, "grad_norm": 11.473504692004495, "learning_rate": 4.996649284065036e-05, "loss": 2.934, "mean_token_accuracy": 0.3586206793785095, "step": 66030 }, { "epoch": 0.0665111533468098, "grad_norm": 9.878126237242672, "learning_rate": 4.996647239520833e-05, "loss": 2.1597, "mean_token_accuracy": 0.48275862336158754, "step": 66035 }, { "epoch": 0.06651618939991398, "grad_norm": 10.292787453798343, "learning_rate": 4.9966451943535136e-05, "loss": 2.9654, "mean_token_accuracy": 0.34827586114406583, "step": 66040 }, { "epoch": 0.06652122545301815, "grad_norm": 9.83307573248238, "learning_rate": 4.996643148563079e-05, "loss": 2.3712, "mean_token_accuracy": 0.44827585816383364, "step": 66045 }, { "epoch": 0.06652626150612233, "grad_norm": 14.316953108209084, "learning_rate": 4.99664110214953e-05, "loss": 2.5468, "mean_token_accuracy": 0.382758629322052, "step": 66050 }, { "epoch": 0.0665312975592265, "grad_norm": 13.42775053662216, "learning_rate": 4.996639055112866e-05, "loss": 2.4322, "mean_token_accuracy": 0.39310344457626345, "step": 66055 }, { "epoch": 0.06653633361233068, "grad_norm": 10.224744925443344, "learning_rate": 4.996637007453088e-05, "loss": 2.3002, "mean_token_accuracy": 0.42758620381355283, "step": 66060 }, { "epoch": 0.06654136966543485, "grad_norm": 9.919176088618395, "learning_rate": 4.996634959170198e-05, "loss": 2.4556, "mean_token_accuracy": 0.4068965554237366, "step": 66065 }, { "epoch": 0.06654640571853902, "grad_norm": 10.939818057880961, "learning_rate": 4.996632910264194e-05, "loss": 2.3508, "mean_token_accuracy": 0.42413793206214906, "step": 66070 }, { "epoch": 0.0665514417716432, "grad_norm": 10.177822217083444, "learning_rate": 4.9966308607350784e-05, "loss": 2.3622, "mean_token_accuracy": 0.4448275864124298, "step": 66075 }, { "epoch": 0.06655647782474737, "grad_norm": 12.486180782006812, "learning_rate": 4.996628810582852e-05, "loss": 2.4, "mean_token_accuracy": 0.42220205068588257, "step": 66080 }, { "epoch": 0.06656151387785154, "grad_norm": 11.43461385909063, "learning_rate": 4.9966267598075144e-05, "loss": 2.6108, "mean_token_accuracy": 0.4206896543502808, "step": 66085 }, { "epoch": 0.06656654993095572, "grad_norm": 18.720697106714304, "learning_rate": 4.9966247084090664e-05, "loss": 2.7222, "mean_token_accuracy": 0.42068964838981626, "step": 66090 }, { "epoch": 0.06657158598405988, "grad_norm": 13.76627800060079, "learning_rate": 4.9966226563875085e-05, "loss": 2.788, "mean_token_accuracy": 0.3827586233615875, "step": 66095 }, { "epoch": 0.06657662203716405, "grad_norm": 10.800804864620961, "learning_rate": 4.996620603742842e-05, "loss": 2.6495, "mean_token_accuracy": 0.3931034505367279, "step": 66100 }, { "epoch": 0.06658165809026823, "grad_norm": 10.338340693017049, "learning_rate": 4.996618550475066e-05, "loss": 2.2443, "mean_token_accuracy": 0.4379310369491577, "step": 66105 }, { "epoch": 0.0665866941433724, "grad_norm": 10.496507351196147, "learning_rate": 4.9966164965841825e-05, "loss": 2.3545, "mean_token_accuracy": 0.4533013582229614, "step": 66110 }, { "epoch": 0.06659173019647657, "grad_norm": 11.211391242286485, "learning_rate": 4.9966144420701915e-05, "loss": 2.6675, "mean_token_accuracy": 0.4068965554237366, "step": 66115 }, { "epoch": 0.06659676624958075, "grad_norm": 10.654729424699223, "learning_rate": 4.996612386933094e-05, "loss": 2.4456, "mean_token_accuracy": 0.41034482717514037, "step": 66120 }, { "epoch": 0.06660180230268492, "grad_norm": 12.127180584455239, "learning_rate": 4.99661033117289e-05, "loss": 3.0584, "mean_token_accuracy": 0.3310344755649567, "step": 66125 }, { "epoch": 0.0666068383557891, "grad_norm": 13.424407290936784, "learning_rate": 4.996608274789579e-05, "loss": 2.7523, "mean_token_accuracy": 0.3793103456497192, "step": 66130 }, { "epoch": 0.06661187440889327, "grad_norm": 10.42645562022817, "learning_rate": 4.996606217783164e-05, "loss": 2.8638, "mean_token_accuracy": 0.41724138259887694, "step": 66135 }, { "epoch": 0.06661691046199744, "grad_norm": 10.311070855724283, "learning_rate": 4.996604160153645e-05, "loss": 2.4923, "mean_token_accuracy": 0.4551724076271057, "step": 66140 }, { "epoch": 0.06662194651510162, "grad_norm": 11.742212835403546, "learning_rate": 4.996602101901021e-05, "loss": 2.6908, "mean_token_accuracy": 0.4379310429096222, "step": 66145 }, { "epoch": 0.06662698256820579, "grad_norm": 10.567838622035188, "learning_rate": 4.9966000430252936e-05, "loss": 2.1252, "mean_token_accuracy": 0.49655172824859617, "step": 66150 }, { "epoch": 0.06663201862130996, "grad_norm": 10.782181528480487, "learning_rate": 4.996597983526463e-05, "loss": 2.4665, "mean_token_accuracy": 0.3862069010734558, "step": 66155 }, { "epoch": 0.06663705467441414, "grad_norm": 12.511544159229862, "learning_rate": 4.9965959234045304e-05, "loss": 2.3473, "mean_token_accuracy": 0.3931034505367279, "step": 66160 }, { "epoch": 0.0666420907275183, "grad_norm": 17.61815705317741, "learning_rate": 4.996593862659497e-05, "loss": 3.5867, "mean_token_accuracy": 0.28965516984462736, "step": 66165 }, { "epoch": 0.06664712678062247, "grad_norm": 11.248825848364577, "learning_rate": 4.996591801291361e-05, "loss": 2.3255, "mean_token_accuracy": 0.38965516686439516, "step": 66170 }, { "epoch": 0.06665216283372664, "grad_norm": 11.311981066189935, "learning_rate": 4.996589739300125e-05, "loss": 2.4769, "mean_token_accuracy": 0.3841500341892242, "step": 66175 }, { "epoch": 0.06665719888683082, "grad_norm": 11.221232385959333, "learning_rate": 4.996587676685789e-05, "loss": 2.2453, "mean_token_accuracy": 0.4379310369491577, "step": 66180 }, { "epoch": 0.06666223493993499, "grad_norm": 11.94160116416949, "learning_rate": 4.996585613448353e-05, "loss": 2.4141, "mean_token_accuracy": 0.43448275327682495, "step": 66185 }, { "epoch": 0.06666727099303917, "grad_norm": 10.786340701858007, "learning_rate": 4.9965835495878184e-05, "loss": 2.4601, "mean_token_accuracy": 0.41034482717514037, "step": 66190 }, { "epoch": 0.06667230704614334, "grad_norm": 11.053909988506982, "learning_rate": 4.996581485104185e-05, "loss": 2.5627, "mean_token_accuracy": 0.42758620977401735, "step": 66195 }, { "epoch": 0.06667734309924751, "grad_norm": 10.981913019405885, "learning_rate": 4.996579419997455e-05, "loss": 2.602, "mean_token_accuracy": 0.42068964838981626, "step": 66200 }, { "epoch": 0.06668237915235169, "grad_norm": 12.293638032973194, "learning_rate": 4.996577354267628e-05, "loss": 2.5349, "mean_token_accuracy": 0.4068965554237366, "step": 66205 }, { "epoch": 0.06668741520545586, "grad_norm": 13.557173867472162, "learning_rate": 4.9965752879147026e-05, "loss": 2.7926, "mean_token_accuracy": 0.37241379022598264, "step": 66210 }, { "epoch": 0.06669245125856003, "grad_norm": 11.377244170566494, "learning_rate": 4.9965732209386825e-05, "loss": 2.6472, "mean_token_accuracy": 0.39310344457626345, "step": 66215 }, { "epoch": 0.06669748731166421, "grad_norm": 11.347822761402815, "learning_rate": 4.9965711533395666e-05, "loss": 2.4998, "mean_token_accuracy": 0.3862068891525269, "step": 66220 }, { "epoch": 0.06670252336476838, "grad_norm": 11.890712740848503, "learning_rate": 4.9965690851173564e-05, "loss": 2.5737, "mean_token_accuracy": 0.4, "step": 66225 }, { "epoch": 0.06670755941787256, "grad_norm": 16.643008433513096, "learning_rate": 4.996567016272051e-05, "loss": 2.4979, "mean_token_accuracy": 0.34827586114406583, "step": 66230 }, { "epoch": 0.06671259547097672, "grad_norm": 10.197193658359037, "learning_rate": 4.996564946803652e-05, "loss": 2.1559, "mean_token_accuracy": 0.4482758641242981, "step": 66235 }, { "epoch": 0.06671763152408089, "grad_norm": 12.737640907976207, "learning_rate": 4.99656287671216e-05, "loss": 2.4784, "mean_token_accuracy": 0.45517241954803467, "step": 66240 }, { "epoch": 0.06672266757718506, "grad_norm": 12.8756444598531, "learning_rate": 4.996560805997576e-05, "loss": 2.4792, "mean_token_accuracy": 0.46896551847457885, "step": 66245 }, { "epoch": 0.06672770363028924, "grad_norm": 15.651209179632009, "learning_rate": 4.996558734659899e-05, "loss": 2.3762, "mean_token_accuracy": 0.4068965554237366, "step": 66250 }, { "epoch": 0.06673273968339341, "grad_norm": 13.677928631192895, "learning_rate": 4.9965566626991306e-05, "loss": 2.8615, "mean_token_accuracy": 0.404839688539505, "step": 66255 }, { "epoch": 0.06673777573649758, "grad_norm": 9.645667872175167, "learning_rate": 4.996554590115272e-05, "loss": 2.436, "mean_token_accuracy": 0.4121597111225128, "step": 66260 }, { "epoch": 0.06674281178960176, "grad_norm": 9.708286297232284, "learning_rate": 4.996552516908322e-05, "loss": 2.4455, "mean_token_accuracy": 0.47352216243743894, "step": 66265 }, { "epoch": 0.06674784784270593, "grad_norm": 10.765000699298964, "learning_rate": 4.9965504430782836e-05, "loss": 2.4801, "mean_token_accuracy": 0.4379310369491577, "step": 66270 }, { "epoch": 0.0667528838958101, "grad_norm": 13.248726752845258, "learning_rate": 4.9965483686251553e-05, "loss": 2.9254, "mean_token_accuracy": 0.37931033968925476, "step": 66275 }, { "epoch": 0.06675791994891428, "grad_norm": 12.099536990807216, "learning_rate": 4.996546293548939e-05, "loss": 2.6196, "mean_token_accuracy": 0.38620689511299133, "step": 66280 }, { "epoch": 0.06676295600201845, "grad_norm": 11.33384583693756, "learning_rate": 4.996544217849634e-05, "loss": 2.4398, "mean_token_accuracy": 0.39310344457626345, "step": 66285 }, { "epoch": 0.06676799205512263, "grad_norm": 10.929501978455228, "learning_rate": 4.996542141527242e-05, "loss": 2.0183, "mean_token_accuracy": 0.46206897497177124, "step": 66290 }, { "epoch": 0.0667730281082268, "grad_norm": 10.74936858062237, "learning_rate": 4.996540064581763e-05, "loss": 2.4025, "mean_token_accuracy": 0.4586206912994385, "step": 66295 }, { "epoch": 0.06677806416133097, "grad_norm": 11.388001539917504, "learning_rate": 4.996537987013198e-05, "loss": 2.4409, "mean_token_accuracy": 0.41554749608039854, "step": 66300 }, { "epoch": 0.06678310021443513, "grad_norm": 13.040340411719935, "learning_rate": 4.996535908821546e-05, "loss": 2.5849, "mean_token_accuracy": 0.38965516686439516, "step": 66305 }, { "epoch": 0.06678813626753931, "grad_norm": 10.07712603774365, "learning_rate": 4.996533830006811e-05, "loss": 2.6262, "mean_token_accuracy": 0.43793103098869324, "step": 66310 }, { "epoch": 0.06679317232064348, "grad_norm": 14.504661389094485, "learning_rate": 4.99653175056899e-05, "loss": 2.3005, "mean_token_accuracy": 0.43103448748588563, "step": 66315 }, { "epoch": 0.06679820837374766, "grad_norm": 12.810916479184415, "learning_rate": 4.9965296705080854e-05, "loss": 2.6076, "mean_token_accuracy": 0.39310344457626345, "step": 66320 }, { "epoch": 0.06680324442685183, "grad_norm": 12.430741989260607, "learning_rate": 4.9965275898240974e-05, "loss": 2.271, "mean_token_accuracy": 0.4517241358757019, "step": 66325 }, { "epoch": 0.066808280479956, "grad_norm": 10.913284868433577, "learning_rate": 4.996525508517027e-05, "loss": 2.6121, "mean_token_accuracy": 0.43793103098869324, "step": 66330 }, { "epoch": 0.06681331653306018, "grad_norm": 10.560775268068141, "learning_rate": 4.996523426586874e-05, "loss": 2.7409, "mean_token_accuracy": 0.34482758939266206, "step": 66335 }, { "epoch": 0.06681835258616435, "grad_norm": 11.953360432909303, "learning_rate": 4.99652134403364e-05, "loss": 2.4478, "mean_token_accuracy": 0.39655172228813174, "step": 66340 }, { "epoch": 0.06682338863926852, "grad_norm": 11.00197151173896, "learning_rate": 4.996519260857324e-05, "loss": 2.0902, "mean_token_accuracy": 0.45904416441917417, "step": 66345 }, { "epoch": 0.0668284246923727, "grad_norm": 12.781750319426335, "learning_rate": 4.996517177057928e-05, "loss": 2.6293, "mean_token_accuracy": 0.36551723480224607, "step": 66350 }, { "epoch": 0.06683346074547687, "grad_norm": 14.00670925132052, "learning_rate": 4.996515092635452e-05, "loss": 2.3557, "mean_token_accuracy": 0.42758620381355283, "step": 66355 }, { "epoch": 0.06683849679858105, "grad_norm": 12.518401487653934, "learning_rate": 4.9965130075898966e-05, "loss": 2.4733, "mean_token_accuracy": 0.38620689511299133, "step": 66360 }, { "epoch": 0.06684353285168522, "grad_norm": 11.078366028732237, "learning_rate": 4.996510921921262e-05, "loss": 2.2695, "mean_token_accuracy": 0.4655172348022461, "step": 66365 }, { "epoch": 0.0668485689047894, "grad_norm": 12.282512262483078, "learning_rate": 4.99650883562955e-05, "loss": 2.4037, "mean_token_accuracy": 0.4068965494632721, "step": 66370 }, { "epoch": 0.06685360495789355, "grad_norm": 12.267136676654756, "learning_rate": 4.99650674871476e-05, "loss": 2.8638, "mean_token_accuracy": 0.34482758343219755, "step": 66375 }, { "epoch": 0.06685864101099773, "grad_norm": 12.057096154087924, "learning_rate": 4.996504661176893e-05, "loss": 2.6268, "mean_token_accuracy": 0.43448275327682495, "step": 66380 }, { "epoch": 0.0668636770641019, "grad_norm": 21.9234800827101, "learning_rate": 4.99650257301595e-05, "loss": 3.0769, "mean_token_accuracy": 0.36551723778247835, "step": 66385 }, { "epoch": 0.06686871311720607, "grad_norm": 12.252119370924136, "learning_rate": 4.9965004842319295e-05, "loss": 2.4356, "mean_token_accuracy": 0.4, "step": 66390 }, { "epoch": 0.06687374917031025, "grad_norm": 10.288089197958827, "learning_rate": 4.996498394824836e-05, "loss": 2.3763, "mean_token_accuracy": 0.4241379380226135, "step": 66395 }, { "epoch": 0.06687878522341442, "grad_norm": 10.0161835604596, "learning_rate": 4.996496304794666e-05, "loss": 2.4996, "mean_token_accuracy": 0.42068966031074523, "step": 66400 }, { "epoch": 0.0668838212765186, "grad_norm": 11.704628900208423, "learning_rate": 4.9964942141414226e-05, "loss": 2.6656, "mean_token_accuracy": 0.3862068891525269, "step": 66405 }, { "epoch": 0.06688885732962277, "grad_norm": 12.267536675916393, "learning_rate": 4.996492122865106e-05, "loss": 3.003, "mean_token_accuracy": 0.37241379618644715, "step": 66410 }, { "epoch": 0.06689389338272694, "grad_norm": 13.519538905654098, "learning_rate": 4.996490030965716e-05, "loss": 2.6373, "mean_token_accuracy": 0.3793103456497192, "step": 66415 }, { "epoch": 0.06689892943583112, "grad_norm": 15.616310525688995, "learning_rate": 4.996487938443253e-05, "loss": 2.8172, "mean_token_accuracy": 0.37241379618644715, "step": 66420 }, { "epoch": 0.06690396548893529, "grad_norm": 12.845896272408565, "learning_rate": 4.996485845297719e-05, "loss": 3.042, "mean_token_accuracy": 0.4068965494632721, "step": 66425 }, { "epoch": 0.06690900154203946, "grad_norm": 14.504624316100706, "learning_rate": 4.996483751529113e-05, "loss": 3.115, "mean_token_accuracy": 0.2931034415960312, "step": 66430 }, { "epoch": 0.06691403759514364, "grad_norm": 15.229301324578802, "learning_rate": 4.996481657137437e-05, "loss": 2.5941, "mean_token_accuracy": 0.42413792610168455, "step": 66435 }, { "epoch": 0.06691907364824781, "grad_norm": 13.10067009100101, "learning_rate": 4.996479562122692e-05, "loss": 2.5921, "mean_token_accuracy": 0.42413792610168455, "step": 66440 }, { "epoch": 0.06692410970135197, "grad_norm": 11.70337096649754, "learning_rate": 4.996477466484876e-05, "loss": 2.4948, "mean_token_accuracy": 0.4748422861099243, "step": 66445 }, { "epoch": 0.06692914575445615, "grad_norm": 12.453526372118437, "learning_rate": 4.996475370223991e-05, "loss": 2.7174, "mean_token_accuracy": 0.37931033968925476, "step": 66450 }, { "epoch": 0.06693418180756032, "grad_norm": 11.38317314700908, "learning_rate": 4.996473273340039e-05, "loss": 2.5292, "mean_token_accuracy": 0.42758620977401735, "step": 66455 }, { "epoch": 0.0669392178606645, "grad_norm": 10.416058309859338, "learning_rate": 4.9964711758330176e-05, "loss": 2.4109, "mean_token_accuracy": 0.4172413766384125, "step": 66460 }, { "epoch": 0.06694425391376867, "grad_norm": 14.971705524172318, "learning_rate": 4.9964690777029305e-05, "loss": 2.4989, "mean_token_accuracy": 0.41379310488700866, "step": 66465 }, { "epoch": 0.06694928996687284, "grad_norm": 11.108388651801024, "learning_rate": 4.996466978949776e-05, "loss": 2.2532, "mean_token_accuracy": 0.47586206197738645, "step": 66470 }, { "epoch": 0.06695432601997701, "grad_norm": 10.237561659161601, "learning_rate": 4.996464879573555e-05, "loss": 2.7172, "mean_token_accuracy": 0.4137930989265442, "step": 66475 }, { "epoch": 0.06695936207308119, "grad_norm": 17.021811522446626, "learning_rate": 4.9964627795742697e-05, "loss": 2.3248, "mean_token_accuracy": 0.4189352750778198, "step": 66480 }, { "epoch": 0.06696439812618536, "grad_norm": 9.494988672117511, "learning_rate": 4.996460678951919e-05, "loss": 2.4786, "mean_token_accuracy": 0.4379310369491577, "step": 66485 }, { "epoch": 0.06696943417928954, "grad_norm": 10.190426798938667, "learning_rate": 4.996458577706504e-05, "loss": 2.2563, "mean_token_accuracy": 0.44482758045196535, "step": 66490 }, { "epoch": 0.06697447023239371, "grad_norm": 10.32396335679019, "learning_rate": 4.996456475838026e-05, "loss": 2.3576, "mean_token_accuracy": 0.4068965494632721, "step": 66495 }, { "epoch": 0.06697950628549788, "grad_norm": 12.246758404889505, "learning_rate": 4.996454373346484e-05, "loss": 2.6409, "mean_token_accuracy": 0.4517241418361664, "step": 66500 }, { "epoch": 0.06698454233860206, "grad_norm": 12.211494968754495, "learning_rate": 4.9964522702318804e-05, "loss": 2.4704, "mean_token_accuracy": 0.39310343861579894, "step": 66505 }, { "epoch": 0.06698957839170623, "grad_norm": 10.81631141158981, "learning_rate": 4.996450166494214e-05, "loss": 2.3867, "mean_token_accuracy": 0.4379310369491577, "step": 66510 }, { "epoch": 0.06699461444481039, "grad_norm": 12.97370473461869, "learning_rate": 4.996448062133487e-05, "loss": 2.7514, "mean_token_accuracy": 0.3931034505367279, "step": 66515 }, { "epoch": 0.06699965049791456, "grad_norm": 13.109195365246542, "learning_rate": 4.996445957149699e-05, "loss": 2.3204, "mean_token_accuracy": 0.4034482717514038, "step": 66520 }, { "epoch": 0.06700468655101874, "grad_norm": 14.052975182372732, "learning_rate": 4.996443851542851e-05, "loss": 2.6518, "mean_token_accuracy": 0.4344827592372894, "step": 66525 }, { "epoch": 0.06700972260412291, "grad_norm": 10.71664872937317, "learning_rate": 4.996441745312943e-05, "loss": 2.8489, "mean_token_accuracy": 0.3931034475564957, "step": 66530 }, { "epoch": 0.06701475865722709, "grad_norm": 11.155514931099361, "learning_rate": 4.996439638459976e-05, "loss": 2.3655, "mean_token_accuracy": 0.4, "step": 66535 }, { "epoch": 0.06701979471033126, "grad_norm": 12.101380216637022, "learning_rate": 4.9964375309839506e-05, "loss": 2.5079, "mean_token_accuracy": 0.4344827592372894, "step": 66540 }, { "epoch": 0.06702483076343543, "grad_norm": 10.833667075683776, "learning_rate": 4.996435422884868e-05, "loss": 2.9275, "mean_token_accuracy": 0.37931033968925476, "step": 66545 }, { "epoch": 0.06702986681653961, "grad_norm": 11.787569472731738, "learning_rate": 4.996433314162727e-05, "loss": 2.245, "mean_token_accuracy": 0.4551724076271057, "step": 66550 }, { "epoch": 0.06703490286964378, "grad_norm": 9.736350720648918, "learning_rate": 4.99643120481753e-05, "loss": 2.5392, "mean_token_accuracy": 0.39310344457626345, "step": 66555 }, { "epoch": 0.06703993892274795, "grad_norm": 9.950342161125144, "learning_rate": 4.9964290948492774e-05, "loss": 2.5868, "mean_token_accuracy": 0.4, "step": 66560 }, { "epoch": 0.06704497497585213, "grad_norm": 9.860318320968823, "learning_rate": 4.996426984257969e-05, "loss": 2.4557, "mean_token_accuracy": 0.4034482717514038, "step": 66565 }, { "epoch": 0.0670500110289563, "grad_norm": 9.906724235485221, "learning_rate": 4.996424873043605e-05, "loss": 2.453, "mean_token_accuracy": 0.43793103098869324, "step": 66570 }, { "epoch": 0.06705504708206048, "grad_norm": 11.00668052403052, "learning_rate": 4.996422761206187e-05, "loss": 2.7181, "mean_token_accuracy": 0.3827586114406586, "step": 66575 }, { "epoch": 0.06706008313516465, "grad_norm": 11.180226648486332, "learning_rate": 4.996420648745716e-05, "loss": 2.7871, "mean_token_accuracy": 0.41246217489242554, "step": 66580 }, { "epoch": 0.06706511918826881, "grad_norm": 10.75446951369793, "learning_rate": 4.9964185356621915e-05, "loss": 2.5292, "mean_token_accuracy": 0.36896551549434664, "step": 66585 }, { "epoch": 0.06707015524137298, "grad_norm": 11.80038901779949, "learning_rate": 4.9964164219556144e-05, "loss": 2.6273, "mean_token_accuracy": 0.3862069010734558, "step": 66590 }, { "epoch": 0.06707519129447716, "grad_norm": 10.243490797371122, "learning_rate": 4.9964143076259856e-05, "loss": 2.5812, "mean_token_accuracy": 0.4068965554237366, "step": 66595 }, { "epoch": 0.06708022734758133, "grad_norm": 12.626895299069702, "learning_rate": 4.996412192673305e-05, "loss": 2.3503, "mean_token_accuracy": 0.39854809641838074, "step": 66600 }, { "epoch": 0.0670852634006855, "grad_norm": 15.847229053232997, "learning_rate": 4.996410077097573e-05, "loss": 2.5768, "mean_token_accuracy": 0.41465516686439513, "step": 66605 }, { "epoch": 0.06709029945378968, "grad_norm": 12.822353886604263, "learning_rate": 4.996407960898792e-05, "loss": 2.3201, "mean_token_accuracy": 0.44482758045196535, "step": 66610 }, { "epoch": 0.06709533550689385, "grad_norm": 12.513690605522882, "learning_rate": 4.996405844076961e-05, "loss": 2.5141, "mean_token_accuracy": 0.4310344934463501, "step": 66615 }, { "epoch": 0.06710037155999803, "grad_norm": 9.548230026074894, "learning_rate": 4.9964037266320805e-05, "loss": 2.3701, "mean_token_accuracy": 0.41724138259887694, "step": 66620 }, { "epoch": 0.0671054076131022, "grad_norm": 14.078674194656493, "learning_rate": 4.996401608564151e-05, "loss": 2.989, "mean_token_accuracy": 0.4310344815254211, "step": 66625 }, { "epoch": 0.06711044366620637, "grad_norm": 10.81586433670949, "learning_rate": 4.996399489873175e-05, "loss": 2.1178, "mean_token_accuracy": 0.48275861144065857, "step": 66630 }, { "epoch": 0.06711547971931055, "grad_norm": 10.175057358263796, "learning_rate": 4.996397370559151e-05, "loss": 2.6213, "mean_token_accuracy": 0.39999999701976774, "step": 66635 }, { "epoch": 0.06712051577241472, "grad_norm": 11.448417905865412, "learning_rate": 4.9963952506220805e-05, "loss": 2.4597, "mean_token_accuracy": 0.42758620977401735, "step": 66640 }, { "epoch": 0.0671255518255189, "grad_norm": 11.49240490608862, "learning_rate": 4.9963931300619646e-05, "loss": 2.723, "mean_token_accuracy": 0.4103448212146759, "step": 66645 }, { "epoch": 0.06713058787862307, "grad_norm": 10.53276629750285, "learning_rate": 4.996391008878802e-05, "loss": 2.7349, "mean_token_accuracy": 0.37586206793785093, "step": 66650 }, { "epoch": 0.06713562393172723, "grad_norm": 11.263428559437706, "learning_rate": 4.996388887072595e-05, "loss": 2.9117, "mean_token_accuracy": 0.36551724672317504, "step": 66655 }, { "epoch": 0.0671406599848314, "grad_norm": 12.21368948762846, "learning_rate": 4.9963867646433435e-05, "loss": 2.6873, "mean_token_accuracy": 0.3689655214548111, "step": 66660 }, { "epoch": 0.06714569603793558, "grad_norm": 13.145141395210068, "learning_rate": 4.996384641591048e-05, "loss": 2.4459, "mean_token_accuracy": 0.45741077065467833, "step": 66665 }, { "epoch": 0.06715073209103975, "grad_norm": 11.066067423473065, "learning_rate": 4.99638251791571e-05, "loss": 2.0474, "mean_token_accuracy": 0.493103438615799, "step": 66670 }, { "epoch": 0.06715576814414392, "grad_norm": 11.58968755524119, "learning_rate": 4.996380393617329e-05, "loss": 2.8409, "mean_token_accuracy": 0.33448275923728943, "step": 66675 }, { "epoch": 0.0671608041972481, "grad_norm": 12.538509447157185, "learning_rate": 4.9963782686959063e-05, "loss": 2.4654, "mean_token_accuracy": 0.403448274731636, "step": 66680 }, { "epoch": 0.06716584025035227, "grad_norm": 10.789055215632521, "learning_rate": 4.996376143151442e-05, "loss": 2.2391, "mean_token_accuracy": 0.4448275864124298, "step": 66685 }, { "epoch": 0.06717087630345644, "grad_norm": 12.251237474054077, "learning_rate": 4.996374016983937e-05, "loss": 2.5378, "mean_token_accuracy": 0.4068965554237366, "step": 66690 }, { "epoch": 0.06717591235656062, "grad_norm": 12.392354397685262, "learning_rate": 4.9963718901933916e-05, "loss": 2.408, "mean_token_accuracy": 0.4206896543502808, "step": 66695 }, { "epoch": 0.06718094840966479, "grad_norm": 11.359841077316215, "learning_rate": 4.996369762779806e-05, "loss": 2.8094, "mean_token_accuracy": 0.37241379618644715, "step": 66700 }, { "epoch": 0.06718598446276897, "grad_norm": 11.907759345750748, "learning_rate": 4.9963676347431826e-05, "loss": 2.8483, "mean_token_accuracy": 0.3517241358757019, "step": 66705 }, { "epoch": 0.06719102051587314, "grad_norm": 13.269003915590288, "learning_rate": 4.996365506083521e-05, "loss": 2.5665, "mean_token_accuracy": 0.34137930870056155, "step": 66710 }, { "epoch": 0.06719605656897731, "grad_norm": 10.588641798089172, "learning_rate": 4.99636337680082e-05, "loss": 2.5978, "mean_token_accuracy": 0.41724138259887694, "step": 66715 }, { "epoch": 0.06720109262208149, "grad_norm": 13.209726674173757, "learning_rate": 4.996361246895083e-05, "loss": 2.3437, "mean_token_accuracy": 0.47586206793785096, "step": 66720 }, { "epoch": 0.06720612867518565, "grad_norm": 10.45269197069057, "learning_rate": 4.996359116366308e-05, "loss": 3.0883, "mean_token_accuracy": 0.38620689511299133, "step": 66725 }, { "epoch": 0.06721116472828982, "grad_norm": 15.104066406467888, "learning_rate": 4.996356985214498e-05, "loss": 2.9087, "mean_token_accuracy": 0.358620685338974, "step": 66730 }, { "epoch": 0.067216200781394, "grad_norm": 10.568726535010004, "learning_rate": 4.9963548534396524e-05, "loss": 2.296, "mean_token_accuracy": 0.42068966031074523, "step": 66735 }, { "epoch": 0.06722123683449817, "grad_norm": 15.407399788282733, "learning_rate": 4.996352721041771e-05, "loss": 2.4913, "mean_token_accuracy": 0.36551724672317504, "step": 66740 }, { "epoch": 0.06722627288760234, "grad_norm": 11.563819310911443, "learning_rate": 4.996350588020856e-05, "loss": 3.1188, "mean_token_accuracy": 0.37586207389831544, "step": 66745 }, { "epoch": 0.06723130894070652, "grad_norm": 8.280932933580422, "learning_rate": 4.996348454376908e-05, "loss": 2.0372, "mean_token_accuracy": 0.5119177162647247, "step": 66750 }, { "epoch": 0.06723634499381069, "grad_norm": 10.056772689633416, "learning_rate": 4.996346320109926e-05, "loss": 2.6026, "mean_token_accuracy": 0.3448275804519653, "step": 66755 }, { "epoch": 0.06724138104691486, "grad_norm": 17.317440650067866, "learning_rate": 4.996344185219912e-05, "loss": 2.6029, "mean_token_accuracy": 0.49122806191444396, "step": 66760 }, { "epoch": 0.06724641710001904, "grad_norm": 14.310388164368542, "learning_rate": 4.996342049706865e-05, "loss": 2.6977, "mean_token_accuracy": 0.4, "step": 66765 }, { "epoch": 0.06725145315312321, "grad_norm": 11.475248519525076, "learning_rate": 4.996339913570788e-05, "loss": 2.6605, "mean_token_accuracy": 0.324137932062149, "step": 66770 }, { "epoch": 0.06725648920622739, "grad_norm": 11.154919777060767, "learning_rate": 4.99633777681168e-05, "loss": 2.3222, "mean_token_accuracy": 0.35172412991523744, "step": 66775 }, { "epoch": 0.06726152525933156, "grad_norm": 10.471266830729128, "learning_rate": 4.9963356394295405e-05, "loss": 2.4061, "mean_token_accuracy": 0.4034482777118683, "step": 66780 }, { "epoch": 0.06726656131243573, "grad_norm": 11.769411481946213, "learning_rate": 4.9963335014243725e-05, "loss": 2.8421, "mean_token_accuracy": 0.36896551847457887, "step": 66785 }, { "epoch": 0.0672715973655399, "grad_norm": 14.786294494172749, "learning_rate": 4.996331362796176e-05, "loss": 3.183, "mean_token_accuracy": 0.37241379022598264, "step": 66790 }, { "epoch": 0.06727663341864407, "grad_norm": 12.393007473662973, "learning_rate": 4.9963292235449506e-05, "loss": 2.6695, "mean_token_accuracy": 0.41379311084747317, "step": 66795 }, { "epoch": 0.06728166947174824, "grad_norm": 9.854176477470542, "learning_rate": 4.996327083670697e-05, "loss": 2.3081, "mean_token_accuracy": 0.42068966031074523, "step": 66800 }, { "epoch": 0.06728670552485241, "grad_norm": 12.416715328291085, "learning_rate": 4.996324943173417e-05, "loss": 2.2217, "mean_token_accuracy": 0.4862068951129913, "step": 66805 }, { "epoch": 0.06729174157795659, "grad_norm": 14.681472221336847, "learning_rate": 4.9963228020531096e-05, "loss": 3.0274, "mean_token_accuracy": 0.3758620619773865, "step": 66810 }, { "epoch": 0.06729677763106076, "grad_norm": 18.141202716235053, "learning_rate": 4.9963206603097765e-05, "loss": 2.747, "mean_token_accuracy": 0.4068965494632721, "step": 66815 }, { "epoch": 0.06730181368416494, "grad_norm": 12.707231134026733, "learning_rate": 4.996318517943418e-05, "loss": 2.4685, "mean_token_accuracy": 0.3827586233615875, "step": 66820 }, { "epoch": 0.06730684973726911, "grad_norm": 10.896778525483665, "learning_rate": 4.996316374954035e-05, "loss": 2.6503, "mean_token_accuracy": 0.37241379618644715, "step": 66825 }, { "epoch": 0.06731188579037328, "grad_norm": 10.054716391023703, "learning_rate": 4.996314231341627e-05, "loss": 2.6947, "mean_token_accuracy": 0.37931033968925476, "step": 66830 }, { "epoch": 0.06731692184347746, "grad_norm": 12.300484822991066, "learning_rate": 4.9963120871061955e-05, "loss": 2.0499, "mean_token_accuracy": 0.4693349778652191, "step": 66835 }, { "epoch": 0.06732195789658163, "grad_norm": 14.69458882845443, "learning_rate": 4.996309942247742e-05, "loss": 2.682, "mean_token_accuracy": 0.417241370677948, "step": 66840 }, { "epoch": 0.0673269939496858, "grad_norm": 8.385039260412185, "learning_rate": 4.9963077967662656e-05, "loss": 2.6782, "mean_token_accuracy": 0.423048996925354, "step": 66845 }, { "epoch": 0.06733203000278998, "grad_norm": 12.11948566499924, "learning_rate": 4.9963056506617666e-05, "loss": 2.5017, "mean_token_accuracy": 0.42413792610168455, "step": 66850 }, { "epoch": 0.06733706605589415, "grad_norm": 9.867621708876474, "learning_rate": 4.996303503934246e-05, "loss": 2.2286, "mean_token_accuracy": 0.46049606800079346, "step": 66855 }, { "epoch": 0.06734210210899833, "grad_norm": 11.639781115886299, "learning_rate": 4.996301356583706e-05, "loss": 2.3547, "mean_token_accuracy": 0.48275862336158754, "step": 66860 }, { "epoch": 0.06734713816210249, "grad_norm": 10.70618330954457, "learning_rate": 4.996299208610146e-05, "loss": 2.8635, "mean_token_accuracy": 0.39655172228813174, "step": 66865 }, { "epoch": 0.06735217421520666, "grad_norm": 14.469362975647158, "learning_rate": 4.9962970600135664e-05, "loss": 2.7166, "mean_token_accuracy": 0.42068966031074523, "step": 66870 }, { "epoch": 0.06735721026831083, "grad_norm": 15.67853168768889, "learning_rate": 4.9962949107939675e-05, "loss": 2.8365, "mean_token_accuracy": 0.36896551847457887, "step": 66875 }, { "epoch": 0.067362246321415, "grad_norm": 14.190450839355602, "learning_rate": 4.996292760951351e-05, "loss": 2.675, "mean_token_accuracy": 0.37374470829963685, "step": 66880 }, { "epoch": 0.06736728237451918, "grad_norm": 12.119532691076238, "learning_rate": 4.996290610485716e-05, "loss": 2.2003, "mean_token_accuracy": 0.441379314661026, "step": 66885 }, { "epoch": 0.06737231842762335, "grad_norm": 9.517649264280843, "learning_rate": 4.996288459397064e-05, "loss": 2.7633, "mean_token_accuracy": 0.42413792610168455, "step": 66890 }, { "epoch": 0.06737735448072753, "grad_norm": 17.23643415957157, "learning_rate": 4.9962863076853964e-05, "loss": 2.7324, "mean_token_accuracy": 0.37241379618644715, "step": 66895 }, { "epoch": 0.0673823905338317, "grad_norm": 10.233719374832946, "learning_rate": 4.996284155350711e-05, "loss": 2.0902, "mean_token_accuracy": 0.4689655125141144, "step": 66900 }, { "epoch": 0.06738742658693588, "grad_norm": 16.110485862541438, "learning_rate": 4.996282002393012e-05, "loss": 3.0173, "mean_token_accuracy": 0.3448275804519653, "step": 66905 }, { "epoch": 0.06739246264004005, "grad_norm": 11.482682621697949, "learning_rate": 4.996279848812297e-05, "loss": 3.3195, "mean_token_accuracy": 0.2862068980932236, "step": 66910 }, { "epoch": 0.06739749869314422, "grad_norm": 11.966290626313109, "learning_rate": 4.9962776946085695e-05, "loss": 2.7014, "mean_token_accuracy": 0.36206896901130675, "step": 66915 }, { "epoch": 0.0674025347462484, "grad_norm": 10.83398369102492, "learning_rate": 4.996275539781827e-05, "loss": 2.5769, "mean_token_accuracy": 0.3793103456497192, "step": 66920 }, { "epoch": 0.06740757079935257, "grad_norm": 11.373673803082418, "learning_rate": 4.996273384332073e-05, "loss": 2.6891, "mean_token_accuracy": 0.37241379618644715, "step": 66925 }, { "epoch": 0.06741260685245674, "grad_norm": 16.64877335301928, "learning_rate": 4.996271228259306e-05, "loss": 2.6717, "mean_token_accuracy": 0.3999999940395355, "step": 66930 }, { "epoch": 0.0674176429055609, "grad_norm": 9.868919142182797, "learning_rate": 4.996269071563527e-05, "loss": 2.4018, "mean_token_accuracy": 0.4178571432828903, "step": 66935 }, { "epoch": 0.06742267895866508, "grad_norm": 10.737975148707518, "learning_rate": 4.996266914244738e-05, "loss": 2.5896, "mean_token_accuracy": 0.42758620977401735, "step": 66940 }, { "epoch": 0.06742771501176925, "grad_norm": 12.712598410094635, "learning_rate": 4.996264756302938e-05, "loss": 2.5282, "mean_token_accuracy": 0.39655171930789945, "step": 66945 }, { "epoch": 0.06743275106487343, "grad_norm": 11.47336961800442, "learning_rate": 4.996262597738127e-05, "loss": 3.1264, "mean_token_accuracy": 0.34137930870056155, "step": 66950 }, { "epoch": 0.0674377871179776, "grad_norm": 10.806450502524452, "learning_rate": 4.996260438550308e-05, "loss": 2.2778, "mean_token_accuracy": 0.41034482717514037, "step": 66955 }, { "epoch": 0.06744282317108177, "grad_norm": 12.390950227154349, "learning_rate": 4.996258278739479e-05, "loss": 2.6376, "mean_token_accuracy": 0.38620689511299133, "step": 66960 }, { "epoch": 0.06744785922418595, "grad_norm": 14.657661411706721, "learning_rate": 4.9962561183056425e-05, "loss": 2.8026, "mean_token_accuracy": 0.4206896543502808, "step": 66965 }, { "epoch": 0.06745289527729012, "grad_norm": 11.739278033800597, "learning_rate": 4.9962539572487995e-05, "loss": 2.4325, "mean_token_accuracy": 0.4275861978530884, "step": 66970 }, { "epoch": 0.0674579313303943, "grad_norm": 10.500762657481578, "learning_rate": 4.996251795568948e-05, "loss": 2.367, "mean_token_accuracy": 0.458620685338974, "step": 66975 }, { "epoch": 0.06746296738349847, "grad_norm": 12.998466386042733, "learning_rate": 4.996249633266091e-05, "loss": 2.8257, "mean_token_accuracy": 0.35862069129943847, "step": 66980 }, { "epoch": 0.06746800343660264, "grad_norm": 13.00314995925543, "learning_rate": 4.9962474703402286e-05, "loss": 2.562, "mean_token_accuracy": 0.4137930989265442, "step": 66985 }, { "epoch": 0.06747303948970682, "grad_norm": 15.29670955926325, "learning_rate": 4.996245306791361e-05, "loss": 2.3516, "mean_token_accuracy": 0.41724138259887694, "step": 66990 }, { "epoch": 0.06747807554281099, "grad_norm": 10.441484115550292, "learning_rate": 4.9962431426194886e-05, "loss": 2.3945, "mean_token_accuracy": 0.4068965494632721, "step": 66995 }, { "epoch": 0.06748311159591516, "grad_norm": 13.218105124046247, "learning_rate": 4.996240977824612e-05, "loss": 2.4046, "mean_token_accuracy": 0.41379310488700866, "step": 67000 }, { "epoch": 0.06748814764901932, "grad_norm": 25.754788445555292, "learning_rate": 4.9962388124067316e-05, "loss": 2.8148, "mean_token_accuracy": 0.3793103456497192, "step": 67005 }, { "epoch": 0.0674931837021235, "grad_norm": 11.208079478097277, "learning_rate": 4.996236646365849e-05, "loss": 2.3225, "mean_token_accuracy": 0.4517241358757019, "step": 67010 }, { "epoch": 0.06749821975522767, "grad_norm": 11.868801826532009, "learning_rate": 4.996234479701966e-05, "loss": 2.6438, "mean_token_accuracy": 0.3620689630508423, "step": 67015 }, { "epoch": 0.06750325580833184, "grad_norm": 15.23628387592478, "learning_rate": 4.996232312415079e-05, "loss": 2.8674, "mean_token_accuracy": 0.36896551251411436, "step": 67020 }, { "epoch": 0.06750829186143602, "grad_norm": 10.393696434561733, "learning_rate": 4.9962301445051924e-05, "loss": 2.3269, "mean_token_accuracy": 0.43103448748588563, "step": 67025 }, { "epoch": 0.06751332791454019, "grad_norm": 11.294359285267616, "learning_rate": 4.996227975972305e-05, "loss": 2.2557, "mean_token_accuracy": 0.43998790383338926, "step": 67030 }, { "epoch": 0.06751836396764437, "grad_norm": 22.42339184567104, "learning_rate": 4.996225806816418e-05, "loss": 2.4931, "mean_token_accuracy": 0.4261947929859161, "step": 67035 }, { "epoch": 0.06752340002074854, "grad_norm": 11.993048303486567, "learning_rate": 4.996223637037532e-05, "loss": 2.5109, "mean_token_accuracy": 0.42413792610168455, "step": 67040 }, { "epoch": 0.06752843607385271, "grad_norm": 12.485389464593876, "learning_rate": 4.996221466635647e-05, "loss": 2.5414, "mean_token_accuracy": 0.43103447556495667, "step": 67045 }, { "epoch": 0.06753347212695689, "grad_norm": 11.795725234915455, "learning_rate": 4.9962192956107656e-05, "loss": 2.9844, "mean_token_accuracy": 0.3551724135875702, "step": 67050 }, { "epoch": 0.06753850818006106, "grad_norm": 11.955884163332714, "learning_rate": 4.9962171239628855e-05, "loss": 2.7221, "mean_token_accuracy": 0.36551724970340727, "step": 67055 }, { "epoch": 0.06754354423316523, "grad_norm": 10.463737775556934, "learning_rate": 4.9962149516920086e-05, "loss": 2.3867, "mean_token_accuracy": 0.41724138259887694, "step": 67060 }, { "epoch": 0.06754858028626941, "grad_norm": 13.144162083535685, "learning_rate": 4.996212778798137e-05, "loss": 2.5106, "mean_token_accuracy": 0.36206896901130675, "step": 67065 }, { "epoch": 0.06755361633937358, "grad_norm": 10.245347764621835, "learning_rate": 4.9962106052812684e-05, "loss": 2.4242, "mean_token_accuracy": 0.4294010877609253, "step": 67070 }, { "epoch": 0.06755865239247774, "grad_norm": 12.986900879162782, "learning_rate": 4.996208431141405e-05, "loss": 2.5525, "mean_token_accuracy": 0.3724137932062149, "step": 67075 }, { "epoch": 0.06756368844558192, "grad_norm": 10.445739522836554, "learning_rate": 4.9962062563785485e-05, "loss": 2.6275, "mean_token_accuracy": 0.36896551549434664, "step": 67080 }, { "epoch": 0.06756872449868609, "grad_norm": 10.98501891754874, "learning_rate": 4.996204080992697e-05, "loss": 2.567, "mean_token_accuracy": 0.37241379022598264, "step": 67085 }, { "epoch": 0.06757376055179026, "grad_norm": 11.497465995822587, "learning_rate": 4.996201904983853e-05, "loss": 2.1825, "mean_token_accuracy": 0.4482758641242981, "step": 67090 }, { "epoch": 0.06757879660489444, "grad_norm": 12.039617411411594, "learning_rate": 4.9961997283520175e-05, "loss": 2.5203, "mean_token_accuracy": 0.3758620619773865, "step": 67095 }, { "epoch": 0.06758383265799861, "grad_norm": 10.015699039604872, "learning_rate": 4.9961975510971885e-05, "loss": 2.3675, "mean_token_accuracy": 0.46551724672317507, "step": 67100 }, { "epoch": 0.06758886871110278, "grad_norm": 14.855815885476023, "learning_rate": 4.996195373219369e-05, "loss": 2.8582, "mean_token_accuracy": 0.39999999701976774, "step": 67105 }, { "epoch": 0.06759390476420696, "grad_norm": 10.466077040514117, "learning_rate": 4.996193194718559e-05, "loss": 2.1597, "mean_token_accuracy": 0.5000000059604645, "step": 67110 }, { "epoch": 0.06759894081731113, "grad_norm": 12.736028576252895, "learning_rate": 4.996191015594758e-05, "loss": 2.5993, "mean_token_accuracy": 0.39310344457626345, "step": 67115 }, { "epoch": 0.0676039768704153, "grad_norm": 12.21883910705969, "learning_rate": 4.9961888358479685e-05, "loss": 2.5403, "mean_token_accuracy": 0.43103448748588563, "step": 67120 }, { "epoch": 0.06760901292351948, "grad_norm": 10.240365041317796, "learning_rate": 4.99618665547819e-05, "loss": 2.7566, "mean_token_accuracy": 0.3655172407627106, "step": 67125 }, { "epoch": 0.06761404897662365, "grad_norm": 11.458247817225393, "learning_rate": 4.996184474485424e-05, "loss": 2.3849, "mean_token_accuracy": 0.42413793206214906, "step": 67130 }, { "epoch": 0.06761908502972783, "grad_norm": 11.61828536219056, "learning_rate": 4.996182292869668e-05, "loss": 2.8068, "mean_token_accuracy": 0.30344827473163605, "step": 67135 }, { "epoch": 0.067624121082832, "grad_norm": 10.157948890184533, "learning_rate": 4.996180110630927e-05, "loss": 2.2953, "mean_token_accuracy": 0.4103448212146759, "step": 67140 }, { "epoch": 0.06762915713593616, "grad_norm": 9.89068279978254, "learning_rate": 4.9961779277691995e-05, "loss": 2.2129, "mean_token_accuracy": 0.44827585816383364, "step": 67145 }, { "epoch": 0.06763419318904033, "grad_norm": 13.10096930667577, "learning_rate": 4.996175744284485e-05, "loss": 2.6437, "mean_token_accuracy": 0.4068965554237366, "step": 67150 }, { "epoch": 0.06763922924214451, "grad_norm": 11.481394154649427, "learning_rate": 4.9961735601767865e-05, "loss": 2.8022, "mean_token_accuracy": 0.42413792610168455, "step": 67155 }, { "epoch": 0.06764426529524868, "grad_norm": 10.358445070618533, "learning_rate": 4.996171375446102e-05, "loss": 2.4989, "mean_token_accuracy": 0.43248639106750486, "step": 67160 }, { "epoch": 0.06764930134835286, "grad_norm": 10.436242424246117, "learning_rate": 4.996169190092435e-05, "loss": 2.8865, "mean_token_accuracy": 0.4103448212146759, "step": 67165 }, { "epoch": 0.06765433740145703, "grad_norm": 14.839277919618446, "learning_rate": 4.996167004115783e-05, "loss": 2.7692, "mean_token_accuracy": 0.33793103098869326, "step": 67170 }, { "epoch": 0.0676593734545612, "grad_norm": 13.492378378118119, "learning_rate": 4.9961648175161505e-05, "loss": 1.984, "mean_token_accuracy": 0.483620685338974, "step": 67175 }, { "epoch": 0.06766440950766538, "grad_norm": 12.651399260909276, "learning_rate": 4.996162630293534e-05, "loss": 2.437, "mean_token_accuracy": 0.4413793087005615, "step": 67180 }, { "epoch": 0.06766944556076955, "grad_norm": 10.568344586148733, "learning_rate": 4.9961604424479364e-05, "loss": 2.3853, "mean_token_accuracy": 0.39310344457626345, "step": 67185 }, { "epoch": 0.06767448161387372, "grad_norm": 9.965870755168254, "learning_rate": 4.996158253979357e-05, "loss": 2.2008, "mean_token_accuracy": 0.47241379618644713, "step": 67190 }, { "epoch": 0.0676795176669779, "grad_norm": 10.024144281175825, "learning_rate": 4.996156064887799e-05, "loss": 2.1957, "mean_token_accuracy": 0.4413793087005615, "step": 67195 }, { "epoch": 0.06768455372008207, "grad_norm": 12.001781084237187, "learning_rate": 4.9961538751732596e-05, "loss": 2.7533, "mean_token_accuracy": 0.3482758581638336, "step": 67200 }, { "epoch": 0.06768958977318625, "grad_norm": 11.501462439530142, "learning_rate": 4.996151684835742e-05, "loss": 2.7381, "mean_token_accuracy": 0.3655172407627106, "step": 67205 }, { "epoch": 0.06769462582629042, "grad_norm": 12.549317992865358, "learning_rate": 4.996149493875245e-05, "loss": 2.6685, "mean_token_accuracy": 0.4034482777118683, "step": 67210 }, { "epoch": 0.06769966187939458, "grad_norm": 9.512701605408347, "learning_rate": 4.996147302291771e-05, "loss": 2.2965, "mean_token_accuracy": 0.44827585816383364, "step": 67215 }, { "epoch": 0.06770469793249875, "grad_norm": 13.342850571823028, "learning_rate": 4.996145110085319e-05, "loss": 2.3302, "mean_token_accuracy": 0.40205685794353485, "step": 67220 }, { "epoch": 0.06770973398560293, "grad_norm": 12.060401212528726, "learning_rate": 4.996142917255891e-05, "loss": 2.6294, "mean_token_accuracy": 0.36896551251411436, "step": 67225 }, { "epoch": 0.0677147700387071, "grad_norm": 16.510702017442465, "learning_rate": 4.9961407238034864e-05, "loss": 2.9967, "mean_token_accuracy": 0.33793103098869326, "step": 67230 }, { "epoch": 0.06771980609181127, "grad_norm": 11.549027162827482, "learning_rate": 4.996138529728106e-05, "loss": 2.7201, "mean_token_accuracy": 0.3827586114406586, "step": 67235 }, { "epoch": 0.06772484214491545, "grad_norm": 13.61727168939817, "learning_rate": 4.996136335029751e-05, "loss": 2.7156, "mean_token_accuracy": 0.42413793206214906, "step": 67240 }, { "epoch": 0.06772987819801962, "grad_norm": 18.617417639402902, "learning_rate": 4.996134139708422e-05, "loss": 2.9854, "mean_token_accuracy": 0.3931034505367279, "step": 67245 }, { "epoch": 0.0677349142511238, "grad_norm": 13.68848994459133, "learning_rate": 4.9961319437641186e-05, "loss": 2.5168, "mean_token_accuracy": 0.4068965554237366, "step": 67250 }, { "epoch": 0.06773995030422797, "grad_norm": 9.575553070203956, "learning_rate": 4.996129747196843e-05, "loss": 2.2442, "mean_token_accuracy": 0.4344827473163605, "step": 67255 }, { "epoch": 0.06774498635733214, "grad_norm": 10.388905567732426, "learning_rate": 4.9961275500065935e-05, "loss": 2.5333, "mean_token_accuracy": 0.41724138259887694, "step": 67260 }, { "epoch": 0.06775002241043632, "grad_norm": 11.939926059464282, "learning_rate": 4.996125352193374e-05, "loss": 2.2168, "mean_token_accuracy": 0.4297035664319992, "step": 67265 }, { "epoch": 0.06775505846354049, "grad_norm": 15.299713297974325, "learning_rate": 4.996123153757182e-05, "loss": 2.5924, "mean_token_accuracy": 0.42413793206214906, "step": 67270 }, { "epoch": 0.06776009451664466, "grad_norm": 11.662170775456454, "learning_rate": 4.996120954698019e-05, "loss": 2.7956, "mean_token_accuracy": 0.382758629322052, "step": 67275 }, { "epoch": 0.06776513056974884, "grad_norm": 10.895639590356428, "learning_rate": 4.996118755015887e-05, "loss": 2.8838, "mean_token_accuracy": 0.35862069129943847, "step": 67280 }, { "epoch": 0.067770166622853, "grad_norm": 11.85989149287716, "learning_rate": 4.996116554710786e-05, "loss": 2.2879, "mean_token_accuracy": 0.4724137902259827, "step": 67285 }, { "epoch": 0.06777520267595717, "grad_norm": 9.950000819257724, "learning_rate": 4.996114353782715e-05, "loss": 2.5777, "mean_token_accuracy": 0.37755595743656156, "step": 67290 }, { "epoch": 0.06778023872906135, "grad_norm": 13.544972622163318, "learning_rate": 4.9961121522316764e-05, "loss": 2.5555, "mean_token_accuracy": 0.3896551728248596, "step": 67295 }, { "epoch": 0.06778527478216552, "grad_norm": 11.92808480532343, "learning_rate": 4.99610995005767e-05, "loss": 2.5163, "mean_token_accuracy": 0.38620689511299133, "step": 67300 }, { "epoch": 0.0677903108352697, "grad_norm": 12.48316811856645, "learning_rate": 4.996107747260697e-05, "loss": 2.7119, "mean_token_accuracy": 0.3965517282485962, "step": 67305 }, { "epoch": 0.06779534688837387, "grad_norm": 9.645120743568606, "learning_rate": 4.996105543840757e-05, "loss": 2.9179, "mean_token_accuracy": 0.39655172228813174, "step": 67310 }, { "epoch": 0.06780038294147804, "grad_norm": 12.64863669903991, "learning_rate": 4.996103339797852e-05, "loss": 2.3396, "mean_token_accuracy": 0.40139141082763674, "step": 67315 }, { "epoch": 0.06780541899458221, "grad_norm": 12.135480346179358, "learning_rate": 4.996101135131981e-05, "loss": 2.2305, "mean_token_accuracy": 0.42068966031074523, "step": 67320 }, { "epoch": 0.06781045504768639, "grad_norm": 11.27921527237365, "learning_rate": 4.996098929843147e-05, "loss": 2.24, "mean_token_accuracy": 0.4068965494632721, "step": 67325 }, { "epoch": 0.06781549110079056, "grad_norm": 12.57431877673111, "learning_rate": 4.9960967239313474e-05, "loss": 2.7581, "mean_token_accuracy": 0.37241379618644715, "step": 67330 }, { "epoch": 0.06782052715389474, "grad_norm": 13.574854446208409, "learning_rate": 4.996094517396586e-05, "loss": 2.2787, "mean_token_accuracy": 0.48275862336158754, "step": 67335 }, { "epoch": 0.06782556320699891, "grad_norm": 10.59474355774425, "learning_rate": 4.99609231023886e-05, "loss": 2.209, "mean_token_accuracy": 0.41379310488700866, "step": 67340 }, { "epoch": 0.06783059926010308, "grad_norm": 11.367021752139053, "learning_rate": 4.996090102458174e-05, "loss": 2.206, "mean_token_accuracy": 0.441379314661026, "step": 67345 }, { "epoch": 0.06783563531320726, "grad_norm": 11.081263619138939, "learning_rate": 4.996087894054525e-05, "loss": 2.3788, "mean_token_accuracy": 0.39836660623550413, "step": 67350 }, { "epoch": 0.06784067136631142, "grad_norm": 11.626939441149753, "learning_rate": 4.9960856850279155e-05, "loss": 2.7242, "mean_token_accuracy": 0.3413793116807938, "step": 67355 }, { "epoch": 0.06784570741941559, "grad_norm": 10.581893445234016, "learning_rate": 4.996083475378346e-05, "loss": 2.7101, "mean_token_accuracy": 0.37241379022598264, "step": 67360 }, { "epoch": 0.06785074347251976, "grad_norm": 11.68098749250743, "learning_rate": 4.9960812651058164e-05, "loss": 2.3089, "mean_token_accuracy": 0.39310343861579894, "step": 67365 }, { "epoch": 0.06785577952562394, "grad_norm": 12.12890635528658, "learning_rate": 4.9960790542103286e-05, "loss": 2.5668, "mean_token_accuracy": 0.38965516686439516, "step": 67370 }, { "epoch": 0.06786081557872811, "grad_norm": 10.161050837069254, "learning_rate": 4.996076842691881e-05, "loss": 2.2435, "mean_token_accuracy": 0.4275862067937851, "step": 67375 }, { "epoch": 0.06786585163183229, "grad_norm": 11.53014823268052, "learning_rate": 4.996074630550478e-05, "loss": 2.6079, "mean_token_accuracy": 0.4310344815254211, "step": 67380 }, { "epoch": 0.06787088768493646, "grad_norm": 12.063003575458287, "learning_rate": 4.9960724177861154e-05, "loss": 2.563, "mean_token_accuracy": 0.4, "step": 67385 }, { "epoch": 0.06787592373804063, "grad_norm": 11.264524635845152, "learning_rate": 4.9960702043987976e-05, "loss": 2.5012, "mean_token_accuracy": 0.4344827592372894, "step": 67390 }, { "epoch": 0.06788095979114481, "grad_norm": 11.444576010868468, "learning_rate": 4.996067990388523e-05, "loss": 2.99, "mean_token_accuracy": 0.3517241388559341, "step": 67395 }, { "epoch": 0.06788599584424898, "grad_norm": 11.418281410115434, "learning_rate": 4.9960657757552936e-05, "loss": 2.4291, "mean_token_accuracy": 0.3655172437429428, "step": 67400 }, { "epoch": 0.06789103189735315, "grad_norm": 11.890321702629741, "learning_rate": 4.99606356049911e-05, "loss": 2.0241, "mean_token_accuracy": 0.4793103516101837, "step": 67405 }, { "epoch": 0.06789606795045733, "grad_norm": 12.243551617707503, "learning_rate": 4.996061344619971e-05, "loss": 2.5762, "mean_token_accuracy": 0.38275861740112305, "step": 67410 }, { "epoch": 0.0679011040035615, "grad_norm": 21.381524704358892, "learning_rate": 4.996059128117879e-05, "loss": 2.5058, "mean_token_accuracy": 0.4275861978530884, "step": 67415 }, { "epoch": 0.06790614005666568, "grad_norm": 14.060916445705557, "learning_rate": 4.996056910992835e-05, "loss": 2.4795, "mean_token_accuracy": 0.39842710494995115, "step": 67420 }, { "epoch": 0.06791117610976984, "grad_norm": 11.178324643286626, "learning_rate": 4.996054693244838e-05, "loss": 2.5733, "mean_token_accuracy": 0.4344827592372894, "step": 67425 }, { "epoch": 0.06791621216287401, "grad_norm": 20.75714070717515, "learning_rate": 4.996052474873889e-05, "loss": 2.869, "mean_token_accuracy": 0.38620689511299133, "step": 67430 }, { "epoch": 0.06792124821597818, "grad_norm": 17.44797667877228, "learning_rate": 4.9960502558799896e-05, "loss": 2.7087, "mean_token_accuracy": 0.4279556632041931, "step": 67435 }, { "epoch": 0.06792628426908236, "grad_norm": 9.870783310204883, "learning_rate": 4.9960480362631395e-05, "loss": 2.7272, "mean_token_accuracy": 0.41724138259887694, "step": 67440 }, { "epoch": 0.06793132032218653, "grad_norm": 9.962690673683175, "learning_rate": 4.9960458160233394e-05, "loss": 2.2792, "mean_token_accuracy": 0.43623715043067934, "step": 67445 }, { "epoch": 0.0679363563752907, "grad_norm": 9.905796679745507, "learning_rate": 4.9960435951605906e-05, "loss": 2.0922, "mean_token_accuracy": 0.4724137902259827, "step": 67450 }, { "epoch": 0.06794139242839488, "grad_norm": 15.651557683541277, "learning_rate": 4.996041373674893e-05, "loss": 2.5725, "mean_token_accuracy": 0.4034482717514038, "step": 67455 }, { "epoch": 0.06794642848149905, "grad_norm": 11.14024475231413, "learning_rate": 4.9960391515662476e-05, "loss": 2.7269, "mean_token_accuracy": 0.37241379022598264, "step": 67460 }, { "epoch": 0.06795146453460323, "grad_norm": 12.45319230666954, "learning_rate": 4.996036928834655e-05, "loss": 2.6676, "mean_token_accuracy": 0.35862069129943847, "step": 67465 }, { "epoch": 0.0679565005877074, "grad_norm": 7.461908081910165, "learning_rate": 4.9960347054801146e-05, "loss": 2.3315, "mean_token_accuracy": 0.44047186374664304, "step": 67470 }, { "epoch": 0.06796153664081157, "grad_norm": 11.096509855502292, "learning_rate": 4.996032481502629e-05, "loss": 2.3765, "mean_token_accuracy": 0.4068965554237366, "step": 67475 }, { "epoch": 0.06796657269391575, "grad_norm": 11.41034978158626, "learning_rate": 4.9960302569021984e-05, "loss": 2.8313, "mean_token_accuracy": 0.36206896901130675, "step": 67480 }, { "epoch": 0.06797160874701992, "grad_norm": 12.83777653911419, "learning_rate": 4.9960280316788224e-05, "loss": 2.6273, "mean_token_accuracy": 0.44482759237289426, "step": 67485 }, { "epoch": 0.0679766448001241, "grad_norm": 10.848310144283532, "learning_rate": 4.9960258058325024e-05, "loss": 2.3365, "mean_token_accuracy": 0.4931034445762634, "step": 67490 }, { "epoch": 0.06798168085322825, "grad_norm": 11.078363954704422, "learning_rate": 4.9960235793632384e-05, "loss": 2.8658, "mean_token_accuracy": 0.3793103516101837, "step": 67495 }, { "epoch": 0.06798671690633243, "grad_norm": 12.177495073962945, "learning_rate": 4.9960213522710305e-05, "loss": 2.3426, "mean_token_accuracy": 0.42758620381355283, "step": 67500 }, { "epoch": 0.0679917529594366, "grad_norm": 9.622854364499236, "learning_rate": 4.9960191245558814e-05, "loss": 2.4772, "mean_token_accuracy": 0.3862069010734558, "step": 67505 }, { "epoch": 0.06799678901254078, "grad_norm": 10.369879005177918, "learning_rate": 4.9960168962177904e-05, "loss": 2.2492, "mean_token_accuracy": 0.4206896543502808, "step": 67510 }, { "epoch": 0.06800182506564495, "grad_norm": 11.608792595384573, "learning_rate": 4.996014667256758e-05, "loss": 2.3711, "mean_token_accuracy": 0.4034482777118683, "step": 67515 }, { "epoch": 0.06800686111874912, "grad_norm": 10.70929953467183, "learning_rate": 4.996012437672785e-05, "loss": 2.2026, "mean_token_accuracy": 0.4620689630508423, "step": 67520 }, { "epoch": 0.0680118971718533, "grad_norm": 12.01475489016206, "learning_rate": 4.9960102074658726e-05, "loss": 2.5589, "mean_token_accuracy": 0.4034482717514038, "step": 67525 }, { "epoch": 0.06801693322495747, "grad_norm": 12.613384642969377, "learning_rate": 4.99600797663602e-05, "loss": 2.306, "mean_token_accuracy": 0.4517241358757019, "step": 67530 }, { "epoch": 0.06802196927806164, "grad_norm": 11.085014845943247, "learning_rate": 4.9960057451832296e-05, "loss": 2.3998, "mean_token_accuracy": 0.4034482717514038, "step": 67535 }, { "epoch": 0.06802700533116582, "grad_norm": 13.0302800298626, "learning_rate": 4.9960035131075006e-05, "loss": 2.5572, "mean_token_accuracy": 0.41379310488700866, "step": 67540 }, { "epoch": 0.06803204138426999, "grad_norm": 13.95461241458804, "learning_rate": 4.996001280408834e-05, "loss": 2.2505, "mean_token_accuracy": 0.43793103098869324, "step": 67545 }, { "epoch": 0.06803707743737417, "grad_norm": 15.891194375730802, "learning_rate": 4.995999047087231e-05, "loss": 2.4138, "mean_token_accuracy": 0.4413793087005615, "step": 67550 }, { "epoch": 0.06804211349047834, "grad_norm": 10.391275457994764, "learning_rate": 4.995996813142692e-05, "loss": 2.3541, "mean_token_accuracy": 0.4206896543502808, "step": 67555 }, { "epoch": 0.06804714954358251, "grad_norm": 10.731958739480316, "learning_rate": 4.995994578575217e-05, "loss": 2.5376, "mean_token_accuracy": 0.39999999701976774, "step": 67560 }, { "epoch": 0.06805218559668667, "grad_norm": 11.592575490023139, "learning_rate": 4.9959923433848065e-05, "loss": 2.3071, "mean_token_accuracy": 0.43103448748588563, "step": 67565 }, { "epoch": 0.06805722164979085, "grad_norm": 11.409479831506632, "learning_rate": 4.995990107571463e-05, "loss": 2.4719, "mean_token_accuracy": 0.40828797221183777, "step": 67570 }, { "epoch": 0.06806225770289502, "grad_norm": 10.563076503681634, "learning_rate": 4.9959878711351844e-05, "loss": 2.5372, "mean_token_accuracy": 0.38620689511299133, "step": 67575 }, { "epoch": 0.0680672937559992, "grad_norm": 11.866089400411754, "learning_rate": 4.995985634075973e-05, "loss": 2.7538, "mean_token_accuracy": 0.379310342669487, "step": 67580 }, { "epoch": 0.06807232980910337, "grad_norm": 11.653004661676793, "learning_rate": 4.99598339639383e-05, "loss": 2.5502, "mean_token_accuracy": 0.39655172228813174, "step": 67585 }, { "epoch": 0.06807736586220754, "grad_norm": 12.518365995849269, "learning_rate": 4.995981158088754e-05, "loss": 2.3229, "mean_token_accuracy": 0.482758629322052, "step": 67590 }, { "epoch": 0.06808240191531172, "grad_norm": 11.438220508897544, "learning_rate": 4.995978919160747e-05, "loss": 2.8819, "mean_token_accuracy": 0.3034482657909393, "step": 67595 }, { "epoch": 0.06808743796841589, "grad_norm": 11.136927770625919, "learning_rate": 4.99597667960981e-05, "loss": 3.0726, "mean_token_accuracy": 0.3448275804519653, "step": 67600 }, { "epoch": 0.06809247402152006, "grad_norm": 10.506544096031416, "learning_rate": 4.995974439435943e-05, "loss": 2.33, "mean_token_accuracy": 0.42758620977401735, "step": 67605 }, { "epoch": 0.06809751007462424, "grad_norm": 13.550237585257541, "learning_rate": 4.9959721986391456e-05, "loss": 2.4761, "mean_token_accuracy": 0.42413792610168455, "step": 67610 }, { "epoch": 0.06810254612772841, "grad_norm": 12.572156595164099, "learning_rate": 4.9959699572194194e-05, "loss": 2.6315, "mean_token_accuracy": 0.38965516686439516, "step": 67615 }, { "epoch": 0.06810758218083258, "grad_norm": 12.230959063692618, "learning_rate": 4.995967715176766e-05, "loss": 2.2209, "mean_token_accuracy": 0.47241379618644713, "step": 67620 }, { "epoch": 0.06811261823393676, "grad_norm": 13.281003520680764, "learning_rate": 4.995965472511185e-05, "loss": 2.9073, "mean_token_accuracy": 0.34137930274009703, "step": 67625 }, { "epoch": 0.06811765428704093, "grad_norm": 11.759880574911456, "learning_rate": 4.995963229222677e-05, "loss": 2.555, "mean_token_accuracy": 0.4103448331356049, "step": 67630 }, { "epoch": 0.06812269034014509, "grad_norm": 13.669642152052498, "learning_rate": 4.995960985311242e-05, "loss": 2.4342, "mean_token_accuracy": 0.38965516686439516, "step": 67635 }, { "epoch": 0.06812772639324927, "grad_norm": 10.198767347032152, "learning_rate": 4.995958740776883e-05, "loss": 2.4611, "mean_token_accuracy": 0.3620689570903778, "step": 67640 }, { "epoch": 0.06813276244635344, "grad_norm": 17.182630498137463, "learning_rate": 4.9959564956195975e-05, "loss": 2.4019, "mean_token_accuracy": 0.44482757449150084, "step": 67645 }, { "epoch": 0.06813779849945761, "grad_norm": 12.112798067206478, "learning_rate": 4.995954249839388e-05, "loss": 2.6985, "mean_token_accuracy": 0.39655172228813174, "step": 67650 }, { "epoch": 0.06814283455256179, "grad_norm": 11.63531380206764, "learning_rate": 4.9959520034362544e-05, "loss": 2.5345, "mean_token_accuracy": 0.4, "step": 67655 }, { "epoch": 0.06814787060566596, "grad_norm": 10.33939208886827, "learning_rate": 4.9959497564101974e-05, "loss": 2.5698, "mean_token_accuracy": 0.43103448748588563, "step": 67660 }, { "epoch": 0.06815290665877013, "grad_norm": 10.27792208520894, "learning_rate": 4.995947508761219e-05, "loss": 2.1212, "mean_token_accuracy": 0.45517241954803467, "step": 67665 }, { "epoch": 0.06815794271187431, "grad_norm": 10.160824333528534, "learning_rate": 4.9959452604893174e-05, "loss": 2.5278, "mean_token_accuracy": 0.40689654350280763, "step": 67670 }, { "epoch": 0.06816297876497848, "grad_norm": 12.775698051615551, "learning_rate": 4.9959430115944957e-05, "loss": 2.511, "mean_token_accuracy": 0.37241379022598264, "step": 67675 }, { "epoch": 0.06816801481808266, "grad_norm": 14.194501544052104, "learning_rate": 4.995940762076753e-05, "loss": 2.6071, "mean_token_accuracy": 0.3517241358757019, "step": 67680 }, { "epoch": 0.06817305087118683, "grad_norm": 14.31971292179627, "learning_rate": 4.9959385119360895e-05, "loss": 2.503, "mean_token_accuracy": 0.45741077661514284, "step": 67685 }, { "epoch": 0.068178086924291, "grad_norm": 11.341180389497971, "learning_rate": 4.995936261172507e-05, "loss": 2.4121, "mean_token_accuracy": 0.3931034505367279, "step": 67690 }, { "epoch": 0.06818312297739518, "grad_norm": 11.053927591732855, "learning_rate": 4.9959340097860065e-05, "loss": 2.8424, "mean_token_accuracy": 0.3827586233615875, "step": 67695 }, { "epoch": 0.06818815903049935, "grad_norm": 10.439632239773552, "learning_rate": 4.995931757776587e-05, "loss": 2.3849, "mean_token_accuracy": 0.37586206793785093, "step": 67700 }, { "epoch": 0.06819319508360351, "grad_norm": 12.20094578112412, "learning_rate": 4.99592950514425e-05, "loss": 2.5721, "mean_token_accuracy": 0.4103448212146759, "step": 67705 }, { "epoch": 0.06819823113670768, "grad_norm": 10.041808514940353, "learning_rate": 4.995927251888996e-05, "loss": 2.2557, "mean_token_accuracy": 0.4103448331356049, "step": 67710 }, { "epoch": 0.06820326718981186, "grad_norm": 11.625215221839394, "learning_rate": 4.9959249980108256e-05, "loss": 2.8524, "mean_token_accuracy": 0.3620689630508423, "step": 67715 }, { "epoch": 0.06820830324291603, "grad_norm": 13.636346638878388, "learning_rate": 4.99592274350974e-05, "loss": 2.2736, "mean_token_accuracy": 0.47241380214691164, "step": 67720 }, { "epoch": 0.0682133392960202, "grad_norm": 14.760031215533257, "learning_rate": 4.995920488385739e-05, "loss": 2.4786, "mean_token_accuracy": 0.42413793206214906, "step": 67725 }, { "epoch": 0.06821837534912438, "grad_norm": 12.678838320150144, "learning_rate": 4.995918232638824e-05, "loss": 2.4347, "mean_token_accuracy": 0.3793103456497192, "step": 67730 }, { "epoch": 0.06822341140222855, "grad_norm": 12.053792102128776, "learning_rate": 4.995915976268995e-05, "loss": 2.3119, "mean_token_accuracy": 0.48275862336158754, "step": 67735 }, { "epoch": 0.06822844745533273, "grad_norm": 22.061651031457604, "learning_rate": 4.995913719276252e-05, "loss": 2.9199, "mean_token_accuracy": 0.3862069010734558, "step": 67740 }, { "epoch": 0.0682334835084369, "grad_norm": 9.855669142905063, "learning_rate": 4.995911461660598e-05, "loss": 2.7263, "mean_token_accuracy": 0.358620685338974, "step": 67745 }, { "epoch": 0.06823851956154108, "grad_norm": 10.984783747030557, "learning_rate": 4.9959092034220306e-05, "loss": 2.1303, "mean_token_accuracy": 0.47931034564971925, "step": 67750 }, { "epoch": 0.06824355561464525, "grad_norm": 11.553586965491832, "learning_rate": 4.995906944560553e-05, "loss": 2.2412, "mean_token_accuracy": 0.41724138259887694, "step": 67755 }, { "epoch": 0.06824859166774942, "grad_norm": 14.007340867442336, "learning_rate": 4.995904685076164e-05, "loss": 2.3331, "mean_token_accuracy": 0.4467029690742493, "step": 67760 }, { "epoch": 0.0682536277208536, "grad_norm": 13.047073571210488, "learning_rate": 4.995902424968865e-05, "loss": 2.2024, "mean_token_accuracy": 0.44137930274009707, "step": 67765 }, { "epoch": 0.06825866377395777, "grad_norm": 12.38055620311324, "learning_rate": 4.995900164238657e-05, "loss": 2.8741, "mean_token_accuracy": 0.3724138021469116, "step": 67770 }, { "epoch": 0.06826369982706193, "grad_norm": 11.916159724152177, "learning_rate": 4.99589790288554e-05, "loss": 2.6477, "mean_token_accuracy": 0.4172413796186447, "step": 67775 }, { "epoch": 0.0682687358801661, "grad_norm": 11.294999070671961, "learning_rate": 4.995895640909515e-05, "loss": 2.5906, "mean_token_accuracy": 0.36896551251411436, "step": 67780 }, { "epoch": 0.06827377193327028, "grad_norm": 10.755489763010823, "learning_rate": 4.995893378310583e-05, "loss": 2.4273, "mean_token_accuracy": 0.4206896543502808, "step": 67785 }, { "epoch": 0.06827880798637445, "grad_norm": 11.726088754582799, "learning_rate": 4.9958911150887424e-05, "loss": 2.8993, "mean_token_accuracy": 0.36896551251411436, "step": 67790 }, { "epoch": 0.06828384403947863, "grad_norm": 9.97170460780227, "learning_rate": 4.995888851243997e-05, "loss": 2.3642, "mean_token_accuracy": 0.44137930274009707, "step": 67795 }, { "epoch": 0.0682888800925828, "grad_norm": 11.5999564998719, "learning_rate": 4.995886586776345e-05, "loss": 2.6157, "mean_token_accuracy": 0.46206897497177124, "step": 67800 }, { "epoch": 0.06829391614568697, "grad_norm": 11.451137283201733, "learning_rate": 4.995884321685789e-05, "loss": 2.3552, "mean_token_accuracy": 0.42238354682922363, "step": 67805 }, { "epoch": 0.06829895219879115, "grad_norm": 11.045433382768143, "learning_rate": 4.9958820559723276e-05, "loss": 2.3819, "mean_token_accuracy": 0.41034482717514037, "step": 67810 }, { "epoch": 0.06830398825189532, "grad_norm": 10.39142777362513, "learning_rate": 4.995879789635963e-05, "loss": 2.7999, "mean_token_accuracy": 0.4000000089406967, "step": 67815 }, { "epoch": 0.0683090243049995, "grad_norm": 12.374181197035556, "learning_rate": 4.9958775226766954e-05, "loss": 3.5747, "mean_token_accuracy": 0.2896551787853241, "step": 67820 }, { "epoch": 0.06831406035810367, "grad_norm": 12.506992164076818, "learning_rate": 4.995875255094525e-05, "loss": 2.4624, "mean_token_accuracy": 0.39310344457626345, "step": 67825 }, { "epoch": 0.06831909641120784, "grad_norm": 10.777338236416748, "learning_rate": 4.995872986889453e-05, "loss": 2.3829, "mean_token_accuracy": 0.4086509466171265, "step": 67830 }, { "epoch": 0.06832413246431202, "grad_norm": 7.047548656024399, "learning_rate": 4.995870718061479e-05, "loss": 2.1891, "mean_token_accuracy": 0.4912561535835266, "step": 67835 }, { "epoch": 0.06832916851741619, "grad_norm": 10.548905972028468, "learning_rate": 4.995868448610605e-05, "loss": 2.4246, "mean_token_accuracy": 0.43793103098869324, "step": 67840 }, { "epoch": 0.06833420457052035, "grad_norm": 11.131581273911983, "learning_rate": 4.995866178536831e-05, "loss": 2.5971, "mean_token_accuracy": 0.4137930989265442, "step": 67845 }, { "epoch": 0.06833924062362452, "grad_norm": 15.116506786151675, "learning_rate": 4.995863907840158e-05, "loss": 2.3582, "mean_token_accuracy": 0.4379310369491577, "step": 67850 }, { "epoch": 0.0683442766767287, "grad_norm": 9.175100172438112, "learning_rate": 4.995861636520586e-05, "loss": 2.354, "mean_token_accuracy": 0.4551724076271057, "step": 67855 }, { "epoch": 0.06834931272983287, "grad_norm": 12.5592146316937, "learning_rate": 4.995859364578115e-05, "loss": 2.3818, "mean_token_accuracy": 0.42758620381355283, "step": 67860 }, { "epoch": 0.06835434878293704, "grad_norm": 18.79740162845886, "learning_rate": 4.995857092012748e-05, "loss": 2.5061, "mean_token_accuracy": 0.4275862067937851, "step": 67865 }, { "epoch": 0.06835938483604122, "grad_norm": 14.57022183557931, "learning_rate": 4.995854818824484e-05, "loss": 2.5396, "mean_token_accuracy": 0.441379314661026, "step": 67870 }, { "epoch": 0.06836442088914539, "grad_norm": 15.352158693312692, "learning_rate": 4.995852545013322e-05, "loss": 2.8335, "mean_token_accuracy": 0.3793103456497192, "step": 67875 }, { "epoch": 0.06836945694224957, "grad_norm": 17.923213263395787, "learning_rate": 4.995850270579266e-05, "loss": 2.7775, "mean_token_accuracy": 0.38620689511299133, "step": 67880 }, { "epoch": 0.06837449299535374, "grad_norm": 11.88786806200433, "learning_rate": 4.9958479955223146e-05, "loss": 2.4185, "mean_token_accuracy": 0.4103448212146759, "step": 67885 }, { "epoch": 0.06837952904845791, "grad_norm": 11.57498501957785, "learning_rate": 4.9958457198424687e-05, "loss": 2.2756, "mean_token_accuracy": 0.40490018725395205, "step": 67890 }, { "epoch": 0.06838456510156209, "grad_norm": 9.935760864532636, "learning_rate": 4.9958434435397295e-05, "loss": 3.0267, "mean_token_accuracy": 0.3172413736581802, "step": 67895 }, { "epoch": 0.06838960115466626, "grad_norm": 12.066123936091081, "learning_rate": 4.995841166614097e-05, "loss": 2.4891, "mean_token_accuracy": 0.39655172228813174, "step": 67900 }, { "epoch": 0.06839463720777043, "grad_norm": 13.218251543035977, "learning_rate": 4.9958388890655727e-05, "loss": 2.3807, "mean_token_accuracy": 0.41724138259887694, "step": 67905 }, { "epoch": 0.06839967326087461, "grad_norm": 9.76936640312093, "learning_rate": 4.995836610894156e-05, "loss": 2.0089, "mean_token_accuracy": 0.4620689630508423, "step": 67910 }, { "epoch": 0.06840470931397877, "grad_norm": 14.56657002513992, "learning_rate": 4.995834332099848e-05, "loss": 2.5509, "mean_token_accuracy": 0.42068964838981626, "step": 67915 }, { "epoch": 0.06840974536708294, "grad_norm": 11.758562428182222, "learning_rate": 4.9958320526826494e-05, "loss": 2.4621, "mean_token_accuracy": 0.4034482777118683, "step": 67920 }, { "epoch": 0.06841478142018712, "grad_norm": 11.254236143449344, "learning_rate": 4.995829772642561e-05, "loss": 2.5021, "mean_token_accuracy": 0.41034482717514037, "step": 67925 }, { "epoch": 0.06841981747329129, "grad_norm": 10.170986982691067, "learning_rate": 4.995827491979584e-05, "loss": 2.3279, "mean_token_accuracy": 0.4620689630508423, "step": 67930 }, { "epoch": 0.06842485352639546, "grad_norm": 11.98224232358821, "learning_rate": 4.995825210693718e-05, "loss": 2.7845, "mean_token_accuracy": 0.4137930989265442, "step": 67935 }, { "epoch": 0.06842988957949964, "grad_norm": 14.614397198425685, "learning_rate": 4.995822928784964e-05, "loss": 2.5635, "mean_token_accuracy": 0.3862068921327591, "step": 67940 }, { "epoch": 0.06843492563260381, "grad_norm": 10.303188806157545, "learning_rate": 4.9958206462533225e-05, "loss": 2.1692, "mean_token_accuracy": 0.4551724135875702, "step": 67945 }, { "epoch": 0.06843996168570798, "grad_norm": 12.048774232995173, "learning_rate": 4.9958183630987943e-05, "loss": 2.2902, "mean_token_accuracy": 0.4034482777118683, "step": 67950 }, { "epoch": 0.06844499773881216, "grad_norm": 12.657010052946985, "learning_rate": 4.99581607932138e-05, "loss": 2.9604, "mean_token_accuracy": 0.36206896901130675, "step": 67955 }, { "epoch": 0.06845003379191633, "grad_norm": 13.6825145832331, "learning_rate": 4.99581379492108e-05, "loss": 2.5835, "mean_token_accuracy": 0.420689657330513, "step": 67960 }, { "epoch": 0.0684550698450205, "grad_norm": 11.003620092788015, "learning_rate": 4.995811509897895e-05, "loss": 2.462, "mean_token_accuracy": 0.4172413766384125, "step": 67965 }, { "epoch": 0.06846010589812468, "grad_norm": 15.19296718060808, "learning_rate": 4.995809224251827e-05, "loss": 2.8776, "mean_token_accuracy": 0.36551723480224607, "step": 67970 }, { "epoch": 0.06846514195122885, "grad_norm": 13.945249586069592, "learning_rate": 4.995806937982874e-05, "loss": 2.9054, "mean_token_accuracy": 0.36896551251411436, "step": 67975 }, { "epoch": 0.06847017800433303, "grad_norm": 11.244731370560537, "learning_rate": 4.995804651091039e-05, "loss": 2.7528, "mean_token_accuracy": 0.3689655244350433, "step": 67980 }, { "epoch": 0.06847521405743719, "grad_norm": 22.110342838687064, "learning_rate": 4.995802363576321e-05, "loss": 2.7398, "mean_token_accuracy": 0.4113300502300262, "step": 67985 }, { "epoch": 0.06848025011054136, "grad_norm": 19.26432173437627, "learning_rate": 4.995800075438722e-05, "loss": 2.9468, "mean_token_accuracy": 0.3793103456497192, "step": 67990 }, { "epoch": 0.06848528616364553, "grad_norm": 11.276852072657423, "learning_rate": 4.995797786678242e-05, "loss": 2.6102, "mean_token_accuracy": 0.3689655065536499, "step": 67995 }, { "epoch": 0.06849032221674971, "grad_norm": 11.090639232304385, "learning_rate": 4.995795497294881e-05, "loss": 2.4122, "mean_token_accuracy": 0.39655172228813174, "step": 68000 }, { "epoch": 0.06849535826985388, "grad_norm": 12.562089443373422, "learning_rate": 4.9957932072886405e-05, "loss": 2.4198, "mean_token_accuracy": 0.43448275327682495, "step": 68005 }, { "epoch": 0.06850039432295806, "grad_norm": 11.527090376631108, "learning_rate": 4.995790916659521e-05, "loss": 2.4738, "mean_token_accuracy": 0.3862069010734558, "step": 68010 }, { "epoch": 0.06850543037606223, "grad_norm": 10.722513711569446, "learning_rate": 4.9957886254075223e-05, "loss": 2.3529, "mean_token_accuracy": 0.4206896543502808, "step": 68015 }, { "epoch": 0.0685104664291664, "grad_norm": 12.398642602053979, "learning_rate": 4.995786333532647e-05, "loss": 2.5458, "mean_token_accuracy": 0.3999999940395355, "step": 68020 }, { "epoch": 0.06851550248227058, "grad_norm": 10.456643458396158, "learning_rate": 4.995784041034894e-05, "loss": 2.5577, "mean_token_accuracy": 0.41034482717514037, "step": 68025 }, { "epoch": 0.06852053853537475, "grad_norm": 12.887406245332665, "learning_rate": 4.9957817479142636e-05, "loss": 2.8691, "mean_token_accuracy": 0.3774349570274353, "step": 68030 }, { "epoch": 0.06852557458847892, "grad_norm": 14.174355464637435, "learning_rate": 4.9957794541707583e-05, "loss": 2.8707, "mean_token_accuracy": 0.4103448331356049, "step": 68035 }, { "epoch": 0.0685306106415831, "grad_norm": 13.144835040752119, "learning_rate": 4.9957771598043776e-05, "loss": 2.3696, "mean_token_accuracy": 0.4068965494632721, "step": 68040 }, { "epoch": 0.06853564669468727, "grad_norm": 11.246789224224312, "learning_rate": 4.995774864815121e-05, "loss": 2.0803, "mean_token_accuracy": 0.458620685338974, "step": 68045 }, { "epoch": 0.06854068274779145, "grad_norm": 13.394944572588896, "learning_rate": 4.9957725692029915e-05, "loss": 2.8871, "mean_token_accuracy": 0.35335753560066224, "step": 68050 }, { "epoch": 0.0685457188008956, "grad_norm": 12.257848607816527, "learning_rate": 4.995770272967989e-05, "loss": 2.4439, "mean_token_accuracy": 0.42413792610168455, "step": 68055 }, { "epoch": 0.06855075485399978, "grad_norm": 10.090734651197305, "learning_rate": 4.9957679761101137e-05, "loss": 3.3997, "mean_token_accuracy": 0.3413793116807938, "step": 68060 }, { "epoch": 0.06855579090710395, "grad_norm": 11.942178948826568, "learning_rate": 4.995765678629365e-05, "loss": 2.4048, "mean_token_accuracy": 0.4275862157344818, "step": 68065 }, { "epoch": 0.06856082696020813, "grad_norm": 10.453425893756625, "learning_rate": 4.995763380525746e-05, "loss": 2.3919, "mean_token_accuracy": 0.44482758045196535, "step": 68070 }, { "epoch": 0.0685658630133123, "grad_norm": 13.436341964765807, "learning_rate": 4.9957610817992556e-05, "loss": 2.7575, "mean_token_accuracy": 0.37931033968925476, "step": 68075 }, { "epoch": 0.06857089906641647, "grad_norm": 14.202240067666892, "learning_rate": 4.9957587824498946e-05, "loss": 2.4885, "mean_token_accuracy": 0.42068964838981626, "step": 68080 }, { "epoch": 0.06857593511952065, "grad_norm": 10.061776810203515, "learning_rate": 4.995756482477665e-05, "loss": 2.2977, "mean_token_accuracy": 0.4379310369491577, "step": 68085 }, { "epoch": 0.06858097117262482, "grad_norm": 10.701233608231643, "learning_rate": 4.9957541818825655e-05, "loss": 2.6224, "mean_token_accuracy": 0.4310344815254211, "step": 68090 }, { "epoch": 0.068586007225729, "grad_norm": 11.140484726397732, "learning_rate": 4.995751880664599e-05, "loss": 2.7755, "mean_token_accuracy": 0.33103448152542114, "step": 68095 }, { "epoch": 0.06859104327883317, "grad_norm": 11.254314639549678, "learning_rate": 4.9957495788237636e-05, "loss": 2.5011, "mean_token_accuracy": 0.4034482777118683, "step": 68100 }, { "epoch": 0.06859607933193734, "grad_norm": 12.035333883603743, "learning_rate": 4.995747276360061e-05, "loss": 2.5645, "mean_token_accuracy": 0.4172413766384125, "step": 68105 }, { "epoch": 0.06860111538504152, "grad_norm": 9.458219513973852, "learning_rate": 4.9957449732734936e-05, "loss": 2.369, "mean_token_accuracy": 0.45517241954803467, "step": 68110 }, { "epoch": 0.06860615143814569, "grad_norm": 15.328357994755201, "learning_rate": 4.995742669564059e-05, "loss": 2.5749, "mean_token_accuracy": 0.44482758045196535, "step": 68115 }, { "epoch": 0.06861118749124986, "grad_norm": 12.214712588989148, "learning_rate": 4.9957403652317595e-05, "loss": 2.3947, "mean_token_accuracy": 0.4344827592372894, "step": 68120 }, { "epoch": 0.06861622354435402, "grad_norm": 12.128713044444613, "learning_rate": 4.9957380602765954e-05, "loss": 2.6187, "mean_token_accuracy": 0.4103448301553726, "step": 68125 }, { "epoch": 0.0686212595974582, "grad_norm": 10.459589655249898, "learning_rate": 4.995735754698568e-05, "loss": 2.1151, "mean_token_accuracy": 0.4655172288417816, "step": 68130 }, { "epoch": 0.06862629565056237, "grad_norm": 10.085066977629412, "learning_rate": 4.9957334484976774e-05, "loss": 2.3608, "mean_token_accuracy": 0.42758620977401735, "step": 68135 }, { "epoch": 0.06863133170366655, "grad_norm": 10.470334883663183, "learning_rate": 4.995731141673924e-05, "loss": 2.3027, "mean_token_accuracy": 0.4122807025909424, "step": 68140 }, { "epoch": 0.06863636775677072, "grad_norm": 11.819030230479202, "learning_rate": 4.995728834227309e-05, "loss": 2.05, "mean_token_accuracy": 0.47586206197738645, "step": 68145 }, { "epoch": 0.06864140380987489, "grad_norm": 10.985890241995858, "learning_rate": 4.9957265261578324e-05, "loss": 2.5963, "mean_token_accuracy": 0.39310343861579894, "step": 68150 }, { "epoch": 0.06864643986297907, "grad_norm": 12.03026032241558, "learning_rate": 4.995724217465495e-05, "loss": 2.1793, "mean_token_accuracy": 0.443254691362381, "step": 68155 }, { "epoch": 0.06865147591608324, "grad_norm": 8.53095996252538, "learning_rate": 4.995721908150298e-05, "loss": 2.415, "mean_token_accuracy": 0.44248768091201784, "step": 68160 }, { "epoch": 0.06865651196918741, "grad_norm": 11.409718350880656, "learning_rate": 4.995719598212242e-05, "loss": 2.4775, "mean_token_accuracy": 0.4034482717514038, "step": 68165 }, { "epoch": 0.06866154802229159, "grad_norm": 9.7141628757598, "learning_rate": 4.9957172876513265e-05, "loss": 2.0407, "mean_token_accuracy": 0.4827586054801941, "step": 68170 }, { "epoch": 0.06866658407539576, "grad_norm": 10.851265392690806, "learning_rate": 4.995714976467553e-05, "loss": 2.4063, "mean_token_accuracy": 0.3896551728248596, "step": 68175 }, { "epoch": 0.06867162012849994, "grad_norm": 13.489016763291328, "learning_rate": 4.995712664660923e-05, "loss": 2.2229, "mean_token_accuracy": 0.42068964838981626, "step": 68180 }, { "epoch": 0.06867665618160411, "grad_norm": 11.157386216273943, "learning_rate": 4.9957103522314354e-05, "loss": 2.9373, "mean_token_accuracy": 0.34137931317090986, "step": 68185 }, { "epoch": 0.06868169223470828, "grad_norm": 12.671492426012367, "learning_rate": 4.995708039179092e-05, "loss": 2.7379, "mean_token_accuracy": 0.39655172526836396, "step": 68190 }, { "epoch": 0.06868672828781244, "grad_norm": 12.22262975315247, "learning_rate": 4.995705725503893e-05, "loss": 2.5968, "mean_token_accuracy": 0.4034482717514038, "step": 68195 }, { "epoch": 0.06869176434091662, "grad_norm": 13.270736734349299, "learning_rate": 4.995703411205839e-05, "loss": 1.9786, "mean_token_accuracy": 0.4965517222881317, "step": 68200 }, { "epoch": 0.06869680039402079, "grad_norm": 10.561188892829515, "learning_rate": 4.995701096284931e-05, "loss": 2.657, "mean_token_accuracy": 0.3034482717514038, "step": 68205 }, { "epoch": 0.06870183644712496, "grad_norm": 11.405297136851035, "learning_rate": 4.9956987807411694e-05, "loss": 3.0318, "mean_token_accuracy": 0.34482758641242983, "step": 68210 }, { "epoch": 0.06870687250022914, "grad_norm": 10.807959589992901, "learning_rate": 4.995696464574554e-05, "loss": 2.2939, "mean_token_accuracy": 0.4551724076271057, "step": 68215 }, { "epoch": 0.06871190855333331, "grad_norm": 11.183639038708124, "learning_rate": 4.9956941477850874e-05, "loss": 2.3126, "mean_token_accuracy": 0.4448275864124298, "step": 68220 }, { "epoch": 0.06871694460643749, "grad_norm": 11.341184527072784, "learning_rate": 4.9956918303727684e-05, "loss": 2.43, "mean_token_accuracy": 0.4034482777118683, "step": 68225 }, { "epoch": 0.06872198065954166, "grad_norm": 11.867086148677712, "learning_rate": 4.995689512337599e-05, "loss": 2.8162, "mean_token_accuracy": 0.35862069129943847, "step": 68230 }, { "epoch": 0.06872701671264583, "grad_norm": 11.63069911886928, "learning_rate": 4.995687193679579e-05, "loss": 2.4218, "mean_token_accuracy": 0.4103448331356049, "step": 68235 }, { "epoch": 0.06873205276575, "grad_norm": 7.837334287424601, "learning_rate": 4.9956848743987094e-05, "loss": 2.3421, "mean_token_accuracy": 0.46098004579544066, "step": 68240 }, { "epoch": 0.06873708881885418, "grad_norm": 9.923742170854597, "learning_rate": 4.9956825544949906e-05, "loss": 2.1277, "mean_token_accuracy": 0.4206896424293518, "step": 68245 }, { "epoch": 0.06874212487195835, "grad_norm": 12.650398859564838, "learning_rate": 4.9956802339684235e-05, "loss": 2.6756, "mean_token_accuracy": 0.34482758343219755, "step": 68250 }, { "epoch": 0.06874716092506253, "grad_norm": 11.424958710758034, "learning_rate": 4.995677912819008e-05, "loss": 3.1312, "mean_token_accuracy": 0.32758620381355286, "step": 68255 }, { "epoch": 0.0687521969781667, "grad_norm": 10.719106329382566, "learning_rate": 4.995675591046746e-05, "loss": 2.655, "mean_token_accuracy": 0.41724138259887694, "step": 68260 }, { "epoch": 0.06875723303127086, "grad_norm": 14.989340031481367, "learning_rate": 4.995673268651637e-05, "loss": 3.1029, "mean_token_accuracy": 0.3551724076271057, "step": 68265 }, { "epoch": 0.06876226908437504, "grad_norm": 11.097026851255778, "learning_rate": 4.995670945633683e-05, "loss": 2.3953, "mean_token_accuracy": 0.4206896543502808, "step": 68270 }, { "epoch": 0.06876730513747921, "grad_norm": 11.672875333645576, "learning_rate": 4.9956686219928836e-05, "loss": 2.3799, "mean_token_accuracy": 0.42413792610168455, "step": 68275 }, { "epoch": 0.06877234119058338, "grad_norm": 9.769184310215607, "learning_rate": 4.995666297729239e-05, "loss": 2.2208, "mean_token_accuracy": 0.4758620738983154, "step": 68280 }, { "epoch": 0.06877737724368756, "grad_norm": 10.75739287361541, "learning_rate": 4.9956639728427507e-05, "loss": 2.391, "mean_token_accuracy": 0.4774349570274353, "step": 68285 }, { "epoch": 0.06878241329679173, "grad_norm": 13.394639760364598, "learning_rate": 4.9956616473334194e-05, "loss": 2.6972, "mean_token_accuracy": 0.3862068891525269, "step": 68290 }, { "epoch": 0.0687874493498959, "grad_norm": 11.041523905372237, "learning_rate": 4.995659321201245e-05, "loss": 2.7111, "mean_token_accuracy": 0.4, "step": 68295 }, { "epoch": 0.06879248540300008, "grad_norm": 8.625332400582879, "learning_rate": 4.995656994446229e-05, "loss": 2.6796, "mean_token_accuracy": 0.4068965554237366, "step": 68300 }, { "epoch": 0.06879752145610425, "grad_norm": 9.720938484340916, "learning_rate": 4.995654667068371e-05, "loss": 1.9673, "mean_token_accuracy": 0.46995073556900024, "step": 68305 }, { "epoch": 0.06880255750920843, "grad_norm": 10.52488012676982, "learning_rate": 4.995652339067673e-05, "loss": 2.3777, "mean_token_accuracy": 0.42758620977401735, "step": 68310 }, { "epoch": 0.0688075935623126, "grad_norm": 13.01065089626831, "learning_rate": 4.995650010444135e-05, "loss": 2.1194, "mean_token_accuracy": 0.4620689630508423, "step": 68315 }, { "epoch": 0.06881262961541677, "grad_norm": 14.07132543863634, "learning_rate": 4.995647681197758e-05, "loss": 2.4308, "mean_token_accuracy": 0.42915910482406616, "step": 68320 }, { "epoch": 0.06881766566852095, "grad_norm": 12.946774338917402, "learning_rate": 4.995645351328541e-05, "loss": 2.5139, "mean_token_accuracy": 0.4034482777118683, "step": 68325 }, { "epoch": 0.06882270172162512, "grad_norm": 14.721830659351117, "learning_rate": 4.995643020836487e-05, "loss": 2.7721, "mean_token_accuracy": 0.37241379618644715, "step": 68330 }, { "epoch": 0.06882773777472928, "grad_norm": 12.811521538146035, "learning_rate": 4.995640689721595e-05, "loss": 2.637, "mean_token_accuracy": 0.41034482717514037, "step": 68335 }, { "epoch": 0.06883277382783345, "grad_norm": 12.188066228182917, "learning_rate": 4.9956383579838656e-05, "loss": 3.0205, "mean_token_accuracy": 0.3379310339689255, "step": 68340 }, { "epoch": 0.06883780988093763, "grad_norm": 10.189306275715216, "learning_rate": 4.995636025623301e-05, "loss": 2.3359, "mean_token_accuracy": 0.42413793206214906, "step": 68345 }, { "epoch": 0.0688428459340418, "grad_norm": 11.552688370595554, "learning_rate": 4.9956336926399006e-05, "loss": 2.6752, "mean_token_accuracy": 0.36896551251411436, "step": 68350 }, { "epoch": 0.06884788198714598, "grad_norm": 10.64053065552082, "learning_rate": 4.995631359033665e-05, "loss": 2.3537, "mean_token_accuracy": 0.42014518976211546, "step": 68355 }, { "epoch": 0.06885291804025015, "grad_norm": 13.825055121473975, "learning_rate": 4.995629024804595e-05, "loss": 2.2019, "mean_token_accuracy": 0.4848154723644257, "step": 68360 }, { "epoch": 0.06885795409335432, "grad_norm": 11.848816493515631, "learning_rate": 4.9956266899526915e-05, "loss": 2.4217, "mean_token_accuracy": 0.40344826579093934, "step": 68365 }, { "epoch": 0.0688629901464585, "grad_norm": 11.377183686229952, "learning_rate": 4.995624354477956e-05, "loss": 2.1334, "mean_token_accuracy": 0.4931034445762634, "step": 68370 }, { "epoch": 0.06886802619956267, "grad_norm": 9.698027321301755, "learning_rate": 4.9956220183803875e-05, "loss": 2.3202, "mean_token_accuracy": 0.4034482777118683, "step": 68375 }, { "epoch": 0.06887306225266684, "grad_norm": 11.509564562701797, "learning_rate": 4.995619681659987e-05, "loss": 2.5417, "mean_token_accuracy": 0.38275861740112305, "step": 68380 }, { "epoch": 0.06887809830577102, "grad_norm": 10.040928349171919, "learning_rate": 4.9956173443167564e-05, "loss": 2.401, "mean_token_accuracy": 0.4586206912994385, "step": 68385 }, { "epoch": 0.06888313435887519, "grad_norm": 19.906730386696204, "learning_rate": 4.9956150063506943e-05, "loss": 2.7352, "mean_token_accuracy": 0.3896551728248596, "step": 68390 }, { "epoch": 0.06888817041197937, "grad_norm": 12.616991827631823, "learning_rate": 4.9956126677618035e-05, "loss": 2.435, "mean_token_accuracy": 0.38620689809322356, "step": 68395 }, { "epoch": 0.06889320646508354, "grad_norm": 10.26895841272872, "learning_rate": 4.995610328550084e-05, "loss": 2.3165, "mean_token_accuracy": 0.443254691362381, "step": 68400 }, { "epoch": 0.0688982425181877, "grad_norm": 9.993477608082141, "learning_rate": 4.995607988715534e-05, "loss": 2.4207, "mean_token_accuracy": 0.40689654350280763, "step": 68405 }, { "epoch": 0.06890327857129187, "grad_norm": 15.538352396027378, "learning_rate": 4.9956056482581586e-05, "loss": 2.7349, "mean_token_accuracy": 0.358620685338974, "step": 68410 }, { "epoch": 0.06890831462439605, "grad_norm": 17.254591896865257, "learning_rate": 4.995603307177955e-05, "loss": 2.7628, "mean_token_accuracy": 0.41724138259887694, "step": 68415 }, { "epoch": 0.06891335067750022, "grad_norm": 11.561106690820623, "learning_rate": 4.995600965474925e-05, "loss": 2.4659, "mean_token_accuracy": 0.42413792610168455, "step": 68420 }, { "epoch": 0.0689183867306044, "grad_norm": 10.916660391260852, "learning_rate": 4.995598623149069e-05, "loss": 3.2277, "mean_token_accuracy": 0.39310344457626345, "step": 68425 }, { "epoch": 0.06892342278370857, "grad_norm": 10.310989828371996, "learning_rate": 4.9955962802003887e-05, "loss": 2.3647, "mean_token_accuracy": 0.3999999940395355, "step": 68430 }, { "epoch": 0.06892845883681274, "grad_norm": 11.032759181497525, "learning_rate": 4.9955939366288834e-05, "loss": 2.8766, "mean_token_accuracy": 0.4103448331356049, "step": 68435 }, { "epoch": 0.06893349488991692, "grad_norm": 10.017227522900951, "learning_rate": 4.995591592434554e-05, "loss": 2.4006, "mean_token_accuracy": 0.39310344457626345, "step": 68440 }, { "epoch": 0.06893853094302109, "grad_norm": 14.614787203511902, "learning_rate": 4.995589247617402e-05, "loss": 2.464, "mean_token_accuracy": 0.4034482717514038, "step": 68445 }, { "epoch": 0.06894356699612526, "grad_norm": 11.21538102570232, "learning_rate": 4.995586902177427e-05, "loss": 2.5783, "mean_token_accuracy": 0.36896550953388213, "step": 68450 }, { "epoch": 0.06894860304922944, "grad_norm": 11.130687187389741, "learning_rate": 4.99558455611463e-05, "loss": 2.9655, "mean_token_accuracy": 0.3482758551836014, "step": 68455 }, { "epoch": 0.06895363910233361, "grad_norm": 11.980783784234031, "learning_rate": 4.9955822094290116e-05, "loss": 2.3263, "mean_token_accuracy": 0.4275861978530884, "step": 68460 }, { "epoch": 0.06895867515543778, "grad_norm": 11.627874988993845, "learning_rate": 4.995579862120573e-05, "loss": 2.7726, "mean_token_accuracy": 0.37586206793785093, "step": 68465 }, { "epoch": 0.06896371120854196, "grad_norm": 11.993794892656815, "learning_rate": 4.995577514189314e-05, "loss": 2.3595, "mean_token_accuracy": 0.4, "step": 68470 }, { "epoch": 0.06896874726164612, "grad_norm": 16.163800529366693, "learning_rate": 4.995575165635236e-05, "loss": 2.4382, "mean_token_accuracy": 0.3999999940395355, "step": 68475 }, { "epoch": 0.06897378331475029, "grad_norm": 11.085237128187229, "learning_rate": 4.99557281645834e-05, "loss": 2.7072, "mean_token_accuracy": 0.37241379022598264, "step": 68480 }, { "epoch": 0.06897881936785447, "grad_norm": 14.76984410113925, "learning_rate": 4.995570466658625e-05, "loss": 2.8102, "mean_token_accuracy": 0.3574107691645622, "step": 68485 }, { "epoch": 0.06898385542095864, "grad_norm": 10.53966114243923, "learning_rate": 4.995568116236093e-05, "loss": 2.2478, "mean_token_accuracy": 0.4344827592372894, "step": 68490 }, { "epoch": 0.06898889147406281, "grad_norm": 10.72842197344829, "learning_rate": 4.995565765190744e-05, "loss": 2.5779, "mean_token_accuracy": 0.4137930929660797, "step": 68495 }, { "epoch": 0.06899392752716699, "grad_norm": 12.145113604289183, "learning_rate": 4.99556341352258e-05, "loss": 2.3373, "mean_token_accuracy": 0.42413793206214906, "step": 68500 }, { "epoch": 0.06899896358027116, "grad_norm": 14.661452125856623, "learning_rate": 4.995561061231599e-05, "loss": 2.3491, "mean_token_accuracy": 0.401875376701355, "step": 68505 }, { "epoch": 0.06900399963337533, "grad_norm": 11.43938343223494, "learning_rate": 4.9955587083178044e-05, "loss": 2.2729, "mean_token_accuracy": 0.46031457781791685, "step": 68510 }, { "epoch": 0.06900903568647951, "grad_norm": 11.947915279705231, "learning_rate": 4.995556354781195e-05, "loss": 2.6768, "mean_token_accuracy": 0.3896551728248596, "step": 68515 }, { "epoch": 0.06901407173958368, "grad_norm": 12.195475639202975, "learning_rate": 4.9955540006217726e-05, "loss": 2.6219, "mean_token_accuracy": 0.3655172407627106, "step": 68520 }, { "epoch": 0.06901910779268786, "grad_norm": 11.328282163588128, "learning_rate": 4.9955516458395376e-05, "loss": 2.4307, "mean_token_accuracy": 0.4206896543502808, "step": 68525 }, { "epoch": 0.06902414384579203, "grad_norm": 12.1180032101301, "learning_rate": 4.995549290434491e-05, "loss": 3.143, "mean_token_accuracy": 0.37241379022598264, "step": 68530 }, { "epoch": 0.0690291798988962, "grad_norm": 10.425640153208771, "learning_rate": 4.9955469344066316e-05, "loss": 2.2673, "mean_token_accuracy": 0.41379310488700866, "step": 68535 }, { "epoch": 0.06903421595200038, "grad_norm": 12.139066728407032, "learning_rate": 4.9955445777559626e-05, "loss": 2.6509, "mean_token_accuracy": 0.33103448152542114, "step": 68540 }, { "epoch": 0.06903925200510454, "grad_norm": 10.515487193559398, "learning_rate": 4.995542220482483e-05, "loss": 2.5458, "mean_token_accuracy": 0.42413792610168455, "step": 68545 }, { "epoch": 0.06904428805820871, "grad_norm": 10.28783048709047, "learning_rate": 4.995539862586194e-05, "loss": 2.4284, "mean_token_accuracy": 0.3896551728248596, "step": 68550 }, { "epoch": 0.06904932411131288, "grad_norm": 10.3329281536301, "learning_rate": 4.9955375040670956e-05, "loss": 2.3182, "mean_token_accuracy": 0.42413792610168455, "step": 68555 }, { "epoch": 0.06905436016441706, "grad_norm": 13.747447509805834, "learning_rate": 4.995535144925189e-05, "loss": 2.8041, "mean_token_accuracy": 0.36896551251411436, "step": 68560 }, { "epoch": 0.06905939621752123, "grad_norm": 13.445670757314042, "learning_rate": 4.995532785160475e-05, "loss": 2.5272, "mean_token_accuracy": 0.3827586233615875, "step": 68565 }, { "epoch": 0.0690644322706254, "grad_norm": 9.686006047570606, "learning_rate": 4.995530424772955e-05, "loss": 2.4205, "mean_token_accuracy": 0.3724137932062149, "step": 68570 }, { "epoch": 0.06906946832372958, "grad_norm": 10.238154704688407, "learning_rate": 4.9955280637626275e-05, "loss": 2.6881, "mean_token_accuracy": 0.4000000059604645, "step": 68575 }, { "epoch": 0.06907450437683375, "grad_norm": 10.636075214163679, "learning_rate": 4.995525702129495e-05, "loss": 2.3645, "mean_token_accuracy": 0.3999999940395355, "step": 68580 }, { "epoch": 0.06907954042993793, "grad_norm": 13.234580379916961, "learning_rate": 4.995523339873557e-05, "loss": 2.6993, "mean_token_accuracy": 0.3862069010734558, "step": 68585 }, { "epoch": 0.0690845764830421, "grad_norm": 13.167188487716343, "learning_rate": 4.9955209769948155e-05, "loss": 2.3473, "mean_token_accuracy": 0.4310344815254211, "step": 68590 }, { "epoch": 0.06908961253614627, "grad_norm": 11.08968255253257, "learning_rate": 4.99551861349327e-05, "loss": 2.8021, "mean_token_accuracy": 0.37586207389831544, "step": 68595 }, { "epoch": 0.06909464858925045, "grad_norm": 10.957593880194658, "learning_rate": 4.9955162493689225e-05, "loss": 2.5719, "mean_token_accuracy": 0.3620689630508423, "step": 68600 }, { "epoch": 0.06909968464235462, "grad_norm": 9.444082987842588, "learning_rate": 4.995513884621771e-05, "loss": 2.6809, "mean_token_accuracy": 0.3880822777748108, "step": 68605 }, { "epoch": 0.0691047206954588, "grad_norm": 10.452925842264674, "learning_rate": 4.9955115192518195e-05, "loss": 2.607, "mean_token_accuracy": 0.3931034505367279, "step": 68610 }, { "epoch": 0.06910975674856296, "grad_norm": 11.284994723468822, "learning_rate": 4.9955091532590655e-05, "loss": 2.6513, "mean_token_accuracy": 0.3862069010734558, "step": 68615 }, { "epoch": 0.06911479280166713, "grad_norm": 11.455402326753806, "learning_rate": 4.995506786643512e-05, "loss": 2.5495, "mean_token_accuracy": 0.43448275327682495, "step": 68620 }, { "epoch": 0.0691198288547713, "grad_norm": 12.156602715431191, "learning_rate": 4.995504419405159e-05, "loss": 2.1219, "mean_token_accuracy": 0.4310344815254211, "step": 68625 }, { "epoch": 0.06912486490787548, "grad_norm": 14.44076571141514, "learning_rate": 4.995502051544007e-05, "loss": 2.6554, "mean_token_accuracy": 0.3931034505367279, "step": 68630 }, { "epoch": 0.06912990096097965, "grad_norm": 9.331211057562246, "learning_rate": 4.9954996830600557e-05, "loss": 2.6234, "mean_token_accuracy": 0.4137930989265442, "step": 68635 }, { "epoch": 0.06913493701408382, "grad_norm": 13.387125275798537, "learning_rate": 4.995497313953307e-05, "loss": 2.5179, "mean_token_accuracy": 0.3896551728248596, "step": 68640 }, { "epoch": 0.069139973067188, "grad_norm": 11.614357486543609, "learning_rate": 4.995494944223761e-05, "loss": 2.8698, "mean_token_accuracy": 0.37586206793785093, "step": 68645 }, { "epoch": 0.06914500912029217, "grad_norm": 10.2091869982369, "learning_rate": 4.99549257387142e-05, "loss": 2.3818, "mean_token_accuracy": 0.47586206197738645, "step": 68650 }, { "epoch": 0.06915004517339635, "grad_norm": 12.294449936209256, "learning_rate": 4.995490202896281e-05, "loss": 2.1653, "mean_token_accuracy": 0.4620689690113068, "step": 68655 }, { "epoch": 0.06915508122650052, "grad_norm": 11.628665868012853, "learning_rate": 4.99548783129835e-05, "loss": 2.4145, "mean_token_accuracy": 0.3896551728248596, "step": 68660 }, { "epoch": 0.0691601172796047, "grad_norm": 10.41633871684809, "learning_rate": 4.995485459077622e-05, "loss": 2.4332, "mean_token_accuracy": 0.42758620977401735, "step": 68665 }, { "epoch": 0.06916515333270887, "grad_norm": 11.501497858255746, "learning_rate": 4.995483086234101e-05, "loss": 2.6771, "mean_token_accuracy": 0.38275861740112305, "step": 68670 }, { "epoch": 0.06917018938581304, "grad_norm": 10.792549804645974, "learning_rate": 4.995480712767787e-05, "loss": 2.1903, "mean_token_accuracy": 0.4593596041202545, "step": 68675 }, { "epoch": 0.06917522543891721, "grad_norm": 11.947470278700079, "learning_rate": 4.99547833867868e-05, "loss": 2.5797, "mean_token_accuracy": 0.40689656138420105, "step": 68680 }, { "epoch": 0.06918026149202137, "grad_norm": 14.180908310440948, "learning_rate": 4.995475963966782e-05, "loss": 2.8306, "mean_token_accuracy": 0.38620689511299133, "step": 68685 }, { "epoch": 0.06918529754512555, "grad_norm": 12.015600515394084, "learning_rate": 4.995473588632093e-05, "loss": 2.4433, "mean_token_accuracy": 0.41034482717514037, "step": 68690 }, { "epoch": 0.06919033359822972, "grad_norm": 10.326587450970427, "learning_rate": 4.9954712126746125e-05, "loss": 2.4806, "mean_token_accuracy": 0.4223230481147766, "step": 68695 }, { "epoch": 0.0691953696513339, "grad_norm": 10.907207101792315, "learning_rate": 4.9954688360943424e-05, "loss": 2.4136, "mean_token_accuracy": 0.3979431390762329, "step": 68700 }, { "epoch": 0.06920040570443807, "grad_norm": 12.443756143362155, "learning_rate": 4.995466458891284e-05, "loss": 2.4007, "mean_token_accuracy": 0.41034482717514037, "step": 68705 }, { "epoch": 0.06920544175754224, "grad_norm": 11.355663241090976, "learning_rate": 4.995464081065437e-05, "loss": 2.411, "mean_token_accuracy": 0.42758620381355283, "step": 68710 }, { "epoch": 0.06921047781064642, "grad_norm": 12.154822167650508, "learning_rate": 4.9954617026168017e-05, "loss": 2.5309, "mean_token_accuracy": 0.41034482717514037, "step": 68715 }, { "epoch": 0.06921551386375059, "grad_norm": 17.316687640054468, "learning_rate": 4.995459323545379e-05, "loss": 2.456, "mean_token_accuracy": 0.3793103456497192, "step": 68720 }, { "epoch": 0.06922054991685476, "grad_norm": 11.649141479761466, "learning_rate": 4.995456943851171e-05, "loss": 2.6445, "mean_token_accuracy": 0.3655172407627106, "step": 68725 }, { "epoch": 0.06922558596995894, "grad_norm": 16.368233549752873, "learning_rate": 4.9954545635341755e-05, "loss": 2.8127, "mean_token_accuracy": 0.38275861740112305, "step": 68730 }, { "epoch": 0.06923062202306311, "grad_norm": 13.01168856137935, "learning_rate": 4.9954521825943956e-05, "loss": 2.399, "mean_token_accuracy": 0.41724138259887694, "step": 68735 }, { "epoch": 0.06923565807616729, "grad_norm": 11.763145816658948, "learning_rate": 4.995449801031831e-05, "loss": 2.6363, "mean_token_accuracy": 0.40344826579093934, "step": 68740 }, { "epoch": 0.06924069412927146, "grad_norm": 12.234524227982364, "learning_rate": 4.9954474188464824e-05, "loss": 2.9011, "mean_token_accuracy": 0.3911070764064789, "step": 68745 }, { "epoch": 0.06924573018237563, "grad_norm": 10.43238559814186, "learning_rate": 4.995445036038351e-05, "loss": 2.4521, "mean_token_accuracy": 0.3896551728248596, "step": 68750 }, { "epoch": 0.0692507662354798, "grad_norm": 12.467078694985444, "learning_rate": 4.995442652607437e-05, "loss": 2.8374, "mean_token_accuracy": 0.358620685338974, "step": 68755 }, { "epoch": 0.06925580228858397, "grad_norm": 14.385823381623696, "learning_rate": 4.9954402685537404e-05, "loss": 2.8, "mean_token_accuracy": 0.38965516686439516, "step": 68760 }, { "epoch": 0.06926083834168814, "grad_norm": 15.04342781232573, "learning_rate": 4.995437883877263e-05, "loss": 2.4493, "mean_token_accuracy": 0.4517241358757019, "step": 68765 }, { "epoch": 0.06926587439479232, "grad_norm": 10.560124977734038, "learning_rate": 4.995435498578005e-05, "loss": 2.2557, "mean_token_accuracy": 0.45862069725990295, "step": 68770 }, { "epoch": 0.06927091044789649, "grad_norm": 10.450295484171619, "learning_rate": 4.995433112655968e-05, "loss": 2.6005, "mean_token_accuracy": 0.37586206793785093, "step": 68775 }, { "epoch": 0.06927594650100066, "grad_norm": 12.078299774100959, "learning_rate": 4.99543072611115e-05, "loss": 2.454, "mean_token_accuracy": 0.42758620381355283, "step": 68780 }, { "epoch": 0.06928098255410484, "grad_norm": 13.698929098045454, "learning_rate": 4.9954283389435545e-05, "loss": 2.2923, "mean_token_accuracy": 0.45051422119140627, "step": 68785 }, { "epoch": 0.06928601860720901, "grad_norm": 11.01010567253589, "learning_rate": 4.9954259511531815e-05, "loss": 2.6219, "mean_token_accuracy": 0.38620689511299133, "step": 68790 }, { "epoch": 0.06929105466031318, "grad_norm": 9.777383742958852, "learning_rate": 4.995423562740031e-05, "loss": 2.4177, "mean_token_accuracy": 0.41530550122261045, "step": 68795 }, { "epoch": 0.06929609071341736, "grad_norm": 9.150017781487204, "learning_rate": 4.995421173704103e-05, "loss": 2.2728, "mean_token_accuracy": 0.42413792610168455, "step": 68800 }, { "epoch": 0.06930112676652153, "grad_norm": 10.855660054710444, "learning_rate": 4.9954187840453996e-05, "loss": 2.4523, "mean_token_accuracy": 0.41034482717514037, "step": 68805 }, { "epoch": 0.0693061628196257, "grad_norm": 10.884763382822344, "learning_rate": 4.995416393763922e-05, "loss": 2.5689, "mean_token_accuracy": 0.3517241358757019, "step": 68810 }, { "epoch": 0.06931119887272988, "grad_norm": 10.917681794814634, "learning_rate": 4.995414002859668e-05, "loss": 2.4923, "mean_token_accuracy": 0.3827586263418198, "step": 68815 }, { "epoch": 0.06931623492583405, "grad_norm": 16.430117647235722, "learning_rate": 4.99541161133264e-05, "loss": 2.6375, "mean_token_accuracy": 0.35172414481639863, "step": 68820 }, { "epoch": 0.06932127097893821, "grad_norm": 12.930266203709143, "learning_rate": 4.99540921918284e-05, "loss": 2.4642, "mean_token_accuracy": 0.42068966031074523, "step": 68825 }, { "epoch": 0.06932630703204239, "grad_norm": 9.73922959309583, "learning_rate": 4.995406826410267e-05, "loss": 2.6668, "mean_token_accuracy": 0.4360556542873383, "step": 68830 }, { "epoch": 0.06933134308514656, "grad_norm": 11.414551492463593, "learning_rate": 4.995404433014922e-05, "loss": 2.2866, "mean_token_accuracy": 0.4206896543502808, "step": 68835 }, { "epoch": 0.06933637913825073, "grad_norm": 12.034953929478462, "learning_rate": 4.995402038996805e-05, "loss": 2.2224, "mean_token_accuracy": 0.42413792610168455, "step": 68840 }, { "epoch": 0.06934141519135491, "grad_norm": 11.870792432328185, "learning_rate": 4.995399644355919e-05, "loss": 2.4494, "mean_token_accuracy": 0.4068965554237366, "step": 68845 }, { "epoch": 0.06934645124445908, "grad_norm": 11.558940644956172, "learning_rate": 4.995397249092261e-05, "loss": 2.2503, "mean_token_accuracy": 0.47586206197738645, "step": 68850 }, { "epoch": 0.06935148729756326, "grad_norm": 10.759807408604575, "learning_rate": 4.995394853205835e-05, "loss": 2.6779, "mean_token_accuracy": 0.4151845157146454, "step": 68855 }, { "epoch": 0.06935652335066743, "grad_norm": 10.330986829314917, "learning_rate": 4.9953924566966404e-05, "loss": 2.4523, "mean_token_accuracy": 0.38620689511299133, "step": 68860 }, { "epoch": 0.0693615594037716, "grad_norm": 10.753541205291183, "learning_rate": 4.995390059564677e-05, "loss": 2.6479, "mean_token_accuracy": 0.4068965554237366, "step": 68865 }, { "epoch": 0.06936659545687578, "grad_norm": 11.073421025987594, "learning_rate": 4.9953876618099475e-05, "loss": 2.1975, "mean_token_accuracy": 0.42758620977401735, "step": 68870 }, { "epoch": 0.06937163150997995, "grad_norm": 11.213531552285495, "learning_rate": 4.9953852634324514e-05, "loss": 2.5785, "mean_token_accuracy": 0.4206896543502808, "step": 68875 }, { "epoch": 0.06937666756308412, "grad_norm": 11.725228726465163, "learning_rate": 4.9953828644321875e-05, "loss": 2.7197, "mean_token_accuracy": 0.36896551847457887, "step": 68880 }, { "epoch": 0.0693817036161883, "grad_norm": 11.068580729639695, "learning_rate": 4.995380464809159e-05, "loss": 2.5585, "mean_token_accuracy": 0.42068966031074523, "step": 68885 }, { "epoch": 0.06938673966929247, "grad_norm": 11.913123053052246, "learning_rate": 4.995378064563367e-05, "loss": 2.5933, "mean_token_accuracy": 0.41724138259887694, "step": 68890 }, { "epoch": 0.06939177572239663, "grad_norm": 10.090095709382323, "learning_rate": 4.9953756636948104e-05, "loss": 2.3276, "mean_token_accuracy": 0.42758620977401735, "step": 68895 }, { "epoch": 0.0693968117755008, "grad_norm": 11.615033413924445, "learning_rate": 4.9953732622034904e-05, "loss": 2.2036, "mean_token_accuracy": 0.4503327250480652, "step": 68900 }, { "epoch": 0.06940184782860498, "grad_norm": 16.901866636531842, "learning_rate": 4.995370860089408e-05, "loss": 2.8175, "mean_token_accuracy": 0.33448275923728943, "step": 68905 }, { "epoch": 0.06940688388170915, "grad_norm": 13.989519230748044, "learning_rate": 4.995368457352563e-05, "loss": 2.1626, "mean_token_accuracy": 0.44827585816383364, "step": 68910 }, { "epoch": 0.06941191993481333, "grad_norm": 11.210757881121143, "learning_rate": 4.995366053992957e-05, "loss": 2.7814, "mean_token_accuracy": 0.39993950724601746, "step": 68915 }, { "epoch": 0.0694169559879175, "grad_norm": 13.854054188760335, "learning_rate": 4.995363650010589e-05, "loss": 2.7845, "mean_token_accuracy": 0.358620685338974, "step": 68920 }, { "epoch": 0.06942199204102167, "grad_norm": 11.14041695564461, "learning_rate": 4.9953612454054636e-05, "loss": 2.4939, "mean_token_accuracy": 0.38620689511299133, "step": 68925 }, { "epoch": 0.06942702809412585, "grad_norm": 14.482412728840401, "learning_rate": 4.995358840177577e-05, "loss": 2.9452, "mean_token_accuracy": 0.3965517282485962, "step": 68930 }, { "epoch": 0.06943206414723002, "grad_norm": 11.06778317753632, "learning_rate": 4.995356434326933e-05, "loss": 2.3983, "mean_token_accuracy": 0.4379310429096222, "step": 68935 }, { "epoch": 0.0694371002003342, "grad_norm": 11.456746249681887, "learning_rate": 4.99535402785353e-05, "loss": 3.2143, "mean_token_accuracy": 0.317241370677948, "step": 68940 }, { "epoch": 0.06944213625343837, "grad_norm": 11.641652918004821, "learning_rate": 4.9953516207573703e-05, "loss": 2.6824, "mean_token_accuracy": 0.39655172228813174, "step": 68945 }, { "epoch": 0.06944717230654254, "grad_norm": 11.379517999043522, "learning_rate": 4.9953492130384534e-05, "loss": 2.4344, "mean_token_accuracy": 0.4000000059604645, "step": 68950 }, { "epoch": 0.06945220835964672, "grad_norm": 9.640750488301148, "learning_rate": 4.99534680469678e-05, "loss": 2.2563, "mean_token_accuracy": 0.458620685338974, "step": 68955 }, { "epoch": 0.06945724441275089, "grad_norm": 14.3951908657317, "learning_rate": 4.995344395732353e-05, "loss": 2.4955, "mean_token_accuracy": 0.3931034505367279, "step": 68960 }, { "epoch": 0.06946228046585505, "grad_norm": 12.58434918281384, "learning_rate": 4.9953419861451705e-05, "loss": 2.5236, "mean_token_accuracy": 0.44827585816383364, "step": 68965 }, { "epoch": 0.06946731651895922, "grad_norm": 17.206526493849143, "learning_rate": 4.995339575935234e-05, "loss": 2.9202, "mean_token_accuracy": 0.34137930870056155, "step": 68970 }, { "epoch": 0.0694723525720634, "grad_norm": 12.553841148156884, "learning_rate": 4.995337165102544e-05, "loss": 2.4965, "mean_token_accuracy": 0.37586206793785093, "step": 68975 }, { "epoch": 0.06947738862516757, "grad_norm": 13.399813701867968, "learning_rate": 4.995334753647102e-05, "loss": 2.2564, "mean_token_accuracy": 0.44150246381759645, "step": 68980 }, { "epoch": 0.06948242467827175, "grad_norm": 10.10172739297303, "learning_rate": 4.9953323415689074e-05, "loss": 2.4994, "mean_token_accuracy": 0.4241379380226135, "step": 68985 }, { "epoch": 0.06948746073137592, "grad_norm": 12.208975771978402, "learning_rate": 4.995329928867961e-05, "loss": 2.6605, "mean_token_accuracy": 0.42413793206214906, "step": 68990 }, { "epoch": 0.06949249678448009, "grad_norm": 16.570529539590613, "learning_rate": 4.9953275155442655e-05, "loss": 2.3356, "mean_token_accuracy": 0.4172413766384125, "step": 68995 }, { "epoch": 0.06949753283758427, "grad_norm": 10.433911929830472, "learning_rate": 4.9953251015978186e-05, "loss": 2.4537, "mean_token_accuracy": 0.44482758045196535, "step": 69000 }, { "epoch": 0.06950256889068844, "grad_norm": 13.250607140260652, "learning_rate": 4.995322687028623e-05, "loss": 2.6566, "mean_token_accuracy": 0.3862069010734558, "step": 69005 }, { "epoch": 0.06950760494379261, "grad_norm": 14.175663597725457, "learning_rate": 4.9953202718366786e-05, "loss": 2.5445, "mean_token_accuracy": 0.41034482717514037, "step": 69010 }, { "epoch": 0.06951264099689679, "grad_norm": 10.254019031143196, "learning_rate": 4.995317856021987e-05, "loss": 2.4162, "mean_token_accuracy": 0.46896551847457885, "step": 69015 }, { "epoch": 0.06951767705000096, "grad_norm": 14.30277014771815, "learning_rate": 4.9953154395845475e-05, "loss": 2.7011, "mean_token_accuracy": 0.37241379022598264, "step": 69020 }, { "epoch": 0.06952271310310514, "grad_norm": 11.70104131128868, "learning_rate": 4.995313022524361e-05, "loss": 2.7611, "mean_token_accuracy": 0.38620689511299133, "step": 69025 }, { "epoch": 0.06952774915620931, "grad_norm": 10.253303509416424, "learning_rate": 4.9953106048414293e-05, "loss": 2.584, "mean_token_accuracy": 0.4034482777118683, "step": 69030 }, { "epoch": 0.06953278520931347, "grad_norm": 11.874387052025757, "learning_rate": 4.995308186535752e-05, "loss": 2.5118, "mean_token_accuracy": 0.40344828367233276, "step": 69035 }, { "epoch": 0.06953782126241764, "grad_norm": 11.512632384894767, "learning_rate": 4.99530576760733e-05, "loss": 2.6066, "mean_token_accuracy": 0.4398064136505127, "step": 69040 }, { "epoch": 0.06954285731552182, "grad_norm": 11.610155950263424, "learning_rate": 4.995303348056164e-05, "loss": 2.4158, "mean_token_accuracy": 0.39655172228813174, "step": 69045 }, { "epoch": 0.06954789336862599, "grad_norm": 9.889454074803922, "learning_rate": 4.995300927882256e-05, "loss": 2.5037, "mean_token_accuracy": 0.43103448748588563, "step": 69050 }, { "epoch": 0.06955292942173016, "grad_norm": 11.308732888993708, "learning_rate": 4.9952985070856034e-05, "loss": 2.5873, "mean_token_accuracy": 0.4, "step": 69055 }, { "epoch": 0.06955796547483434, "grad_norm": 10.879324492961441, "learning_rate": 4.99529608566621e-05, "loss": 2.4889, "mean_token_accuracy": 0.44137930274009707, "step": 69060 }, { "epoch": 0.06956300152793851, "grad_norm": 12.151599275915636, "learning_rate": 4.995293663624075e-05, "loss": 2.7146, "mean_token_accuracy": 0.36896551847457887, "step": 69065 }, { "epoch": 0.06956803758104269, "grad_norm": 13.889498073531236, "learning_rate": 4.9952912409592e-05, "loss": 2.8549, "mean_token_accuracy": 0.37241379618644715, "step": 69070 }, { "epoch": 0.06957307363414686, "grad_norm": 12.177187852860959, "learning_rate": 4.995288817671584e-05, "loss": 2.7181, "mean_token_accuracy": 0.41724138259887694, "step": 69075 }, { "epoch": 0.06957810968725103, "grad_norm": 12.384731382360865, "learning_rate": 4.99528639376123e-05, "loss": 2.3653, "mean_token_accuracy": 0.42413793206214906, "step": 69080 }, { "epoch": 0.0695831457403552, "grad_norm": 10.976640050596247, "learning_rate": 4.995283969228137e-05, "loss": 3.1003, "mean_token_accuracy": 0.3655172407627106, "step": 69085 }, { "epoch": 0.06958818179345938, "grad_norm": 12.026789175310098, "learning_rate": 4.9952815440723055e-05, "loss": 2.5681, "mean_token_accuracy": 0.4344827592372894, "step": 69090 }, { "epoch": 0.06959321784656355, "grad_norm": 12.121541156799083, "learning_rate": 4.9952791182937374e-05, "loss": 2.5274, "mean_token_accuracy": 0.41034482717514037, "step": 69095 }, { "epoch": 0.06959825389966773, "grad_norm": 14.927715533335313, "learning_rate": 4.995276691892433e-05, "loss": 2.4328, "mean_token_accuracy": 0.38275861740112305, "step": 69100 }, { "epoch": 0.06960328995277189, "grad_norm": 10.265588983489966, "learning_rate": 4.995274264868392e-05, "loss": 2.5255, "mean_token_accuracy": 0.4310344815254211, "step": 69105 }, { "epoch": 0.06960832600587606, "grad_norm": 9.780438737432963, "learning_rate": 4.995271837221616e-05, "loss": 2.429, "mean_token_accuracy": 0.4329703629016876, "step": 69110 }, { "epoch": 0.06961336205898024, "grad_norm": 15.03281090190755, "learning_rate": 4.995269408952106e-05, "loss": 2.4947, "mean_token_accuracy": 0.38965516686439516, "step": 69115 }, { "epoch": 0.06961839811208441, "grad_norm": 9.484920805830832, "learning_rate": 4.995266980059862e-05, "loss": 2.5013, "mean_token_accuracy": 0.4034482717514038, "step": 69120 }, { "epoch": 0.06962343416518858, "grad_norm": 13.915142028456298, "learning_rate": 4.995264550544885e-05, "loss": 2.2544, "mean_token_accuracy": 0.4379310250282288, "step": 69125 }, { "epoch": 0.06962847021829276, "grad_norm": 13.229519668757016, "learning_rate": 4.995262120407175e-05, "loss": 2.4697, "mean_token_accuracy": 0.42499999403953553, "step": 69130 }, { "epoch": 0.06963350627139693, "grad_norm": 11.68384412914266, "learning_rate": 4.995259689646733e-05, "loss": 2.7728, "mean_token_accuracy": 0.3551724135875702, "step": 69135 }, { "epoch": 0.0696385423245011, "grad_norm": 15.215890104803373, "learning_rate": 4.99525725826356e-05, "loss": 3.0822, "mean_token_accuracy": 0.32758620083332063, "step": 69140 }, { "epoch": 0.06964357837760528, "grad_norm": 10.178072054726151, "learning_rate": 4.9952548262576566e-05, "loss": 2.1224, "mean_token_accuracy": 0.4862069010734558, "step": 69145 }, { "epoch": 0.06964861443070945, "grad_norm": 12.110112671056603, "learning_rate": 4.995252393629024e-05, "loss": 2.7521, "mean_token_accuracy": 0.3793103456497192, "step": 69150 }, { "epoch": 0.06965365048381363, "grad_norm": 14.311716667498578, "learning_rate": 4.9952499603776615e-05, "loss": 2.3517, "mean_token_accuracy": 0.4068965554237366, "step": 69155 }, { "epoch": 0.0696586865369178, "grad_norm": 10.50301247360131, "learning_rate": 4.9952475265035706e-05, "loss": 2.3775, "mean_token_accuracy": 0.44827585816383364, "step": 69160 }, { "epoch": 0.06966372259002197, "grad_norm": 10.910908198938966, "learning_rate": 4.9952450920067526e-05, "loss": 2.4826, "mean_token_accuracy": 0.3965517282485962, "step": 69165 }, { "epoch": 0.06966875864312615, "grad_norm": 12.543590583091008, "learning_rate": 4.9952426568872075e-05, "loss": 2.8059, "mean_token_accuracy": 0.37586207389831544, "step": 69170 }, { "epoch": 0.0696737946962303, "grad_norm": 10.17003951953099, "learning_rate": 4.995240221144935e-05, "loss": 2.6055, "mean_token_accuracy": 0.3931034505367279, "step": 69175 }, { "epoch": 0.06967883074933448, "grad_norm": 11.175095015564159, "learning_rate": 4.9952377847799366e-05, "loss": 2.4398, "mean_token_accuracy": 0.44827585220336913, "step": 69180 }, { "epoch": 0.06968386680243865, "grad_norm": 11.395416032224416, "learning_rate": 4.995235347792214e-05, "loss": 2.4825, "mean_token_accuracy": 0.41724138259887694, "step": 69185 }, { "epoch": 0.06968890285554283, "grad_norm": 12.152853753237443, "learning_rate": 4.9952329101817676e-05, "loss": 2.5999, "mean_token_accuracy": 0.42758620977401735, "step": 69190 }, { "epoch": 0.069693938908647, "grad_norm": 10.979722187268564, "learning_rate": 4.995230471948596e-05, "loss": 2.0078, "mean_token_accuracy": 0.48094373345375063, "step": 69195 }, { "epoch": 0.06969897496175118, "grad_norm": 11.643808166754644, "learning_rate": 4.995228033092703e-05, "loss": 2.7891, "mean_token_accuracy": 0.41034482717514037, "step": 69200 }, { "epoch": 0.06970401101485535, "grad_norm": 12.59336213559089, "learning_rate": 4.995225593614086e-05, "loss": 2.663, "mean_token_accuracy": 0.36206896007061007, "step": 69205 }, { "epoch": 0.06970904706795952, "grad_norm": 11.353422203876665, "learning_rate": 4.995223153512748e-05, "loss": 2.5689, "mean_token_accuracy": 0.4068965494632721, "step": 69210 }, { "epoch": 0.0697140831210637, "grad_norm": 11.891655055457258, "learning_rate": 4.995220712788689e-05, "loss": 2.6296, "mean_token_accuracy": 0.4034482717514038, "step": 69215 }, { "epoch": 0.06971911917416787, "grad_norm": 12.16609974850152, "learning_rate": 4.9952182714419094e-05, "loss": 2.693, "mean_token_accuracy": 0.4137930989265442, "step": 69220 }, { "epoch": 0.06972415522727204, "grad_norm": 10.697233800366224, "learning_rate": 4.99521582947241e-05, "loss": 2.3711, "mean_token_accuracy": 0.4241379380226135, "step": 69225 }, { "epoch": 0.06972919128037622, "grad_norm": 10.90460049265713, "learning_rate": 4.995213386880192e-05, "loss": 2.3015, "mean_token_accuracy": 0.4620689630508423, "step": 69230 }, { "epoch": 0.06973422733348039, "grad_norm": 10.046911844460293, "learning_rate": 4.9952109436652556e-05, "loss": 2.855, "mean_token_accuracy": 0.4068965554237366, "step": 69235 }, { "epoch": 0.06973926338658457, "grad_norm": 11.284415791745777, "learning_rate": 4.995208499827602e-05, "loss": 2.3464, "mean_token_accuracy": 0.4206896543502808, "step": 69240 }, { "epoch": 0.06974429943968873, "grad_norm": 13.568014567785893, "learning_rate": 4.995206055367231e-05, "loss": 2.6642, "mean_token_accuracy": 0.38620689511299133, "step": 69245 }, { "epoch": 0.0697493354927929, "grad_norm": 31.495450284490325, "learning_rate": 4.995203610284144e-05, "loss": 2.6996, "mean_token_accuracy": 0.43448275327682495, "step": 69250 }, { "epoch": 0.06975437154589707, "grad_norm": 11.330954547460001, "learning_rate": 4.995201164578341e-05, "loss": 2.7512, "mean_token_accuracy": 0.4137930989265442, "step": 69255 }, { "epoch": 0.06975940759900125, "grad_norm": 13.391847941845976, "learning_rate": 4.995198718249823e-05, "loss": 2.2322, "mean_token_accuracy": 0.443993604183197, "step": 69260 }, { "epoch": 0.06976444365210542, "grad_norm": 11.052345841105538, "learning_rate": 4.995196271298591e-05, "loss": 2.1857, "mean_token_accuracy": 0.46551724672317507, "step": 69265 }, { "epoch": 0.0697694797052096, "grad_norm": 9.670145462510254, "learning_rate": 4.995193823724646e-05, "loss": 2.4563, "mean_token_accuracy": 0.40098522007465365, "step": 69270 }, { "epoch": 0.06977451575831377, "grad_norm": 13.39834029234346, "learning_rate": 4.995191375527987e-05, "loss": 2.4028, "mean_token_accuracy": 0.43266787827014924, "step": 69275 }, { "epoch": 0.06977955181141794, "grad_norm": 12.509633988924094, "learning_rate": 4.995188926708617e-05, "loss": 2.9001, "mean_token_accuracy": 0.3448275804519653, "step": 69280 }, { "epoch": 0.06978458786452212, "grad_norm": 10.904922590481267, "learning_rate": 4.9951864772665345e-05, "loss": 2.5634, "mean_token_accuracy": 0.38965517580509185, "step": 69285 }, { "epoch": 0.06978962391762629, "grad_norm": 11.899745762346427, "learning_rate": 4.995184027201742e-05, "loss": 2.7106, "mean_token_accuracy": 0.36551724672317504, "step": 69290 }, { "epoch": 0.06979465997073046, "grad_norm": 13.124551297736753, "learning_rate": 4.995181576514239e-05, "loss": 2.4451, "mean_token_accuracy": 0.4689655125141144, "step": 69295 }, { "epoch": 0.06979969602383464, "grad_norm": 12.832785151138568, "learning_rate": 4.995179125204026e-05, "loss": 2.7844, "mean_token_accuracy": 0.3655172407627106, "step": 69300 }, { "epoch": 0.06980473207693881, "grad_norm": 11.834963449405778, "learning_rate": 4.9951766732711044e-05, "loss": 2.4623, "mean_token_accuracy": 0.441379314661026, "step": 69305 }, { "epoch": 0.06980976813004298, "grad_norm": 11.806922302441444, "learning_rate": 4.995174220715475e-05, "loss": 2.3533, "mean_token_accuracy": 0.4344827592372894, "step": 69310 }, { "epoch": 0.06981480418314714, "grad_norm": 14.02394715771266, "learning_rate": 4.9951717675371385e-05, "loss": 2.4274, "mean_token_accuracy": 0.42413792908191683, "step": 69315 }, { "epoch": 0.06981984023625132, "grad_norm": 9.243206114209983, "learning_rate": 4.995169313736094e-05, "loss": 2.0182, "mean_token_accuracy": 0.4965517222881317, "step": 69320 }, { "epoch": 0.06982487628935549, "grad_norm": 12.386063511544483, "learning_rate": 4.995166859312344e-05, "loss": 3.052, "mean_token_accuracy": 0.37241379618644715, "step": 69325 }, { "epoch": 0.06982991234245967, "grad_norm": 11.798422481080193, "learning_rate": 4.995164404265889e-05, "loss": 2.4693, "mean_token_accuracy": 0.42068966031074523, "step": 69330 }, { "epoch": 0.06983494839556384, "grad_norm": 12.73035962158462, "learning_rate": 4.995161948596729e-05, "loss": 2.5947, "mean_token_accuracy": 0.3172413736581802, "step": 69335 }, { "epoch": 0.06983998444866801, "grad_norm": 13.955569574120261, "learning_rate": 4.995159492304865e-05, "loss": 2.5831, "mean_token_accuracy": 0.39655172228813174, "step": 69340 }, { "epoch": 0.06984502050177219, "grad_norm": 13.94063524864331, "learning_rate": 4.995157035390298e-05, "loss": 2.728, "mean_token_accuracy": 0.38620689511299133, "step": 69345 }, { "epoch": 0.06985005655487636, "grad_norm": 9.72088114737023, "learning_rate": 4.995154577853027e-05, "loss": 2.4625, "mean_token_accuracy": 0.42196006774902345, "step": 69350 }, { "epoch": 0.06985509260798053, "grad_norm": 14.947868423690267, "learning_rate": 4.9951521196930554e-05, "loss": 2.8758, "mean_token_accuracy": 0.31724138259887696, "step": 69355 }, { "epoch": 0.06986012866108471, "grad_norm": 11.663484283600335, "learning_rate": 4.995149660910382e-05, "loss": 2.658, "mean_token_accuracy": 0.3843920111656189, "step": 69360 }, { "epoch": 0.06986516471418888, "grad_norm": 11.765140212739167, "learning_rate": 4.9951472015050076e-05, "loss": 2.3417, "mean_token_accuracy": 0.4413793087005615, "step": 69365 }, { "epoch": 0.06987020076729306, "grad_norm": 12.267805124783123, "learning_rate": 4.995144741476934e-05, "loss": 2.6408, "mean_token_accuracy": 0.3931034505367279, "step": 69370 }, { "epoch": 0.06987523682039723, "grad_norm": 12.304540046812004, "learning_rate": 4.9951422808261604e-05, "loss": 2.5695, "mean_token_accuracy": 0.38275861740112305, "step": 69375 }, { "epoch": 0.0698802728735014, "grad_norm": 11.243892863772443, "learning_rate": 4.995139819552689e-05, "loss": 2.3582, "mean_token_accuracy": 0.42413792610168455, "step": 69380 }, { "epoch": 0.06988530892660556, "grad_norm": 12.225143773666671, "learning_rate": 4.9951373576565194e-05, "loss": 2.4377, "mean_token_accuracy": 0.3965517163276672, "step": 69385 }, { "epoch": 0.06989034497970974, "grad_norm": 11.262541976423735, "learning_rate": 4.9951348951376526e-05, "loss": 2.2699, "mean_token_accuracy": 0.4344827592372894, "step": 69390 }, { "epoch": 0.06989538103281391, "grad_norm": 10.27466864976267, "learning_rate": 4.995132431996089e-05, "loss": 2.8771, "mean_token_accuracy": 0.33103448450565337, "step": 69395 }, { "epoch": 0.06990041708591808, "grad_norm": 11.977076679845837, "learning_rate": 4.995129968231829e-05, "loss": 2.6374, "mean_token_accuracy": 0.3655172407627106, "step": 69400 }, { "epoch": 0.06990545313902226, "grad_norm": 10.195777546517574, "learning_rate": 4.9951275038448745e-05, "loss": 2.2, "mean_token_accuracy": 0.44827585816383364, "step": 69405 }, { "epoch": 0.06991048919212643, "grad_norm": 10.907084953414586, "learning_rate": 4.9951250388352254e-05, "loss": 2.4091, "mean_token_accuracy": 0.3896551728248596, "step": 69410 }, { "epoch": 0.0699155252452306, "grad_norm": 16.30838130408925, "learning_rate": 4.995122573202883e-05, "loss": 2.8221, "mean_token_accuracy": 0.37241379022598264, "step": 69415 }, { "epoch": 0.06992056129833478, "grad_norm": 11.022869358334763, "learning_rate": 4.9951201069478474e-05, "loss": 2.3465, "mean_token_accuracy": 0.42601330280303956, "step": 69420 }, { "epoch": 0.06992559735143895, "grad_norm": 11.256428626658082, "learning_rate": 4.9951176400701185e-05, "loss": 2.4913, "mean_token_accuracy": 0.4294010907411575, "step": 69425 }, { "epoch": 0.06993063340454313, "grad_norm": 18.402258974283903, "learning_rate": 4.9951151725696985e-05, "loss": 2.6719, "mean_token_accuracy": 0.37241379618644715, "step": 69430 }, { "epoch": 0.0699356694576473, "grad_norm": 11.789977862035215, "learning_rate": 4.995112704446587e-05, "loss": 1.7707, "mean_token_accuracy": 0.5540834784507751, "step": 69435 }, { "epoch": 0.06994070551075147, "grad_norm": 13.78246144306605, "learning_rate": 4.995110235700786e-05, "loss": 2.3375, "mean_token_accuracy": 0.43793103098869324, "step": 69440 }, { "epoch": 0.06994574156385565, "grad_norm": 16.486755208211243, "learning_rate": 4.995107766332294e-05, "loss": 2.7606, "mean_token_accuracy": 0.34137930274009703, "step": 69445 }, { "epoch": 0.06995077761695982, "grad_norm": 11.060542696042813, "learning_rate": 4.995105296341114e-05, "loss": 2.726, "mean_token_accuracy": 0.4034482777118683, "step": 69450 }, { "epoch": 0.06995581367006398, "grad_norm": 12.35527957243082, "learning_rate": 4.9951028257272454e-05, "loss": 2.5467, "mean_token_accuracy": 0.41034482717514037, "step": 69455 }, { "epoch": 0.06996084972316816, "grad_norm": 14.938953210635248, "learning_rate": 4.995100354490689e-05, "loss": 2.6991, "mean_token_accuracy": 0.41034482717514037, "step": 69460 }, { "epoch": 0.06996588577627233, "grad_norm": 10.484548207854473, "learning_rate": 4.995097882631446e-05, "loss": 2.0337, "mean_token_accuracy": 0.4379310429096222, "step": 69465 }, { "epoch": 0.0699709218293765, "grad_norm": 17.450387784811692, "learning_rate": 4.9950954101495165e-05, "loss": 2.7992, "mean_token_accuracy": 0.3793103486299515, "step": 69470 }, { "epoch": 0.06997595788248068, "grad_norm": 11.210921256885348, "learning_rate": 4.9950929370449015e-05, "loss": 2.3836, "mean_token_accuracy": 0.44482759237289426, "step": 69475 }, { "epoch": 0.06998099393558485, "grad_norm": 13.366179210038005, "learning_rate": 4.9950904633176014e-05, "loss": 2.7474, "mean_token_accuracy": 0.3999999940395355, "step": 69480 }, { "epoch": 0.06998602998868902, "grad_norm": 11.12457493388408, "learning_rate": 4.995087988967617e-05, "loss": 2.803, "mean_token_accuracy": 0.3965517282485962, "step": 69485 }, { "epoch": 0.0699910660417932, "grad_norm": 16.008310915574448, "learning_rate": 4.99508551399495e-05, "loss": 2.5542, "mean_token_accuracy": 0.42262552976608275, "step": 69490 }, { "epoch": 0.06999610209489737, "grad_norm": 11.035513787228433, "learning_rate": 4.995083038399599e-05, "loss": 2.561, "mean_token_accuracy": 0.4050211727619171, "step": 69495 }, { "epoch": 0.07000113814800155, "grad_norm": 11.461473696640395, "learning_rate": 4.995080562181566e-05, "loss": 2.3698, "mean_token_accuracy": 0.41379310488700866, "step": 69500 }, { "epoch": 0.07000617420110572, "grad_norm": 12.745879256223596, "learning_rate": 4.995078085340852e-05, "loss": 2.6604, "mean_token_accuracy": 0.37241379618644715, "step": 69505 }, { "epoch": 0.0700112102542099, "grad_norm": 12.038720499588772, "learning_rate": 4.9950756078774575e-05, "loss": 2.6503, "mean_token_accuracy": 0.38421053290367124, "step": 69510 }, { "epoch": 0.07001624630731407, "grad_norm": 11.07098850780782, "learning_rate": 4.995073129791382e-05, "loss": 2.7462, "mean_token_accuracy": 0.39310344457626345, "step": 69515 }, { "epoch": 0.07002128236041824, "grad_norm": 10.13703713905326, "learning_rate": 4.995070651082628e-05, "loss": 2.472, "mean_token_accuracy": 0.41379310488700866, "step": 69520 }, { "epoch": 0.0700263184135224, "grad_norm": 10.500432430369951, "learning_rate": 4.9950681717511944e-05, "loss": 2.6223, "mean_token_accuracy": 0.42758620381355283, "step": 69525 }, { "epoch": 0.07003135446662657, "grad_norm": 10.043471758540381, "learning_rate": 4.995065691797083e-05, "loss": 3.0629, "mean_token_accuracy": 0.3965517282485962, "step": 69530 }, { "epoch": 0.07003639051973075, "grad_norm": 11.554672517968806, "learning_rate": 4.995063211220294e-05, "loss": 2.8337, "mean_token_accuracy": 0.38620689511299133, "step": 69535 }, { "epoch": 0.07004142657283492, "grad_norm": 12.432307810997044, "learning_rate": 4.995060730020829e-05, "loss": 2.9088, "mean_token_accuracy": 0.3103448212146759, "step": 69540 }, { "epoch": 0.0700464626259391, "grad_norm": 12.507465170038829, "learning_rate": 4.995058248198688e-05, "loss": 2.9678, "mean_token_accuracy": 0.36896551251411436, "step": 69545 }, { "epoch": 0.07005149867904327, "grad_norm": 14.783108963858476, "learning_rate": 4.995055765753871e-05, "loss": 2.7604, "mean_token_accuracy": 0.3632788896560669, "step": 69550 }, { "epoch": 0.07005653473214744, "grad_norm": 11.856391559304246, "learning_rate": 4.995053282686381e-05, "loss": 2.2951, "mean_token_accuracy": 0.41379310488700866, "step": 69555 }, { "epoch": 0.07006157078525162, "grad_norm": 12.418202274308635, "learning_rate": 4.9950507989962156e-05, "loss": 2.126, "mean_token_accuracy": 0.4034482717514038, "step": 69560 }, { "epoch": 0.07006660683835579, "grad_norm": 14.535076150447885, "learning_rate": 4.9950483146833774e-05, "loss": 2.6364, "mean_token_accuracy": 0.4172413766384125, "step": 69565 }, { "epoch": 0.07007164289145996, "grad_norm": 8.168030818246812, "learning_rate": 4.995045829747866e-05, "loss": 2.3793, "mean_token_accuracy": 0.4806650340557098, "step": 69570 }, { "epoch": 0.07007667894456414, "grad_norm": 11.494508623615944, "learning_rate": 4.995043344189684e-05, "loss": 2.6566, "mean_token_accuracy": 0.36896551549434664, "step": 69575 }, { "epoch": 0.07008171499766831, "grad_norm": 10.635212663559185, "learning_rate": 4.99504085800883e-05, "loss": 2.4652, "mean_token_accuracy": 0.4517241299152374, "step": 69580 }, { "epoch": 0.07008675105077249, "grad_norm": 21.654275351000837, "learning_rate": 4.995038371205306e-05, "loss": 2.7141, "mean_token_accuracy": 0.37931033968925476, "step": 69585 }, { "epoch": 0.07009178710387666, "grad_norm": 11.48067681033711, "learning_rate": 4.995035883779112e-05, "loss": 2.7844, "mean_token_accuracy": 0.35172414034605026, "step": 69590 }, { "epoch": 0.07009682315698082, "grad_norm": 9.999109430497885, "learning_rate": 4.9950333957302494e-05, "loss": 2.5422, "mean_token_accuracy": 0.3793103456497192, "step": 69595 }, { "epoch": 0.070101859210085, "grad_norm": 11.088837991756646, "learning_rate": 4.995030907058718e-05, "loss": 2.4798, "mean_token_accuracy": 0.41379310488700866, "step": 69600 }, { "epoch": 0.07010689526318917, "grad_norm": 10.009586206231806, "learning_rate": 4.995028417764518e-05, "loss": 2.5302, "mean_token_accuracy": 0.4068965554237366, "step": 69605 }, { "epoch": 0.07011193131629334, "grad_norm": 11.904666115757921, "learning_rate": 4.9950259278476525e-05, "loss": 2.6684, "mean_token_accuracy": 0.3620689630508423, "step": 69610 }, { "epoch": 0.07011696736939751, "grad_norm": 16.113996255392088, "learning_rate": 4.995023437308119e-05, "loss": 2.415, "mean_token_accuracy": 0.42068966031074523, "step": 69615 }, { "epoch": 0.07012200342250169, "grad_norm": 11.133348347879226, "learning_rate": 4.9950209461459226e-05, "loss": 2.1962, "mean_token_accuracy": 0.4620689570903778, "step": 69620 }, { "epoch": 0.07012703947560586, "grad_norm": 11.341987866646178, "learning_rate": 4.995018454361059e-05, "loss": 2.7539, "mean_token_accuracy": 0.3999999940395355, "step": 69625 }, { "epoch": 0.07013207552871004, "grad_norm": 18.13640625720166, "learning_rate": 4.9950159619535316e-05, "loss": 2.3494, "mean_token_accuracy": 0.46896551847457885, "step": 69630 }, { "epoch": 0.07013711158181421, "grad_norm": 12.59155015495586, "learning_rate": 4.9950134689233414e-05, "loss": 2.642, "mean_token_accuracy": 0.37241379618644715, "step": 69635 }, { "epoch": 0.07014214763491838, "grad_norm": 10.722239303318371, "learning_rate": 4.995010975270487e-05, "loss": 2.3089, "mean_token_accuracy": 0.3569872975349426, "step": 69640 }, { "epoch": 0.07014718368802256, "grad_norm": 10.279371909132163, "learning_rate": 4.9950084809949714e-05, "loss": 2.3933, "mean_token_accuracy": 0.4360556542873383, "step": 69645 }, { "epoch": 0.07015221974112673, "grad_norm": 10.071647993025032, "learning_rate": 4.995005986096794e-05, "loss": 2.2757, "mean_token_accuracy": 0.45862069725990295, "step": 69650 }, { "epoch": 0.0701572557942309, "grad_norm": 8.541142760606844, "learning_rate": 4.9950034905759565e-05, "loss": 2.4894, "mean_token_accuracy": 0.45396249294281005, "step": 69655 }, { "epoch": 0.07016229184733506, "grad_norm": 11.400823534331346, "learning_rate": 4.995000994432458e-05, "loss": 2.2971, "mean_token_accuracy": 0.4965517342090607, "step": 69660 }, { "epoch": 0.07016732790043924, "grad_norm": 10.875925949693752, "learning_rate": 4.9949984976663e-05, "loss": 2.6775, "mean_token_accuracy": 0.36551723480224607, "step": 69665 }, { "epoch": 0.07017236395354341, "grad_norm": 12.34677600713615, "learning_rate": 4.9949960002774844e-05, "loss": 2.3332, "mean_token_accuracy": 0.41034482717514037, "step": 69670 }, { "epoch": 0.07017740000664759, "grad_norm": 11.845025741998386, "learning_rate": 4.99499350226601e-05, "loss": 2.3622, "mean_token_accuracy": 0.441379314661026, "step": 69675 }, { "epoch": 0.07018243605975176, "grad_norm": 13.367114636321551, "learning_rate": 4.994991003631879e-05, "loss": 2.4081, "mean_token_accuracy": 0.41379310488700866, "step": 69680 }, { "epoch": 0.07018747211285593, "grad_norm": 12.978048580129567, "learning_rate": 4.9949885043750905e-05, "loss": 2.5635, "mean_token_accuracy": 0.42238354682922363, "step": 69685 }, { "epoch": 0.07019250816596011, "grad_norm": 10.44722846996975, "learning_rate": 4.994986004495646e-05, "loss": 2.4769, "mean_token_accuracy": 0.39310344457626345, "step": 69690 }, { "epoch": 0.07019754421906428, "grad_norm": 14.857527995963384, "learning_rate": 4.9949835039935465e-05, "loss": 2.5865, "mean_token_accuracy": 0.441379314661026, "step": 69695 }, { "epoch": 0.07020258027216845, "grad_norm": 11.039129814326783, "learning_rate": 4.994981002868792e-05, "loss": 2.1958, "mean_token_accuracy": 0.41724138855934145, "step": 69700 }, { "epoch": 0.07020761632527263, "grad_norm": 11.380242989337086, "learning_rate": 4.994978501121385e-05, "loss": 2.5089, "mean_token_accuracy": 0.4379310250282288, "step": 69705 }, { "epoch": 0.0702126523783768, "grad_norm": 11.855869925134776, "learning_rate": 4.994975998751324e-05, "loss": 2.489, "mean_token_accuracy": 0.43103448748588563, "step": 69710 }, { "epoch": 0.07021768843148098, "grad_norm": 11.151152909273577, "learning_rate": 4.99497349575861e-05, "loss": 2.6825, "mean_token_accuracy": 0.3931034505367279, "step": 69715 }, { "epoch": 0.07022272448458515, "grad_norm": 32.409307369269165, "learning_rate": 4.9949709921432444e-05, "loss": 2.672, "mean_token_accuracy": 0.4, "step": 69720 }, { "epoch": 0.07022776053768932, "grad_norm": 10.4068415668025, "learning_rate": 4.994968487905229e-05, "loss": 2.1966, "mean_token_accuracy": 0.4379310369491577, "step": 69725 }, { "epoch": 0.07023279659079348, "grad_norm": 11.31702510649938, "learning_rate": 4.9949659830445625e-05, "loss": 2.8858, "mean_token_accuracy": 0.41034482717514037, "step": 69730 }, { "epoch": 0.07023783264389766, "grad_norm": 11.271868322047075, "learning_rate": 4.994963477561246e-05, "loss": 2.4991, "mean_token_accuracy": 0.3655172407627106, "step": 69735 }, { "epoch": 0.07024286869700183, "grad_norm": 10.096947754148971, "learning_rate": 4.9949609714552805e-05, "loss": 2.165, "mean_token_accuracy": 0.4344827592372894, "step": 69740 }, { "epoch": 0.070247904750106, "grad_norm": 13.003698538652918, "learning_rate": 4.994958464726667e-05, "loss": 2.3747, "mean_token_accuracy": 0.42758620977401735, "step": 69745 }, { "epoch": 0.07025294080321018, "grad_norm": 11.654631156614874, "learning_rate": 4.994955957375405e-05, "loss": 2.3772, "mean_token_accuracy": 0.41034482717514037, "step": 69750 }, { "epoch": 0.07025797685631435, "grad_norm": 12.27998841863513, "learning_rate": 4.994953449401498e-05, "loss": 2.3903, "mean_token_accuracy": 0.44827585816383364, "step": 69755 }, { "epoch": 0.07026301290941853, "grad_norm": 12.424257644030815, "learning_rate": 4.994950940804943e-05, "loss": 2.6115, "mean_token_accuracy": 0.4034482777118683, "step": 69760 }, { "epoch": 0.0702680489625227, "grad_norm": 9.211236995586136, "learning_rate": 4.9949484315857424e-05, "loss": 2.2901, "mean_token_accuracy": 0.43793103098869324, "step": 69765 }, { "epoch": 0.07027308501562687, "grad_norm": 15.668940058768534, "learning_rate": 4.994945921743898e-05, "loss": 2.5867, "mean_token_accuracy": 0.4034482717514038, "step": 69770 }, { "epoch": 0.07027812106873105, "grad_norm": 10.809164413991063, "learning_rate": 4.994943411279409e-05, "loss": 2.6077, "mean_token_accuracy": 0.3517241358757019, "step": 69775 }, { "epoch": 0.07028315712183522, "grad_norm": 11.503470297408914, "learning_rate": 4.994940900192277e-05, "loss": 2.7308, "mean_token_accuracy": 0.3965517282485962, "step": 69780 }, { "epoch": 0.0702881931749394, "grad_norm": 10.622207724141463, "learning_rate": 4.9949383884825016e-05, "loss": 2.0582, "mean_token_accuracy": 0.4379310369491577, "step": 69785 }, { "epoch": 0.07029322922804357, "grad_norm": 10.376749624812218, "learning_rate": 4.9949358761500846e-05, "loss": 2.5192, "mean_token_accuracy": 0.3793103456497192, "step": 69790 }, { "epoch": 0.07029826528114774, "grad_norm": 11.737771743887164, "learning_rate": 4.9949333631950265e-05, "loss": 2.4925, "mean_token_accuracy": 0.4517241358757019, "step": 69795 }, { "epoch": 0.0703033013342519, "grad_norm": 10.767390804500213, "learning_rate": 4.9949308496173274e-05, "loss": 2.648, "mean_token_accuracy": 0.41379310488700866, "step": 69800 }, { "epoch": 0.07030833738735608, "grad_norm": 11.977953585204077, "learning_rate": 4.9949283354169886e-05, "loss": 2.6357, "mean_token_accuracy": 0.3620689660310745, "step": 69805 }, { "epoch": 0.07031337344046025, "grad_norm": 10.409118567773453, "learning_rate": 4.99492582059401e-05, "loss": 2.3679, "mean_token_accuracy": 0.4379310369491577, "step": 69810 }, { "epoch": 0.07031840949356442, "grad_norm": 12.729640270764301, "learning_rate": 4.994923305148393e-05, "loss": 2.4064, "mean_token_accuracy": 0.39655172228813174, "step": 69815 }, { "epoch": 0.0703234455466686, "grad_norm": 12.453183048204174, "learning_rate": 4.994920789080139e-05, "loss": 2.1585, "mean_token_accuracy": 0.46551724076271056, "step": 69820 }, { "epoch": 0.07032848159977277, "grad_norm": 11.056744969222756, "learning_rate": 4.9949182723892466e-05, "loss": 2.6957, "mean_token_accuracy": 0.40689656138420105, "step": 69825 }, { "epoch": 0.07033351765287695, "grad_norm": 9.15557981945039, "learning_rate": 4.994915755075719e-05, "loss": 2.6863, "mean_token_accuracy": 0.44482758045196535, "step": 69830 }, { "epoch": 0.07033855370598112, "grad_norm": 9.827789595049888, "learning_rate": 4.994913237139554e-05, "loss": 2.5802, "mean_token_accuracy": 0.3896551728248596, "step": 69835 }, { "epoch": 0.07034358975908529, "grad_norm": 12.680246767565977, "learning_rate": 4.994910718580756e-05, "loss": 2.6141, "mean_token_accuracy": 0.4172413766384125, "step": 69840 }, { "epoch": 0.07034862581218947, "grad_norm": 11.180447620889874, "learning_rate": 4.9949081993993225e-05, "loss": 2.2204, "mean_token_accuracy": 0.4517241358757019, "step": 69845 }, { "epoch": 0.07035366186529364, "grad_norm": 10.985769559063929, "learning_rate": 4.994905679595255e-05, "loss": 2.4519, "mean_token_accuracy": 0.42208107113838195, "step": 69850 }, { "epoch": 0.07035869791839781, "grad_norm": 9.83306237917322, "learning_rate": 4.9949031591685545e-05, "loss": 2.5348, "mean_token_accuracy": 0.40889292359352114, "step": 69855 }, { "epoch": 0.07036373397150199, "grad_norm": 12.458498794702932, "learning_rate": 4.994900638119222e-05, "loss": 2.5662, "mean_token_accuracy": 0.3758620619773865, "step": 69860 }, { "epoch": 0.07036877002460616, "grad_norm": 12.117661943442595, "learning_rate": 4.994898116447259e-05, "loss": 2.2766, "mean_token_accuracy": 0.4586206912994385, "step": 69865 }, { "epoch": 0.07037380607771032, "grad_norm": 11.935412359878029, "learning_rate": 4.994895594152664e-05, "loss": 2.5278, "mean_token_accuracy": 0.39655172228813174, "step": 69870 }, { "epoch": 0.0703788421308145, "grad_norm": 11.278127372248036, "learning_rate": 4.9948930712354384e-05, "loss": 2.4024, "mean_token_accuracy": 0.4224440336227417, "step": 69875 }, { "epoch": 0.07038387818391867, "grad_norm": 10.415047722213039, "learning_rate": 4.994890547695585e-05, "loss": 2.152, "mean_token_accuracy": 0.4620689570903778, "step": 69880 }, { "epoch": 0.07038891423702284, "grad_norm": 14.122167575289225, "learning_rate": 4.994888023533101e-05, "loss": 2.6162, "mean_token_accuracy": 0.4310344815254211, "step": 69885 }, { "epoch": 0.07039395029012702, "grad_norm": 12.02363446907806, "learning_rate": 4.99488549874799e-05, "loss": 2.6891, "mean_token_accuracy": 0.3758620649576187, "step": 69890 }, { "epoch": 0.07039898634323119, "grad_norm": 11.767719322244815, "learning_rate": 4.994882973340252e-05, "loss": 2.6346, "mean_token_accuracy": 0.39655172228813174, "step": 69895 }, { "epoch": 0.07040402239633536, "grad_norm": 9.838135125304456, "learning_rate": 4.994880447309886e-05, "loss": 2.4675, "mean_token_accuracy": 0.39310344457626345, "step": 69900 }, { "epoch": 0.07040905844943954, "grad_norm": 14.303024192707186, "learning_rate": 4.9948779206568955e-05, "loss": 2.4041, "mean_token_accuracy": 0.4034482717514038, "step": 69905 }, { "epoch": 0.07041409450254371, "grad_norm": 11.183068214689438, "learning_rate": 4.9948753933812784e-05, "loss": 2.5764, "mean_token_accuracy": 0.4172413766384125, "step": 69910 }, { "epoch": 0.07041913055564789, "grad_norm": 10.032529287705792, "learning_rate": 4.994872865483037e-05, "loss": 2.578, "mean_token_accuracy": 0.4551724076271057, "step": 69915 }, { "epoch": 0.07042416660875206, "grad_norm": 11.404396072358262, "learning_rate": 4.9948703369621725e-05, "loss": 2.5771, "mean_token_accuracy": 0.38275861740112305, "step": 69920 }, { "epoch": 0.07042920266185623, "grad_norm": 11.18417577804517, "learning_rate": 4.994867807818684e-05, "loss": 2.2584, "mean_token_accuracy": 0.43793103098869324, "step": 69925 }, { "epoch": 0.0704342387149604, "grad_norm": 13.19364626264909, "learning_rate": 4.994865278052574e-05, "loss": 2.1594, "mean_token_accuracy": 0.458620685338974, "step": 69930 }, { "epoch": 0.07043927476806458, "grad_norm": 14.321413778851447, "learning_rate": 4.994862747663841e-05, "loss": 2.2627, "mean_token_accuracy": 0.45662431716918944, "step": 69935 }, { "epoch": 0.07044431082116874, "grad_norm": 10.839431982956105, "learning_rate": 4.994860216652488e-05, "loss": 2.165, "mean_token_accuracy": 0.4620689570903778, "step": 69940 }, { "epoch": 0.07044934687427291, "grad_norm": 9.49059378089504, "learning_rate": 4.994857685018514e-05, "loss": 2.4656, "mean_token_accuracy": 0.37586206793785093, "step": 69945 }, { "epoch": 0.07045438292737709, "grad_norm": 11.347706602923962, "learning_rate": 4.9948551527619205e-05, "loss": 2.4309, "mean_token_accuracy": 0.4137930989265442, "step": 69950 }, { "epoch": 0.07045941898048126, "grad_norm": 14.523649271161622, "learning_rate": 4.994852619882708e-05, "loss": 2.7919, "mean_token_accuracy": 0.3843920141458511, "step": 69955 }, { "epoch": 0.07046445503358544, "grad_norm": 16.06267499555078, "learning_rate": 4.994850086380878e-05, "loss": 2.6834, "mean_token_accuracy": 0.3551724076271057, "step": 69960 }, { "epoch": 0.07046949108668961, "grad_norm": 12.782328249358063, "learning_rate": 4.9948475522564295e-05, "loss": 2.5631, "mean_token_accuracy": 0.441379314661026, "step": 69965 }, { "epoch": 0.07047452713979378, "grad_norm": 15.091712984578308, "learning_rate": 4.9948450175093644e-05, "loss": 2.778, "mean_token_accuracy": 0.3827586233615875, "step": 69970 }, { "epoch": 0.07047956319289796, "grad_norm": 11.012103182008387, "learning_rate": 4.994842482139684e-05, "loss": 2.3691, "mean_token_accuracy": 0.4227465093135834, "step": 69975 }, { "epoch": 0.07048459924600213, "grad_norm": 10.991870748391035, "learning_rate": 4.994839946147387e-05, "loss": 2.6774, "mean_token_accuracy": 0.44482758939266204, "step": 69980 }, { "epoch": 0.0704896352991063, "grad_norm": 10.586103667495937, "learning_rate": 4.9948374095324756e-05, "loss": 2.5506, "mean_token_accuracy": 0.4482758641242981, "step": 69985 }, { "epoch": 0.07049467135221048, "grad_norm": 9.59525358930973, "learning_rate": 4.99483487229495e-05, "loss": 2.8354, "mean_token_accuracy": 0.33448276221752166, "step": 69990 }, { "epoch": 0.07049970740531465, "grad_norm": 13.803506418756518, "learning_rate": 4.994832334434812e-05, "loss": 2.277, "mean_token_accuracy": 0.44706594944000244, "step": 69995 }, { "epoch": 0.07050474345841883, "grad_norm": 10.838546646167913, "learning_rate": 4.9948297959520606e-05, "loss": 2.9031, "mean_token_accuracy": 0.37931033968925476, "step": 70000 }, { "epoch": 0.070509779511523, "grad_norm": 11.094394206516935, "learning_rate": 4.994827256846697e-05, "loss": 2.2989, "mean_token_accuracy": 0.42758620977401735, "step": 70005 }, { "epoch": 0.07051481556462716, "grad_norm": 13.708120559395848, "learning_rate": 4.994824717118723e-05, "loss": 2.6848, "mean_token_accuracy": 0.3517241358757019, "step": 70010 }, { "epoch": 0.07051985161773133, "grad_norm": 12.191850113563884, "learning_rate": 4.994822176768138e-05, "loss": 2.443, "mean_token_accuracy": 0.4103448331356049, "step": 70015 }, { "epoch": 0.0705248876708355, "grad_norm": 9.034555752848037, "learning_rate": 4.9948196357949435e-05, "loss": 2.1186, "mean_token_accuracy": 0.4431941986083984, "step": 70020 }, { "epoch": 0.07052992372393968, "grad_norm": 11.59937010618008, "learning_rate": 4.9948170941991394e-05, "loss": 2.5496, "mean_token_accuracy": 0.4034482777118683, "step": 70025 }, { "epoch": 0.07053495977704385, "grad_norm": 13.363683074532052, "learning_rate": 4.9948145519807274e-05, "loss": 2.7358, "mean_token_accuracy": 0.4000000059604645, "step": 70030 }, { "epoch": 0.07053999583014803, "grad_norm": 12.655578160789627, "learning_rate": 4.994812009139707e-05, "loss": 3.0334, "mean_token_accuracy": 0.3551724135875702, "step": 70035 }, { "epoch": 0.0705450318832522, "grad_norm": 11.211074491146233, "learning_rate": 4.9948094656760806e-05, "loss": 2.5472, "mean_token_accuracy": 0.3896551728248596, "step": 70040 }, { "epoch": 0.07055006793635638, "grad_norm": 14.573713070832257, "learning_rate": 4.994806921589847e-05, "loss": 2.5855, "mean_token_accuracy": 0.4068965554237366, "step": 70045 }, { "epoch": 0.07055510398946055, "grad_norm": 11.296433024599299, "learning_rate": 4.9948043768810086e-05, "loss": 2.4483, "mean_token_accuracy": 0.4517241418361664, "step": 70050 }, { "epoch": 0.07056014004256472, "grad_norm": 12.220005260662424, "learning_rate": 4.9948018315495656e-05, "loss": 2.8256, "mean_token_accuracy": 0.3551724076271057, "step": 70055 }, { "epoch": 0.0705651760956689, "grad_norm": 11.51624445154521, "learning_rate": 4.9947992855955174e-05, "loss": 2.7386, "mean_token_accuracy": 0.3931034505367279, "step": 70060 }, { "epoch": 0.07057021214877307, "grad_norm": 10.75761686448214, "learning_rate": 4.994796739018866e-05, "loss": 2.4213, "mean_token_accuracy": 0.42758620381355283, "step": 70065 }, { "epoch": 0.07057524820187724, "grad_norm": 11.94413647642222, "learning_rate": 4.994794191819612e-05, "loss": 2.3364, "mean_token_accuracy": 0.44137930274009707, "step": 70070 }, { "epoch": 0.07058028425498142, "grad_norm": 11.2733463610061, "learning_rate": 4.994791643997756e-05, "loss": 2.3157, "mean_token_accuracy": 0.3896551728248596, "step": 70075 }, { "epoch": 0.07058532030808558, "grad_norm": 9.987565487335917, "learning_rate": 4.9947890955532985e-05, "loss": 2.4635, "mean_token_accuracy": 0.3965517282485962, "step": 70080 }, { "epoch": 0.07059035636118975, "grad_norm": 9.743064332823339, "learning_rate": 4.99478654648624e-05, "loss": 2.1529, "mean_token_accuracy": 0.4517241358757019, "step": 70085 }, { "epoch": 0.07059539241429393, "grad_norm": 13.113106224089798, "learning_rate": 4.994783996796582e-05, "loss": 3.0923, "mean_token_accuracy": 0.36551723182201384, "step": 70090 }, { "epoch": 0.0706004284673981, "grad_norm": 11.838157421329166, "learning_rate": 4.994781446484325e-05, "loss": 2.3477, "mean_token_accuracy": 0.4206896543502808, "step": 70095 }, { "epoch": 0.07060546452050227, "grad_norm": 11.603879568912266, "learning_rate": 4.99477889554947e-05, "loss": 2.4082, "mean_token_accuracy": 0.4206896543502808, "step": 70100 }, { "epoch": 0.07061050057360645, "grad_norm": 9.783486458554057, "learning_rate": 4.994776343992016e-05, "loss": 2.6037, "mean_token_accuracy": 0.38620689511299133, "step": 70105 }, { "epoch": 0.07061553662671062, "grad_norm": 10.902571260882754, "learning_rate": 4.9947737918119656e-05, "loss": 2.1717, "mean_token_accuracy": 0.44827585816383364, "step": 70110 }, { "epoch": 0.0706205726798148, "grad_norm": 13.502509125841813, "learning_rate": 4.994771239009319e-05, "loss": 2.5407, "mean_token_accuracy": 0.4172413766384125, "step": 70115 }, { "epoch": 0.07062560873291897, "grad_norm": 11.665694178564694, "learning_rate": 4.9947686855840766e-05, "loss": 2.321, "mean_token_accuracy": 0.4206896543502808, "step": 70120 }, { "epoch": 0.07063064478602314, "grad_norm": 14.663085770559602, "learning_rate": 4.994766131536239e-05, "loss": 2.3175, "mean_token_accuracy": 0.4103448331356049, "step": 70125 }, { "epoch": 0.07063568083912732, "grad_norm": 12.503811366774348, "learning_rate": 4.994763576865807e-05, "loss": 2.4005, "mean_token_accuracy": 0.4275862157344818, "step": 70130 }, { "epoch": 0.07064071689223149, "grad_norm": 12.20670408581205, "learning_rate": 4.9947610215727816e-05, "loss": 2.3608, "mean_token_accuracy": 0.4068965494632721, "step": 70135 }, { "epoch": 0.07064575294533566, "grad_norm": 15.73518047464689, "learning_rate": 4.994758465657163e-05, "loss": 2.9314, "mean_token_accuracy": 0.3724137842655182, "step": 70140 }, { "epoch": 0.07065078899843984, "grad_norm": 10.968998148164612, "learning_rate": 4.9947559091189525e-05, "loss": 2.6002, "mean_token_accuracy": 0.3931034505367279, "step": 70145 }, { "epoch": 0.070655825051544, "grad_norm": 11.486852583736642, "learning_rate": 4.9947533519581515e-05, "loss": 2.5346, "mean_token_accuracy": 0.41034482717514037, "step": 70150 }, { "epoch": 0.07066086110464817, "grad_norm": 11.2181640902374, "learning_rate": 4.994750794174759e-05, "loss": 2.5319, "mean_token_accuracy": 0.36551723778247835, "step": 70155 }, { "epoch": 0.07066589715775234, "grad_norm": 11.551371768768886, "learning_rate": 4.994748235768777e-05, "loss": 2.982, "mean_token_accuracy": 0.3655172437429428, "step": 70160 }, { "epoch": 0.07067093321085652, "grad_norm": 11.217070619224495, "learning_rate": 4.994745676740205e-05, "loss": 2.8034, "mean_token_accuracy": 0.37931033968925476, "step": 70165 }, { "epoch": 0.07067596926396069, "grad_norm": 12.825053661880082, "learning_rate": 4.994743117089044e-05, "loss": 2.5864, "mean_token_accuracy": 0.419237756729126, "step": 70170 }, { "epoch": 0.07068100531706487, "grad_norm": 12.334139789154278, "learning_rate": 4.9947405568152956e-05, "loss": 2.8432, "mean_token_accuracy": 0.3793103456497192, "step": 70175 }, { "epoch": 0.07068604137016904, "grad_norm": 13.006068381029577, "learning_rate": 4.994737995918961e-05, "loss": 2.8025, "mean_token_accuracy": 0.34482758343219755, "step": 70180 }, { "epoch": 0.07069107742327321, "grad_norm": 10.582250081420506, "learning_rate": 4.9947354344000385e-05, "loss": 2.8485, "mean_token_accuracy": 0.39310345351696013, "step": 70185 }, { "epoch": 0.07069611347637739, "grad_norm": 11.094040684807956, "learning_rate": 4.994732872258531e-05, "loss": 3.0127, "mean_token_accuracy": 0.32068965435028074, "step": 70190 }, { "epoch": 0.07070114952948156, "grad_norm": 11.38693825418227, "learning_rate": 4.994730309494438e-05, "loss": 2.2579, "mean_token_accuracy": 0.4931034505367279, "step": 70195 }, { "epoch": 0.07070618558258573, "grad_norm": 11.449289242783431, "learning_rate": 4.9947277461077613e-05, "loss": 2.4568, "mean_token_accuracy": 0.4344827592372894, "step": 70200 }, { "epoch": 0.07071122163568991, "grad_norm": 10.565394974660665, "learning_rate": 4.9947251820985005e-05, "loss": 2.6645, "mean_token_accuracy": 0.36206896901130675, "step": 70205 }, { "epoch": 0.07071625768879408, "grad_norm": 36.701769490279204, "learning_rate": 4.9947226174666576e-05, "loss": 2.9153, "mean_token_accuracy": 0.4413793087005615, "step": 70210 }, { "epoch": 0.07072129374189826, "grad_norm": 13.022516024978197, "learning_rate": 4.994720052212231e-05, "loss": 2.6438, "mean_token_accuracy": 0.3793103456497192, "step": 70215 }, { "epoch": 0.07072632979500242, "grad_norm": 10.870449841617692, "learning_rate": 4.994717486335224e-05, "loss": 2.4152, "mean_token_accuracy": 0.4551724135875702, "step": 70220 }, { "epoch": 0.07073136584810659, "grad_norm": 9.79890828638806, "learning_rate": 4.994714919835636e-05, "loss": 2.2415, "mean_token_accuracy": 0.46551724672317507, "step": 70225 }, { "epoch": 0.07073640190121076, "grad_norm": 16.586794358637526, "learning_rate": 4.994712352713469e-05, "loss": 2.6496, "mean_token_accuracy": 0.37586206793785093, "step": 70230 }, { "epoch": 0.07074143795431494, "grad_norm": 15.927491754098385, "learning_rate": 4.994709784968721e-05, "loss": 2.9354, "mean_token_accuracy": 0.36551723480224607, "step": 70235 }, { "epoch": 0.07074647400741911, "grad_norm": 11.458290820812287, "learning_rate": 4.994707216601395e-05, "loss": 2.1528, "mean_token_accuracy": 0.4770935893058777, "step": 70240 }, { "epoch": 0.07075151006052328, "grad_norm": 9.22030731304803, "learning_rate": 4.994704647611491e-05, "loss": 2.6861, "mean_token_accuracy": 0.35862069129943847, "step": 70245 }, { "epoch": 0.07075654611362746, "grad_norm": 12.130277285974119, "learning_rate": 4.99470207799901e-05, "loss": 2.318, "mean_token_accuracy": 0.4329098641872406, "step": 70250 }, { "epoch": 0.07076158216673163, "grad_norm": 11.614749385287697, "learning_rate": 4.994699507763952e-05, "loss": 2.2809, "mean_token_accuracy": 0.45862067937850953, "step": 70255 }, { "epoch": 0.0707666182198358, "grad_norm": 14.3490363492776, "learning_rate": 4.994696936906319e-05, "loss": 2.9985, "mean_token_accuracy": 0.34137930274009703, "step": 70260 }, { "epoch": 0.07077165427293998, "grad_norm": 11.092444764676017, "learning_rate": 4.99469436542611e-05, "loss": 2.563, "mean_token_accuracy": 0.3965517282485962, "step": 70265 }, { "epoch": 0.07077669032604415, "grad_norm": 11.703473651671526, "learning_rate": 4.9946917933233275e-05, "loss": 2.3834, "mean_token_accuracy": 0.4517241299152374, "step": 70270 }, { "epoch": 0.07078172637914833, "grad_norm": 12.571302385546824, "learning_rate": 4.9946892205979716e-05, "loss": 2.3563, "mean_token_accuracy": 0.4034482777118683, "step": 70275 }, { "epoch": 0.0707867624322525, "grad_norm": 11.626379970921274, "learning_rate": 4.994686647250042e-05, "loss": 2.7711, "mean_token_accuracy": 0.37586206793785093, "step": 70280 }, { "epoch": 0.07079179848535667, "grad_norm": 9.168076987963088, "learning_rate": 4.99468407327954e-05, "loss": 2.1858, "mean_token_accuracy": 0.4448275864124298, "step": 70285 }, { "epoch": 0.07079683453846083, "grad_norm": 22.22364781611157, "learning_rate": 4.994681498686467e-05, "loss": 2.3999, "mean_token_accuracy": 0.4172413766384125, "step": 70290 }, { "epoch": 0.07080187059156501, "grad_norm": 12.37357109785026, "learning_rate": 4.994678923470823e-05, "loss": 2.2884, "mean_token_accuracy": 0.4344827473163605, "step": 70295 }, { "epoch": 0.07080690664466918, "grad_norm": 15.551315688587673, "learning_rate": 4.9946763476326084e-05, "loss": 2.6999, "mean_token_accuracy": 0.3896551728248596, "step": 70300 }, { "epoch": 0.07081194269777336, "grad_norm": 15.739390909726957, "learning_rate": 4.9946737711718264e-05, "loss": 2.5253, "mean_token_accuracy": 0.4068965554237366, "step": 70305 }, { "epoch": 0.07081697875087753, "grad_norm": 10.78033626591525, "learning_rate": 4.994671194088474e-05, "loss": 2.6061, "mean_token_accuracy": 0.4068965554237366, "step": 70310 }, { "epoch": 0.0708220148039817, "grad_norm": 10.343567116164026, "learning_rate": 4.994668616382553e-05, "loss": 2.5544, "mean_token_accuracy": 0.42758620381355283, "step": 70315 }, { "epoch": 0.07082705085708588, "grad_norm": 10.26829496006489, "learning_rate": 4.994666038054067e-05, "loss": 2.631, "mean_token_accuracy": 0.3724137842655182, "step": 70320 }, { "epoch": 0.07083208691019005, "grad_norm": 10.482382703601703, "learning_rate": 4.9946634591030134e-05, "loss": 2.3862, "mean_token_accuracy": 0.42068966031074523, "step": 70325 }, { "epoch": 0.07083712296329422, "grad_norm": 11.361212344503103, "learning_rate": 4.9946608795293934e-05, "loss": 2.326, "mean_token_accuracy": 0.45862067937850953, "step": 70330 }, { "epoch": 0.0708421590163984, "grad_norm": 20.47980482561033, "learning_rate": 4.994658299333209e-05, "loss": 2.8915, "mean_token_accuracy": 0.3482758581638336, "step": 70335 }, { "epoch": 0.07084719506950257, "grad_norm": 11.251681891829271, "learning_rate": 4.994655718514461e-05, "loss": 2.4911, "mean_token_accuracy": 0.46896551847457885, "step": 70340 }, { "epoch": 0.07085223112260675, "grad_norm": 11.27873034566599, "learning_rate": 4.9946531370731486e-05, "loss": 2.4166, "mean_token_accuracy": 0.4068965554237366, "step": 70345 }, { "epoch": 0.07085726717571092, "grad_norm": 10.252753041237728, "learning_rate": 4.994650555009273e-05, "loss": 2.3593, "mean_token_accuracy": 0.4344827592372894, "step": 70350 }, { "epoch": 0.0708623032288151, "grad_norm": 10.835968553243204, "learning_rate": 4.994647972322835e-05, "loss": 2.6561, "mean_token_accuracy": 0.3979431390762329, "step": 70355 }, { "epoch": 0.07086733928191925, "grad_norm": 11.95764031271148, "learning_rate": 4.9946453890138364e-05, "loss": 2.6965, "mean_token_accuracy": 0.4310344815254211, "step": 70360 }, { "epoch": 0.07087237533502343, "grad_norm": 9.403211441988184, "learning_rate": 4.994642805082276e-05, "loss": 2.5574, "mean_token_accuracy": 0.42795565724372864, "step": 70365 }, { "epoch": 0.0708774113881276, "grad_norm": 12.897701822973511, "learning_rate": 4.9946402205281564e-05, "loss": 2.6712, "mean_token_accuracy": 0.4068965494632721, "step": 70370 }, { "epoch": 0.07088244744123177, "grad_norm": 12.999910278542746, "learning_rate": 4.994637635351477e-05, "loss": 2.5711, "mean_token_accuracy": 0.3620689570903778, "step": 70375 }, { "epoch": 0.07088748349433595, "grad_norm": 11.22917503418133, "learning_rate": 4.9946350495522395e-05, "loss": 2.4414, "mean_token_accuracy": 0.4034482777118683, "step": 70380 }, { "epoch": 0.07089251954744012, "grad_norm": 15.954125966379527, "learning_rate": 4.9946324631304435e-05, "loss": 2.2183, "mean_token_accuracy": 0.4261947929859161, "step": 70385 }, { "epoch": 0.0708975556005443, "grad_norm": 11.830939815228778, "learning_rate": 4.9946298760860905e-05, "loss": 1.9035, "mean_token_accuracy": 0.4689655125141144, "step": 70390 }, { "epoch": 0.07090259165364847, "grad_norm": 10.89484928053885, "learning_rate": 4.994627288419181e-05, "loss": 2.7462, "mean_token_accuracy": 0.3655172407627106, "step": 70395 }, { "epoch": 0.07090762770675264, "grad_norm": 11.572669114648221, "learning_rate": 4.994624700129716e-05, "loss": 2.8133, "mean_token_accuracy": 0.37241379022598264, "step": 70400 }, { "epoch": 0.07091266375985682, "grad_norm": 18.673855278837642, "learning_rate": 4.9946221112176954e-05, "loss": 2.4031, "mean_token_accuracy": 0.4360556542873383, "step": 70405 }, { "epoch": 0.07091769981296099, "grad_norm": 13.865598974918965, "learning_rate": 4.994619521683121e-05, "loss": 2.6168, "mean_token_accuracy": 0.39655172228813174, "step": 70410 }, { "epoch": 0.07092273586606516, "grad_norm": 14.78720936435485, "learning_rate": 4.994616931525993e-05, "loss": 2.3095, "mean_token_accuracy": 0.482758617401123, "step": 70415 }, { "epoch": 0.07092777191916934, "grad_norm": 11.287426048725141, "learning_rate": 4.994614340746312e-05, "loss": 2.3295, "mean_token_accuracy": 0.47241379618644713, "step": 70420 }, { "epoch": 0.07093280797227351, "grad_norm": 12.701047023927524, "learning_rate": 4.9946117493440784e-05, "loss": 2.0455, "mean_token_accuracy": 0.5034482717514038, "step": 70425 }, { "epoch": 0.07093784402537767, "grad_norm": 10.681017681772339, "learning_rate": 4.994609157319294e-05, "loss": 2.2949, "mean_token_accuracy": 0.46551724672317507, "step": 70430 }, { "epoch": 0.07094288007848185, "grad_norm": 11.203134653035066, "learning_rate": 4.9946065646719586e-05, "loss": 2.6983, "mean_token_accuracy": 0.4068965494632721, "step": 70435 }, { "epoch": 0.07094791613158602, "grad_norm": 9.551566658009277, "learning_rate": 4.994603971402074e-05, "loss": 2.5711, "mean_token_accuracy": 0.43641862869262693, "step": 70440 }, { "epoch": 0.0709529521846902, "grad_norm": 12.66403455105608, "learning_rate": 4.994601377509639e-05, "loss": 2.249, "mean_token_accuracy": 0.4551724076271057, "step": 70445 }, { "epoch": 0.07095798823779437, "grad_norm": 11.48321384635629, "learning_rate": 4.9945987829946555e-05, "loss": 2.4984, "mean_token_accuracy": 0.42413792610168455, "step": 70450 }, { "epoch": 0.07096302429089854, "grad_norm": 15.721868524296804, "learning_rate": 4.994596187857124e-05, "loss": 2.884, "mean_token_accuracy": 0.3931034505367279, "step": 70455 }, { "epoch": 0.07096806034400271, "grad_norm": 11.253078832531148, "learning_rate": 4.994593592097047e-05, "loss": 2.5807, "mean_token_accuracy": 0.42413793206214906, "step": 70460 }, { "epoch": 0.07097309639710689, "grad_norm": 10.999852698410633, "learning_rate": 4.994590995714422e-05, "loss": 2.3727, "mean_token_accuracy": 0.4172413766384125, "step": 70465 }, { "epoch": 0.07097813245021106, "grad_norm": 9.23632191408146, "learning_rate": 4.994588398709252e-05, "loss": 2.662, "mean_token_accuracy": 0.4034482717514038, "step": 70470 }, { "epoch": 0.07098316850331524, "grad_norm": 9.606709373726503, "learning_rate": 4.994585801081537e-05, "loss": 2.3544, "mean_token_accuracy": 0.4482758641242981, "step": 70475 }, { "epoch": 0.07098820455641941, "grad_norm": 13.974434235057336, "learning_rate": 4.994583202831277e-05, "loss": 2.4835, "mean_token_accuracy": 0.47931033968925474, "step": 70480 }, { "epoch": 0.07099324060952358, "grad_norm": 11.383371115003428, "learning_rate": 4.9945806039584745e-05, "loss": 2.8178, "mean_token_accuracy": 0.34482758641242983, "step": 70485 }, { "epoch": 0.07099827666262776, "grad_norm": 9.89035478607554, "learning_rate": 4.994578004463128e-05, "loss": 2.9994, "mean_token_accuracy": 0.36551724523305895, "step": 70490 }, { "epoch": 0.07100331271573193, "grad_norm": 12.684106307925722, "learning_rate": 4.9945754043452404e-05, "loss": 2.6003, "mean_token_accuracy": 0.4379310369491577, "step": 70495 }, { "epoch": 0.07100834876883609, "grad_norm": 10.973785199857181, "learning_rate": 4.994572803604811e-05, "loss": 2.4096, "mean_token_accuracy": 0.44640047550201417, "step": 70500 }, { "epoch": 0.07101338482194026, "grad_norm": 9.242831473521587, "learning_rate": 4.994570202241841e-05, "loss": 2.4515, "mean_token_accuracy": 0.4715668439865112, "step": 70505 }, { "epoch": 0.07101842087504444, "grad_norm": 10.626044452083827, "learning_rate": 4.994567600256331e-05, "loss": 2.7643, "mean_token_accuracy": 0.3793103456497192, "step": 70510 }, { "epoch": 0.07102345692814861, "grad_norm": 9.969247839922538, "learning_rate": 4.994564997648283e-05, "loss": 2.4395, "mean_token_accuracy": 0.4448275864124298, "step": 70515 }, { "epoch": 0.07102849298125279, "grad_norm": 13.360452960741611, "learning_rate": 4.994562394417694e-05, "loss": 2.6033, "mean_token_accuracy": 0.4103448212146759, "step": 70520 }, { "epoch": 0.07103352903435696, "grad_norm": 10.858572421287231, "learning_rate": 4.99455979056457e-05, "loss": 2.7581, "mean_token_accuracy": 0.34482758641242983, "step": 70525 }, { "epoch": 0.07103856508746113, "grad_norm": 15.027180309419482, "learning_rate": 4.994557186088907e-05, "loss": 2.5156, "mean_token_accuracy": 0.39310344457626345, "step": 70530 }, { "epoch": 0.07104360114056531, "grad_norm": 11.97993970181806, "learning_rate": 4.994554580990709e-05, "loss": 2.1166, "mean_token_accuracy": 0.5034482717514038, "step": 70535 }, { "epoch": 0.07104863719366948, "grad_norm": 11.761352999830295, "learning_rate": 4.994551975269975e-05, "loss": 3.0568, "mean_token_accuracy": 0.37586206793785093, "step": 70540 }, { "epoch": 0.07105367324677365, "grad_norm": 14.05508897304383, "learning_rate": 4.9945493689267056e-05, "loss": 2.3359, "mean_token_accuracy": 0.4689655125141144, "step": 70545 }, { "epoch": 0.07105870929987783, "grad_norm": 11.28660922907751, "learning_rate": 4.994546761960903e-05, "loss": 2.7629, "mean_token_accuracy": 0.3586206823587418, "step": 70550 }, { "epoch": 0.071063745352982, "grad_norm": 11.277862044307968, "learning_rate": 4.9945441543725665e-05, "loss": 2.372, "mean_token_accuracy": 0.38118572235107423, "step": 70555 }, { "epoch": 0.07106878140608618, "grad_norm": 10.897882154331445, "learning_rate": 4.994541546161698e-05, "loss": 2.6017, "mean_token_accuracy": 0.417241370677948, "step": 70560 }, { "epoch": 0.07107381745919035, "grad_norm": 12.28403797642235, "learning_rate": 4.9945389373282955e-05, "loss": 2.2271, "mean_token_accuracy": 0.3931034475564957, "step": 70565 }, { "epoch": 0.07107885351229451, "grad_norm": 9.681751996483266, "learning_rate": 4.9945363278723635e-05, "loss": 2.2388, "mean_token_accuracy": 0.4034482777118683, "step": 70570 }, { "epoch": 0.07108388956539868, "grad_norm": 12.330387989249559, "learning_rate": 4.9945337177939e-05, "loss": 2.5294, "mean_token_accuracy": 0.4482758641242981, "step": 70575 }, { "epoch": 0.07108892561850286, "grad_norm": 8.955662623094511, "learning_rate": 4.9945311070929076e-05, "loss": 2.292, "mean_token_accuracy": 0.41034482717514037, "step": 70580 }, { "epoch": 0.07109396167160703, "grad_norm": 11.172593592406988, "learning_rate": 4.994528495769386e-05, "loss": 2.5601, "mean_token_accuracy": 0.43103447556495667, "step": 70585 }, { "epoch": 0.0710989977247112, "grad_norm": 15.035465566728494, "learning_rate": 4.994525883823335e-05, "loss": 3.1064, "mean_token_accuracy": 0.34137930572032926, "step": 70590 }, { "epoch": 0.07110403377781538, "grad_norm": 11.739660563380184, "learning_rate": 4.994523271254757e-05, "loss": 2.5366, "mean_token_accuracy": 0.43793103098869324, "step": 70595 }, { "epoch": 0.07110906983091955, "grad_norm": 15.849692761176208, "learning_rate": 4.9945206580636525e-05, "loss": 2.5307, "mean_token_accuracy": 0.42068964838981626, "step": 70600 }, { "epoch": 0.07111410588402373, "grad_norm": 12.17764555566839, "learning_rate": 4.994518044250021e-05, "loss": 2.6912, "mean_token_accuracy": 0.35517241060733795, "step": 70605 }, { "epoch": 0.0711191419371279, "grad_norm": 10.391699855304093, "learning_rate": 4.9945154298138655e-05, "loss": 2.6097, "mean_token_accuracy": 0.3551724135875702, "step": 70610 }, { "epoch": 0.07112417799023207, "grad_norm": 14.296021921515297, "learning_rate": 4.994512814755184e-05, "loss": 2.452, "mean_token_accuracy": 0.4360556542873383, "step": 70615 }, { "epoch": 0.07112921404333625, "grad_norm": 11.724903376050053, "learning_rate": 4.994510199073979e-05, "loss": 2.3895, "mean_token_accuracy": 0.38620689511299133, "step": 70620 }, { "epoch": 0.07113425009644042, "grad_norm": 12.671271247381794, "learning_rate": 4.9945075827702506e-05, "loss": 2.4711, "mean_token_accuracy": 0.4000000059604645, "step": 70625 }, { "epoch": 0.0711392861495446, "grad_norm": 10.207370688475175, "learning_rate": 4.9945049658439995e-05, "loss": 2.6741, "mean_token_accuracy": 0.42758620977401735, "step": 70630 }, { "epoch": 0.07114432220264877, "grad_norm": 13.12782775187658, "learning_rate": 4.994502348295227e-05, "loss": 2.5277, "mean_token_accuracy": 0.43793103098869324, "step": 70635 }, { "epoch": 0.07114935825575293, "grad_norm": 11.105889321200712, "learning_rate": 4.994499730123933e-05, "loss": 2.5424, "mean_token_accuracy": 0.4068965554237366, "step": 70640 }, { "epoch": 0.0711543943088571, "grad_norm": 9.661949209135395, "learning_rate": 4.9944971113301185e-05, "loss": 2.4156, "mean_token_accuracy": 0.41379310488700866, "step": 70645 }, { "epoch": 0.07115943036196128, "grad_norm": 9.549024782555119, "learning_rate": 4.994494491913785e-05, "loss": 2.2862, "mean_token_accuracy": 0.4517241358757019, "step": 70650 }, { "epoch": 0.07116446641506545, "grad_norm": 9.406550492214441, "learning_rate": 4.9944918718749326e-05, "loss": 2.2725, "mean_token_accuracy": 0.44482759237289426, "step": 70655 }, { "epoch": 0.07116950246816962, "grad_norm": 11.826400912592916, "learning_rate": 4.994489251213561e-05, "loss": 2.7083, "mean_token_accuracy": 0.41034482717514037, "step": 70660 }, { "epoch": 0.0711745385212738, "grad_norm": 11.335575003963879, "learning_rate": 4.9944866299296724e-05, "loss": 2.386, "mean_token_accuracy": 0.43793103098869324, "step": 70665 }, { "epoch": 0.07117957457437797, "grad_norm": 10.268075162359352, "learning_rate": 4.9944840080232675e-05, "loss": 2.4905, "mean_token_accuracy": 0.38275861740112305, "step": 70670 }, { "epoch": 0.07118461062748214, "grad_norm": 10.25182759608856, "learning_rate": 4.994481385494346e-05, "loss": 2.6252, "mean_token_accuracy": 0.42577131986618044, "step": 70675 }, { "epoch": 0.07118964668058632, "grad_norm": 11.086095444721575, "learning_rate": 4.994478762342909e-05, "loss": 2.7612, "mean_token_accuracy": 0.3931034505367279, "step": 70680 }, { "epoch": 0.07119468273369049, "grad_norm": 13.501977265932828, "learning_rate": 4.994476138568958e-05, "loss": 2.4447, "mean_token_accuracy": 0.40344826579093934, "step": 70685 }, { "epoch": 0.07119971878679467, "grad_norm": 9.632548851095104, "learning_rate": 4.9944735141724935e-05, "loss": 2.5812, "mean_token_accuracy": 0.42413794100284574, "step": 70690 }, { "epoch": 0.07120475483989884, "grad_norm": 11.746044520824606, "learning_rate": 4.9944708891535155e-05, "loss": 2.7043, "mean_token_accuracy": 0.37241379022598264, "step": 70695 }, { "epoch": 0.07120979089300301, "grad_norm": 13.110302070609862, "learning_rate": 4.9944682635120246e-05, "loss": 2.3288, "mean_token_accuracy": 0.40344826579093934, "step": 70700 }, { "epoch": 0.07121482694610719, "grad_norm": 12.378242719394526, "learning_rate": 4.9944656372480234e-05, "loss": 2.7406, "mean_token_accuracy": 0.4103448331356049, "step": 70705 }, { "epoch": 0.07121986299921135, "grad_norm": 12.702405264020008, "learning_rate": 4.99446301036151e-05, "loss": 2.2296, "mean_token_accuracy": 0.4641863226890564, "step": 70710 }, { "epoch": 0.07122489905231552, "grad_norm": 12.30321329424009, "learning_rate": 4.9944603828524875e-05, "loss": 2.5986, "mean_token_accuracy": 0.38965516686439516, "step": 70715 }, { "epoch": 0.0712299351054197, "grad_norm": 13.58039097977116, "learning_rate": 4.994457754720954e-05, "loss": 2.4328, "mean_token_accuracy": 0.4034482717514038, "step": 70720 }, { "epoch": 0.07123497115852387, "grad_norm": 10.177707733071324, "learning_rate": 4.994455125966913e-05, "loss": 2.386, "mean_token_accuracy": 0.40163339376449586, "step": 70725 }, { "epoch": 0.07124000721162804, "grad_norm": 14.085057433278212, "learning_rate": 4.994452496590364e-05, "loss": 2.4562, "mean_token_accuracy": 0.41379310488700866, "step": 70730 }, { "epoch": 0.07124504326473222, "grad_norm": 12.290509395230686, "learning_rate": 4.994449866591307e-05, "loss": 2.3773, "mean_token_accuracy": 0.4413793087005615, "step": 70735 }, { "epoch": 0.07125007931783639, "grad_norm": 11.15151188212305, "learning_rate": 4.994447235969744e-05, "loss": 2.7459, "mean_token_accuracy": 0.37931033670902253, "step": 70740 }, { "epoch": 0.07125511537094056, "grad_norm": 11.125330115072433, "learning_rate": 4.994444604725675e-05, "loss": 2.6763, "mean_token_accuracy": 0.3724137991666794, "step": 70745 }, { "epoch": 0.07126015142404474, "grad_norm": 11.930243160710932, "learning_rate": 4.994441972859101e-05, "loss": 2.4483, "mean_token_accuracy": 0.4034482717514038, "step": 70750 }, { "epoch": 0.07126518747714891, "grad_norm": 9.576407649277836, "learning_rate": 4.994439340370022e-05, "loss": 2.2071, "mean_token_accuracy": 0.4310344815254211, "step": 70755 }, { "epoch": 0.07127022353025309, "grad_norm": 12.4216056083062, "learning_rate": 4.99443670725844e-05, "loss": 2.3091, "mean_token_accuracy": 0.4329098641872406, "step": 70760 }, { "epoch": 0.07127525958335726, "grad_norm": 9.884915579538406, "learning_rate": 4.9944340735243556e-05, "loss": 2.9061, "mean_token_accuracy": 0.37362371683120726, "step": 70765 }, { "epoch": 0.07128029563646143, "grad_norm": 14.157224059188993, "learning_rate": 4.9944314391677685e-05, "loss": 2.8259, "mean_token_accuracy": 0.3551724135875702, "step": 70770 }, { "epoch": 0.0712853316895656, "grad_norm": 10.323825554514238, "learning_rate": 4.99442880418868e-05, "loss": 2.6118, "mean_token_accuracy": 0.3882637619972229, "step": 70775 }, { "epoch": 0.07129036774266977, "grad_norm": 12.421319425485658, "learning_rate": 4.994426168587091e-05, "loss": 2.698, "mean_token_accuracy": 0.4172413766384125, "step": 70780 }, { "epoch": 0.07129540379577394, "grad_norm": 11.387986991483675, "learning_rate": 4.9944235323630016e-05, "loss": 2.5358, "mean_token_accuracy": 0.3965517282485962, "step": 70785 }, { "epoch": 0.07130043984887811, "grad_norm": 9.535196823406435, "learning_rate": 4.9944208955164134e-05, "loss": 2.1317, "mean_token_accuracy": 0.441379314661026, "step": 70790 }, { "epoch": 0.07130547590198229, "grad_norm": 14.409028249020343, "learning_rate": 4.9944182580473264e-05, "loss": 2.2947, "mean_token_accuracy": 0.43950392603874205, "step": 70795 }, { "epoch": 0.07131051195508646, "grad_norm": 14.52114121439506, "learning_rate": 4.994415619955742e-05, "loss": 2.9437, "mean_token_accuracy": 0.38965516686439516, "step": 70800 }, { "epoch": 0.07131554800819064, "grad_norm": 11.357618243921069, "learning_rate": 4.99441298124166e-05, "loss": 2.7781, "mean_token_accuracy": 0.33103448152542114, "step": 70805 }, { "epoch": 0.07132058406129481, "grad_norm": 24.778665969770362, "learning_rate": 4.994410341905082e-05, "loss": 2.3848, "mean_token_accuracy": 0.43793103098869324, "step": 70810 }, { "epoch": 0.07132562011439898, "grad_norm": 11.029616389724485, "learning_rate": 4.9944077019460086e-05, "loss": 2.3692, "mean_token_accuracy": 0.4206896543502808, "step": 70815 }, { "epoch": 0.07133065616750316, "grad_norm": 12.106278635067156, "learning_rate": 4.99440506136444e-05, "loss": 2.3405, "mean_token_accuracy": 0.44137930274009707, "step": 70820 }, { "epoch": 0.07133569222060733, "grad_norm": 12.420718495372768, "learning_rate": 4.9944024201603776e-05, "loss": 2.3677, "mean_token_accuracy": 0.4586206912994385, "step": 70825 }, { "epoch": 0.0713407282737115, "grad_norm": 11.229742365331946, "learning_rate": 4.994399778333821e-05, "loss": 2.9375, "mean_token_accuracy": 0.38965517580509185, "step": 70830 }, { "epoch": 0.07134576432681568, "grad_norm": 15.134934905351564, "learning_rate": 4.9943971358847726e-05, "loss": 2.8809, "mean_token_accuracy": 0.3793103516101837, "step": 70835 }, { "epoch": 0.07135080037991985, "grad_norm": 17.92020690498485, "learning_rate": 4.994394492813232e-05, "loss": 2.8504, "mean_token_accuracy": 0.3965517282485962, "step": 70840 }, { "epoch": 0.07135583643302403, "grad_norm": 10.241089256154567, "learning_rate": 4.9943918491192005e-05, "loss": 2.3839, "mean_token_accuracy": 0.422202056646347, "step": 70845 }, { "epoch": 0.07136087248612819, "grad_norm": 12.034115180949227, "learning_rate": 4.994389204802678e-05, "loss": 2.658, "mean_token_accuracy": 0.39310345649719236, "step": 70850 }, { "epoch": 0.07136590853923236, "grad_norm": 11.706252008332417, "learning_rate": 4.9943865598636665e-05, "loss": 2.5979, "mean_token_accuracy": 0.3793103456497192, "step": 70855 }, { "epoch": 0.07137094459233653, "grad_norm": 10.101452874172487, "learning_rate": 4.9943839143021657e-05, "loss": 2.4062, "mean_token_accuracy": 0.4206896424293518, "step": 70860 }, { "epoch": 0.0713759806454407, "grad_norm": 10.833255491465126, "learning_rate": 4.9943812681181756e-05, "loss": 2.2773, "mean_token_accuracy": 0.4448275864124298, "step": 70865 }, { "epoch": 0.07138101669854488, "grad_norm": 16.60778438387288, "learning_rate": 4.9943786213116996e-05, "loss": 2.9333, "mean_token_accuracy": 0.39655172228813174, "step": 70870 }, { "epoch": 0.07138605275164905, "grad_norm": 10.331685605483557, "learning_rate": 4.994375973882736e-05, "loss": 2.0486, "mean_token_accuracy": 0.5034482657909394, "step": 70875 }, { "epoch": 0.07139108880475323, "grad_norm": 11.627511842995375, "learning_rate": 4.9943733258312866e-05, "loss": 2.5898, "mean_token_accuracy": 0.44482758045196535, "step": 70880 }, { "epoch": 0.0713961248578574, "grad_norm": 11.421274344674455, "learning_rate": 4.994370677157352e-05, "loss": 2.5111, "mean_token_accuracy": 0.41724138855934145, "step": 70885 }, { "epoch": 0.07140116091096158, "grad_norm": 11.734189405984683, "learning_rate": 4.994368027860932e-05, "loss": 2.7348, "mean_token_accuracy": 0.40139141082763674, "step": 70890 }, { "epoch": 0.07140619696406575, "grad_norm": 13.021598803225036, "learning_rate": 4.994365377942029e-05, "loss": 2.0335, "mean_token_accuracy": 0.47973382472991943, "step": 70895 }, { "epoch": 0.07141123301716992, "grad_norm": 10.364770088464093, "learning_rate": 4.9943627274006425e-05, "loss": 1.9974, "mean_token_accuracy": 0.4689655125141144, "step": 70900 }, { "epoch": 0.0714162690702741, "grad_norm": 13.522544635575718, "learning_rate": 4.994360076236774e-05, "loss": 2.8219, "mean_token_accuracy": 0.4172413766384125, "step": 70905 }, { "epoch": 0.07142130512337827, "grad_norm": 11.70887946184419, "learning_rate": 4.9943574244504235e-05, "loss": 2.1361, "mean_token_accuracy": 0.44827585816383364, "step": 70910 }, { "epoch": 0.07142634117648244, "grad_norm": 14.234147069994952, "learning_rate": 4.9943547720415924e-05, "loss": 3.0879, "mean_token_accuracy": 0.341379314661026, "step": 70915 }, { "epoch": 0.0714313772295866, "grad_norm": 11.432731353608675, "learning_rate": 4.994352119010281e-05, "loss": 2.6016, "mean_token_accuracy": 0.4034482717514038, "step": 70920 }, { "epoch": 0.07143641328269078, "grad_norm": 13.259819065600094, "learning_rate": 4.99434946535649e-05, "loss": 2.4433, "mean_token_accuracy": 0.4137930989265442, "step": 70925 }, { "epoch": 0.07144144933579495, "grad_norm": 10.589519400769717, "learning_rate": 4.99434681108022e-05, "loss": 2.5367, "mean_token_accuracy": 0.42413793206214906, "step": 70930 }, { "epoch": 0.07144648538889913, "grad_norm": 10.075779622427428, "learning_rate": 4.994344156181473e-05, "loss": 1.9828, "mean_token_accuracy": 0.5122201979160309, "step": 70935 }, { "epoch": 0.0714515214420033, "grad_norm": 19.118272964405886, "learning_rate": 4.994341500660248e-05, "loss": 2.301, "mean_token_accuracy": 0.5, "step": 70940 }, { "epoch": 0.07145655749510747, "grad_norm": 11.220535284426372, "learning_rate": 4.994338844516547e-05, "loss": 2.6837, "mean_token_accuracy": 0.3896551728248596, "step": 70945 }, { "epoch": 0.07146159354821165, "grad_norm": 11.665190623666764, "learning_rate": 4.9943361877503704e-05, "loss": 2.8422, "mean_token_accuracy": 0.3896551728248596, "step": 70950 }, { "epoch": 0.07146662960131582, "grad_norm": 10.681579785810431, "learning_rate": 4.994333530361718e-05, "loss": 2.3563, "mean_token_accuracy": 0.42758620381355283, "step": 70955 }, { "epoch": 0.07147166565442, "grad_norm": 11.409474997479817, "learning_rate": 4.994330872350591e-05, "loss": 2.3415, "mean_token_accuracy": 0.441379314661026, "step": 70960 }, { "epoch": 0.07147670170752417, "grad_norm": 11.381029927339695, "learning_rate": 4.9943282137169916e-05, "loss": 2.1509, "mean_token_accuracy": 0.4776164650917053, "step": 70965 }, { "epoch": 0.07148173776062834, "grad_norm": 12.13730130404337, "learning_rate": 4.994325554460919e-05, "loss": 2.2033, "mean_token_accuracy": 0.4551724076271057, "step": 70970 }, { "epoch": 0.07148677381373252, "grad_norm": 10.478375160203383, "learning_rate": 4.9943228945823745e-05, "loss": 2.5283, "mean_token_accuracy": 0.4186932861804962, "step": 70975 }, { "epoch": 0.07149180986683669, "grad_norm": 12.937254344908167, "learning_rate": 4.9943202340813585e-05, "loss": 2.5381, "mean_token_accuracy": 0.4172413766384125, "step": 70980 }, { "epoch": 0.07149684591994086, "grad_norm": 10.97568586555971, "learning_rate": 4.9943175729578715e-05, "loss": 2.6197, "mean_token_accuracy": 0.3793103456497192, "step": 70985 }, { "epoch": 0.07150188197304502, "grad_norm": 11.684862273904743, "learning_rate": 4.9943149112119156e-05, "loss": 2.5448, "mean_token_accuracy": 0.42413793206214906, "step": 70990 }, { "epoch": 0.0715069180261492, "grad_norm": 13.791824517154222, "learning_rate": 4.9943122488434895e-05, "loss": 3.0424, "mean_token_accuracy": 0.35607985258102415, "step": 70995 }, { "epoch": 0.07151195407925337, "grad_norm": 10.227219959114166, "learning_rate": 4.994309585852596e-05, "loss": 2.5011, "mean_token_accuracy": 0.4068965494632721, "step": 71000 }, { "epoch": 0.07151699013235754, "grad_norm": 12.65714794969652, "learning_rate": 4.9943069222392344e-05, "loss": 2.6511, "mean_token_accuracy": 0.4000000059604645, "step": 71005 }, { "epoch": 0.07152202618546172, "grad_norm": 12.587031453739757, "learning_rate": 4.9943042580034055e-05, "loss": 2.7937, "mean_token_accuracy": 0.3655172407627106, "step": 71010 }, { "epoch": 0.07152706223856589, "grad_norm": 10.78725374731294, "learning_rate": 4.994301593145111e-05, "loss": 2.4543, "mean_token_accuracy": 0.3655172407627106, "step": 71015 }, { "epoch": 0.07153209829167007, "grad_norm": 11.986500068067201, "learning_rate": 4.994298927664351e-05, "loss": 2.4143, "mean_token_accuracy": 0.4137930989265442, "step": 71020 }, { "epoch": 0.07153713434477424, "grad_norm": 13.649569073006116, "learning_rate": 4.994296261561126e-05, "loss": 2.3044, "mean_token_accuracy": 0.3825166344642639, "step": 71025 }, { "epoch": 0.07154217039787841, "grad_norm": 9.276241223936314, "learning_rate": 4.994293594835437e-05, "loss": 2.5017, "mean_token_accuracy": 0.3655172407627106, "step": 71030 }, { "epoch": 0.07154720645098259, "grad_norm": 12.442769516306736, "learning_rate": 4.9942909274872854e-05, "loss": 2.4891, "mean_token_accuracy": 0.4517241358757019, "step": 71035 }, { "epoch": 0.07155224250408676, "grad_norm": 11.340496530123296, "learning_rate": 4.9942882595166714e-05, "loss": 2.6579, "mean_token_accuracy": 0.39310345649719236, "step": 71040 }, { "epoch": 0.07155727855719093, "grad_norm": 13.943519110063361, "learning_rate": 4.994285590923595e-05, "loss": 2.3911, "mean_token_accuracy": 0.458620685338974, "step": 71045 }, { "epoch": 0.07156231461029511, "grad_norm": 13.184404215926195, "learning_rate": 4.9942829217080584e-05, "loss": 2.5661, "mean_token_accuracy": 0.4137930989265442, "step": 71050 }, { "epoch": 0.07156735066339928, "grad_norm": 9.31016052864668, "learning_rate": 4.994280251870061e-05, "loss": 2.0516, "mean_token_accuracy": 0.4862068951129913, "step": 71055 }, { "epoch": 0.07157238671650344, "grad_norm": 10.08712730881105, "learning_rate": 4.9942775814096034e-05, "loss": 2.3825, "mean_token_accuracy": 0.43103447556495667, "step": 71060 }, { "epoch": 0.07157742276960762, "grad_norm": 9.979294918136405, "learning_rate": 4.994274910326689e-05, "loss": 2.6759, "mean_token_accuracy": 0.3931034505367279, "step": 71065 }, { "epoch": 0.07158245882271179, "grad_norm": 12.681070326031314, "learning_rate": 4.994272238621315e-05, "loss": 2.6941, "mean_token_accuracy": 0.41724138855934145, "step": 71070 }, { "epoch": 0.07158749487581596, "grad_norm": 11.360753952805513, "learning_rate": 4.994269566293484e-05, "loss": 2.3541, "mean_token_accuracy": 0.4572292804718018, "step": 71075 }, { "epoch": 0.07159253092892014, "grad_norm": 11.577007670442573, "learning_rate": 4.9942668933431966e-05, "loss": 2.5938, "mean_token_accuracy": 0.38275861740112305, "step": 71080 }, { "epoch": 0.07159756698202431, "grad_norm": 7.992803332204998, "learning_rate": 4.994264219770454e-05, "loss": 2.8432, "mean_token_accuracy": 0.3793103456497192, "step": 71085 }, { "epoch": 0.07160260303512848, "grad_norm": 10.838283024964884, "learning_rate": 4.9942615455752555e-05, "loss": 2.2356, "mean_token_accuracy": 0.43103448748588563, "step": 71090 }, { "epoch": 0.07160763908823266, "grad_norm": 11.49725009298919, "learning_rate": 4.994258870757603e-05, "loss": 2.9394, "mean_token_accuracy": 0.3517241418361664, "step": 71095 }, { "epoch": 0.07161267514133683, "grad_norm": 13.078473539226987, "learning_rate": 4.994256195317496e-05, "loss": 2.9376, "mean_token_accuracy": 0.37241379022598264, "step": 71100 }, { "epoch": 0.071617711194441, "grad_norm": 10.559175004260128, "learning_rate": 4.994253519254938e-05, "loss": 2.0873, "mean_token_accuracy": 0.4413793087005615, "step": 71105 }, { "epoch": 0.07162274724754518, "grad_norm": 9.903892844234257, "learning_rate": 4.994250842569927e-05, "loss": 2.398, "mean_token_accuracy": 0.4206896543502808, "step": 71110 }, { "epoch": 0.07162778330064935, "grad_norm": 11.44775330064501, "learning_rate": 4.994248165262465e-05, "loss": 2.7172, "mean_token_accuracy": 0.36896551847457887, "step": 71115 }, { "epoch": 0.07163281935375353, "grad_norm": 12.53561062053418, "learning_rate": 4.994245487332552e-05, "loss": 2.8782, "mean_token_accuracy": 0.3862068891525269, "step": 71120 }, { "epoch": 0.0716378554068577, "grad_norm": 10.485570941287847, "learning_rate": 4.994242808780189e-05, "loss": 2.4474, "mean_token_accuracy": 0.4344827651977539, "step": 71125 }, { "epoch": 0.07164289145996186, "grad_norm": 11.542010741540413, "learning_rate": 4.994240129605377e-05, "loss": 2.5578, "mean_token_accuracy": 0.39310344457626345, "step": 71130 }, { "epoch": 0.07164792751306603, "grad_norm": 15.285713704974636, "learning_rate": 4.9942374498081164e-05, "loss": 2.5681, "mean_token_accuracy": 0.42589232325553894, "step": 71135 }, { "epoch": 0.07165296356617021, "grad_norm": 10.05700926700659, "learning_rate": 4.9942347693884084e-05, "loss": 2.36, "mean_token_accuracy": 0.4379310369491577, "step": 71140 }, { "epoch": 0.07165799961927438, "grad_norm": 10.907575103528712, "learning_rate": 4.994232088346253e-05, "loss": 2.6808, "mean_token_accuracy": 0.3517241358757019, "step": 71145 }, { "epoch": 0.07166303567237856, "grad_norm": 11.229350665402336, "learning_rate": 4.994229406681653e-05, "loss": 2.4989, "mean_token_accuracy": 0.3931034475564957, "step": 71150 }, { "epoch": 0.07166807172548273, "grad_norm": 15.889429707573136, "learning_rate": 4.9942267243946066e-05, "loss": 2.8523, "mean_token_accuracy": 0.36896551251411436, "step": 71155 }, { "epoch": 0.0716731077785869, "grad_norm": 11.62771780136095, "learning_rate": 4.994224041485115e-05, "loss": 2.3217, "mean_token_accuracy": 0.36896551847457887, "step": 71160 }, { "epoch": 0.07167814383169108, "grad_norm": 12.14494475705175, "learning_rate": 4.9942213579531804e-05, "loss": 3.1062, "mean_token_accuracy": 0.3517241358757019, "step": 71165 }, { "epoch": 0.07168317988479525, "grad_norm": 9.08373474438411, "learning_rate": 4.9942186737988026e-05, "loss": 2.3713, "mean_token_accuracy": 0.4137930989265442, "step": 71170 }, { "epoch": 0.07168821593789942, "grad_norm": 10.565575481143629, "learning_rate": 4.9942159890219815e-05, "loss": 2.1803, "mean_token_accuracy": 0.44137930274009707, "step": 71175 }, { "epoch": 0.0716932519910036, "grad_norm": 13.1256046310764, "learning_rate": 4.9942133036227194e-05, "loss": 2.1523, "mean_token_accuracy": 0.441379314661026, "step": 71180 }, { "epoch": 0.07169828804410777, "grad_norm": 11.03369307458606, "learning_rate": 4.994210617601017e-05, "loss": 2.2698, "mean_token_accuracy": 0.4310344934463501, "step": 71185 }, { "epoch": 0.07170332409721195, "grad_norm": 12.688652136524157, "learning_rate": 4.9942079309568736e-05, "loss": 2.2658, "mean_token_accuracy": 0.48275861144065857, "step": 71190 }, { "epoch": 0.07170836015031612, "grad_norm": 11.776628233817998, "learning_rate": 4.99420524369029e-05, "loss": 2.204, "mean_token_accuracy": 0.43793103098869324, "step": 71195 }, { "epoch": 0.07171339620342028, "grad_norm": 9.008451077284565, "learning_rate": 4.994202555801269e-05, "loss": 2.271, "mean_token_accuracy": 0.5360837399959564, "step": 71200 }, { "epoch": 0.07171843225652445, "grad_norm": 11.893562401900843, "learning_rate": 4.9941998672898096e-05, "loss": 2.3008, "mean_token_accuracy": 0.39655172228813174, "step": 71205 }, { "epoch": 0.07172346830962863, "grad_norm": 12.40901272116406, "learning_rate": 4.994197178155913e-05, "loss": 2.3866, "mean_token_accuracy": 0.43103448748588563, "step": 71210 }, { "epoch": 0.0717285043627328, "grad_norm": 9.892954597035462, "learning_rate": 4.99419448839958e-05, "loss": 2.1194, "mean_token_accuracy": 0.47241380214691164, "step": 71215 }, { "epoch": 0.07173354041583697, "grad_norm": 10.882423306649848, "learning_rate": 4.994191798020811e-05, "loss": 2.4034, "mean_token_accuracy": 0.4344827651977539, "step": 71220 }, { "epoch": 0.07173857646894115, "grad_norm": 14.889056478861342, "learning_rate": 4.994189107019607e-05, "loss": 2.749, "mean_token_accuracy": 0.3635813593864441, "step": 71225 }, { "epoch": 0.07174361252204532, "grad_norm": 11.704562400583475, "learning_rate": 4.9941864153959695e-05, "loss": 2.252, "mean_token_accuracy": 0.45172414779663084, "step": 71230 }, { "epoch": 0.0717486485751495, "grad_norm": 13.005324690076995, "learning_rate": 4.9941837231498975e-05, "loss": 2.3403, "mean_token_accuracy": 0.39655172228813174, "step": 71235 }, { "epoch": 0.07175368462825367, "grad_norm": 16.08874533363667, "learning_rate": 4.994181030281394e-05, "loss": 2.5644, "mean_token_accuracy": 0.3965517163276672, "step": 71240 }, { "epoch": 0.07175872068135784, "grad_norm": 14.833879327567997, "learning_rate": 4.9941783367904577e-05, "loss": 2.3939, "mean_token_accuracy": 0.4206896543502808, "step": 71245 }, { "epoch": 0.07176375673446202, "grad_norm": 14.1276398175455, "learning_rate": 4.99417564267709e-05, "loss": 2.9489, "mean_token_accuracy": 0.3517241418361664, "step": 71250 }, { "epoch": 0.07176879278756619, "grad_norm": 12.579352639587158, "learning_rate": 4.994172947941292e-05, "loss": 2.4039, "mean_token_accuracy": 0.4344827711582184, "step": 71255 }, { "epoch": 0.07177382884067036, "grad_norm": 11.01249615252546, "learning_rate": 4.994170252583065e-05, "loss": 2.4503, "mean_token_accuracy": 0.44482758045196535, "step": 71260 }, { "epoch": 0.07177886489377454, "grad_norm": 11.645856211015182, "learning_rate": 4.9941675566024077e-05, "loss": 2.6739, "mean_token_accuracy": 0.3551724135875702, "step": 71265 }, { "epoch": 0.0717839009468787, "grad_norm": 11.937534664044211, "learning_rate": 4.994164859999323e-05, "loss": 2.3175, "mean_token_accuracy": 0.4379310369491577, "step": 71270 }, { "epoch": 0.07178893699998287, "grad_norm": 11.645731182485507, "learning_rate": 4.994162162773811e-05, "loss": 2.7318, "mean_token_accuracy": 0.38965516686439516, "step": 71275 }, { "epoch": 0.07179397305308705, "grad_norm": 19.20205931515992, "learning_rate": 4.994159464925872e-05, "loss": 3.6012, "mean_token_accuracy": 0.34930428564548494, "step": 71280 }, { "epoch": 0.07179900910619122, "grad_norm": 10.947561815680867, "learning_rate": 4.994156766455507e-05, "loss": 2.4425, "mean_token_accuracy": 0.41724138259887694, "step": 71285 }, { "epoch": 0.0718040451592954, "grad_norm": 11.203567658609044, "learning_rate": 4.994154067362716e-05, "loss": 2.1872, "mean_token_accuracy": 0.4517241418361664, "step": 71290 }, { "epoch": 0.07180908121239957, "grad_norm": 11.384364356111817, "learning_rate": 4.9941513676475015e-05, "loss": 2.706, "mean_token_accuracy": 0.3758620619773865, "step": 71295 }, { "epoch": 0.07181411726550374, "grad_norm": 15.956695767848043, "learning_rate": 4.9941486673098626e-05, "loss": 2.9547, "mean_token_accuracy": 0.3310344785451889, "step": 71300 }, { "epoch": 0.07181915331860791, "grad_norm": 13.431933856396167, "learning_rate": 4.994145966349801e-05, "loss": 2.3287, "mean_token_accuracy": 0.4570477962493896, "step": 71305 }, { "epoch": 0.07182418937171209, "grad_norm": 10.636052524922706, "learning_rate": 4.9941432647673176e-05, "loss": 2.2743, "mean_token_accuracy": 0.42758620977401735, "step": 71310 }, { "epoch": 0.07182922542481626, "grad_norm": 11.879931948821524, "learning_rate": 4.994140562562413e-05, "loss": 2.5965, "mean_token_accuracy": 0.40852994918823243, "step": 71315 }, { "epoch": 0.07183426147792044, "grad_norm": 11.926525612138873, "learning_rate": 4.994137859735086e-05, "loss": 2.9058, "mean_token_accuracy": 0.37241379618644715, "step": 71320 }, { "epoch": 0.07183929753102461, "grad_norm": 19.584321373983283, "learning_rate": 4.99413515628534e-05, "loss": 2.3878, "mean_token_accuracy": 0.42068964838981626, "step": 71325 }, { "epoch": 0.07184433358412878, "grad_norm": 12.978709994744785, "learning_rate": 4.9941324522131744e-05, "loss": 2.5934, "mean_token_accuracy": 0.36896551847457887, "step": 71330 }, { "epoch": 0.07184936963723296, "grad_norm": 10.848848452406486, "learning_rate": 4.994129747518591e-05, "loss": 2.6662, "mean_token_accuracy": 0.36896551847457887, "step": 71335 }, { "epoch": 0.07185440569033712, "grad_norm": 11.046680781075052, "learning_rate": 4.994127042201589e-05, "loss": 2.3881, "mean_token_accuracy": 0.3999999940395355, "step": 71340 }, { "epoch": 0.07185944174344129, "grad_norm": 9.852345600411851, "learning_rate": 4.9941243362621696e-05, "loss": 2.9423, "mean_token_accuracy": 0.4000000059604645, "step": 71345 }, { "epoch": 0.07186447779654546, "grad_norm": 14.362708099023973, "learning_rate": 4.9941216297003354e-05, "loss": 2.4185, "mean_token_accuracy": 0.41034482717514037, "step": 71350 }, { "epoch": 0.07186951384964964, "grad_norm": 12.331388340334398, "learning_rate": 4.994118922516085e-05, "loss": 2.4748, "mean_token_accuracy": 0.4344827473163605, "step": 71355 }, { "epoch": 0.07187454990275381, "grad_norm": 10.917560575430834, "learning_rate": 4.9941162147094195e-05, "loss": 2.1875, "mean_token_accuracy": 0.48124622106552123, "step": 71360 }, { "epoch": 0.07187958595585799, "grad_norm": 13.38597094905091, "learning_rate": 4.9941135062803405e-05, "loss": 3.2665, "mean_token_accuracy": 0.3275862067937851, "step": 71365 }, { "epoch": 0.07188462200896216, "grad_norm": 11.599937822207165, "learning_rate": 4.994110797228849e-05, "loss": 2.573, "mean_token_accuracy": 0.40889292359352114, "step": 71370 }, { "epoch": 0.07188965806206633, "grad_norm": 11.539004668726683, "learning_rate": 4.994108087554944e-05, "loss": 2.6042, "mean_token_accuracy": 0.42413792610168455, "step": 71375 }, { "epoch": 0.07189469411517051, "grad_norm": 10.027263978393824, "learning_rate": 4.994105377258627e-05, "loss": 2.6138, "mean_token_accuracy": 0.3793103456497192, "step": 71380 }, { "epoch": 0.07189973016827468, "grad_norm": 11.420537320879685, "learning_rate": 4.994102666339899e-05, "loss": 2.2196, "mean_token_accuracy": 0.47586206197738645, "step": 71385 }, { "epoch": 0.07190476622137885, "grad_norm": 19.034901146450853, "learning_rate": 4.994099954798761e-05, "loss": 3.0506, "mean_token_accuracy": 0.334482753276825, "step": 71390 }, { "epoch": 0.07190980227448303, "grad_norm": 14.237850931540196, "learning_rate": 4.9940972426352135e-05, "loss": 2.6819, "mean_token_accuracy": 0.37586206793785093, "step": 71395 }, { "epoch": 0.0719148383275872, "grad_norm": 11.061479042848587, "learning_rate": 4.994094529849257e-05, "loss": 2.5385, "mean_token_accuracy": 0.3862069010734558, "step": 71400 }, { "epoch": 0.07191987438069138, "grad_norm": 11.396092282928125, "learning_rate": 4.9940918164408937e-05, "loss": 2.4133, "mean_token_accuracy": 0.4413793087005615, "step": 71405 }, { "epoch": 0.07192491043379554, "grad_norm": 11.938490827029275, "learning_rate": 4.994089102410122e-05, "loss": 2.3523, "mean_token_accuracy": 0.4241379380226135, "step": 71410 }, { "epoch": 0.07192994648689971, "grad_norm": 9.963676438646516, "learning_rate": 4.994086387756944e-05, "loss": 2.2604, "mean_token_accuracy": 0.4672111332416534, "step": 71415 }, { "epoch": 0.07193498254000388, "grad_norm": 12.025109602407882, "learning_rate": 4.994083672481361e-05, "loss": 2.3229, "mean_token_accuracy": 0.4465214729309082, "step": 71420 }, { "epoch": 0.07194001859310806, "grad_norm": 11.84345575701104, "learning_rate": 4.9940809565833715e-05, "loss": 2.1297, "mean_token_accuracy": 0.4862069070339203, "step": 71425 }, { "epoch": 0.07194505464621223, "grad_norm": 11.478245950459932, "learning_rate": 4.994078240062979e-05, "loss": 2.4269, "mean_token_accuracy": 0.3999999940395355, "step": 71430 }, { "epoch": 0.0719500906993164, "grad_norm": 11.222448409408251, "learning_rate": 4.994075522920182e-05, "loss": 2.7749, "mean_token_accuracy": 0.37586206793785093, "step": 71435 }, { "epoch": 0.07195512675242058, "grad_norm": 14.795736672515089, "learning_rate": 4.994072805154983e-05, "loss": 2.1834, "mean_token_accuracy": 0.4862069010734558, "step": 71440 }, { "epoch": 0.07196016280552475, "grad_norm": 11.985102016269979, "learning_rate": 4.994070086767383e-05, "loss": 2.8104, "mean_token_accuracy": 0.3551724135875702, "step": 71445 }, { "epoch": 0.07196519885862893, "grad_norm": 11.758987771025575, "learning_rate": 4.994067367757381e-05, "loss": 2.4874, "mean_token_accuracy": 0.36551723480224607, "step": 71450 }, { "epoch": 0.0719702349117331, "grad_norm": 16.29002748074399, "learning_rate": 4.994064648124978e-05, "loss": 2.3843, "mean_token_accuracy": 0.42068964838981626, "step": 71455 }, { "epoch": 0.07197527096483727, "grad_norm": 14.266394796785562, "learning_rate": 4.994061927870176e-05, "loss": 2.4995, "mean_token_accuracy": 0.4068965554237366, "step": 71460 }, { "epoch": 0.07198030701794145, "grad_norm": 10.418399530343343, "learning_rate": 4.9940592069929745e-05, "loss": 2.3103, "mean_token_accuracy": 0.48620688915252686, "step": 71465 }, { "epoch": 0.07198534307104562, "grad_norm": 13.556227852654867, "learning_rate": 4.9940564854933755e-05, "loss": 2.3784, "mean_token_accuracy": 0.4125226855278015, "step": 71470 }, { "epoch": 0.0719903791241498, "grad_norm": 12.672570721125945, "learning_rate": 4.994053763371378e-05, "loss": 2.9103, "mean_token_accuracy": 0.37241379618644715, "step": 71475 }, { "epoch": 0.07199541517725395, "grad_norm": 12.99002147265881, "learning_rate": 4.994051040626985e-05, "loss": 2.8115, "mean_token_accuracy": 0.3620689570903778, "step": 71480 }, { "epoch": 0.07200045123035813, "grad_norm": 11.009675239447201, "learning_rate": 4.994048317260195e-05, "loss": 2.6494, "mean_token_accuracy": 0.42068966031074523, "step": 71485 }, { "epoch": 0.0720054872834623, "grad_norm": 10.987640962150886, "learning_rate": 4.9940455932710104e-05, "loss": 2.7236, "mean_token_accuracy": 0.3896551728248596, "step": 71490 }, { "epoch": 0.07201052333656648, "grad_norm": 13.856438387420134, "learning_rate": 4.994042868659431e-05, "loss": 2.3668, "mean_token_accuracy": 0.4432546854019165, "step": 71495 }, { "epoch": 0.07201555938967065, "grad_norm": 13.693493315924862, "learning_rate": 4.9940401434254585e-05, "loss": 2.7748, "mean_token_accuracy": 0.3807622492313385, "step": 71500 }, { "epoch": 0.07202059544277482, "grad_norm": 12.42467102695888, "learning_rate": 4.9940374175690926e-05, "loss": 2.2863, "mean_token_accuracy": 0.43793103098869324, "step": 71505 }, { "epoch": 0.072025631495879, "grad_norm": 10.613430633949436, "learning_rate": 4.994034691090335e-05, "loss": 2.7947, "mean_token_accuracy": 0.37586206793785093, "step": 71510 }, { "epoch": 0.07203066754898317, "grad_norm": 10.89109345361714, "learning_rate": 4.994031963989185e-05, "loss": 2.5677, "mean_token_accuracy": 0.3931034505367279, "step": 71515 }, { "epoch": 0.07203570360208734, "grad_norm": 10.50883382429764, "learning_rate": 4.994029236265646e-05, "loss": 2.4916, "mean_token_accuracy": 0.38620689511299133, "step": 71520 }, { "epoch": 0.07204073965519152, "grad_norm": 11.048682815706222, "learning_rate": 4.9940265079197165e-05, "loss": 3.0293, "mean_token_accuracy": 0.36896551251411436, "step": 71525 }, { "epoch": 0.07204577570829569, "grad_norm": 12.349478738727637, "learning_rate": 4.994023778951397e-05, "loss": 2.6617, "mean_token_accuracy": 0.4103448212146759, "step": 71530 }, { "epoch": 0.07205081176139987, "grad_norm": 9.684595661527045, "learning_rate": 4.9940210493606906e-05, "loss": 2.4296, "mean_token_accuracy": 0.3793103486299515, "step": 71535 }, { "epoch": 0.07205584781450404, "grad_norm": 11.113699252455035, "learning_rate": 4.994018319147595e-05, "loss": 2.3171, "mean_token_accuracy": 0.45517241954803467, "step": 71540 }, { "epoch": 0.07206088386760821, "grad_norm": 10.029020265956625, "learning_rate": 4.9940155883121135e-05, "loss": 2.8643, "mean_token_accuracy": 0.41379310488700866, "step": 71545 }, { "epoch": 0.07206591992071237, "grad_norm": 14.717267114791252, "learning_rate": 4.994012856854246e-05, "loss": 2.2799, "mean_token_accuracy": 0.475862056016922, "step": 71550 }, { "epoch": 0.07207095597381655, "grad_norm": 20.959088006214035, "learning_rate": 4.994010124773992e-05, "loss": 3.2726, "mean_token_accuracy": 0.40532365441322327, "step": 71555 }, { "epoch": 0.07207599202692072, "grad_norm": 10.440547784348782, "learning_rate": 4.994007392071355e-05, "loss": 2.4183, "mean_token_accuracy": 0.42758620381355283, "step": 71560 }, { "epoch": 0.0720810280800249, "grad_norm": 11.9672385259416, "learning_rate": 4.994004658746334e-05, "loss": 2.6002, "mean_token_accuracy": 0.3827586233615875, "step": 71565 }, { "epoch": 0.07208606413312907, "grad_norm": 12.691108105709239, "learning_rate": 4.994001924798929e-05, "loss": 2.7679, "mean_token_accuracy": 0.37931033968925476, "step": 71570 }, { "epoch": 0.07209110018623324, "grad_norm": 11.317498288857527, "learning_rate": 4.993999190229142e-05, "loss": 2.3456, "mean_token_accuracy": 0.44827585816383364, "step": 71575 }, { "epoch": 0.07209613623933742, "grad_norm": 12.32279075434726, "learning_rate": 4.993996455036974e-05, "loss": 3.2205, "mean_token_accuracy": 0.3482758581638336, "step": 71580 }, { "epoch": 0.07210117229244159, "grad_norm": 15.547267147676845, "learning_rate": 4.993993719222425e-05, "loss": 2.3294, "mean_token_accuracy": 0.4551724135875702, "step": 71585 }, { "epoch": 0.07210620834554576, "grad_norm": 11.253616591538012, "learning_rate": 4.9939909827854946e-05, "loss": 2.7361, "mean_token_accuracy": 0.3827586233615875, "step": 71590 }, { "epoch": 0.07211124439864994, "grad_norm": 10.969314150306264, "learning_rate": 4.993988245726186e-05, "loss": 2.9053, "mean_token_accuracy": 0.3551724165678024, "step": 71595 }, { "epoch": 0.07211628045175411, "grad_norm": 10.175069966923722, "learning_rate": 4.993985508044499e-05, "loss": 2.258, "mean_token_accuracy": 0.4310344815254211, "step": 71600 }, { "epoch": 0.07212131650485828, "grad_norm": 11.237158970258017, "learning_rate": 4.9939827697404346e-05, "loss": 2.4367, "mean_token_accuracy": 0.3827586233615875, "step": 71605 }, { "epoch": 0.07212635255796246, "grad_norm": 11.049151223250492, "learning_rate": 4.993980030813993e-05, "loss": 2.3572, "mean_token_accuracy": 0.37241379022598264, "step": 71610 }, { "epoch": 0.07213138861106663, "grad_norm": 11.38296387544779, "learning_rate": 4.993977291265176e-05, "loss": 2.2649, "mean_token_accuracy": 0.4551724135875702, "step": 71615 }, { "epoch": 0.07213642466417079, "grad_norm": 10.538894471367406, "learning_rate": 4.993974551093981e-05, "loss": 2.7164, "mean_token_accuracy": 0.39310344457626345, "step": 71620 }, { "epoch": 0.07214146071727497, "grad_norm": 12.113524464550544, "learning_rate": 4.9939718103004134e-05, "loss": 2.4251, "mean_token_accuracy": 0.42413792610168455, "step": 71625 }, { "epoch": 0.07214649677037914, "grad_norm": 10.377896812698255, "learning_rate": 4.993969068884471e-05, "loss": 2.9003, "mean_token_accuracy": 0.358620685338974, "step": 71630 }, { "epoch": 0.07215153282348331, "grad_norm": 9.37534405713517, "learning_rate": 4.9939663268461554e-05, "loss": 2.7102, "mean_token_accuracy": 0.3655172407627106, "step": 71635 }, { "epoch": 0.07215656887658749, "grad_norm": 13.674747681983112, "learning_rate": 4.9939635841854684e-05, "loss": 2.248, "mean_token_accuracy": 0.41379311084747317, "step": 71640 }, { "epoch": 0.07216160492969166, "grad_norm": 16.603969991780037, "learning_rate": 4.9939608409024086e-05, "loss": 2.907, "mean_token_accuracy": 0.32413792610168457, "step": 71645 }, { "epoch": 0.07216664098279583, "grad_norm": 17.820120120799444, "learning_rate": 4.993958096996978e-05, "loss": 2.6106, "mean_token_accuracy": 0.3827586144208908, "step": 71650 }, { "epoch": 0.07217167703590001, "grad_norm": 10.650218917746896, "learning_rate": 4.993955352469178e-05, "loss": 2.1474, "mean_token_accuracy": 0.4724137902259827, "step": 71655 }, { "epoch": 0.07217671308900418, "grad_norm": 9.264443813989137, "learning_rate": 4.993952607319009e-05, "loss": 2.5643, "mean_token_accuracy": 0.3931034505367279, "step": 71660 }, { "epoch": 0.07218174914210836, "grad_norm": 11.552777484702569, "learning_rate": 4.993949861546469e-05, "loss": 2.4183, "mean_token_accuracy": 0.42758620381355283, "step": 71665 }, { "epoch": 0.07218678519521253, "grad_norm": 15.255875301593512, "learning_rate": 4.993947115151563e-05, "loss": 2.7141, "mean_token_accuracy": 0.34137930572032926, "step": 71670 }, { "epoch": 0.0721918212483167, "grad_norm": 11.550434574086205, "learning_rate": 4.99394436813429e-05, "loss": 2.5029, "mean_token_accuracy": 0.41379310488700866, "step": 71675 }, { "epoch": 0.07219685730142088, "grad_norm": 10.291515343467934, "learning_rate": 4.99394162049465e-05, "loss": 2.9086, "mean_token_accuracy": 0.3324258953332901, "step": 71680 }, { "epoch": 0.07220189335452505, "grad_norm": 15.028745392036004, "learning_rate": 4.993938872232645e-05, "loss": 2.6665, "mean_token_accuracy": 0.4137930989265442, "step": 71685 }, { "epoch": 0.07220692940762921, "grad_norm": 9.97353187889815, "learning_rate": 4.993936123348274e-05, "loss": 2.7902, "mean_token_accuracy": 0.4448275864124298, "step": 71690 }, { "epoch": 0.07221196546073338, "grad_norm": 11.65076030860426, "learning_rate": 4.99393337384154e-05, "loss": 2.5136, "mean_token_accuracy": 0.41222020983695984, "step": 71695 }, { "epoch": 0.07221700151383756, "grad_norm": 11.349795190571083, "learning_rate": 4.993930623712442e-05, "loss": 2.0942, "mean_token_accuracy": 0.46551724076271056, "step": 71700 }, { "epoch": 0.07222203756694173, "grad_norm": 11.859164930650953, "learning_rate": 4.9939278729609825e-05, "loss": 2.1408, "mean_token_accuracy": 0.49999999403953554, "step": 71705 }, { "epoch": 0.0722270736200459, "grad_norm": 10.308000695684758, "learning_rate": 4.993925121587161e-05, "loss": 2.1409, "mean_token_accuracy": 0.4620689630508423, "step": 71710 }, { "epoch": 0.07223210967315008, "grad_norm": 11.749737668050528, "learning_rate": 4.9939223695909775e-05, "loss": 2.556, "mean_token_accuracy": 0.4310344815254211, "step": 71715 }, { "epoch": 0.07223714572625425, "grad_norm": 11.231502145240697, "learning_rate": 4.993919616972434e-05, "loss": 2.7329, "mean_token_accuracy": 0.37586206793785093, "step": 71720 }, { "epoch": 0.07224218177935843, "grad_norm": 14.222976137875554, "learning_rate": 4.993916863731532e-05, "loss": 2.5219, "mean_token_accuracy": 0.4702964246273041, "step": 71725 }, { "epoch": 0.0722472178324626, "grad_norm": 11.299418272584601, "learning_rate": 4.993914109868271e-05, "loss": 2.6462, "mean_token_accuracy": 0.4, "step": 71730 }, { "epoch": 0.07225225388556678, "grad_norm": 11.102176581359085, "learning_rate": 4.9939113553826516e-05, "loss": 2.585, "mean_token_accuracy": 0.3999999940395355, "step": 71735 }, { "epoch": 0.07225728993867095, "grad_norm": 10.325806036488645, "learning_rate": 4.993908600274675e-05, "loss": 2.0121, "mean_token_accuracy": 0.49165154099464414, "step": 71740 }, { "epoch": 0.07226232599177512, "grad_norm": 10.160428786610945, "learning_rate": 4.9939058445443415e-05, "loss": 2.261, "mean_token_accuracy": 0.44827585816383364, "step": 71745 }, { "epoch": 0.0722673620448793, "grad_norm": 10.12163781549173, "learning_rate": 4.9939030881916535e-05, "loss": 2.2614, "mean_token_accuracy": 0.4482758641242981, "step": 71750 }, { "epoch": 0.07227239809798347, "grad_norm": 11.985489712251821, "learning_rate": 4.9939003312166104e-05, "loss": 2.5713, "mean_token_accuracy": 0.39503931999206543, "step": 71755 }, { "epoch": 0.07227743415108763, "grad_norm": 7.619634944740773, "learning_rate": 4.993897573619213e-05, "loss": 2.3959, "mean_token_accuracy": 0.44137930274009707, "step": 71760 }, { "epoch": 0.0722824702041918, "grad_norm": 12.311164023543075, "learning_rate": 4.993894815399462e-05, "loss": 2.727, "mean_token_accuracy": 0.37241379618644715, "step": 71765 }, { "epoch": 0.07228750625729598, "grad_norm": 11.10212120097582, "learning_rate": 4.993892056557359e-05, "loss": 2.2481, "mean_token_accuracy": 0.46551724672317507, "step": 71770 }, { "epoch": 0.07229254231040015, "grad_norm": 11.572472619544609, "learning_rate": 4.993889297092903e-05, "loss": 2.8596, "mean_token_accuracy": 0.40344828367233276, "step": 71775 }, { "epoch": 0.07229757836350433, "grad_norm": 10.899249628748388, "learning_rate": 4.9938865370060964e-05, "loss": 2.2756, "mean_token_accuracy": 0.448033881187439, "step": 71780 }, { "epoch": 0.0723026144166085, "grad_norm": 17.75386818786281, "learning_rate": 4.9938837762969405e-05, "loss": 2.6961, "mean_token_accuracy": 0.4, "step": 71785 }, { "epoch": 0.07230765046971267, "grad_norm": 11.16595582573691, "learning_rate": 4.9938810149654334e-05, "loss": 2.7256, "mean_token_accuracy": 0.408711439371109, "step": 71790 }, { "epoch": 0.07231268652281685, "grad_norm": 11.250038977207629, "learning_rate": 4.993878253011579e-05, "loss": 2.7665, "mean_token_accuracy": 0.4172413766384125, "step": 71795 }, { "epoch": 0.07231772257592102, "grad_norm": 11.269175702498613, "learning_rate": 4.993875490435375e-05, "loss": 2.578, "mean_token_accuracy": 0.3517241418361664, "step": 71800 }, { "epoch": 0.0723227586290252, "grad_norm": 12.81919392102948, "learning_rate": 4.993872727236825e-05, "loss": 2.6231, "mean_token_accuracy": 0.3896551728248596, "step": 71805 }, { "epoch": 0.07232779468212937, "grad_norm": 10.507017666292143, "learning_rate": 4.9938699634159284e-05, "loss": 2.6256, "mean_token_accuracy": 0.3827586233615875, "step": 71810 }, { "epoch": 0.07233283073523354, "grad_norm": 12.620833815019466, "learning_rate": 4.993867198972686e-05, "loss": 2.6904, "mean_token_accuracy": 0.3827586233615875, "step": 71815 }, { "epoch": 0.07233786678833772, "grad_norm": 12.29287347086289, "learning_rate": 4.993864433907099e-05, "loss": 2.556, "mean_token_accuracy": 0.46896552443504336, "step": 71820 }, { "epoch": 0.07234290284144189, "grad_norm": 10.328125362362801, "learning_rate": 4.993861668219168e-05, "loss": 2.4117, "mean_token_accuracy": 0.42068965137004855, "step": 71825 }, { "epoch": 0.07234793889454605, "grad_norm": 10.046770713240933, "learning_rate": 4.9938589019088924e-05, "loss": 2.8931, "mean_token_accuracy": 0.3724137991666794, "step": 71830 }, { "epoch": 0.07235297494765022, "grad_norm": 10.040703964946038, "learning_rate": 4.993856134976275e-05, "loss": 2.4024, "mean_token_accuracy": 0.4130066573619843, "step": 71835 }, { "epoch": 0.0723580110007544, "grad_norm": 12.677489016372917, "learning_rate": 4.9938533674213154e-05, "loss": 2.4964, "mean_token_accuracy": 0.41203871965408323, "step": 71840 }, { "epoch": 0.07236304705385857, "grad_norm": 12.679327368113709, "learning_rate": 4.9938505992440156e-05, "loss": 2.7582, "mean_token_accuracy": 0.3551724076271057, "step": 71845 }, { "epoch": 0.07236808310696274, "grad_norm": 10.223952074981142, "learning_rate": 4.9938478304443736e-05, "loss": 2.3166, "mean_token_accuracy": 0.41724138259887694, "step": 71850 }, { "epoch": 0.07237311916006692, "grad_norm": 10.914424594517488, "learning_rate": 4.993845061022394e-05, "loss": 2.6105, "mean_token_accuracy": 0.34137930274009703, "step": 71855 }, { "epoch": 0.07237815521317109, "grad_norm": 13.997772289838162, "learning_rate": 4.993842290978074e-05, "loss": 2.6974, "mean_token_accuracy": 0.38965516686439516, "step": 71860 }, { "epoch": 0.07238319126627527, "grad_norm": 10.442603433876926, "learning_rate": 4.9938395203114166e-05, "loss": 2.4615, "mean_token_accuracy": 0.4030248045921326, "step": 71865 }, { "epoch": 0.07238822731937944, "grad_norm": 12.634259999590816, "learning_rate": 4.993836749022423e-05, "loss": 2.4025, "mean_token_accuracy": 0.46073804795742035, "step": 71870 }, { "epoch": 0.07239326337248361, "grad_norm": 10.207431223287152, "learning_rate": 4.9938339771110914e-05, "loss": 2.7267, "mean_token_accuracy": 0.3655172407627106, "step": 71875 }, { "epoch": 0.07239829942558779, "grad_norm": 13.36800365443819, "learning_rate": 4.9938312045774245e-05, "loss": 2.9456, "mean_token_accuracy": 0.41724138259887694, "step": 71880 }, { "epoch": 0.07240333547869196, "grad_norm": 10.166108518841511, "learning_rate": 4.9938284314214236e-05, "loss": 2.429, "mean_token_accuracy": 0.4517241299152374, "step": 71885 }, { "epoch": 0.07240837153179613, "grad_norm": 11.45276519937559, "learning_rate": 4.9938256576430866e-05, "loss": 2.4781, "mean_token_accuracy": 0.441379314661026, "step": 71890 }, { "epoch": 0.07241340758490031, "grad_norm": 11.990444026856164, "learning_rate": 4.9938228832424176e-05, "loss": 2.5159, "mean_token_accuracy": 0.4413793087005615, "step": 71895 }, { "epoch": 0.07241844363800447, "grad_norm": 11.892029101980777, "learning_rate": 4.9938201082194165e-05, "loss": 2.2349, "mean_token_accuracy": 0.4310344815254211, "step": 71900 }, { "epoch": 0.07242347969110864, "grad_norm": 12.787725198137968, "learning_rate": 4.9938173325740826e-05, "loss": 2.5388, "mean_token_accuracy": 0.38620689511299133, "step": 71905 }, { "epoch": 0.07242851574421282, "grad_norm": 10.912425670937424, "learning_rate": 4.993814556306417e-05, "loss": 2.3725, "mean_token_accuracy": 0.44211822748184204, "step": 71910 }, { "epoch": 0.07243355179731699, "grad_norm": 11.389933788171122, "learning_rate": 4.9938117794164216e-05, "loss": 2.4931, "mean_token_accuracy": 0.38620689511299133, "step": 71915 }, { "epoch": 0.07243858785042116, "grad_norm": 11.717433617948556, "learning_rate": 4.993809001904097e-05, "loss": 2.6956, "mean_token_accuracy": 0.3517241358757019, "step": 71920 }, { "epoch": 0.07244362390352534, "grad_norm": 10.261877036974804, "learning_rate": 4.993806223769443e-05, "loss": 2.1946, "mean_token_accuracy": 0.4758620738983154, "step": 71925 }, { "epoch": 0.07244865995662951, "grad_norm": 9.539748662534658, "learning_rate": 4.993803445012462e-05, "loss": 2.3636, "mean_token_accuracy": 0.4689655065536499, "step": 71930 }, { "epoch": 0.07245369600973368, "grad_norm": 11.76595595517886, "learning_rate": 4.9938006656331534e-05, "loss": 2.2152, "mean_token_accuracy": 0.4241379201412201, "step": 71935 }, { "epoch": 0.07245873206283786, "grad_norm": 15.278631243674903, "learning_rate": 4.993797885631517e-05, "loss": 2.6372, "mean_token_accuracy": 0.3793103456497192, "step": 71940 }, { "epoch": 0.07246376811594203, "grad_norm": 11.014822693461545, "learning_rate": 4.993795105007556e-05, "loss": 2.246, "mean_token_accuracy": 0.42758620977401735, "step": 71945 }, { "epoch": 0.0724688041690462, "grad_norm": 10.971033396490107, "learning_rate": 4.99379232376127e-05, "loss": 2.4775, "mean_token_accuracy": 0.441379314661026, "step": 71950 }, { "epoch": 0.07247384022215038, "grad_norm": 14.161680313459602, "learning_rate": 4.993789541892661e-05, "loss": 2.157, "mean_token_accuracy": 0.4429521977901459, "step": 71955 }, { "epoch": 0.07247887627525455, "grad_norm": 14.92910822459536, "learning_rate": 4.9937867594017265e-05, "loss": 2.7432, "mean_token_accuracy": 0.4049606740474701, "step": 71960 }, { "epoch": 0.07248391232835873, "grad_norm": 11.418711023117943, "learning_rate": 4.99378397628847e-05, "loss": 2.2859, "mean_token_accuracy": 0.44295220971107485, "step": 71965 }, { "epoch": 0.07248894838146289, "grad_norm": 10.657142537386534, "learning_rate": 4.993781192552892e-05, "loss": 2.3025, "mean_token_accuracy": 0.4413793087005615, "step": 71970 }, { "epoch": 0.07249398443456706, "grad_norm": 14.876535604831842, "learning_rate": 4.993778408194993e-05, "loss": 2.478, "mean_token_accuracy": 0.4137930989265442, "step": 71975 }, { "epoch": 0.07249902048767123, "grad_norm": 11.088488940009128, "learning_rate": 4.993775623214773e-05, "loss": 2.6126, "mean_token_accuracy": 0.3896551728248596, "step": 71980 }, { "epoch": 0.07250405654077541, "grad_norm": 11.134606476520943, "learning_rate": 4.993772837612234e-05, "loss": 2.8547, "mean_token_accuracy": 0.36896551847457887, "step": 71985 }, { "epoch": 0.07250909259387958, "grad_norm": 11.605691479055297, "learning_rate": 4.993770051387377e-05, "loss": 2.6515, "mean_token_accuracy": 0.4421182215213776, "step": 71990 }, { "epoch": 0.07251412864698376, "grad_norm": 12.938916871274403, "learning_rate": 4.9937672645402004e-05, "loss": 2.6485, "mean_token_accuracy": 0.38620689511299133, "step": 71995 }, { "epoch": 0.07251916470008793, "grad_norm": 13.305607967632266, "learning_rate": 4.9937644770707074e-05, "loss": 2.5614, "mean_token_accuracy": 0.3586206942796707, "step": 72000 }, { "epoch": 0.0725242007531921, "grad_norm": 11.999243140979976, "learning_rate": 4.993761688978898e-05, "loss": 2.3539, "mean_token_accuracy": 0.3946763455867767, "step": 72005 }, { "epoch": 0.07252923680629628, "grad_norm": 12.661554151121283, "learning_rate": 4.993758900264773e-05, "loss": 2.7432, "mean_token_accuracy": 0.37586206793785093, "step": 72010 }, { "epoch": 0.07253427285940045, "grad_norm": 12.773481139331302, "learning_rate": 4.9937561109283325e-05, "loss": 2.3902, "mean_token_accuracy": 0.40199636816978457, "step": 72015 }, { "epoch": 0.07253930891250462, "grad_norm": 9.119114489379617, "learning_rate": 4.9937533209695784e-05, "loss": 2.5232, "mean_token_accuracy": 0.4275862157344818, "step": 72020 }, { "epoch": 0.0725443449656088, "grad_norm": 13.102672201812354, "learning_rate": 4.993750530388511e-05, "loss": 2.9672, "mean_token_accuracy": 0.3689655244350433, "step": 72025 }, { "epoch": 0.07254938101871297, "grad_norm": 13.502906922192079, "learning_rate": 4.993747739185131e-05, "loss": 2.7273, "mean_token_accuracy": 0.37586206793785093, "step": 72030 }, { "epoch": 0.07255441707181715, "grad_norm": 10.931625583675523, "learning_rate": 4.993744947359439e-05, "loss": 2.6936, "mean_token_accuracy": 0.43448275327682495, "step": 72035 }, { "epoch": 0.0725594531249213, "grad_norm": 10.58375945266393, "learning_rate": 4.993742154911437e-05, "loss": 2.7918, "mean_token_accuracy": 0.42758620977401735, "step": 72040 }, { "epoch": 0.07256448917802548, "grad_norm": 9.376099288205099, "learning_rate": 4.993739361841123e-05, "loss": 3.1053, "mean_token_accuracy": 0.3482758581638336, "step": 72045 }, { "epoch": 0.07256952523112965, "grad_norm": 11.821425431240204, "learning_rate": 4.9937365681485004e-05, "loss": 2.5655, "mean_token_accuracy": 0.38620689511299133, "step": 72050 }, { "epoch": 0.07257456128423383, "grad_norm": 10.043402433289465, "learning_rate": 4.993733773833569e-05, "loss": 1.8754, "mean_token_accuracy": 0.4952813029289246, "step": 72055 }, { "epoch": 0.072579597337338, "grad_norm": 9.713557864727024, "learning_rate": 4.9937309788963294e-05, "loss": 2.9376, "mean_token_accuracy": 0.36206896901130675, "step": 72060 }, { "epoch": 0.07258463339044217, "grad_norm": 10.492289895225737, "learning_rate": 4.9937281833367834e-05, "loss": 2.5403, "mean_token_accuracy": 0.42068966031074523, "step": 72065 }, { "epoch": 0.07258966944354635, "grad_norm": 11.964340224084317, "learning_rate": 4.99372538715493e-05, "loss": 2.558, "mean_token_accuracy": 0.4172413766384125, "step": 72070 }, { "epoch": 0.07259470549665052, "grad_norm": 11.833406691465733, "learning_rate": 4.993722590350772e-05, "loss": 2.221, "mean_token_accuracy": 0.41034482717514037, "step": 72075 }, { "epoch": 0.0725997415497547, "grad_norm": 9.568507002032092, "learning_rate": 4.9937197929243084e-05, "loss": 2.1943, "mean_token_accuracy": 0.49491834044456484, "step": 72080 }, { "epoch": 0.07260477760285887, "grad_norm": 11.615251721085219, "learning_rate": 4.9937169948755406e-05, "loss": 2.2385, "mean_token_accuracy": 0.4517241358757019, "step": 72085 }, { "epoch": 0.07260981365596304, "grad_norm": 13.639964199022765, "learning_rate": 4.9937141962044706e-05, "loss": 2.7596, "mean_token_accuracy": 0.37241379618644715, "step": 72090 }, { "epoch": 0.07261484970906722, "grad_norm": 11.953674673718709, "learning_rate": 4.993711396911096e-05, "loss": 2.8162, "mean_token_accuracy": 0.3862068891525269, "step": 72095 }, { "epoch": 0.07261988576217139, "grad_norm": 14.500681391463356, "learning_rate": 4.9937085969954225e-05, "loss": 2.5205, "mean_token_accuracy": 0.47931034564971925, "step": 72100 }, { "epoch": 0.07262492181527556, "grad_norm": 11.1343364020398, "learning_rate": 4.9937057964574465e-05, "loss": 2.4026, "mean_token_accuracy": 0.37586207389831544, "step": 72105 }, { "epoch": 0.07262995786837972, "grad_norm": 11.745569630947235, "learning_rate": 4.99370299529717e-05, "loss": 2.3465, "mean_token_accuracy": 0.4068965554237366, "step": 72110 }, { "epoch": 0.0726349939214839, "grad_norm": 12.516530776257065, "learning_rate": 4.993700193514595e-05, "loss": 2.7408, "mean_token_accuracy": 0.3517241358757019, "step": 72115 }, { "epoch": 0.07264002997458807, "grad_norm": 9.915700377416577, "learning_rate": 4.993697391109721e-05, "loss": 2.5748, "mean_token_accuracy": 0.41034482717514037, "step": 72120 }, { "epoch": 0.07264506602769225, "grad_norm": 10.115278657277177, "learning_rate": 4.993694588082549e-05, "loss": 2.4531, "mean_token_accuracy": 0.42413793206214906, "step": 72125 }, { "epoch": 0.07265010208079642, "grad_norm": 12.783923515468887, "learning_rate": 4.9936917844330796e-05, "loss": 2.4075, "mean_token_accuracy": 0.4298850655555725, "step": 72130 }, { "epoch": 0.07265513813390059, "grad_norm": 10.562461824078984, "learning_rate": 4.993688980161315e-05, "loss": 2.8101, "mean_token_accuracy": 0.4034482717514038, "step": 72135 }, { "epoch": 0.07266017418700477, "grad_norm": 11.484376494793619, "learning_rate": 4.993686175267254e-05, "loss": 2.8034, "mean_token_accuracy": 0.37241379618644715, "step": 72140 }, { "epoch": 0.07266521024010894, "grad_norm": 10.654652045444353, "learning_rate": 4.993683369750899e-05, "loss": 2.2444, "mean_token_accuracy": 0.47241380214691164, "step": 72145 }, { "epoch": 0.07267024629321311, "grad_norm": 11.001404227402434, "learning_rate": 4.9936805636122495e-05, "loss": 2.5194, "mean_token_accuracy": 0.3896551728248596, "step": 72150 }, { "epoch": 0.07267528234631729, "grad_norm": 12.64844526512769, "learning_rate": 4.9936777568513074e-05, "loss": 2.7745, "mean_token_accuracy": 0.41034482717514037, "step": 72155 }, { "epoch": 0.07268031839942146, "grad_norm": 14.115372997041845, "learning_rate": 4.9936749494680726e-05, "loss": 2.5279, "mean_token_accuracy": 0.41034482717514037, "step": 72160 }, { "epoch": 0.07268535445252564, "grad_norm": 14.11745542487036, "learning_rate": 4.993672141462545e-05, "loss": 2.2876, "mean_token_accuracy": 0.40689654350280763, "step": 72165 }, { "epoch": 0.07269039050562981, "grad_norm": 10.935087388866803, "learning_rate": 4.993669332834728e-05, "loss": 2.6153, "mean_token_accuracy": 0.4413793087005615, "step": 72170 }, { "epoch": 0.07269542655873398, "grad_norm": 11.459747297695882, "learning_rate": 4.9936665235846204e-05, "loss": 2.1953, "mean_token_accuracy": 0.4413793087005615, "step": 72175 }, { "epoch": 0.07270046261183814, "grad_norm": 10.797975898809698, "learning_rate": 4.9936637137122234e-05, "loss": 2.3074, "mean_token_accuracy": 0.4206896543502808, "step": 72180 }, { "epoch": 0.07270549866494232, "grad_norm": 9.927769965576273, "learning_rate": 4.9936609032175384e-05, "loss": 2.656, "mean_token_accuracy": 0.36551723480224607, "step": 72185 }, { "epoch": 0.07271053471804649, "grad_norm": 9.9366059119274, "learning_rate": 4.9936580921005653e-05, "loss": 2.2896, "mean_token_accuracy": 0.3862069010734558, "step": 72190 }, { "epoch": 0.07271557077115066, "grad_norm": 10.182686578031506, "learning_rate": 4.993655280361306e-05, "loss": 2.5989, "mean_token_accuracy": 0.3931034475564957, "step": 72195 }, { "epoch": 0.07272060682425484, "grad_norm": 11.362470730104189, "learning_rate": 4.9936524679997594e-05, "loss": 2.7519, "mean_token_accuracy": 0.3655172407627106, "step": 72200 }, { "epoch": 0.07272564287735901, "grad_norm": 12.85851822140653, "learning_rate": 4.9936496550159284e-05, "loss": 2.4645, "mean_token_accuracy": 0.4034482777118683, "step": 72205 }, { "epoch": 0.07273067893046319, "grad_norm": 8.800501259946556, "learning_rate": 4.993646841409812e-05, "loss": 2.6502, "mean_token_accuracy": 0.42571083307266233, "step": 72210 }, { "epoch": 0.07273571498356736, "grad_norm": 10.211404845777034, "learning_rate": 4.9936440271814114e-05, "loss": 2.3605, "mean_token_accuracy": 0.4137930929660797, "step": 72215 }, { "epoch": 0.07274075103667153, "grad_norm": 17.08572072247456, "learning_rate": 4.9936412123307294e-05, "loss": 2.6309, "mean_token_accuracy": 0.42413793206214906, "step": 72220 }, { "epoch": 0.07274578708977571, "grad_norm": 11.839789272502871, "learning_rate": 4.993638396857764e-05, "loss": 2.8775, "mean_token_accuracy": 0.3620689630508423, "step": 72225 }, { "epoch": 0.07275082314287988, "grad_norm": 26.127949241353303, "learning_rate": 4.993635580762517e-05, "loss": 2.4665, "mean_token_accuracy": 0.4, "step": 72230 }, { "epoch": 0.07275585919598405, "grad_norm": 10.570506277354669, "learning_rate": 4.99363276404499e-05, "loss": 2.5491, "mean_token_accuracy": 0.4, "step": 72235 }, { "epoch": 0.07276089524908823, "grad_norm": 10.065490456879969, "learning_rate": 4.993629946705182e-05, "loss": 2.7251, "mean_token_accuracy": 0.4103448212146759, "step": 72240 }, { "epoch": 0.0727659313021924, "grad_norm": 18.147303802028727, "learning_rate": 4.993627128743097e-05, "loss": 2.3385, "mean_token_accuracy": 0.42068966031074523, "step": 72245 }, { "epoch": 0.07277096735529656, "grad_norm": 10.749321348813726, "learning_rate": 4.993624310158731e-05, "loss": 2.4098, "mean_token_accuracy": 0.3655172407627106, "step": 72250 }, { "epoch": 0.07277600340840074, "grad_norm": 24.404530509693284, "learning_rate": 4.993621490952089e-05, "loss": 2.2571, "mean_token_accuracy": 0.43793103098869324, "step": 72255 }, { "epoch": 0.07278103946150491, "grad_norm": 11.801749490094599, "learning_rate": 4.99361867112317e-05, "loss": 2.6827, "mean_token_accuracy": 0.42068966031074523, "step": 72260 }, { "epoch": 0.07278607551460908, "grad_norm": 17.657090311227172, "learning_rate": 4.993615850671975e-05, "loss": 2.6583, "mean_token_accuracy": 0.42413793206214906, "step": 72265 }, { "epoch": 0.07279111156771326, "grad_norm": 12.306756401687004, "learning_rate": 4.9936130295985045e-05, "loss": 2.3396, "mean_token_accuracy": 0.4482758641242981, "step": 72270 }, { "epoch": 0.07279614762081743, "grad_norm": 14.534504993578793, "learning_rate": 4.9936102079027595e-05, "loss": 2.618, "mean_token_accuracy": 0.38275861740112305, "step": 72275 }, { "epoch": 0.0728011836739216, "grad_norm": 10.073668040250075, "learning_rate": 4.9936073855847407e-05, "loss": 3.0102, "mean_token_accuracy": 0.4137930989265442, "step": 72280 }, { "epoch": 0.07280621972702578, "grad_norm": 14.749881275112061, "learning_rate": 4.9936045626444494e-05, "loss": 2.8117, "mean_token_accuracy": 0.4413793087005615, "step": 72285 }, { "epoch": 0.07281125578012995, "grad_norm": 9.835752814550867, "learning_rate": 4.993601739081886e-05, "loss": 2.3581, "mean_token_accuracy": 0.42068966031074523, "step": 72290 }, { "epoch": 0.07281629183323413, "grad_norm": 18.30930332703122, "learning_rate": 4.993598914897051e-05, "loss": 2.6393, "mean_token_accuracy": 0.40344826579093934, "step": 72295 }, { "epoch": 0.0728213278863383, "grad_norm": 10.852575199357078, "learning_rate": 4.993596090089946e-05, "loss": 2.9509, "mean_token_accuracy": 0.37241379618644715, "step": 72300 }, { "epoch": 0.07282636393944247, "grad_norm": 9.510306256276776, "learning_rate": 4.993593264660571e-05, "loss": 2.5359, "mean_token_accuracy": 0.3827586233615875, "step": 72305 }, { "epoch": 0.07283139999254665, "grad_norm": 9.593495084847603, "learning_rate": 4.993590438608927e-05, "loss": 3.006, "mean_token_accuracy": 0.4068965554237366, "step": 72310 }, { "epoch": 0.07283643604565082, "grad_norm": 12.223821199916264, "learning_rate": 4.9935876119350146e-05, "loss": 2.6485, "mean_token_accuracy": 0.36551724672317504, "step": 72315 }, { "epoch": 0.07284147209875498, "grad_norm": 12.001795356391348, "learning_rate": 4.993584784638835e-05, "loss": 2.3903, "mean_token_accuracy": 0.42262553572654726, "step": 72320 }, { "epoch": 0.07284650815185915, "grad_norm": 9.460677259005962, "learning_rate": 4.993581956720389e-05, "loss": 2.3558, "mean_token_accuracy": 0.41379310488700866, "step": 72325 }, { "epoch": 0.07285154420496333, "grad_norm": 9.705487608003441, "learning_rate": 4.9935791281796775e-05, "loss": 2.1228, "mean_token_accuracy": 0.45021172761917116, "step": 72330 }, { "epoch": 0.0728565802580675, "grad_norm": 15.311335958846053, "learning_rate": 4.9935762990167e-05, "loss": 2.5427, "mean_token_accuracy": 0.42413792610168455, "step": 72335 }, { "epoch": 0.07286161631117168, "grad_norm": 11.397359013493828, "learning_rate": 4.993573469231458e-05, "loss": 2.5952, "mean_token_accuracy": 0.42758620381355283, "step": 72340 }, { "epoch": 0.07286665236427585, "grad_norm": 13.633317444586888, "learning_rate": 4.993570638823954e-05, "loss": 2.3747, "mean_token_accuracy": 0.4532970368862152, "step": 72345 }, { "epoch": 0.07287168841738002, "grad_norm": 10.12042932312706, "learning_rate": 4.993567807794186e-05, "loss": 2.1829, "mean_token_accuracy": 0.48275862336158754, "step": 72350 }, { "epoch": 0.0728767244704842, "grad_norm": 12.510523246293966, "learning_rate": 4.993564976142157e-05, "loss": 3.4516, "mean_token_accuracy": 0.32068965435028074, "step": 72355 }, { "epoch": 0.07288176052358837, "grad_norm": 10.86703246565231, "learning_rate": 4.993562143867866e-05, "loss": 2.489, "mean_token_accuracy": 0.3896551787853241, "step": 72360 }, { "epoch": 0.07288679657669254, "grad_norm": 11.620403511029545, "learning_rate": 4.993559310971316e-05, "loss": 2.4835, "mean_token_accuracy": 0.43647912740707395, "step": 72365 }, { "epoch": 0.07289183262979672, "grad_norm": 10.292147022120481, "learning_rate": 4.993556477452505e-05, "loss": 2.181, "mean_token_accuracy": 0.43623715043067934, "step": 72370 }, { "epoch": 0.07289686868290089, "grad_norm": 8.893932292214556, "learning_rate": 4.993553643311437e-05, "loss": 1.9211, "mean_token_accuracy": 0.5310344755649566, "step": 72375 }, { "epoch": 0.07290190473600507, "grad_norm": 12.720922187810377, "learning_rate": 4.993550808548109e-05, "loss": 2.5298, "mean_token_accuracy": 0.3896551638841629, "step": 72380 }, { "epoch": 0.07290694078910924, "grad_norm": 11.61154605670652, "learning_rate": 4.993547973162525e-05, "loss": 2.4601, "mean_token_accuracy": 0.40344828367233276, "step": 72385 }, { "epoch": 0.0729119768422134, "grad_norm": 10.985398217705818, "learning_rate": 4.9935451371546844e-05, "loss": 2.6579, "mean_token_accuracy": 0.44137930274009707, "step": 72390 }, { "epoch": 0.07291701289531757, "grad_norm": 11.172170055961857, "learning_rate": 4.993542300524588e-05, "loss": 2.6228, "mean_token_accuracy": 0.37241379618644715, "step": 72395 }, { "epoch": 0.07292204894842175, "grad_norm": 16.236557106760667, "learning_rate": 4.993539463272236e-05, "loss": 2.7382, "mean_token_accuracy": 0.38620689511299133, "step": 72400 }, { "epoch": 0.07292708500152592, "grad_norm": 12.407548622779077, "learning_rate": 4.9935366253976314e-05, "loss": 2.2455, "mean_token_accuracy": 0.44482758045196535, "step": 72405 }, { "epoch": 0.0729321210546301, "grad_norm": 11.202663897107664, "learning_rate": 4.993533786900773e-05, "loss": 2.1573, "mean_token_accuracy": 0.4673926293849945, "step": 72410 }, { "epoch": 0.07293715710773427, "grad_norm": 12.91132643619679, "learning_rate": 4.993530947781662e-05, "loss": 2.5486, "mean_token_accuracy": 0.4310344815254211, "step": 72415 }, { "epoch": 0.07294219316083844, "grad_norm": 12.97893000198072, "learning_rate": 4.9935281080403e-05, "loss": 2.8238, "mean_token_accuracy": 0.3517241358757019, "step": 72420 }, { "epoch": 0.07294722921394262, "grad_norm": 10.834034842587899, "learning_rate": 4.993525267676686e-05, "loss": 2.462, "mean_token_accuracy": 0.36896551847457887, "step": 72425 }, { "epoch": 0.07295226526704679, "grad_norm": 10.0147594962151, "learning_rate": 4.993522426690823e-05, "loss": 2.8127, "mean_token_accuracy": 0.3999999940395355, "step": 72430 }, { "epoch": 0.07295730132015096, "grad_norm": 10.293541524180158, "learning_rate": 4.99351958508271e-05, "loss": 2.6511, "mean_token_accuracy": 0.3999999940395355, "step": 72435 }, { "epoch": 0.07296233737325514, "grad_norm": 10.250687016878397, "learning_rate": 4.993516742852349e-05, "loss": 2.3762, "mean_token_accuracy": 0.3827586203813553, "step": 72440 }, { "epoch": 0.07296737342635931, "grad_norm": 10.448315549117739, "learning_rate": 4.99351389999974e-05, "loss": 2.4875, "mean_token_accuracy": 0.4344827592372894, "step": 72445 }, { "epoch": 0.07297240947946348, "grad_norm": 13.210731001332078, "learning_rate": 4.993511056524883e-05, "loss": 2.4359, "mean_token_accuracy": 0.46551724672317507, "step": 72450 }, { "epoch": 0.07297744553256766, "grad_norm": 11.238519528966805, "learning_rate": 4.993508212427781e-05, "loss": 2.4323, "mean_token_accuracy": 0.4572292804718018, "step": 72455 }, { "epoch": 0.07298248158567182, "grad_norm": 10.99616577918631, "learning_rate": 4.993505367708433e-05, "loss": 2.4446, "mean_token_accuracy": 0.42758620381355283, "step": 72460 }, { "epoch": 0.07298751763877599, "grad_norm": 10.712084970411805, "learning_rate": 4.993502522366841e-05, "loss": 2.7277, "mean_token_accuracy": 0.4379310369491577, "step": 72465 }, { "epoch": 0.07299255369188017, "grad_norm": 16.203551666641417, "learning_rate": 4.993499676403006e-05, "loss": 2.5145, "mean_token_accuracy": 0.42413793206214906, "step": 72470 }, { "epoch": 0.07299758974498434, "grad_norm": 13.12164136223451, "learning_rate": 4.9934968298169266e-05, "loss": 2.387, "mean_token_accuracy": 0.4103448331356049, "step": 72475 }, { "epoch": 0.07300262579808851, "grad_norm": 9.449511741401048, "learning_rate": 4.9934939826086055e-05, "loss": 2.5212, "mean_token_accuracy": 0.42413793206214906, "step": 72480 }, { "epoch": 0.07300766185119269, "grad_norm": 10.526415801719274, "learning_rate": 4.993491134778043e-05, "loss": 2.9463, "mean_token_accuracy": 0.3344827562570572, "step": 72485 }, { "epoch": 0.07301269790429686, "grad_norm": 9.042886627410118, "learning_rate": 4.9934882863252394e-05, "loss": 2.5385, "mean_token_accuracy": 0.4052026629447937, "step": 72490 }, { "epoch": 0.07301773395740103, "grad_norm": 14.822220398822482, "learning_rate": 4.9934854372501966e-05, "loss": 2.4732, "mean_token_accuracy": 0.4155474901199341, "step": 72495 }, { "epoch": 0.07302277001050521, "grad_norm": 12.682939884496433, "learning_rate": 4.9934825875529145e-05, "loss": 3.0513, "mean_token_accuracy": 0.35862069129943847, "step": 72500 }, { "epoch": 0.07302780606360938, "grad_norm": 9.793739351461229, "learning_rate": 4.993479737233394e-05, "loss": 2.3138, "mean_token_accuracy": 0.44482758045196535, "step": 72505 }, { "epoch": 0.07303284211671356, "grad_norm": 13.67818198823798, "learning_rate": 4.993476886291636e-05, "loss": 2.8014, "mean_token_accuracy": 0.37241379618644715, "step": 72510 }, { "epoch": 0.07303787816981773, "grad_norm": 12.882179464599904, "learning_rate": 4.993474034727642e-05, "loss": 2.3201, "mean_token_accuracy": 0.4091349124908447, "step": 72515 }, { "epoch": 0.0730429142229219, "grad_norm": 13.22214372743453, "learning_rate": 4.99347118254141e-05, "loss": 3.0196, "mean_token_accuracy": 0.3551724135875702, "step": 72520 }, { "epoch": 0.07304795027602608, "grad_norm": 10.797604812921078, "learning_rate": 4.993468329732945e-05, "loss": 2.773, "mean_token_accuracy": 0.3896551698446274, "step": 72525 }, { "epoch": 0.07305298632913024, "grad_norm": 14.85796495571364, "learning_rate": 4.9934654763022447e-05, "loss": 2.6073, "mean_token_accuracy": 0.4068965494632721, "step": 72530 }, { "epoch": 0.07305802238223441, "grad_norm": 12.887502078811295, "learning_rate": 4.993462622249312e-05, "loss": 2.773, "mean_token_accuracy": 0.38275861740112305, "step": 72535 }, { "epoch": 0.07306305843533858, "grad_norm": 14.42940983692418, "learning_rate": 4.993459767574146e-05, "loss": 2.6287, "mean_token_accuracy": 0.3910465776920319, "step": 72540 }, { "epoch": 0.07306809448844276, "grad_norm": 12.414631158400018, "learning_rate": 4.9934569122767475e-05, "loss": 2.5637, "mean_token_accuracy": 0.4379310369491577, "step": 72545 }, { "epoch": 0.07307313054154693, "grad_norm": 16.570800641473394, "learning_rate": 4.9934540563571187e-05, "loss": 2.4767, "mean_token_accuracy": 0.44307319521903993, "step": 72550 }, { "epoch": 0.0730781665946511, "grad_norm": 10.08503365826091, "learning_rate": 4.993451199815258e-05, "loss": 2.0344, "mean_token_accuracy": 0.49479734897613525, "step": 72555 }, { "epoch": 0.07308320264775528, "grad_norm": 16.5182442673272, "learning_rate": 4.99344834265117e-05, "loss": 2.5348, "mean_token_accuracy": 0.41724138259887694, "step": 72560 }, { "epoch": 0.07308823870085945, "grad_norm": 14.428776681431172, "learning_rate": 4.993445484864851e-05, "loss": 2.4657, "mean_token_accuracy": 0.41379310488700866, "step": 72565 }, { "epoch": 0.07309327475396363, "grad_norm": 12.09529962967115, "learning_rate": 4.9934426264563055e-05, "loss": 2.4007, "mean_token_accuracy": 0.4620689630508423, "step": 72570 }, { "epoch": 0.0730983108070678, "grad_norm": 9.860982635321337, "learning_rate": 4.993439767425532e-05, "loss": 2.4117, "mean_token_accuracy": 0.41034482717514037, "step": 72575 }, { "epoch": 0.07310334686017197, "grad_norm": 9.620055548909258, "learning_rate": 4.993436907772533e-05, "loss": 2.1648, "mean_token_accuracy": 0.44137930274009707, "step": 72580 }, { "epoch": 0.07310838291327615, "grad_norm": 13.321161916269885, "learning_rate": 4.9934340474973085e-05, "loss": 2.1513, "mean_token_accuracy": 0.49655172824859617, "step": 72585 }, { "epoch": 0.07311341896638032, "grad_norm": 9.368303441956412, "learning_rate": 4.993431186599858e-05, "loss": 2.29, "mean_token_accuracy": 0.42068964838981626, "step": 72590 }, { "epoch": 0.0731184550194845, "grad_norm": 12.628769166968187, "learning_rate": 4.993428325080184e-05, "loss": 2.378, "mean_token_accuracy": 0.4050211727619171, "step": 72595 }, { "epoch": 0.07312349107258866, "grad_norm": 9.00620295301634, "learning_rate": 4.993425462938288e-05, "loss": 2.4072, "mean_token_accuracy": 0.4206896543502808, "step": 72600 }, { "epoch": 0.07312852712569283, "grad_norm": 11.16097721676781, "learning_rate": 4.993422600174168e-05, "loss": 2.5759, "mean_token_accuracy": 0.35862068831920624, "step": 72605 }, { "epoch": 0.073133563178797, "grad_norm": 11.679502619804406, "learning_rate": 4.993419736787827e-05, "loss": 2.8263, "mean_token_accuracy": 0.3896551728248596, "step": 72610 }, { "epoch": 0.07313859923190118, "grad_norm": 12.992594811845812, "learning_rate": 4.993416872779265e-05, "loss": 2.6594, "mean_token_accuracy": 0.36896551847457887, "step": 72615 }, { "epoch": 0.07314363528500535, "grad_norm": 10.745330435193642, "learning_rate": 4.993414008148483e-05, "loss": 2.5455, "mean_token_accuracy": 0.3931034505367279, "step": 72620 }, { "epoch": 0.07314867133810952, "grad_norm": 11.082977663490098, "learning_rate": 4.9934111428954815e-05, "loss": 2.9057, "mean_token_accuracy": 0.37241379022598264, "step": 72625 }, { "epoch": 0.0731537073912137, "grad_norm": 14.518551347517656, "learning_rate": 4.9934082770202625e-05, "loss": 2.5439, "mean_token_accuracy": 0.4517241299152374, "step": 72630 }, { "epoch": 0.07315874344431787, "grad_norm": 10.504482318490387, "learning_rate": 4.9934054105228246e-05, "loss": 2.2096, "mean_token_accuracy": 0.4034482777118683, "step": 72635 }, { "epoch": 0.07316377949742205, "grad_norm": 12.078324314967087, "learning_rate": 4.993402543403171e-05, "loss": 2.2463, "mean_token_accuracy": 0.46551724076271056, "step": 72640 }, { "epoch": 0.07316881555052622, "grad_norm": 11.53250257379269, "learning_rate": 4.9933996756613e-05, "loss": 2.7343, "mean_token_accuracy": 0.39655172228813174, "step": 72645 }, { "epoch": 0.0731738516036304, "grad_norm": 11.351865964699005, "learning_rate": 4.9933968072972145e-05, "loss": 2.6475, "mean_token_accuracy": 0.3482758581638336, "step": 72650 }, { "epoch": 0.07317888765673457, "grad_norm": 10.706577265826784, "learning_rate": 4.9933939383109146e-05, "loss": 2.6182, "mean_token_accuracy": 0.4, "step": 72655 }, { "epoch": 0.07318392370983874, "grad_norm": 14.59814882408465, "learning_rate": 4.993391068702401e-05, "loss": 2.7607, "mean_token_accuracy": 0.3931034475564957, "step": 72660 }, { "epoch": 0.07318895976294292, "grad_norm": 10.27141236090536, "learning_rate": 4.9933881984716745e-05, "loss": 2.8877, "mean_token_accuracy": 0.3482758581638336, "step": 72665 }, { "epoch": 0.07319399581604707, "grad_norm": 13.515274801470566, "learning_rate": 4.993385327618736e-05, "loss": 2.423, "mean_token_accuracy": 0.41034482717514037, "step": 72670 }, { "epoch": 0.07319903186915125, "grad_norm": 11.496856593179961, "learning_rate": 4.9933824561435865e-05, "loss": 2.2281, "mean_token_accuracy": 0.4344827592372894, "step": 72675 }, { "epoch": 0.07320406792225542, "grad_norm": 10.023850480946967, "learning_rate": 4.9933795840462254e-05, "loss": 2.6116, "mean_token_accuracy": 0.3862069010734558, "step": 72680 }, { "epoch": 0.0732091039753596, "grad_norm": 10.748159748281589, "learning_rate": 4.993376711326655e-05, "loss": 2.4868, "mean_token_accuracy": 0.40344828069210054, "step": 72685 }, { "epoch": 0.07321414002846377, "grad_norm": 10.671028914736983, "learning_rate": 4.9933738379848764e-05, "loss": 2.8209, "mean_token_accuracy": 0.36206896901130675, "step": 72690 }, { "epoch": 0.07321917608156794, "grad_norm": 9.402937272038015, "learning_rate": 4.993370964020889e-05, "loss": 2.1738, "mean_token_accuracy": 0.4413793087005615, "step": 72695 }, { "epoch": 0.07322421213467212, "grad_norm": 11.180710195116706, "learning_rate": 4.9933680894346945e-05, "loss": 2.7296, "mean_token_accuracy": 0.38620689809322356, "step": 72700 }, { "epoch": 0.07322924818777629, "grad_norm": 11.731293533991138, "learning_rate": 4.993365214226294e-05, "loss": 2.7864, "mean_token_accuracy": 0.39310344457626345, "step": 72705 }, { "epoch": 0.07323428424088047, "grad_norm": 12.775391319381187, "learning_rate": 4.993362338395687e-05, "loss": 2.7982, "mean_token_accuracy": 0.3620689630508423, "step": 72710 }, { "epoch": 0.07323932029398464, "grad_norm": 10.168188944155752, "learning_rate": 4.993359461942876e-05, "loss": 2.9086, "mean_token_accuracy": 0.3448275804519653, "step": 72715 }, { "epoch": 0.07324435634708881, "grad_norm": 9.619224530721427, "learning_rate": 4.99335658486786e-05, "loss": 2.3741, "mean_token_accuracy": 0.4, "step": 72720 }, { "epoch": 0.07324939240019299, "grad_norm": 12.235530491565955, "learning_rate": 4.993353707170641e-05, "loss": 2.4882, "mean_token_accuracy": 0.3931034505367279, "step": 72725 }, { "epoch": 0.07325442845329716, "grad_norm": 14.161830482019129, "learning_rate": 4.9933508288512196e-05, "loss": 2.7582, "mean_token_accuracy": 0.40344825983047483, "step": 72730 }, { "epoch": 0.07325946450640133, "grad_norm": 11.065786996003816, "learning_rate": 4.9933479499095965e-05, "loss": 2.1326, "mean_token_accuracy": 0.5124016880989075, "step": 72735 }, { "epoch": 0.0732645005595055, "grad_norm": 10.943910892945322, "learning_rate": 4.993345070345772e-05, "loss": 2.8873, "mean_token_accuracy": 0.36206896901130675, "step": 72740 }, { "epoch": 0.07326953661260967, "grad_norm": 9.084490565789176, "learning_rate": 4.9933421901597484e-05, "loss": 2.3348, "mean_token_accuracy": 0.42758620381355283, "step": 72745 }, { "epoch": 0.07327457266571384, "grad_norm": 13.439504902987066, "learning_rate": 4.993339309351524e-05, "loss": 2.7223, "mean_token_accuracy": 0.3965517282485962, "step": 72750 }, { "epoch": 0.07327960871881802, "grad_norm": 11.124198028256757, "learning_rate": 4.993336427921102e-05, "loss": 2.5535, "mean_token_accuracy": 0.39655172228813174, "step": 72755 }, { "epoch": 0.07328464477192219, "grad_norm": 9.53519373722695, "learning_rate": 4.993333545868482e-05, "loss": 2.2601, "mean_token_accuracy": 0.38965516686439516, "step": 72760 }, { "epoch": 0.07328968082502636, "grad_norm": 11.02635320199772, "learning_rate": 4.993330663193666e-05, "loss": 2.7454, "mean_token_accuracy": 0.3724137932062149, "step": 72765 }, { "epoch": 0.07329471687813054, "grad_norm": 12.42936850765551, "learning_rate": 4.993327779896652e-05, "loss": 2.4891, "mean_token_accuracy": 0.3931034505367279, "step": 72770 }, { "epoch": 0.07329975293123471, "grad_norm": 11.532799952079722, "learning_rate": 4.9933248959774445e-05, "loss": 2.331, "mean_token_accuracy": 0.41905626058578493, "step": 72775 }, { "epoch": 0.07330478898433888, "grad_norm": 15.412544825014487, "learning_rate": 4.993322011436042e-05, "loss": 2.9721, "mean_token_accuracy": 0.3517241418361664, "step": 72780 }, { "epoch": 0.07330982503744306, "grad_norm": 13.022551276168073, "learning_rate": 4.993319126272445e-05, "loss": 2.7537, "mean_token_accuracy": 0.34827586710453035, "step": 72785 }, { "epoch": 0.07331486109054723, "grad_norm": 9.848614495781279, "learning_rate": 4.993316240486656e-05, "loss": 2.2557, "mean_token_accuracy": 0.47586206197738645, "step": 72790 }, { "epoch": 0.0733198971436514, "grad_norm": 13.677853765357694, "learning_rate": 4.9933133540786743e-05, "loss": 2.3623, "mean_token_accuracy": 0.4206896543502808, "step": 72795 }, { "epoch": 0.07332493319675558, "grad_norm": 14.000033698736551, "learning_rate": 4.993310467048501e-05, "loss": 2.9051, "mean_token_accuracy": 0.36896551251411436, "step": 72800 }, { "epoch": 0.07332996924985975, "grad_norm": 10.790073204684578, "learning_rate": 4.993307579396138e-05, "loss": 2.6302, "mean_token_accuracy": 0.37586206793785093, "step": 72805 }, { "epoch": 0.07333500530296391, "grad_norm": 10.949172310787837, "learning_rate": 4.993304691121585e-05, "loss": 2.3274, "mean_token_accuracy": 0.4482758641242981, "step": 72810 }, { "epoch": 0.07334004135606809, "grad_norm": 9.36385497372334, "learning_rate": 4.993301802224843e-05, "loss": 2.376, "mean_token_accuracy": 0.41379311084747317, "step": 72815 }, { "epoch": 0.07334507740917226, "grad_norm": 12.324114294063513, "learning_rate": 4.9932989127059124e-05, "loss": 3.2396, "mean_token_accuracy": 0.3655172437429428, "step": 72820 }, { "epoch": 0.07335011346227643, "grad_norm": 10.531440125641488, "learning_rate": 4.993296022564795e-05, "loss": 2.4096, "mean_token_accuracy": 0.4, "step": 72825 }, { "epoch": 0.07335514951538061, "grad_norm": 10.631136358316258, "learning_rate": 4.993293131801491e-05, "loss": 2.3275, "mean_token_accuracy": 0.41379310488700866, "step": 72830 }, { "epoch": 0.07336018556848478, "grad_norm": 15.867949896793366, "learning_rate": 4.993290240416001e-05, "loss": 2.5764, "mean_token_accuracy": 0.4551724135875702, "step": 72835 }, { "epoch": 0.07336522162158896, "grad_norm": 13.735210710100157, "learning_rate": 4.993287348408326e-05, "loss": 2.4499, "mean_token_accuracy": 0.3601935863494873, "step": 72840 }, { "epoch": 0.07337025767469313, "grad_norm": 11.580109443927407, "learning_rate": 4.993284455778467e-05, "loss": 2.4587, "mean_token_accuracy": 0.4, "step": 72845 }, { "epoch": 0.0733752937277973, "grad_norm": 13.355206504472298, "learning_rate": 4.993281562526426e-05, "loss": 2.3773, "mean_token_accuracy": 0.43103447556495667, "step": 72850 }, { "epoch": 0.07338032978090148, "grad_norm": 15.364359142280078, "learning_rate": 4.993278668652201e-05, "loss": 2.1644, "mean_token_accuracy": 0.46613301038742067, "step": 72855 }, { "epoch": 0.07338536583400565, "grad_norm": 9.531368633444488, "learning_rate": 4.9932757741557945e-05, "loss": 2.4609, "mean_token_accuracy": 0.3862068891525269, "step": 72860 }, { "epoch": 0.07339040188710982, "grad_norm": 13.954673235391898, "learning_rate": 4.9932728790372075e-05, "loss": 2.4213, "mean_token_accuracy": 0.4344827592372894, "step": 72865 }, { "epoch": 0.073395437940214, "grad_norm": 11.378571871075621, "learning_rate": 4.99326998329644e-05, "loss": 2.6886, "mean_token_accuracy": 0.42758620977401735, "step": 72870 }, { "epoch": 0.07340047399331817, "grad_norm": 9.849227185488344, "learning_rate": 4.993267086933493e-05, "loss": 1.8146, "mean_token_accuracy": 0.5354679763317108, "step": 72875 }, { "epoch": 0.07340551004642233, "grad_norm": 10.380432057865466, "learning_rate": 4.993264189948368e-05, "loss": 2.5326, "mean_token_accuracy": 0.3965517163276672, "step": 72880 }, { "epoch": 0.0734105460995265, "grad_norm": 10.51658420074308, "learning_rate": 4.993261292341066e-05, "loss": 2.356, "mean_token_accuracy": 0.41905626058578493, "step": 72885 }, { "epoch": 0.07341558215263068, "grad_norm": 9.936882840583388, "learning_rate": 4.9932583941115855e-05, "loss": 2.4113, "mean_token_accuracy": 0.4413793057203293, "step": 72890 }, { "epoch": 0.07342061820573485, "grad_norm": 10.70935240015387, "learning_rate": 4.9932554952599305e-05, "loss": 2.6979, "mean_token_accuracy": 0.3827586203813553, "step": 72895 }, { "epoch": 0.07342565425883903, "grad_norm": 13.129191040735558, "learning_rate": 4.993252595786099e-05, "loss": 2.3924, "mean_token_accuracy": 0.3896551728248596, "step": 72900 }, { "epoch": 0.0734306903119432, "grad_norm": 10.439409353260482, "learning_rate": 4.993249695690094e-05, "loss": 2.8437, "mean_token_accuracy": 0.3724138021469116, "step": 72905 }, { "epoch": 0.07343572636504737, "grad_norm": 12.301473552007637, "learning_rate": 4.9932467949719144e-05, "loss": 2.2727, "mean_token_accuracy": 0.4310344815254211, "step": 72910 }, { "epoch": 0.07344076241815155, "grad_norm": 12.647491035609791, "learning_rate": 4.993243893631562e-05, "loss": 2.547, "mean_token_accuracy": 0.4517241299152374, "step": 72915 }, { "epoch": 0.07344579847125572, "grad_norm": 13.183942009543351, "learning_rate": 4.9932409916690377e-05, "loss": 2.3256, "mean_token_accuracy": 0.4, "step": 72920 }, { "epoch": 0.0734508345243599, "grad_norm": 9.769951232078048, "learning_rate": 4.9932380890843424e-05, "loss": 2.5652, "mean_token_accuracy": 0.379310342669487, "step": 72925 }, { "epoch": 0.07345587057746407, "grad_norm": 10.333832272303406, "learning_rate": 4.9932351858774764e-05, "loss": 2.8044, "mean_token_accuracy": 0.3793103516101837, "step": 72930 }, { "epoch": 0.07346090663056824, "grad_norm": 13.539689825793765, "learning_rate": 4.993232282048441e-05, "loss": 2.5553, "mean_token_accuracy": 0.3896551787853241, "step": 72935 }, { "epoch": 0.07346594268367242, "grad_norm": 13.204383090535488, "learning_rate": 4.993229377597237e-05, "loss": 2.619, "mean_token_accuracy": 0.35862069129943847, "step": 72940 }, { "epoch": 0.07347097873677659, "grad_norm": 14.516340446792917, "learning_rate": 4.993226472523865e-05, "loss": 2.6091, "mean_token_accuracy": 0.39310344457626345, "step": 72945 }, { "epoch": 0.07347601478988075, "grad_norm": 10.791738044635656, "learning_rate": 4.993223566828325e-05, "loss": 2.559, "mean_token_accuracy": 0.3999999940395355, "step": 72950 }, { "epoch": 0.07348105084298492, "grad_norm": 12.825047126786925, "learning_rate": 4.993220660510619e-05, "loss": 2.886, "mean_token_accuracy": 0.41379310488700866, "step": 72955 }, { "epoch": 0.0734860868960891, "grad_norm": 11.793790759054717, "learning_rate": 4.993217753570747e-05, "loss": 2.4459, "mean_token_accuracy": 0.43103447556495667, "step": 72960 }, { "epoch": 0.07349112294919327, "grad_norm": 11.096154805450695, "learning_rate": 4.993214846008711e-05, "loss": 2.6377, "mean_token_accuracy": 0.3689655065536499, "step": 72965 }, { "epoch": 0.07349615900229745, "grad_norm": 12.399624168464097, "learning_rate": 4.9932119378245105e-05, "loss": 2.4626, "mean_token_accuracy": 0.4034482777118683, "step": 72970 }, { "epoch": 0.07350119505540162, "grad_norm": 16.21415537734024, "learning_rate": 4.993209029018147e-05, "loss": 2.7677, "mean_token_accuracy": 0.4034482717514038, "step": 72975 }, { "epoch": 0.07350623110850579, "grad_norm": 10.765529652947311, "learning_rate": 4.9932061195896206e-05, "loss": 2.8114, "mean_token_accuracy": 0.3379310339689255, "step": 72980 }, { "epoch": 0.07351126716160997, "grad_norm": 10.63799905466745, "learning_rate": 4.993203209538933e-05, "loss": 2.5619, "mean_token_accuracy": 0.4068965554237366, "step": 72985 }, { "epoch": 0.07351630321471414, "grad_norm": 8.956843544101632, "learning_rate": 4.993200298866085e-05, "loss": 2.5059, "mean_token_accuracy": 0.4137930989265442, "step": 72990 }, { "epoch": 0.07352133926781831, "grad_norm": 11.615231406846759, "learning_rate": 4.993197387571076e-05, "loss": 2.3642, "mean_token_accuracy": 0.42758620977401735, "step": 72995 }, { "epoch": 0.07352637532092249, "grad_norm": 11.737252297790269, "learning_rate": 4.993194475653909e-05, "loss": 2.7102, "mean_token_accuracy": 0.37586207389831544, "step": 73000 }, { "epoch": 0.07353141137402666, "grad_norm": 14.812531579043862, "learning_rate": 4.993191563114583e-05, "loss": 2.9156, "mean_token_accuracy": 0.3689655244350433, "step": 73005 }, { "epoch": 0.07353644742713084, "grad_norm": 12.820856068959575, "learning_rate": 4.9931886499530996e-05, "loss": 2.9206, "mean_token_accuracy": 0.31379309892654417, "step": 73010 }, { "epoch": 0.07354148348023501, "grad_norm": 11.658359093517545, "learning_rate": 4.993185736169459e-05, "loss": 2.7762, "mean_token_accuracy": 0.36551723480224607, "step": 73015 }, { "epoch": 0.07354651953333917, "grad_norm": 11.594873309794945, "learning_rate": 4.993182821763663e-05, "loss": 2.6411, "mean_token_accuracy": 0.3551724076271057, "step": 73020 }, { "epoch": 0.07355155558644334, "grad_norm": 10.11901341791273, "learning_rate": 4.9931799067357116e-05, "loss": 2.6076, "mean_token_accuracy": 0.3551724135875702, "step": 73025 }, { "epoch": 0.07355659163954752, "grad_norm": 9.808915724599974, "learning_rate": 4.993176991085606e-05, "loss": 2.5748, "mean_token_accuracy": 0.42068964838981626, "step": 73030 }, { "epoch": 0.07356162769265169, "grad_norm": 24.537367685285453, "learning_rate": 4.993174074813347e-05, "loss": 2.6997, "mean_token_accuracy": 0.42413792610168455, "step": 73035 }, { "epoch": 0.07356666374575586, "grad_norm": 11.758586122645296, "learning_rate": 4.993171157918935e-05, "loss": 2.5735, "mean_token_accuracy": 0.358620685338974, "step": 73040 }, { "epoch": 0.07357169979886004, "grad_norm": 12.208058050294424, "learning_rate": 4.993168240402372e-05, "loss": 2.7919, "mean_token_accuracy": 0.37241378128528596, "step": 73045 }, { "epoch": 0.07357673585196421, "grad_norm": 12.493677360779838, "learning_rate": 4.993165322263657e-05, "loss": 2.7788, "mean_token_accuracy": 0.40193586945533755, "step": 73050 }, { "epoch": 0.07358177190506839, "grad_norm": 11.389702768056457, "learning_rate": 4.993162403502792e-05, "loss": 2.3373, "mean_token_accuracy": 0.44137930274009707, "step": 73055 }, { "epoch": 0.07358680795817256, "grad_norm": 9.421003383414227, "learning_rate": 4.9931594841197776e-05, "loss": 2.6476, "mean_token_accuracy": 0.35862068831920624, "step": 73060 }, { "epoch": 0.07359184401127673, "grad_norm": 8.400750637578986, "learning_rate": 4.9931565641146145e-05, "loss": 2.3728, "mean_token_accuracy": 0.41034482419490814, "step": 73065 }, { "epoch": 0.0735968800643809, "grad_norm": 12.137269326414177, "learning_rate": 4.993153643487303e-05, "loss": 2.2787, "mean_token_accuracy": 0.47586206793785096, "step": 73070 }, { "epoch": 0.07360191611748508, "grad_norm": 12.6350670098919, "learning_rate": 4.993150722237845e-05, "loss": 3.0055, "mean_token_accuracy": 0.3379310369491577, "step": 73075 }, { "epoch": 0.07360695217058925, "grad_norm": 13.414931316332321, "learning_rate": 4.9931478003662416e-05, "loss": 2.535, "mean_token_accuracy": 0.4068965554237366, "step": 73080 }, { "epoch": 0.07361198822369343, "grad_norm": 12.710207224975148, "learning_rate": 4.9931448778724916e-05, "loss": 2.4041, "mean_token_accuracy": 0.4034482777118683, "step": 73085 }, { "epoch": 0.07361702427679759, "grad_norm": 11.125108155865965, "learning_rate": 4.993141954756597e-05, "loss": 2.594, "mean_token_accuracy": 0.38620689511299133, "step": 73090 }, { "epoch": 0.07362206032990176, "grad_norm": 9.41901852823503, "learning_rate": 4.993139031018559e-05, "loss": 2.1291, "mean_token_accuracy": 0.5137930989265442, "step": 73095 }, { "epoch": 0.07362709638300594, "grad_norm": 11.251465777286766, "learning_rate": 4.9931361066583784e-05, "loss": 2.4294, "mean_token_accuracy": 0.4588626801967621, "step": 73100 }, { "epoch": 0.07363213243611011, "grad_norm": 12.278595844752559, "learning_rate": 4.993133181676055e-05, "loss": 2.4409, "mean_token_accuracy": 0.44289171099662783, "step": 73105 }, { "epoch": 0.07363716848921428, "grad_norm": 14.714100078449578, "learning_rate": 4.99313025607159e-05, "loss": 2.6067, "mean_token_accuracy": 0.4068965554237366, "step": 73110 }, { "epoch": 0.07364220454231846, "grad_norm": 12.336767987476094, "learning_rate": 4.993127329844986e-05, "loss": 2.5407, "mean_token_accuracy": 0.4034482717514038, "step": 73115 }, { "epoch": 0.07364724059542263, "grad_norm": 11.33591917145981, "learning_rate": 4.993124402996241e-05, "loss": 2.4933, "mean_token_accuracy": 0.39310344457626345, "step": 73120 }, { "epoch": 0.0736522766485268, "grad_norm": 11.24738157986904, "learning_rate": 4.993121475525357e-05, "loss": 2.495, "mean_token_accuracy": 0.4034482717514038, "step": 73125 }, { "epoch": 0.07365731270163098, "grad_norm": 13.058965455704127, "learning_rate": 4.993118547432335e-05, "loss": 2.8349, "mean_token_accuracy": 0.32758620083332063, "step": 73130 }, { "epoch": 0.07366234875473515, "grad_norm": 11.169526090513259, "learning_rate": 4.9931156187171764e-05, "loss": 2.4791, "mean_token_accuracy": 0.42413792610168455, "step": 73135 }, { "epoch": 0.07366738480783933, "grad_norm": 10.116047180143173, "learning_rate": 4.993112689379881e-05, "loss": 2.0562, "mean_token_accuracy": 0.4814277052879333, "step": 73140 }, { "epoch": 0.0736724208609435, "grad_norm": 10.115841487983777, "learning_rate": 4.993109759420449e-05, "loss": 2.3807, "mean_token_accuracy": 0.42758620977401735, "step": 73145 }, { "epoch": 0.07367745691404767, "grad_norm": 14.134774698324755, "learning_rate": 4.993106828838883e-05, "loss": 2.9347, "mean_token_accuracy": 0.3655172407627106, "step": 73150 }, { "epoch": 0.07368249296715185, "grad_norm": 11.87922621593828, "learning_rate": 4.993103897635182e-05, "loss": 2.734, "mean_token_accuracy": 0.37241379022598264, "step": 73155 }, { "epoch": 0.073687529020256, "grad_norm": 11.964981753196366, "learning_rate": 4.99310096580935e-05, "loss": 2.683, "mean_token_accuracy": 0.4068965494632721, "step": 73160 }, { "epoch": 0.07369256507336018, "grad_norm": 9.447267569067238, "learning_rate": 4.993098033361384e-05, "loss": 2.7416, "mean_token_accuracy": 0.4068965554237366, "step": 73165 }, { "epoch": 0.07369760112646435, "grad_norm": 11.940509159511976, "learning_rate": 4.9930951002912864e-05, "loss": 2.7267, "mean_token_accuracy": 0.40344828367233276, "step": 73170 }, { "epoch": 0.07370263717956853, "grad_norm": 11.806198565836313, "learning_rate": 4.993092166599058e-05, "loss": 2.6046, "mean_token_accuracy": 0.42982456684112547, "step": 73175 }, { "epoch": 0.0737076732326727, "grad_norm": 15.673490059999738, "learning_rate": 4.993089232284699e-05, "loss": 2.8232, "mean_token_accuracy": 0.3609195381402969, "step": 73180 }, { "epoch": 0.07371270928577688, "grad_norm": 12.507235744234682, "learning_rate": 4.993086297348213e-05, "loss": 2.5893, "mean_token_accuracy": 0.4206896543502808, "step": 73185 }, { "epoch": 0.07371774533888105, "grad_norm": 12.826594827947092, "learning_rate": 4.993083361789597e-05, "loss": 2.5548, "mean_token_accuracy": 0.4206896543502808, "step": 73190 }, { "epoch": 0.07372278139198522, "grad_norm": 10.418834537369257, "learning_rate": 4.993080425608853e-05, "loss": 2.3695, "mean_token_accuracy": 0.4310344815254211, "step": 73195 }, { "epoch": 0.0737278174450894, "grad_norm": 11.778388546034375, "learning_rate": 4.993077488805983e-05, "loss": 2.8824, "mean_token_accuracy": 0.34482758343219755, "step": 73200 }, { "epoch": 0.07373285349819357, "grad_norm": 12.483087695271434, "learning_rate": 4.993074551380988e-05, "loss": 3.0121, "mean_token_accuracy": 0.3635208696126938, "step": 73205 }, { "epoch": 0.07373788955129774, "grad_norm": 11.274167954279978, "learning_rate": 4.993071613333866e-05, "loss": 2.6433, "mean_token_accuracy": 0.38620689511299133, "step": 73210 }, { "epoch": 0.07374292560440192, "grad_norm": 10.70864629119354, "learning_rate": 4.9930686746646215e-05, "loss": 2.3339, "mean_token_accuracy": 0.44482759237289426, "step": 73215 }, { "epoch": 0.07374796165750609, "grad_norm": 13.29235905194873, "learning_rate": 4.993065735373252e-05, "loss": 2.4882, "mean_token_accuracy": 0.42758620381355283, "step": 73220 }, { "epoch": 0.07375299771061027, "grad_norm": 16.602851434641263, "learning_rate": 4.993062795459761e-05, "loss": 2.706, "mean_token_accuracy": 0.3793103456497192, "step": 73225 }, { "epoch": 0.07375803376371443, "grad_norm": 12.610507239342391, "learning_rate": 4.993059854924148e-05, "loss": 2.438, "mean_token_accuracy": 0.41724138259887694, "step": 73230 }, { "epoch": 0.0737630698168186, "grad_norm": 13.885425634307948, "learning_rate": 4.993056913766413e-05, "loss": 2.469, "mean_token_accuracy": 0.4068965554237366, "step": 73235 }, { "epoch": 0.07376810586992277, "grad_norm": 12.057756074369651, "learning_rate": 4.993053971986559e-05, "loss": 2.3373, "mean_token_accuracy": 0.47586206793785096, "step": 73240 }, { "epoch": 0.07377314192302695, "grad_norm": 12.806532649367833, "learning_rate": 4.9930510295845854e-05, "loss": 2.3493, "mean_token_accuracy": 0.46896551847457885, "step": 73245 }, { "epoch": 0.07377817797613112, "grad_norm": 9.972113055927416, "learning_rate": 4.993048086560493e-05, "loss": 2.423, "mean_token_accuracy": 0.398064124584198, "step": 73250 }, { "epoch": 0.0737832140292353, "grad_norm": 16.327261704016014, "learning_rate": 4.9930451429142824e-05, "loss": 2.6025, "mean_token_accuracy": 0.38620689511299133, "step": 73255 }, { "epoch": 0.07378825008233947, "grad_norm": 16.451730189726295, "learning_rate": 4.993042198645956e-05, "loss": 2.6037, "mean_token_accuracy": 0.4206896543502808, "step": 73260 }, { "epoch": 0.07379328613544364, "grad_norm": 13.095720142230004, "learning_rate": 4.9930392537555125e-05, "loss": 2.6395, "mean_token_accuracy": 0.4068965494632721, "step": 73265 }, { "epoch": 0.07379832218854782, "grad_norm": 11.887171489716877, "learning_rate": 4.993036308242954e-05, "loss": 2.2182, "mean_token_accuracy": 0.48275862336158754, "step": 73270 }, { "epoch": 0.07380335824165199, "grad_norm": 12.825473541932265, "learning_rate": 4.993033362108281e-05, "loss": 2.9701, "mean_token_accuracy": 0.3310344785451889, "step": 73275 }, { "epoch": 0.07380839429475616, "grad_norm": 12.856875113497944, "learning_rate": 4.993030415351494e-05, "loss": 2.4272, "mean_token_accuracy": 0.4413793087005615, "step": 73280 }, { "epoch": 0.07381343034786034, "grad_norm": 14.206885665999089, "learning_rate": 4.993027467972595e-05, "loss": 2.4515, "mean_token_accuracy": 0.41034482717514037, "step": 73285 }, { "epoch": 0.07381846640096451, "grad_norm": 12.168017849721823, "learning_rate": 4.993024519971583e-05, "loss": 2.3596, "mean_token_accuracy": 0.4310344815254211, "step": 73290 }, { "epoch": 0.07382350245406867, "grad_norm": 12.218374452565037, "learning_rate": 4.9930215713484596e-05, "loss": 2.6897, "mean_token_accuracy": 0.42413792610168455, "step": 73295 }, { "epoch": 0.07382853850717284, "grad_norm": 10.063962451529624, "learning_rate": 4.9930186221032265e-05, "loss": 2.3322, "mean_token_accuracy": 0.41379310488700866, "step": 73300 }, { "epoch": 0.07383357456027702, "grad_norm": 11.130853008735604, "learning_rate": 4.9930156722358843e-05, "loss": 2.5183, "mean_token_accuracy": 0.3517241358757019, "step": 73305 }, { "epoch": 0.07383861061338119, "grad_norm": 18.641129227695448, "learning_rate": 4.993012721746433e-05, "loss": 2.7564, "mean_token_accuracy": 0.4379310369491577, "step": 73310 }, { "epoch": 0.07384364666648537, "grad_norm": 12.818905117003162, "learning_rate": 4.9930097706348733e-05, "loss": 2.6477, "mean_token_accuracy": 0.4, "step": 73315 }, { "epoch": 0.07384868271958954, "grad_norm": 11.189332873647237, "learning_rate": 4.9930068189012066e-05, "loss": 2.6046, "mean_token_accuracy": 0.4172413766384125, "step": 73320 }, { "epoch": 0.07385371877269371, "grad_norm": 18.980725744883244, "learning_rate": 4.993003866545434e-05, "loss": 2.66, "mean_token_accuracy": 0.4034482777118683, "step": 73325 }, { "epoch": 0.07385875482579789, "grad_norm": 13.13947859116219, "learning_rate": 4.993000913567555e-05, "loss": 2.3051, "mean_token_accuracy": 0.42758620977401735, "step": 73330 }, { "epoch": 0.07386379087890206, "grad_norm": 15.785769750312156, "learning_rate": 4.9929979599675726e-05, "loss": 2.7005, "mean_token_accuracy": 0.3999999940395355, "step": 73335 }, { "epoch": 0.07386882693200623, "grad_norm": 12.087716114462479, "learning_rate": 4.9929950057454856e-05, "loss": 2.7669, "mean_token_accuracy": 0.358620685338974, "step": 73340 }, { "epoch": 0.07387386298511041, "grad_norm": 10.450370328644395, "learning_rate": 4.992992050901295e-05, "loss": 2.3548, "mean_token_accuracy": 0.4344827592372894, "step": 73345 }, { "epoch": 0.07387889903821458, "grad_norm": 9.918572847349624, "learning_rate": 4.992989095435003e-05, "loss": 2.3837, "mean_token_accuracy": 0.4206896543502808, "step": 73350 }, { "epoch": 0.07388393509131876, "grad_norm": 11.18134995551533, "learning_rate": 4.9929861393466094e-05, "loss": 2.2824, "mean_token_accuracy": 0.4517241358757019, "step": 73355 }, { "epoch": 0.07388897114442293, "grad_norm": 16.190041723660187, "learning_rate": 4.992983182636115e-05, "loss": 2.6668, "mean_token_accuracy": 0.4, "step": 73360 }, { "epoch": 0.07389400719752709, "grad_norm": 12.270344644505114, "learning_rate": 4.992980225303521e-05, "loss": 2.5687, "mean_token_accuracy": 0.43793103098869324, "step": 73365 }, { "epoch": 0.07389904325063126, "grad_norm": 12.532088683868126, "learning_rate": 4.992977267348829e-05, "loss": 2.8418, "mean_token_accuracy": 0.35862069129943847, "step": 73370 }, { "epoch": 0.07390407930373544, "grad_norm": 11.897729541711936, "learning_rate": 4.9929743087720374e-05, "loss": 2.318, "mean_token_accuracy": 0.4620689630508423, "step": 73375 }, { "epoch": 0.07390911535683961, "grad_norm": 10.775705310205334, "learning_rate": 4.9929713495731494e-05, "loss": 2.4972, "mean_token_accuracy": 0.34137930572032926, "step": 73380 }, { "epoch": 0.07391415140994378, "grad_norm": 9.662684205347084, "learning_rate": 4.9929683897521645e-05, "loss": 2.6227, "mean_token_accuracy": 0.4206896543502808, "step": 73385 }, { "epoch": 0.07391918746304796, "grad_norm": 10.574146389962053, "learning_rate": 4.992965429309085e-05, "loss": 2.2444, "mean_token_accuracy": 0.4310344934463501, "step": 73390 }, { "epoch": 0.07392422351615213, "grad_norm": 9.983893004555474, "learning_rate": 4.9929624682439093e-05, "loss": 2.4134, "mean_token_accuracy": 0.4006157636642456, "step": 73395 }, { "epoch": 0.0739292595692563, "grad_norm": 8.32590652239136, "learning_rate": 4.99295950655664e-05, "loss": 2.3049, "mean_token_accuracy": 0.4482758641242981, "step": 73400 }, { "epoch": 0.07393429562236048, "grad_norm": 14.164893740010019, "learning_rate": 4.9929565442472774e-05, "loss": 2.5778, "mean_token_accuracy": 0.4241379350423813, "step": 73405 }, { "epoch": 0.07393933167546465, "grad_norm": 10.591174181187723, "learning_rate": 4.992953581315823e-05, "loss": 2.7636, "mean_token_accuracy": 0.37241379618644715, "step": 73410 }, { "epoch": 0.07394436772856883, "grad_norm": 10.574883267593195, "learning_rate": 4.992950617762276e-05, "loss": 2.8891, "mean_token_accuracy": 0.36896551847457887, "step": 73415 }, { "epoch": 0.073949403781673, "grad_norm": 10.887037162874512, "learning_rate": 4.992947653586639e-05, "loss": 2.6807, "mean_token_accuracy": 0.36896551847457887, "step": 73420 }, { "epoch": 0.07395443983477717, "grad_norm": 13.198337434367195, "learning_rate": 4.992944688788912e-05, "loss": 2.2645, "mean_token_accuracy": 0.43793103098869324, "step": 73425 }, { "epoch": 0.07395947588788135, "grad_norm": 13.66676876916609, "learning_rate": 4.992941723369096e-05, "loss": 2.512, "mean_token_accuracy": 0.3724138021469116, "step": 73430 }, { "epoch": 0.07396451194098551, "grad_norm": 9.754701692776823, "learning_rate": 4.992938757327193e-05, "loss": 2.4385, "mean_token_accuracy": 0.42413792610168455, "step": 73435 }, { "epoch": 0.07396954799408968, "grad_norm": 12.740677496675143, "learning_rate": 4.992935790663201e-05, "loss": 2.536, "mean_token_accuracy": 0.44283121824264526, "step": 73440 }, { "epoch": 0.07397458404719386, "grad_norm": 11.446803123817192, "learning_rate": 4.992932823377122e-05, "loss": 2.2865, "mean_token_accuracy": 0.3827586233615875, "step": 73445 }, { "epoch": 0.07397962010029803, "grad_norm": 13.44293547264596, "learning_rate": 4.992929855468958e-05, "loss": 2.5088, "mean_token_accuracy": 0.4206896543502808, "step": 73450 }, { "epoch": 0.0739846561534022, "grad_norm": 9.961757320240517, "learning_rate": 4.992926886938709e-05, "loss": 2.3921, "mean_token_accuracy": 0.37931033968925476, "step": 73455 }, { "epoch": 0.07398969220650638, "grad_norm": 13.262266961680465, "learning_rate": 4.992923917786376e-05, "loss": 2.5422, "mean_token_accuracy": 0.4620689630508423, "step": 73460 }, { "epoch": 0.07399472825961055, "grad_norm": 10.811437051006283, "learning_rate": 4.99292094801196e-05, "loss": 2.0815, "mean_token_accuracy": 0.48275861144065857, "step": 73465 }, { "epoch": 0.07399976431271472, "grad_norm": 14.136141277277854, "learning_rate": 4.9929179776154605e-05, "loss": 2.7742, "mean_token_accuracy": 0.382758629322052, "step": 73470 }, { "epoch": 0.0740048003658189, "grad_norm": 11.171392109253986, "learning_rate": 4.99291500659688e-05, "loss": 2.6843, "mean_token_accuracy": 0.37241379618644715, "step": 73475 }, { "epoch": 0.07400983641892307, "grad_norm": 10.163146715926262, "learning_rate": 4.992912034956218e-05, "loss": 2.8853, "mean_token_accuracy": 0.37241379022598264, "step": 73480 }, { "epoch": 0.07401487247202725, "grad_norm": 9.574556097390916, "learning_rate": 4.9929090626934765e-05, "loss": 2.4248, "mean_token_accuracy": 0.4186932861804962, "step": 73485 }, { "epoch": 0.07401990852513142, "grad_norm": 10.902211327207015, "learning_rate": 4.992906089808656e-05, "loss": 2.6733, "mean_token_accuracy": 0.37241379022598264, "step": 73490 }, { "epoch": 0.0740249445782356, "grad_norm": 18.03793466589256, "learning_rate": 4.9929031163017576e-05, "loss": 2.7994, "mean_token_accuracy": 0.4103448331356049, "step": 73495 }, { "epoch": 0.07402998063133977, "grad_norm": 11.989701035481728, "learning_rate": 4.99290014217278e-05, "loss": 2.5555, "mean_token_accuracy": 0.3813067078590393, "step": 73500 }, { "epoch": 0.07403501668444393, "grad_norm": 11.234611477752159, "learning_rate": 4.992897167421727e-05, "loss": 2.8132, "mean_token_accuracy": 0.3569872945547104, "step": 73505 }, { "epoch": 0.0740400527375481, "grad_norm": 10.779105568387152, "learning_rate": 4.992894192048598e-05, "loss": 2.5425, "mean_token_accuracy": 0.41034482717514037, "step": 73510 }, { "epoch": 0.07404508879065227, "grad_norm": 10.815033309149127, "learning_rate": 4.9928912160533935e-05, "loss": 2.3157, "mean_token_accuracy": 0.43103447556495667, "step": 73515 }, { "epoch": 0.07405012484375645, "grad_norm": 11.653483333677219, "learning_rate": 4.9928882394361145e-05, "loss": 3.2556, "mean_token_accuracy": 0.3793103486299515, "step": 73520 }, { "epoch": 0.07405516089686062, "grad_norm": 12.153108381758727, "learning_rate": 4.992885262196763e-05, "loss": 2.6998, "mean_token_accuracy": 0.4137930989265442, "step": 73525 }, { "epoch": 0.0740601969499648, "grad_norm": 25.679417456639484, "learning_rate": 4.9928822843353384e-05, "loss": 2.8483, "mean_token_accuracy": 0.40689654350280763, "step": 73530 }, { "epoch": 0.07406523300306897, "grad_norm": 10.142071095366905, "learning_rate": 4.992879305851842e-05, "loss": 2.6875, "mean_token_accuracy": 0.36896551847457887, "step": 73535 }, { "epoch": 0.07407026905617314, "grad_norm": 9.301850941240241, "learning_rate": 4.992876326746275e-05, "loss": 2.4339, "mean_token_accuracy": 0.4482758641242981, "step": 73540 }, { "epoch": 0.07407530510927732, "grad_norm": 12.888182695309183, "learning_rate": 4.992873347018638e-05, "loss": 2.6849, "mean_token_accuracy": 0.38965516686439516, "step": 73545 }, { "epoch": 0.07408034116238149, "grad_norm": 14.152800807487276, "learning_rate": 4.992870366668931e-05, "loss": 2.8768, "mean_token_accuracy": 0.39479734301567077, "step": 73550 }, { "epoch": 0.07408537721548566, "grad_norm": 9.95740163546874, "learning_rate": 4.992867385697155e-05, "loss": 2.6272, "mean_token_accuracy": 0.358620685338974, "step": 73555 }, { "epoch": 0.07409041326858984, "grad_norm": 11.774748379783778, "learning_rate": 4.9928644041033124e-05, "loss": 2.7446, "mean_token_accuracy": 0.4103448212146759, "step": 73560 }, { "epoch": 0.07409544932169401, "grad_norm": 12.301603490002, "learning_rate": 4.992861421887403e-05, "loss": 2.2436, "mean_token_accuracy": 0.45517241954803467, "step": 73565 }, { "epoch": 0.07410048537479819, "grad_norm": 11.280020643228573, "learning_rate": 4.9928584390494275e-05, "loss": 2.4999, "mean_token_accuracy": 0.38275861740112305, "step": 73570 }, { "epoch": 0.07410552142790235, "grad_norm": 10.926622947459446, "learning_rate": 4.992855455589386e-05, "loss": 2.54, "mean_token_accuracy": 0.4241379380226135, "step": 73575 }, { "epoch": 0.07411055748100652, "grad_norm": 10.038760634445248, "learning_rate": 4.992852471507282e-05, "loss": 2.4701, "mean_token_accuracy": 0.42758620381355283, "step": 73580 }, { "epoch": 0.0741155935341107, "grad_norm": 11.186144441657776, "learning_rate": 4.9928494868031136e-05, "loss": 2.6027, "mean_token_accuracy": 0.4034482717514038, "step": 73585 }, { "epoch": 0.07412062958721487, "grad_norm": 16.284438748928075, "learning_rate": 4.9928465014768825e-05, "loss": 2.587, "mean_token_accuracy": 0.43793103098869324, "step": 73590 }, { "epoch": 0.07412566564031904, "grad_norm": 12.354442883816848, "learning_rate": 4.992843515528588e-05, "loss": 2.5547, "mean_token_accuracy": 0.42068964838981626, "step": 73595 }, { "epoch": 0.07413070169342321, "grad_norm": 11.57128647679959, "learning_rate": 4.9928405289582345e-05, "loss": 2.6626, "mean_token_accuracy": 0.3896551728248596, "step": 73600 }, { "epoch": 0.07413573774652739, "grad_norm": 11.370223771657468, "learning_rate": 4.9928375417658204e-05, "loss": 2.2989, "mean_token_accuracy": 0.40689654350280763, "step": 73605 }, { "epoch": 0.07414077379963156, "grad_norm": 9.501279251962071, "learning_rate": 4.9928345539513466e-05, "loss": 2.0364, "mean_token_accuracy": 0.48856624960899353, "step": 73610 }, { "epoch": 0.07414580985273574, "grad_norm": 12.992595672818029, "learning_rate": 4.9928315655148145e-05, "loss": 2.3876, "mean_token_accuracy": 0.3834975302219391, "step": 73615 }, { "epoch": 0.07415084590583991, "grad_norm": 12.60359006748628, "learning_rate": 4.992828576456224e-05, "loss": 2.5225, "mean_token_accuracy": 0.3988505780696869, "step": 73620 }, { "epoch": 0.07415588195894408, "grad_norm": 12.979697820965283, "learning_rate": 4.992825586775578e-05, "loss": 2.4946, "mean_token_accuracy": 0.40344828069210054, "step": 73625 }, { "epoch": 0.07416091801204826, "grad_norm": 11.567143070661796, "learning_rate": 4.992822596472875e-05, "loss": 2.5673, "mean_token_accuracy": 0.3827586233615875, "step": 73630 }, { "epoch": 0.07416595406515243, "grad_norm": 10.218169555661172, "learning_rate": 4.992819605548117e-05, "loss": 2.2641, "mean_token_accuracy": 0.4258923172950745, "step": 73635 }, { "epoch": 0.0741709901182566, "grad_norm": 10.88494920878584, "learning_rate": 4.992816614001305e-05, "loss": 2.2941, "mean_token_accuracy": 0.4068965494632721, "step": 73640 }, { "epoch": 0.07417602617136076, "grad_norm": 15.06867257197011, "learning_rate": 4.992813621832439e-05, "loss": 2.774, "mean_token_accuracy": 0.3862068891525269, "step": 73645 }, { "epoch": 0.07418106222446494, "grad_norm": 10.858335850392738, "learning_rate": 4.99281062904152e-05, "loss": 2.1657, "mean_token_accuracy": 0.4620689690113068, "step": 73650 }, { "epoch": 0.07418609827756911, "grad_norm": 12.931085581666954, "learning_rate": 4.99280763562855e-05, "loss": 2.6343, "mean_token_accuracy": 0.32413793057203294, "step": 73655 }, { "epoch": 0.07419113433067329, "grad_norm": 18.1439776834488, "learning_rate": 4.9928046415935284e-05, "loss": 2.4466, "mean_token_accuracy": 0.4034482777118683, "step": 73660 }, { "epoch": 0.07419617038377746, "grad_norm": 11.338843231084653, "learning_rate": 4.992801646936456e-05, "loss": 2.6214, "mean_token_accuracy": 0.4241379380226135, "step": 73665 }, { "epoch": 0.07420120643688163, "grad_norm": 11.021891851803653, "learning_rate": 4.9927986516573345e-05, "loss": 2.1308, "mean_token_accuracy": 0.43448275327682495, "step": 73670 }, { "epoch": 0.07420624248998581, "grad_norm": 12.188399459987645, "learning_rate": 4.992795655756165e-05, "loss": 2.5989, "mean_token_accuracy": 0.4034482717514038, "step": 73675 }, { "epoch": 0.07421127854308998, "grad_norm": 13.513414605959973, "learning_rate": 4.9927926592329474e-05, "loss": 2.7421, "mean_token_accuracy": 0.36896551251411436, "step": 73680 }, { "epoch": 0.07421631459619416, "grad_norm": 11.738558941992899, "learning_rate": 4.992789662087683e-05, "loss": 2.6094, "mean_token_accuracy": 0.3910465896129608, "step": 73685 }, { "epoch": 0.07422135064929833, "grad_norm": 11.59881670224189, "learning_rate": 4.9927866643203725e-05, "loss": 2.595, "mean_token_accuracy": 0.38620689511299133, "step": 73690 }, { "epoch": 0.0742263867024025, "grad_norm": 9.57494476153104, "learning_rate": 4.992783665931017e-05, "loss": 2.6028, "mean_token_accuracy": 0.3793103456497192, "step": 73695 }, { "epoch": 0.07423142275550668, "grad_norm": 11.096350505871342, "learning_rate": 4.992780666919617e-05, "loss": 2.6475, "mean_token_accuracy": 0.41034482717514037, "step": 73700 }, { "epoch": 0.07423645880861085, "grad_norm": 9.861435241801793, "learning_rate": 4.992777667286172e-05, "loss": 2.2941, "mean_token_accuracy": 0.42068966031074523, "step": 73705 }, { "epoch": 0.07424149486171502, "grad_norm": 10.047507831909865, "learning_rate": 4.992774667030686e-05, "loss": 2.4564, "mean_token_accuracy": 0.37241379022598264, "step": 73710 }, { "epoch": 0.07424653091481918, "grad_norm": 11.714142853487562, "learning_rate": 4.992771666153157e-05, "loss": 2.3799, "mean_token_accuracy": 0.41034482717514037, "step": 73715 }, { "epoch": 0.07425156696792336, "grad_norm": 12.139510687514797, "learning_rate": 4.992768664653588e-05, "loss": 2.5127, "mean_token_accuracy": 0.42413793206214906, "step": 73720 }, { "epoch": 0.07425660302102753, "grad_norm": 11.030309031569125, "learning_rate": 4.992765662531978e-05, "loss": 2.378, "mean_token_accuracy": 0.4206896543502808, "step": 73725 }, { "epoch": 0.0742616390741317, "grad_norm": 16.011755293173092, "learning_rate": 4.992762659788328e-05, "loss": 2.6507, "mean_token_accuracy": 0.4379310369491577, "step": 73730 }, { "epoch": 0.07426667512723588, "grad_norm": 10.666542903429121, "learning_rate": 4.992759656422641e-05, "loss": 2.3661, "mean_token_accuracy": 0.4122806966304779, "step": 73735 }, { "epoch": 0.07427171118034005, "grad_norm": 11.700557396980534, "learning_rate": 4.992756652434915e-05, "loss": 2.7827, "mean_token_accuracy": 0.37586207687854767, "step": 73740 }, { "epoch": 0.07427674723344423, "grad_norm": 14.5221424078562, "learning_rate": 4.9927536478251524e-05, "loss": 3.0883, "mean_token_accuracy": 0.3643678233027458, "step": 73745 }, { "epoch": 0.0742817832865484, "grad_norm": 9.616768043136565, "learning_rate": 4.992750642593354e-05, "loss": 2.2404, "mean_token_accuracy": 0.4448275864124298, "step": 73750 }, { "epoch": 0.07428681933965257, "grad_norm": 11.638447503321581, "learning_rate": 4.99274763673952e-05, "loss": 2.4899, "mean_token_accuracy": 0.4413793087005615, "step": 73755 }, { "epoch": 0.07429185539275675, "grad_norm": 11.575827599551001, "learning_rate": 4.992744630263652e-05, "loss": 2.5081, "mean_token_accuracy": 0.3827586114406586, "step": 73760 }, { "epoch": 0.07429689144586092, "grad_norm": 10.555713534593526, "learning_rate": 4.99274162316575e-05, "loss": 2.6938, "mean_token_accuracy": 0.34482758641242983, "step": 73765 }, { "epoch": 0.0743019274989651, "grad_norm": 11.767886188270383, "learning_rate": 4.992738615445815e-05, "loss": 2.199, "mean_token_accuracy": 0.4448275864124298, "step": 73770 }, { "epoch": 0.07430696355206927, "grad_norm": 10.267336498757054, "learning_rate": 4.992735607103849e-05, "loss": 2.7611, "mean_token_accuracy": 0.3931034505367279, "step": 73775 }, { "epoch": 0.07431199960517344, "grad_norm": 11.956318339799104, "learning_rate": 4.992732598139851e-05, "loss": 2.5448, "mean_token_accuracy": 0.4103448212146759, "step": 73780 }, { "epoch": 0.0743170356582776, "grad_norm": 12.035066326776143, "learning_rate": 4.992729588553823e-05, "loss": 2.2049, "mean_token_accuracy": 0.43103448748588563, "step": 73785 }, { "epoch": 0.07432207171138178, "grad_norm": 11.407407675402807, "learning_rate": 4.992726578345765e-05, "loss": 2.3361, "mean_token_accuracy": 0.4482758641242981, "step": 73790 }, { "epoch": 0.07432710776448595, "grad_norm": 10.56347306608944, "learning_rate": 4.99272356751568e-05, "loss": 2.211, "mean_token_accuracy": 0.46388384103775027, "step": 73795 }, { "epoch": 0.07433214381759012, "grad_norm": 10.726761001651662, "learning_rate": 4.992720556063566e-05, "loss": 2.7758, "mean_token_accuracy": 0.4206896543502808, "step": 73800 }, { "epoch": 0.0743371798706943, "grad_norm": 10.02281202840823, "learning_rate": 4.992717543989426e-05, "loss": 2.4204, "mean_token_accuracy": 0.4068965494632721, "step": 73805 }, { "epoch": 0.07434221592379847, "grad_norm": 9.929350755460154, "learning_rate": 4.992714531293259e-05, "loss": 2.1412, "mean_token_accuracy": 0.45517241954803467, "step": 73810 }, { "epoch": 0.07434725197690265, "grad_norm": 10.5921077840851, "learning_rate": 4.992711517975067e-05, "loss": 2.6367, "mean_token_accuracy": 0.35692681074142457, "step": 73815 }, { "epoch": 0.07435228803000682, "grad_norm": 11.405852522544334, "learning_rate": 4.992708504034851e-05, "loss": 2.2662, "mean_token_accuracy": 0.4536600112915039, "step": 73820 }, { "epoch": 0.07435732408311099, "grad_norm": 12.797345112831934, "learning_rate": 4.992705489472611e-05, "loss": 2.3771, "mean_token_accuracy": 0.46896551847457885, "step": 73825 }, { "epoch": 0.07436236013621517, "grad_norm": 11.526747412515364, "learning_rate": 4.992702474288349e-05, "loss": 2.5823, "mean_token_accuracy": 0.3876588046550751, "step": 73830 }, { "epoch": 0.07436739618931934, "grad_norm": 11.740531093059374, "learning_rate": 4.992699458482064e-05, "loss": 2.468, "mean_token_accuracy": 0.4, "step": 73835 }, { "epoch": 0.07437243224242351, "grad_norm": 11.66094294133679, "learning_rate": 4.992696442053758e-05, "loss": 2.4595, "mean_token_accuracy": 0.42238354682922363, "step": 73840 }, { "epoch": 0.07437746829552769, "grad_norm": 12.487936707267025, "learning_rate": 4.9926934250034325e-05, "loss": 2.3687, "mean_token_accuracy": 0.44827585816383364, "step": 73845 }, { "epoch": 0.07438250434863186, "grad_norm": 12.365072504406971, "learning_rate": 4.992690407331087e-05, "loss": 2.5797, "mean_token_accuracy": 0.358620685338974, "step": 73850 }, { "epoch": 0.07438754040173602, "grad_norm": 13.00708206286683, "learning_rate": 4.992687389036724e-05, "loss": 2.6934, "mean_token_accuracy": 0.41034482717514037, "step": 73855 }, { "epoch": 0.0743925764548402, "grad_norm": 14.072368648969313, "learning_rate": 4.992684370120342e-05, "loss": 2.5628, "mean_token_accuracy": 0.4586206912994385, "step": 73860 }, { "epoch": 0.07439761250794437, "grad_norm": 12.03604809660669, "learning_rate": 4.992681350581944e-05, "loss": 2.5602, "mean_token_accuracy": 0.4363581299781799, "step": 73865 }, { "epoch": 0.07440264856104854, "grad_norm": 12.493015138253426, "learning_rate": 4.992678330421529e-05, "loss": 2.3488, "mean_token_accuracy": 0.41034482717514037, "step": 73870 }, { "epoch": 0.07440768461415272, "grad_norm": 13.458675910740086, "learning_rate": 4.9926753096391e-05, "loss": 2.9717, "mean_token_accuracy": 0.35172412991523744, "step": 73875 }, { "epoch": 0.07441272066725689, "grad_norm": 15.650197165600286, "learning_rate": 4.992672288234656e-05, "loss": 2.5704, "mean_token_accuracy": 0.3931034505367279, "step": 73880 }, { "epoch": 0.07441775672036106, "grad_norm": 16.382106190663176, "learning_rate": 4.992669266208198e-05, "loss": 2.8511, "mean_token_accuracy": 0.3620689660310745, "step": 73885 }, { "epoch": 0.07442279277346524, "grad_norm": 9.292887021953403, "learning_rate": 4.9926662435597285e-05, "loss": 2.3653, "mean_token_accuracy": 0.4344827592372894, "step": 73890 }, { "epoch": 0.07442782882656941, "grad_norm": 10.133451892027326, "learning_rate": 4.992663220289246e-05, "loss": 2.3177, "mean_token_accuracy": 0.46702963709831236, "step": 73895 }, { "epoch": 0.07443286487967359, "grad_norm": 13.072493975901146, "learning_rate": 4.9926601963967526e-05, "loss": 2.5747, "mean_token_accuracy": 0.4137930989265442, "step": 73900 }, { "epoch": 0.07443790093277776, "grad_norm": 9.24923147309057, "learning_rate": 4.99265717188225e-05, "loss": 2.2384, "mean_token_accuracy": 0.4315270930528641, "step": 73905 }, { "epoch": 0.07444293698588193, "grad_norm": 10.921803307393334, "learning_rate": 4.9926541467457374e-05, "loss": 2.3667, "mean_token_accuracy": 0.38753780722618103, "step": 73910 }, { "epoch": 0.0744479730389861, "grad_norm": 17.321765298143042, "learning_rate": 4.9926511209872166e-05, "loss": 2.7622, "mean_token_accuracy": 0.3965517282485962, "step": 73915 }, { "epoch": 0.07445300909209028, "grad_norm": 12.299526909174983, "learning_rate": 4.9926480946066876e-05, "loss": 2.7418, "mean_token_accuracy": 0.34482758343219755, "step": 73920 }, { "epoch": 0.07445804514519444, "grad_norm": 11.327556309525347, "learning_rate": 4.9926450676041526e-05, "loss": 2.9629, "mean_token_accuracy": 0.3551724135875702, "step": 73925 }, { "epoch": 0.07446308119829861, "grad_norm": 13.44288272157233, "learning_rate": 4.9926420399796114e-05, "loss": 2.4074, "mean_token_accuracy": 0.441379314661026, "step": 73930 }, { "epoch": 0.07446811725140279, "grad_norm": 10.800823138773753, "learning_rate": 4.992639011733065e-05, "loss": 2.9734, "mean_token_accuracy": 0.4034482777118683, "step": 73935 }, { "epoch": 0.07447315330450696, "grad_norm": 15.073112974654988, "learning_rate": 4.9926359828645134e-05, "loss": 2.3611, "mean_token_accuracy": 0.37586206793785093, "step": 73940 }, { "epoch": 0.07447818935761114, "grad_norm": 10.523337776663704, "learning_rate": 4.992632953373959e-05, "loss": 2.5109, "mean_token_accuracy": 0.42758620977401735, "step": 73945 }, { "epoch": 0.07448322541071531, "grad_norm": 11.435353100698668, "learning_rate": 4.9926299232614025e-05, "loss": 2.7945, "mean_token_accuracy": 0.3448275804519653, "step": 73950 }, { "epoch": 0.07448826146381948, "grad_norm": 10.590176818118957, "learning_rate": 4.9926268925268436e-05, "loss": 2.555, "mean_token_accuracy": 0.42758620977401735, "step": 73955 }, { "epoch": 0.07449329751692366, "grad_norm": 10.30890059769074, "learning_rate": 4.9926238611702834e-05, "loss": 2.5352, "mean_token_accuracy": 0.37398669123649597, "step": 73960 }, { "epoch": 0.07449833357002783, "grad_norm": 10.143187054028136, "learning_rate": 4.9926208291917246e-05, "loss": 2.5683, "mean_token_accuracy": 0.37241379618644715, "step": 73965 }, { "epoch": 0.074503369623132, "grad_norm": 12.098449141244899, "learning_rate": 4.992617796591165e-05, "loss": 2.5848, "mean_token_accuracy": 0.41379310488700866, "step": 73970 }, { "epoch": 0.07450840567623618, "grad_norm": 12.575821298055848, "learning_rate": 4.9926147633686075e-05, "loss": 2.8461, "mean_token_accuracy": 0.36551723778247835, "step": 73975 }, { "epoch": 0.07451344172934035, "grad_norm": 15.01439106295847, "learning_rate": 4.9926117295240526e-05, "loss": 2.6533, "mean_token_accuracy": 0.4655172348022461, "step": 73980 }, { "epoch": 0.07451847778244453, "grad_norm": 10.019842620970008, "learning_rate": 4.992608695057501e-05, "loss": 2.4815, "mean_token_accuracy": 0.4655172288417816, "step": 73985 }, { "epoch": 0.0745235138355487, "grad_norm": 10.133255677470762, "learning_rate": 4.992605659968953e-05, "loss": 2.2421, "mean_token_accuracy": 0.4379310369491577, "step": 73990 }, { "epoch": 0.07452854988865286, "grad_norm": 10.761493921330587, "learning_rate": 4.9926026242584104e-05, "loss": 2.7199, "mean_token_accuracy": 0.38620689511299133, "step": 73995 }, { "epoch": 0.07453358594175703, "grad_norm": 11.69394381662905, "learning_rate": 4.992599587925874e-05, "loss": 2.7066, "mean_token_accuracy": 0.3862068891525269, "step": 74000 }, { "epoch": 0.0745386219948612, "grad_norm": 9.184443872991706, "learning_rate": 4.992596550971344e-05, "loss": 2.6132, "mean_token_accuracy": 0.4310344785451889, "step": 74005 }, { "epoch": 0.07454365804796538, "grad_norm": 11.29588590447559, "learning_rate": 4.992593513394821e-05, "loss": 2.3564, "mean_token_accuracy": 0.4068965494632721, "step": 74010 }, { "epoch": 0.07454869410106955, "grad_norm": 11.256098383171906, "learning_rate": 4.9925904751963074e-05, "loss": 2.2401, "mean_token_accuracy": 0.44827585816383364, "step": 74015 }, { "epoch": 0.07455373015417373, "grad_norm": 12.207555627296458, "learning_rate": 4.992587436375802e-05, "loss": 2.3819, "mean_token_accuracy": 0.47586206793785096, "step": 74020 }, { "epoch": 0.0745587662072779, "grad_norm": 15.714446832321306, "learning_rate": 4.992584396933306e-05, "loss": 2.5931, "mean_token_accuracy": 0.3896551638841629, "step": 74025 }, { "epoch": 0.07456380226038208, "grad_norm": 12.002240277487008, "learning_rate": 4.9925813568688215e-05, "loss": 2.745, "mean_token_accuracy": 0.42068964838981626, "step": 74030 }, { "epoch": 0.07456883831348625, "grad_norm": 11.199166995912377, "learning_rate": 4.992578316182349e-05, "loss": 2.9546, "mean_token_accuracy": 0.35995160341262816, "step": 74035 }, { "epoch": 0.07457387436659042, "grad_norm": 12.211916670978802, "learning_rate": 4.992575274873889e-05, "loss": 3.0113, "mean_token_accuracy": 0.3896551787853241, "step": 74040 }, { "epoch": 0.0745789104196946, "grad_norm": 12.28447005126784, "learning_rate": 4.992572232943442e-05, "loss": 2.5282, "mean_token_accuracy": 0.4034482717514038, "step": 74045 }, { "epoch": 0.07458394647279877, "grad_norm": 11.491021536384107, "learning_rate": 4.9925691903910096e-05, "loss": 2.8965, "mean_token_accuracy": 0.40344828367233276, "step": 74050 }, { "epoch": 0.07458898252590294, "grad_norm": 9.861451129187612, "learning_rate": 4.992566147216592e-05, "loss": 2.4128, "mean_token_accuracy": 0.3896551728248596, "step": 74055 }, { "epoch": 0.07459401857900712, "grad_norm": 11.676553106242576, "learning_rate": 4.99256310342019e-05, "loss": 2.4998, "mean_token_accuracy": 0.38965516686439516, "step": 74060 }, { "epoch": 0.07459905463211128, "grad_norm": 11.468093858443208, "learning_rate": 4.9925600590018054e-05, "loss": 2.3262, "mean_token_accuracy": 0.4120387136936188, "step": 74065 }, { "epoch": 0.07460409068521545, "grad_norm": 11.207836179168911, "learning_rate": 4.992557013961439e-05, "loss": 2.8414, "mean_token_accuracy": 0.41379310488700866, "step": 74070 }, { "epoch": 0.07460912673831963, "grad_norm": 10.411372061587082, "learning_rate": 4.99255396829909e-05, "loss": 2.1922, "mean_token_accuracy": 0.4724137902259827, "step": 74075 }, { "epoch": 0.0746141627914238, "grad_norm": 7.938392302790149, "learning_rate": 4.99255092201476e-05, "loss": 1.8021, "mean_token_accuracy": 0.5531760454177856, "step": 74080 }, { "epoch": 0.07461919884452797, "grad_norm": 13.26272150778796, "learning_rate": 4.992547875108451e-05, "loss": 2.7256, "mean_token_accuracy": 0.39655172228813174, "step": 74085 }, { "epoch": 0.07462423489763215, "grad_norm": 12.906795732617358, "learning_rate": 4.992544827580163e-05, "loss": 2.4238, "mean_token_accuracy": 0.4344827592372894, "step": 74090 }, { "epoch": 0.07462927095073632, "grad_norm": 11.635695899199026, "learning_rate": 4.992541779429896e-05, "loss": 2.4453, "mean_token_accuracy": 0.3827586233615875, "step": 74095 }, { "epoch": 0.0746343070038405, "grad_norm": 8.850676857887878, "learning_rate": 4.992538730657652e-05, "loss": 2.6257, "mean_token_accuracy": 0.41034482717514037, "step": 74100 }, { "epoch": 0.07463934305694467, "grad_norm": 9.291045497361653, "learning_rate": 4.9925356812634324e-05, "loss": 2.2709, "mean_token_accuracy": 0.4689655125141144, "step": 74105 }, { "epoch": 0.07464437911004884, "grad_norm": 12.51893398984911, "learning_rate": 4.9925326312472364e-05, "loss": 2.5777, "mean_token_accuracy": 0.38421052098274233, "step": 74110 }, { "epoch": 0.07464941516315302, "grad_norm": 14.849839847027791, "learning_rate": 4.9925295806090655e-05, "loss": 2.4177, "mean_token_accuracy": 0.4517241358757019, "step": 74115 }, { "epoch": 0.07465445121625719, "grad_norm": 11.594724948797252, "learning_rate": 4.992526529348921e-05, "loss": 2.4153, "mean_token_accuracy": 0.4482758641242981, "step": 74120 }, { "epoch": 0.07465948726936136, "grad_norm": 11.51934983668768, "learning_rate": 4.992523477466803e-05, "loss": 2.6677, "mean_token_accuracy": 0.39655172228813174, "step": 74125 }, { "epoch": 0.07466452332246554, "grad_norm": 10.365735737793699, "learning_rate": 4.992520424962713e-05, "loss": 2.4432, "mean_token_accuracy": 0.38275861740112305, "step": 74130 }, { "epoch": 0.0746695593755697, "grad_norm": 17.859420800833238, "learning_rate": 4.9925173718366515e-05, "loss": 2.5676, "mean_token_accuracy": 0.5054446518421173, "step": 74135 }, { "epoch": 0.07467459542867387, "grad_norm": 12.917236174423152, "learning_rate": 4.992514318088619e-05, "loss": 2.5153, "mean_token_accuracy": 0.3896551728248596, "step": 74140 }, { "epoch": 0.07467963148177804, "grad_norm": 10.55110949034747, "learning_rate": 4.992511263718617e-05, "loss": 2.2726, "mean_token_accuracy": 0.4, "step": 74145 }, { "epoch": 0.07468466753488222, "grad_norm": 12.13870132225336, "learning_rate": 4.992508208726647e-05, "loss": 2.6376, "mean_token_accuracy": 0.3896551787853241, "step": 74150 }, { "epoch": 0.07468970358798639, "grad_norm": 8.89327466054518, "learning_rate": 4.992505153112709e-05, "loss": 2.7559, "mean_token_accuracy": 0.42068966031074523, "step": 74155 }, { "epoch": 0.07469473964109057, "grad_norm": 10.64888647954188, "learning_rate": 4.992502096876802e-05, "loss": 2.6124, "mean_token_accuracy": 0.36896551251411436, "step": 74160 }, { "epoch": 0.07469977569419474, "grad_norm": 13.627241509361527, "learning_rate": 4.99249904001893e-05, "loss": 3.0611, "mean_token_accuracy": 0.3655172437429428, "step": 74165 }, { "epoch": 0.07470481174729891, "grad_norm": 11.908444496688013, "learning_rate": 4.992495982539093e-05, "loss": 2.4042, "mean_token_accuracy": 0.4206896543502808, "step": 74170 }, { "epoch": 0.07470984780040309, "grad_norm": 19.982741144463642, "learning_rate": 4.99249292443729e-05, "loss": 2.7375, "mean_token_accuracy": 0.4344827622175217, "step": 74175 }, { "epoch": 0.07471488385350726, "grad_norm": 12.58664641906852, "learning_rate": 4.992489865713524e-05, "loss": 2.6528, "mean_token_accuracy": 0.41724138259887694, "step": 74180 }, { "epoch": 0.07471991990661143, "grad_norm": 11.333665117570488, "learning_rate": 4.992486806367796e-05, "loss": 2.3685, "mean_token_accuracy": 0.42413793206214906, "step": 74185 }, { "epoch": 0.07472495595971561, "grad_norm": 13.665573995973682, "learning_rate": 4.992483746400105e-05, "loss": 2.6871, "mean_token_accuracy": 0.4000000059604645, "step": 74190 }, { "epoch": 0.07472999201281978, "grad_norm": 11.080945448061124, "learning_rate": 4.9924806858104525e-05, "loss": 2.6011, "mean_token_accuracy": 0.4, "step": 74195 }, { "epoch": 0.07473502806592396, "grad_norm": 11.629961530886522, "learning_rate": 4.9924776245988405e-05, "loss": 1.963, "mean_token_accuracy": 0.47931034564971925, "step": 74200 }, { "epoch": 0.07474006411902812, "grad_norm": 9.322069575672932, "learning_rate": 4.992474562765268e-05, "loss": 2.4343, "mean_token_accuracy": 0.42413792610168455, "step": 74205 }, { "epoch": 0.07474510017213229, "grad_norm": 9.621145206695322, "learning_rate": 4.992471500309736e-05, "loss": 2.3957, "mean_token_accuracy": 0.44827587008476255, "step": 74210 }, { "epoch": 0.07475013622523646, "grad_norm": 9.89626262987316, "learning_rate": 4.992468437232248e-05, "loss": 2.9162, "mean_token_accuracy": 0.38965516686439516, "step": 74215 }, { "epoch": 0.07475517227834064, "grad_norm": 16.778608014872635, "learning_rate": 4.992465373532802e-05, "loss": 2.0217, "mean_token_accuracy": 0.47931033968925474, "step": 74220 }, { "epoch": 0.07476020833144481, "grad_norm": 10.877294811953972, "learning_rate": 4.992462309211401e-05, "loss": 2.4218, "mean_token_accuracy": 0.41724138259887694, "step": 74225 }, { "epoch": 0.07476524438454898, "grad_norm": 10.196359983468803, "learning_rate": 4.992459244268044e-05, "loss": 1.9349, "mean_token_accuracy": 0.4862068951129913, "step": 74230 }, { "epoch": 0.07477028043765316, "grad_norm": 11.063878158498069, "learning_rate": 4.9924561787027324e-05, "loss": 2.4044, "mean_token_accuracy": 0.4206896543502808, "step": 74235 }, { "epoch": 0.07477531649075733, "grad_norm": 10.797848261889524, "learning_rate": 4.992453112515467e-05, "loss": 2.3162, "mean_token_accuracy": 0.41379310488700866, "step": 74240 }, { "epoch": 0.0747803525438615, "grad_norm": 14.902552168981597, "learning_rate": 4.9924500457062495e-05, "loss": 2.5545, "mean_token_accuracy": 0.41034482717514037, "step": 74245 }, { "epoch": 0.07478538859696568, "grad_norm": 11.51870346550696, "learning_rate": 4.992446978275079e-05, "loss": 2.4248, "mean_token_accuracy": 0.36551723480224607, "step": 74250 }, { "epoch": 0.07479042465006985, "grad_norm": 9.876411185200189, "learning_rate": 4.992443910221959e-05, "loss": 2.4941, "mean_token_accuracy": 0.4172413766384125, "step": 74255 }, { "epoch": 0.07479546070317403, "grad_norm": 12.343835083903386, "learning_rate": 4.9924408415468876e-05, "loss": 2.9519, "mean_token_accuracy": 0.36896551251411436, "step": 74260 }, { "epoch": 0.0748004967562782, "grad_norm": 11.875383357672037, "learning_rate": 4.992437772249867e-05, "loss": 2.6959, "mean_token_accuracy": 0.3896551728248596, "step": 74265 }, { "epoch": 0.07480553280938237, "grad_norm": 11.855927324418904, "learning_rate": 4.9924347023308986e-05, "loss": 2.3392, "mean_token_accuracy": 0.40689656138420105, "step": 74270 }, { "epoch": 0.07481056886248653, "grad_norm": 13.251153291016989, "learning_rate": 4.992431631789982e-05, "loss": 2.7221, "mean_token_accuracy": 0.3551724135875702, "step": 74275 }, { "epoch": 0.07481560491559071, "grad_norm": 9.596593419375113, "learning_rate": 4.9924285606271196e-05, "loss": 2.2946, "mean_token_accuracy": 0.43103447556495667, "step": 74280 }, { "epoch": 0.07482064096869488, "grad_norm": 20.558329273704697, "learning_rate": 4.99242548884231e-05, "loss": 2.834, "mean_token_accuracy": 0.43793103098869324, "step": 74285 }, { "epoch": 0.07482567702179906, "grad_norm": 13.820635556792803, "learning_rate": 4.992422416435556e-05, "loss": 2.7312, "mean_token_accuracy": 0.3655172407627106, "step": 74290 }, { "epoch": 0.07483071307490323, "grad_norm": 10.315810325782191, "learning_rate": 4.992419343406858e-05, "loss": 2.4181, "mean_token_accuracy": 0.41034482717514037, "step": 74295 }, { "epoch": 0.0748357491280074, "grad_norm": 9.31226254551686, "learning_rate": 4.9924162697562154e-05, "loss": 2.256, "mean_token_accuracy": 0.41724138259887694, "step": 74300 }, { "epoch": 0.07484078518111158, "grad_norm": 11.402568319134643, "learning_rate": 4.992413195483631e-05, "loss": 2.7093, "mean_token_accuracy": 0.39655172228813174, "step": 74305 }, { "epoch": 0.07484582123421575, "grad_norm": 9.96133520409277, "learning_rate": 4.992410120589106e-05, "loss": 2.4894, "mean_token_accuracy": 0.42413793206214906, "step": 74310 }, { "epoch": 0.07485085728731992, "grad_norm": 10.596791688506148, "learning_rate": 4.992407045072639e-05, "loss": 2.0331, "mean_token_accuracy": 0.5034482717514038, "step": 74315 }, { "epoch": 0.0748558933404241, "grad_norm": 9.604333871957234, "learning_rate": 4.992403968934232e-05, "loss": 2.3393, "mean_token_accuracy": 0.41034482717514037, "step": 74320 }, { "epoch": 0.07486092939352827, "grad_norm": 11.190424881352772, "learning_rate": 4.992400892173886e-05, "loss": 2.5643, "mean_token_accuracy": 0.41034482717514037, "step": 74325 }, { "epoch": 0.07486596544663245, "grad_norm": 11.41769878893656, "learning_rate": 4.992397814791602e-05, "loss": 2.1853, "mean_token_accuracy": 0.43448275327682495, "step": 74330 }, { "epoch": 0.07487100149973662, "grad_norm": 11.235681949946677, "learning_rate": 4.992394736787381e-05, "loss": 3.4314, "mean_token_accuracy": 0.32214156687259676, "step": 74335 }, { "epoch": 0.0748760375528408, "grad_norm": 9.020089417613077, "learning_rate": 4.992391658161223e-05, "loss": 2.0489, "mean_token_accuracy": 0.5137930929660797, "step": 74340 }, { "epoch": 0.07488107360594495, "grad_norm": 13.535140799396936, "learning_rate": 4.992388578913129e-05, "loss": 2.5086, "mean_token_accuracy": 0.3896551728248596, "step": 74345 }, { "epoch": 0.07488610965904913, "grad_norm": 11.196682218085444, "learning_rate": 4.9923854990431e-05, "loss": 2.618, "mean_token_accuracy": 0.4034482777118683, "step": 74350 }, { "epoch": 0.0748911457121533, "grad_norm": 9.526240174426706, "learning_rate": 4.992382418551137e-05, "loss": 2.8418, "mean_token_accuracy": 0.3655172407627106, "step": 74355 }, { "epoch": 0.07489618176525747, "grad_norm": 11.37867831104485, "learning_rate": 4.992379337437242e-05, "loss": 2.3842, "mean_token_accuracy": 0.4015124022960663, "step": 74360 }, { "epoch": 0.07490121781836165, "grad_norm": 10.805245750128467, "learning_rate": 4.992376255701414e-05, "loss": 2.4074, "mean_token_accuracy": 0.38965516090393065, "step": 74365 }, { "epoch": 0.07490625387146582, "grad_norm": 11.216256501506264, "learning_rate": 4.9923731733436555e-05, "loss": 2.1359, "mean_token_accuracy": 0.49655171632766726, "step": 74370 }, { "epoch": 0.07491128992457, "grad_norm": 13.901447306731669, "learning_rate": 4.992370090363965e-05, "loss": 2.9417, "mean_token_accuracy": 0.34482758343219755, "step": 74375 }, { "epoch": 0.07491632597767417, "grad_norm": 12.293800677139833, "learning_rate": 4.9923670067623454e-05, "loss": 3.212, "mean_token_accuracy": 0.42512314915657046, "step": 74380 }, { "epoch": 0.07492136203077834, "grad_norm": 11.986101737088507, "learning_rate": 4.992363922538797e-05, "loss": 2.5723, "mean_token_accuracy": 0.4068965554237366, "step": 74385 }, { "epoch": 0.07492639808388252, "grad_norm": 12.70300256486122, "learning_rate": 4.99236083769332e-05, "loss": 2.4379, "mean_token_accuracy": 0.4379310369491577, "step": 74390 }, { "epoch": 0.07493143413698669, "grad_norm": 10.837827362826351, "learning_rate": 4.992357752225917e-05, "loss": 2.8077, "mean_token_accuracy": 0.3689655244350433, "step": 74395 }, { "epoch": 0.07493647019009086, "grad_norm": 11.52890530681104, "learning_rate": 4.992354666136587e-05, "loss": 2.4655, "mean_token_accuracy": 0.38620689511299133, "step": 74400 }, { "epoch": 0.07494150624319504, "grad_norm": 12.805526043302219, "learning_rate": 4.992351579425332e-05, "loss": 3.0517, "mean_token_accuracy": 0.3965517163276672, "step": 74405 }, { "epoch": 0.07494654229629921, "grad_norm": 9.196198975401728, "learning_rate": 4.9923484920921515e-05, "loss": 2.6373, "mean_token_accuracy": 0.4310344815254211, "step": 74410 }, { "epoch": 0.07495157834940337, "grad_norm": 12.050633963051533, "learning_rate": 4.9923454041370476e-05, "loss": 2.4575, "mean_token_accuracy": 0.4034482777118683, "step": 74415 }, { "epoch": 0.07495661440250755, "grad_norm": 11.829061245530673, "learning_rate": 4.992342315560022e-05, "loss": 2.6844, "mean_token_accuracy": 0.39655172228813174, "step": 74420 }, { "epoch": 0.07496165045561172, "grad_norm": 11.118065710046862, "learning_rate": 4.992339226361073e-05, "loss": 2.4732, "mean_token_accuracy": 0.38965516686439516, "step": 74425 }, { "epoch": 0.0749666865087159, "grad_norm": 17.736158544830545, "learning_rate": 4.992336136540203e-05, "loss": 2.5543, "mean_token_accuracy": 0.37586206793785093, "step": 74430 }, { "epoch": 0.07497172256182007, "grad_norm": 10.290261594254197, "learning_rate": 4.992333046097413e-05, "loss": 2.452, "mean_token_accuracy": 0.3931034505367279, "step": 74435 }, { "epoch": 0.07497675861492424, "grad_norm": 11.476327631613017, "learning_rate": 4.992329955032704e-05, "loss": 2.7828, "mean_token_accuracy": 0.38275861740112305, "step": 74440 }, { "epoch": 0.07498179466802841, "grad_norm": 11.22055784981995, "learning_rate": 4.992326863346076e-05, "loss": 2.5464, "mean_token_accuracy": 0.43448275327682495, "step": 74445 }, { "epoch": 0.07498683072113259, "grad_norm": 9.429611475148532, "learning_rate": 4.9923237710375306e-05, "loss": 2.495, "mean_token_accuracy": 0.4000000059604645, "step": 74450 }, { "epoch": 0.07499186677423676, "grad_norm": 12.229032192088107, "learning_rate": 4.9923206781070676e-05, "loss": 2.3035, "mean_token_accuracy": 0.44827585816383364, "step": 74455 }, { "epoch": 0.07499690282734094, "grad_norm": 11.765983549458799, "learning_rate": 4.9923175845546894e-05, "loss": 2.627, "mean_token_accuracy": 0.3862068891525269, "step": 74460 }, { "epoch": 0.07500193888044511, "grad_norm": 10.583596940391164, "learning_rate": 4.992314490380395e-05, "loss": 2.7771, "mean_token_accuracy": 0.38620689511299133, "step": 74465 }, { "epoch": 0.07500697493354928, "grad_norm": 10.977811221583897, "learning_rate": 4.992311395584188e-05, "loss": 2.4795, "mean_token_accuracy": 0.4068965494632721, "step": 74470 }, { "epoch": 0.07501201098665346, "grad_norm": 15.17782509000339, "learning_rate": 4.992308300166066e-05, "loss": 2.7612, "mean_token_accuracy": 0.3413792997598648, "step": 74475 }, { "epoch": 0.07501704703975763, "grad_norm": 11.47026806933236, "learning_rate": 4.992305204126032e-05, "loss": 2.4991, "mean_token_accuracy": 0.41034482717514037, "step": 74480 }, { "epoch": 0.07502208309286179, "grad_norm": 10.133484811205273, "learning_rate": 4.992302107464086e-05, "loss": 2.1546, "mean_token_accuracy": 0.4310344815254211, "step": 74485 }, { "epoch": 0.07502711914596596, "grad_norm": 11.412941777633272, "learning_rate": 4.99229901018023e-05, "loss": 2.3158, "mean_token_accuracy": 0.4517241358757019, "step": 74490 }, { "epoch": 0.07503215519907014, "grad_norm": 11.299828877273477, "learning_rate": 4.992295912274463e-05, "loss": 2.6367, "mean_token_accuracy": 0.39310344159603117, "step": 74495 }, { "epoch": 0.07503719125217431, "grad_norm": 10.327596041858158, "learning_rate": 4.992292813746787e-05, "loss": 2.7698, "mean_token_accuracy": 0.4137930989265442, "step": 74500 }, { "epoch": 0.07504222730527849, "grad_norm": 11.783911849212519, "learning_rate": 4.992289714597203e-05, "loss": 2.5006, "mean_token_accuracy": 0.4172413766384125, "step": 74505 }, { "epoch": 0.07504726335838266, "grad_norm": 10.577087843329933, "learning_rate": 4.992286614825712e-05, "loss": 2.6119, "mean_token_accuracy": 0.3911675751209259, "step": 74510 }, { "epoch": 0.07505229941148683, "grad_norm": 9.820734268288287, "learning_rate": 4.9922835144323143e-05, "loss": 2.3405, "mean_token_accuracy": 0.43811252117156985, "step": 74515 }, { "epoch": 0.07505733546459101, "grad_norm": 13.293306421561798, "learning_rate": 4.992280413417011e-05, "loss": 2.6027, "mean_token_accuracy": 0.4157289773225784, "step": 74520 }, { "epoch": 0.07506237151769518, "grad_norm": 8.667513840389539, "learning_rate": 4.992277311779802e-05, "loss": 2.2602, "mean_token_accuracy": 0.4206896543502808, "step": 74525 }, { "epoch": 0.07506740757079935, "grad_norm": 12.233327291281816, "learning_rate": 4.99227420952069e-05, "loss": 2.7096, "mean_token_accuracy": 0.3482758641242981, "step": 74530 }, { "epoch": 0.07507244362390353, "grad_norm": 10.225347339780871, "learning_rate": 4.992271106639674e-05, "loss": 2.2108, "mean_token_accuracy": 0.4379310250282288, "step": 74535 }, { "epoch": 0.0750774796770077, "grad_norm": 12.041037110684764, "learning_rate": 4.992268003136756e-05, "loss": 2.3027, "mean_token_accuracy": 0.40344828367233276, "step": 74540 }, { "epoch": 0.07508251573011188, "grad_norm": 8.636441606126446, "learning_rate": 4.992264899011937e-05, "loss": 2.4258, "mean_token_accuracy": 0.44827585816383364, "step": 74545 }, { "epoch": 0.07508755178321605, "grad_norm": 10.54169446949642, "learning_rate": 4.9922617942652174e-05, "loss": 2.4574, "mean_token_accuracy": 0.40828797221183777, "step": 74550 }, { "epoch": 0.07509258783632021, "grad_norm": 11.336913707808195, "learning_rate": 4.9922586888965974e-05, "loss": 3.0471, "mean_token_accuracy": 0.40000000298023225, "step": 74555 }, { "epoch": 0.07509762388942438, "grad_norm": 14.800399641137508, "learning_rate": 4.99225558290608e-05, "loss": 2.7155, "mean_token_accuracy": 0.4068965554237366, "step": 74560 }, { "epoch": 0.07510265994252856, "grad_norm": 11.135429433151344, "learning_rate": 4.992252476293664e-05, "loss": 2.9798, "mean_token_accuracy": 0.37931033968925476, "step": 74565 }, { "epoch": 0.07510769599563273, "grad_norm": 10.607252001795178, "learning_rate": 4.99224936905935e-05, "loss": 2.5171, "mean_token_accuracy": 0.3896551728248596, "step": 74570 }, { "epoch": 0.0751127320487369, "grad_norm": 12.889920770178732, "learning_rate": 4.99224626120314e-05, "loss": 2.6022, "mean_token_accuracy": 0.43448275327682495, "step": 74575 }, { "epoch": 0.07511776810184108, "grad_norm": 12.17719043215868, "learning_rate": 4.992243152725035e-05, "loss": 2.8148, "mean_token_accuracy": 0.4034482717514038, "step": 74580 }, { "epoch": 0.07512280415494525, "grad_norm": 11.873839557877341, "learning_rate": 4.992240043625037e-05, "loss": 2.5221, "mean_token_accuracy": 0.41034482717514037, "step": 74585 }, { "epoch": 0.07512784020804943, "grad_norm": 11.211933986921421, "learning_rate": 4.992236933903143e-05, "loss": 2.75, "mean_token_accuracy": 0.4034482717514038, "step": 74590 }, { "epoch": 0.0751328762611536, "grad_norm": 11.632186027497143, "learning_rate": 4.992233823559357e-05, "loss": 2.7421, "mean_token_accuracy": 0.4, "step": 74595 }, { "epoch": 0.07513791231425777, "grad_norm": 11.803262561850177, "learning_rate": 4.99223071259368e-05, "loss": 3.2136, "mean_token_accuracy": 0.3344827562570572, "step": 74600 }, { "epoch": 0.07514294836736195, "grad_norm": 11.01942354518176, "learning_rate": 4.992227601006111e-05, "loss": 2.6837, "mean_token_accuracy": 0.3862069010734558, "step": 74605 }, { "epoch": 0.07514798442046612, "grad_norm": 13.273993822861753, "learning_rate": 4.992224488796652e-05, "loss": 2.3285, "mean_token_accuracy": 0.4379310369491577, "step": 74610 }, { "epoch": 0.0751530204735703, "grad_norm": 8.749877709278744, "learning_rate": 4.992221375965304e-05, "loss": 2.8436, "mean_token_accuracy": 0.40490664839744567, "step": 74615 }, { "epoch": 0.07515805652667447, "grad_norm": 12.88511731777342, "learning_rate": 4.9922182625120675e-05, "loss": 2.8848, "mean_token_accuracy": 0.35172412991523744, "step": 74620 }, { "epoch": 0.07516309257977863, "grad_norm": 12.392676243800622, "learning_rate": 4.992215148436943e-05, "loss": 2.6462, "mean_token_accuracy": 0.4206896543502808, "step": 74625 }, { "epoch": 0.0751681286328828, "grad_norm": 11.643080672138181, "learning_rate": 4.9922120337399314e-05, "loss": 2.7088, "mean_token_accuracy": 0.4, "step": 74630 }, { "epoch": 0.07517316468598698, "grad_norm": 12.264324180787263, "learning_rate": 4.9922089184210344e-05, "loss": 2.67, "mean_token_accuracy": 0.36551723480224607, "step": 74635 }, { "epoch": 0.07517820073909115, "grad_norm": 12.73314834192547, "learning_rate": 4.992205802480253e-05, "loss": 2.4767, "mean_token_accuracy": 0.4379310429096222, "step": 74640 }, { "epoch": 0.07518323679219532, "grad_norm": 10.956683053691346, "learning_rate": 4.992202685917586e-05, "loss": 2.3922, "mean_token_accuracy": 0.46551724672317507, "step": 74645 }, { "epoch": 0.0751882728452995, "grad_norm": 12.334739672775177, "learning_rate": 4.992199568733037e-05, "loss": 3.1054, "mean_token_accuracy": 0.334482753276825, "step": 74650 }, { "epoch": 0.07519330889840367, "grad_norm": 12.158124519690034, "learning_rate": 4.992196450926605e-05, "loss": 2.3824, "mean_token_accuracy": 0.42413793206214906, "step": 74655 }, { "epoch": 0.07519834495150785, "grad_norm": 13.63345370381226, "learning_rate": 4.9921933324982924e-05, "loss": 2.9769, "mean_token_accuracy": 0.3529340624809265, "step": 74660 }, { "epoch": 0.07520338100461202, "grad_norm": 13.752571481195474, "learning_rate": 4.992190213448098e-05, "loss": 2.9366, "mean_token_accuracy": 0.3896551728248596, "step": 74665 }, { "epoch": 0.07520841705771619, "grad_norm": 10.056726483025185, "learning_rate": 4.992187093776024e-05, "loss": 2.1112, "mean_token_accuracy": 0.4986085891723633, "step": 74670 }, { "epoch": 0.07521345311082037, "grad_norm": 21.573326588952398, "learning_rate": 4.992183973482071e-05, "loss": 2.9975, "mean_token_accuracy": 0.4034482717514038, "step": 74675 }, { "epoch": 0.07521848916392454, "grad_norm": 12.323525491808502, "learning_rate": 4.992180852566241e-05, "loss": 2.5283, "mean_token_accuracy": 0.3931034505367279, "step": 74680 }, { "epoch": 0.07522352521702871, "grad_norm": 10.163318492412415, "learning_rate": 4.9921777310285323e-05, "loss": 2.213, "mean_token_accuracy": 0.4551724135875702, "step": 74685 }, { "epoch": 0.07522856127013289, "grad_norm": 13.512074268775086, "learning_rate": 4.992174608868948e-05, "loss": 2.153, "mean_token_accuracy": 0.4931034505367279, "step": 74690 }, { "epoch": 0.07523359732323705, "grad_norm": 11.169950554489096, "learning_rate": 4.992171486087488e-05, "loss": 2.2033, "mean_token_accuracy": 0.4103448331356049, "step": 74695 }, { "epoch": 0.07523863337634122, "grad_norm": 10.07473264576246, "learning_rate": 4.992168362684153e-05, "loss": 2.3912, "mean_token_accuracy": 0.41034482717514037, "step": 74700 }, { "epoch": 0.0752436694294454, "grad_norm": 13.218831131423002, "learning_rate": 4.992165238658945e-05, "loss": 2.5886, "mean_token_accuracy": 0.40865094065666197, "step": 74705 }, { "epoch": 0.07524870548254957, "grad_norm": 11.366994764467304, "learning_rate": 4.9921621140118634e-05, "loss": 2.4869, "mean_token_accuracy": 0.4448275864124298, "step": 74710 }, { "epoch": 0.07525374153565374, "grad_norm": 11.366717384675882, "learning_rate": 4.9921589887429104e-05, "loss": 2.7037, "mean_token_accuracy": 0.36896551847457887, "step": 74715 }, { "epoch": 0.07525877758875792, "grad_norm": 13.53990782652659, "learning_rate": 4.9921558628520854e-05, "loss": 2.4363, "mean_token_accuracy": 0.39655172526836396, "step": 74720 }, { "epoch": 0.07526381364186209, "grad_norm": 10.544459823573526, "learning_rate": 4.992152736339391e-05, "loss": 2.1318, "mean_token_accuracy": 0.4551724135875702, "step": 74725 }, { "epoch": 0.07526884969496626, "grad_norm": 10.648861148027509, "learning_rate": 4.992149609204827e-05, "loss": 3.1111, "mean_token_accuracy": 0.3448275804519653, "step": 74730 }, { "epoch": 0.07527388574807044, "grad_norm": 11.409313736938818, "learning_rate": 4.9921464814483946e-05, "loss": 2.5184, "mean_token_accuracy": 0.36551723778247835, "step": 74735 }, { "epoch": 0.07527892180117461, "grad_norm": 10.7091061616946, "learning_rate": 4.992143353070095e-05, "loss": 2.2665, "mean_token_accuracy": 0.458620685338974, "step": 74740 }, { "epoch": 0.07528395785427879, "grad_norm": 11.173008504531076, "learning_rate": 4.9921402240699264e-05, "loss": 2.7655, "mean_token_accuracy": 0.3620689630508423, "step": 74745 }, { "epoch": 0.07528899390738296, "grad_norm": 12.409134088851763, "learning_rate": 4.992137094447894e-05, "loss": 2.4964, "mean_token_accuracy": 0.39655172228813174, "step": 74750 }, { "epoch": 0.07529402996048713, "grad_norm": 12.713230281053635, "learning_rate": 4.992133964203995e-05, "loss": 2.2552, "mean_token_accuracy": 0.4172413766384125, "step": 74755 }, { "epoch": 0.0752990660135913, "grad_norm": 12.23417559377399, "learning_rate": 4.992130833338233e-05, "loss": 2.7398, "mean_token_accuracy": 0.4021173596382141, "step": 74760 }, { "epoch": 0.07530410206669547, "grad_norm": 11.117608986130428, "learning_rate": 4.992127701850607e-05, "loss": 2.4732, "mean_token_accuracy": 0.4068965494632721, "step": 74765 }, { "epoch": 0.07530913811979964, "grad_norm": 12.623774228153135, "learning_rate": 4.9921245697411186e-05, "loss": 2.8768, "mean_token_accuracy": 0.3999999940395355, "step": 74770 }, { "epoch": 0.07531417417290381, "grad_norm": 13.707661883557265, "learning_rate": 4.992121437009769e-05, "loss": 2.366, "mean_token_accuracy": 0.42758620977401735, "step": 74775 }, { "epoch": 0.07531921022600799, "grad_norm": 13.63212038269677, "learning_rate": 4.992118303656558e-05, "loss": 2.5893, "mean_token_accuracy": 0.3862069010734558, "step": 74780 }, { "epoch": 0.07532424627911216, "grad_norm": 9.674901258760745, "learning_rate": 4.992115169681487e-05, "loss": 2.5405, "mean_token_accuracy": 0.4256503343582153, "step": 74785 }, { "epoch": 0.07532928233221634, "grad_norm": 9.204550764866626, "learning_rate": 4.992112035084558e-05, "loss": 2.1137, "mean_token_accuracy": 0.4730187654495239, "step": 74790 }, { "epoch": 0.07533431838532051, "grad_norm": 9.413424401161253, "learning_rate": 4.99210889986577e-05, "loss": 2.2227, "mean_token_accuracy": 0.4551724076271057, "step": 74795 }, { "epoch": 0.07533935443842468, "grad_norm": 12.444701421944044, "learning_rate": 4.9921057640251254e-05, "loss": 2.1964, "mean_token_accuracy": 0.474168187379837, "step": 74800 }, { "epoch": 0.07534439049152886, "grad_norm": 13.93003237255313, "learning_rate": 4.992102627562624e-05, "loss": 2.9358, "mean_token_accuracy": 0.3896551728248596, "step": 74805 }, { "epoch": 0.07534942654463303, "grad_norm": 12.873166648127915, "learning_rate": 4.9920994904782666e-05, "loss": 2.6602, "mean_token_accuracy": 0.3896551728248596, "step": 74810 }, { "epoch": 0.0753544625977372, "grad_norm": 12.016921192262771, "learning_rate": 4.992096352772055e-05, "loss": 2.3504, "mean_token_accuracy": 0.4551724076271057, "step": 74815 }, { "epoch": 0.07535949865084138, "grad_norm": 11.067540092049498, "learning_rate": 4.99209321444399e-05, "loss": 2.7399, "mean_token_accuracy": 0.3896551728248596, "step": 74820 }, { "epoch": 0.07536453470394555, "grad_norm": 9.775524552778675, "learning_rate": 4.992090075494071e-05, "loss": 2.6802, "mean_token_accuracy": 0.34482758641242983, "step": 74825 }, { "epoch": 0.07536957075704973, "grad_norm": 10.397829904370594, "learning_rate": 4.992086935922301e-05, "loss": 2.5004, "mean_token_accuracy": 0.4413793206214905, "step": 74830 }, { "epoch": 0.07537460681015389, "grad_norm": 10.15349748766837, "learning_rate": 4.99208379572868e-05, "loss": 2.3549, "mean_token_accuracy": 0.43103448748588563, "step": 74835 }, { "epoch": 0.07537964286325806, "grad_norm": 8.352377690529696, "learning_rate": 4.9920806549132084e-05, "loss": 2.4642, "mean_token_accuracy": 0.43793103098869324, "step": 74840 }, { "epoch": 0.07538467891636223, "grad_norm": 10.558177066840473, "learning_rate": 4.9920775134758866e-05, "loss": 2.7048, "mean_token_accuracy": 0.3907440960407257, "step": 74845 }, { "epoch": 0.0753897149694664, "grad_norm": 11.024318798441008, "learning_rate": 4.9920743714167165e-05, "loss": 2.4186, "mean_token_accuracy": 0.3965517282485962, "step": 74850 }, { "epoch": 0.07539475102257058, "grad_norm": 13.901048938707923, "learning_rate": 4.9920712287356995e-05, "loss": 2.6477, "mean_token_accuracy": 0.3931034505367279, "step": 74855 }, { "epoch": 0.07539978707567475, "grad_norm": 10.070421997084713, "learning_rate": 4.992068085432836e-05, "loss": 2.5728, "mean_token_accuracy": 0.35862069129943847, "step": 74860 }, { "epoch": 0.07540482312877893, "grad_norm": 10.291092774816319, "learning_rate": 4.992064941508125e-05, "loss": 3.0999, "mean_token_accuracy": 0.34482758939266206, "step": 74865 }, { "epoch": 0.0754098591818831, "grad_norm": 12.20101099803728, "learning_rate": 4.9920617969615695e-05, "loss": 2.5461, "mean_token_accuracy": 0.4034482777118683, "step": 74870 }, { "epoch": 0.07541489523498728, "grad_norm": 10.826455358059864, "learning_rate": 4.99205865179317e-05, "loss": 2.4063, "mean_token_accuracy": 0.47586206793785096, "step": 74875 }, { "epoch": 0.07541993128809145, "grad_norm": 13.705391209273914, "learning_rate": 4.992055506002928e-05, "loss": 2.3439, "mean_token_accuracy": 0.47126436829566953, "step": 74880 }, { "epoch": 0.07542496734119562, "grad_norm": 11.89151438287245, "learning_rate": 4.992052359590842e-05, "loss": 2.5816, "mean_token_accuracy": 0.4068965554237366, "step": 74885 }, { "epoch": 0.0754300033942998, "grad_norm": 10.02118133757943, "learning_rate": 4.9920492125569154e-05, "loss": 2.4312, "mean_token_accuracy": 0.4050211668014526, "step": 74890 }, { "epoch": 0.07543503944740397, "grad_norm": 10.72219849342627, "learning_rate": 4.992046064901147e-05, "loss": 2.333, "mean_token_accuracy": 0.3965517282485962, "step": 74895 }, { "epoch": 0.07544007550050814, "grad_norm": 10.7963269405344, "learning_rate": 4.9920429166235396e-05, "loss": 2.6162, "mean_token_accuracy": 0.4034482717514038, "step": 74900 }, { "epoch": 0.0754451115536123, "grad_norm": 11.740382548584366, "learning_rate": 4.992039767724094e-05, "loss": 2.4468, "mean_token_accuracy": 0.4103448331356049, "step": 74905 }, { "epoch": 0.07545014760671648, "grad_norm": 13.801400719597702, "learning_rate": 4.992036618202809e-05, "loss": 2.7828, "mean_token_accuracy": 0.33932244777679443, "step": 74910 }, { "epoch": 0.07545518365982065, "grad_norm": 11.367273892067944, "learning_rate": 4.992033468059687e-05, "loss": 2.152, "mean_token_accuracy": 0.458620685338974, "step": 74915 }, { "epoch": 0.07546021971292483, "grad_norm": 14.990574499188641, "learning_rate": 4.9920303172947294e-05, "loss": 2.8169, "mean_token_accuracy": 0.38275861740112305, "step": 74920 }, { "epoch": 0.075465255766029, "grad_norm": 10.951458740752074, "learning_rate": 4.9920271659079356e-05, "loss": 2.8005, "mean_token_accuracy": 0.42758620381355283, "step": 74925 }, { "epoch": 0.07547029181913317, "grad_norm": 11.804120355851495, "learning_rate": 4.992024013899308e-05, "loss": 2.6265, "mean_token_accuracy": 0.3425892323255539, "step": 74930 }, { "epoch": 0.07547532787223735, "grad_norm": 10.666261833024391, "learning_rate": 4.992020861268846e-05, "loss": 2.8334, "mean_token_accuracy": 0.3793103486299515, "step": 74935 }, { "epoch": 0.07548036392534152, "grad_norm": 15.16713252266937, "learning_rate": 4.992017708016551e-05, "loss": 2.717, "mean_token_accuracy": 0.3965517282485962, "step": 74940 }, { "epoch": 0.0754853999784457, "grad_norm": 11.39411820527837, "learning_rate": 4.992014554142424e-05, "loss": 2.8957, "mean_token_accuracy": 0.32758620381355286, "step": 74945 }, { "epoch": 0.07549043603154987, "grad_norm": 10.781770197595755, "learning_rate": 4.992011399646467e-05, "loss": 2.36, "mean_token_accuracy": 0.3965517163276672, "step": 74950 }, { "epoch": 0.07549547208465404, "grad_norm": 10.76762410311463, "learning_rate": 4.9920082445286786e-05, "loss": 2.6586, "mean_token_accuracy": 0.3999999940395355, "step": 74955 }, { "epoch": 0.07550050813775822, "grad_norm": 26.165188428211046, "learning_rate": 4.9920050887890605e-05, "loss": 2.7767, "mean_token_accuracy": 0.3862069010734558, "step": 74960 }, { "epoch": 0.07550554419086239, "grad_norm": 12.313893745380385, "learning_rate": 4.992001932427615e-05, "loss": 2.2627, "mean_token_accuracy": 0.441379314661026, "step": 74965 }, { "epoch": 0.07551058024396656, "grad_norm": 12.885221729148988, "learning_rate": 4.991998775444342e-05, "loss": 2.7019, "mean_token_accuracy": 0.36551724672317504, "step": 74970 }, { "epoch": 0.07551561629707072, "grad_norm": 15.883124242299214, "learning_rate": 4.991995617839241e-05, "loss": 3.2918, "mean_token_accuracy": 0.33103448450565337, "step": 74975 }, { "epoch": 0.0755206523501749, "grad_norm": 9.868879961174354, "learning_rate": 4.9919924596123155e-05, "loss": 2.5042, "mean_token_accuracy": 0.4137930989265442, "step": 74980 }, { "epoch": 0.07552568840327907, "grad_norm": 12.040100280413919, "learning_rate": 4.991989300763565e-05, "loss": 2.2684, "mean_token_accuracy": 0.4586206912994385, "step": 74985 }, { "epoch": 0.07553072445638324, "grad_norm": 11.377887154298785, "learning_rate": 4.99198614129299e-05, "loss": 2.7991, "mean_token_accuracy": 0.3758620649576187, "step": 74990 }, { "epoch": 0.07553576050948742, "grad_norm": 13.449003657266344, "learning_rate": 4.991982981200591e-05, "loss": 2.07, "mean_token_accuracy": 0.4413793087005615, "step": 74995 }, { "epoch": 0.07554079656259159, "grad_norm": 11.686053735967564, "learning_rate": 4.9919798204863704e-05, "loss": 2.6353, "mean_token_accuracy": 0.3931034505367279, "step": 75000 }, { "epoch": 0.07554583261569577, "grad_norm": 10.587006491035401, "learning_rate": 4.991976659150328e-05, "loss": 2.7143, "mean_token_accuracy": 0.358620685338974, "step": 75005 }, { "epoch": 0.07555086866879994, "grad_norm": 12.384930857089538, "learning_rate": 4.991973497192466e-05, "loss": 2.518, "mean_token_accuracy": 0.39655172228813174, "step": 75010 }, { "epoch": 0.07555590472190411, "grad_norm": 10.653591346609966, "learning_rate": 4.9919703346127834e-05, "loss": 2.5711, "mean_token_accuracy": 0.4068965494632721, "step": 75015 }, { "epoch": 0.07556094077500829, "grad_norm": 9.406840835700065, "learning_rate": 4.991967171411283e-05, "loss": 2.149, "mean_token_accuracy": 0.48275861144065857, "step": 75020 }, { "epoch": 0.07556597682811246, "grad_norm": 10.567768380908667, "learning_rate": 4.991964007587963e-05, "loss": 2.3411, "mean_token_accuracy": 0.4184729039669037, "step": 75025 }, { "epoch": 0.07557101288121663, "grad_norm": 14.564808823177678, "learning_rate": 4.991960843142827e-05, "loss": 2.492, "mean_token_accuracy": 0.3793103456497192, "step": 75030 }, { "epoch": 0.07557604893432081, "grad_norm": 11.328341539931163, "learning_rate": 4.991957678075874e-05, "loss": 2.5425, "mean_token_accuracy": 0.39310344457626345, "step": 75035 }, { "epoch": 0.07558108498742498, "grad_norm": 12.783123250394299, "learning_rate": 4.9919545123871066e-05, "loss": 2.1677, "mean_token_accuracy": 0.41724138259887694, "step": 75040 }, { "epoch": 0.07558612104052914, "grad_norm": 12.207032229240859, "learning_rate": 4.991951346076524e-05, "loss": 2.5307, "mean_token_accuracy": 0.37241379022598264, "step": 75045 }, { "epoch": 0.07559115709363332, "grad_norm": 10.518209511430392, "learning_rate": 4.9919481791441284e-05, "loss": 2.6607, "mean_token_accuracy": 0.37586206793785093, "step": 75050 }, { "epoch": 0.07559619314673749, "grad_norm": 10.159564609341814, "learning_rate": 4.99194501158992e-05, "loss": 2.3876, "mean_token_accuracy": 0.4034482717514038, "step": 75055 }, { "epoch": 0.07560122919984166, "grad_norm": 11.029800643270065, "learning_rate": 4.991941843413899e-05, "loss": 2.1125, "mean_token_accuracy": 0.4413793087005615, "step": 75060 }, { "epoch": 0.07560626525294584, "grad_norm": 10.07005251684402, "learning_rate": 4.991938674616068e-05, "loss": 2.519, "mean_token_accuracy": 0.38275861740112305, "step": 75065 }, { "epoch": 0.07561130130605001, "grad_norm": 8.983026109880992, "learning_rate": 4.9919355051964264e-05, "loss": 2.3491, "mean_token_accuracy": 0.45311554074287413, "step": 75070 }, { "epoch": 0.07561633735915418, "grad_norm": 11.384320478722957, "learning_rate": 4.991932335154976e-05, "loss": 2.6466, "mean_token_accuracy": 0.4, "step": 75075 }, { "epoch": 0.07562137341225836, "grad_norm": 10.983822068273314, "learning_rate": 4.991929164491717e-05, "loss": 2.7304, "mean_token_accuracy": 0.4137930989265442, "step": 75080 }, { "epoch": 0.07562640946536253, "grad_norm": 12.479218082718674, "learning_rate": 4.991925993206651e-05, "loss": 2.2225, "mean_token_accuracy": 0.44827585816383364, "step": 75085 }, { "epoch": 0.0756314455184667, "grad_norm": 17.26288710670105, "learning_rate": 4.991922821299778e-05, "loss": 2.7226, "mean_token_accuracy": 0.4011494338512421, "step": 75090 }, { "epoch": 0.07563648157157088, "grad_norm": 10.530593769733136, "learning_rate": 4.9919196487711e-05, "loss": 2.4504, "mean_token_accuracy": 0.44482759237289426, "step": 75095 }, { "epoch": 0.07564151762467505, "grad_norm": 14.00326368564235, "learning_rate": 4.991916475620617e-05, "loss": 3.0926, "mean_token_accuracy": 0.3241379290819168, "step": 75100 }, { "epoch": 0.07564655367777923, "grad_norm": 9.753542809414052, "learning_rate": 4.9919133018483294e-05, "loss": 2.1342, "mean_token_accuracy": 0.46551724672317507, "step": 75105 }, { "epoch": 0.0756515897308834, "grad_norm": 9.843300597729117, "learning_rate": 4.9919101274542396e-05, "loss": 2.3689, "mean_token_accuracy": 0.4724137902259827, "step": 75110 }, { "epoch": 0.07565662578398756, "grad_norm": 8.555182944116257, "learning_rate": 4.991906952438347e-05, "loss": 2.2321, "mean_token_accuracy": 0.4517241418361664, "step": 75115 }, { "epoch": 0.07566166183709173, "grad_norm": 10.986875355918743, "learning_rate": 4.991903776800653e-05, "loss": 2.2199, "mean_token_accuracy": 0.42068964838981626, "step": 75120 }, { "epoch": 0.07566669789019591, "grad_norm": 10.485706803715818, "learning_rate": 4.99190060054116e-05, "loss": 2.6549, "mean_token_accuracy": 0.3551724135875702, "step": 75125 }, { "epoch": 0.07567173394330008, "grad_norm": 9.47754968698919, "learning_rate": 4.991897423659867e-05, "loss": 2.666, "mean_token_accuracy": 0.39716748893260956, "step": 75130 }, { "epoch": 0.07567676999640426, "grad_norm": 9.510873780546204, "learning_rate": 4.9918942461567744e-05, "loss": 2.2567, "mean_token_accuracy": 0.4517241358757019, "step": 75135 }, { "epoch": 0.07568180604950843, "grad_norm": 10.237405787222407, "learning_rate": 4.991891068031884e-05, "loss": 2.5903, "mean_token_accuracy": 0.37241379618644715, "step": 75140 }, { "epoch": 0.0756868421026126, "grad_norm": 13.129441059355418, "learning_rate": 4.991887889285198e-05, "loss": 2.8956, "mean_token_accuracy": 0.38275861740112305, "step": 75145 }, { "epoch": 0.07569187815571678, "grad_norm": 9.53919184752354, "learning_rate": 4.991884709916716e-05, "loss": 2.6506, "mean_token_accuracy": 0.43103448748588563, "step": 75150 }, { "epoch": 0.07569691420882095, "grad_norm": 10.903487703731537, "learning_rate": 4.991881529926438e-05, "loss": 2.5109, "mean_token_accuracy": 0.41379310488700866, "step": 75155 }, { "epoch": 0.07570195026192512, "grad_norm": 11.465358873248912, "learning_rate": 4.991878349314365e-05, "loss": 2.4678, "mean_token_accuracy": 0.4, "step": 75160 }, { "epoch": 0.0757069863150293, "grad_norm": 13.629843458820442, "learning_rate": 4.9918751680805e-05, "loss": 2.7878, "mean_token_accuracy": 0.3551724195480347, "step": 75165 }, { "epoch": 0.07571202236813347, "grad_norm": 12.560534730204415, "learning_rate": 4.9918719862248436e-05, "loss": 2.6814, "mean_token_accuracy": 0.39310344457626345, "step": 75170 }, { "epoch": 0.07571705842123765, "grad_norm": 12.18031538631984, "learning_rate": 4.991868803747394e-05, "loss": 2.6904, "mean_token_accuracy": 0.36206896901130675, "step": 75175 }, { "epoch": 0.07572209447434182, "grad_norm": 10.788068454084915, "learning_rate": 4.991865620648154e-05, "loss": 2.3876, "mean_token_accuracy": 0.4448275864124298, "step": 75180 }, { "epoch": 0.07572713052744598, "grad_norm": 14.966159818821014, "learning_rate": 4.991862436927124e-05, "loss": 2.4394, "mean_token_accuracy": 0.40816696882247927, "step": 75185 }, { "epoch": 0.07573216658055015, "grad_norm": 13.041823685079718, "learning_rate": 4.991859252584306e-05, "loss": 2.6527, "mean_token_accuracy": 0.41911675930023196, "step": 75190 }, { "epoch": 0.07573720263365433, "grad_norm": 12.014922361026857, "learning_rate": 4.9918560676197e-05, "loss": 2.7402, "mean_token_accuracy": 0.33793103098869326, "step": 75195 }, { "epoch": 0.0757422386867585, "grad_norm": 11.441267165296328, "learning_rate": 4.991852882033306e-05, "loss": 2.2222, "mean_token_accuracy": 0.4122807025909424, "step": 75200 }, { "epoch": 0.07574727473986267, "grad_norm": 11.42361996016671, "learning_rate": 4.9918496958251255e-05, "loss": 2.3394, "mean_token_accuracy": 0.4448275834321976, "step": 75205 }, { "epoch": 0.07575231079296685, "grad_norm": 9.409352254791383, "learning_rate": 4.99184650899516e-05, "loss": 2.5803, "mean_token_accuracy": 0.4068965554237366, "step": 75210 }, { "epoch": 0.07575734684607102, "grad_norm": 11.569302664848456, "learning_rate": 4.9918433215434106e-05, "loss": 2.8479, "mean_token_accuracy": 0.3655172407627106, "step": 75215 }, { "epoch": 0.0757623828991752, "grad_norm": 17.280672085888906, "learning_rate": 4.991840133469877e-05, "loss": 2.5755, "mean_token_accuracy": 0.3793103456497192, "step": 75220 }, { "epoch": 0.07576741895227937, "grad_norm": 10.896118151161081, "learning_rate": 4.991836944774561e-05, "loss": 2.9679, "mean_token_accuracy": 0.3620689630508423, "step": 75225 }, { "epoch": 0.07577245500538354, "grad_norm": 13.55580319769656, "learning_rate": 4.991833755457463e-05, "loss": 2.0373, "mean_token_accuracy": 0.44827587008476255, "step": 75230 }, { "epoch": 0.07577749105848772, "grad_norm": 12.551735767135769, "learning_rate": 4.991830565518584e-05, "loss": 2.6958, "mean_token_accuracy": 0.37241379022598264, "step": 75235 }, { "epoch": 0.07578252711159189, "grad_norm": 12.31565082725626, "learning_rate": 4.9918273749579244e-05, "loss": 2.3541, "mean_token_accuracy": 0.41185722351074217, "step": 75240 }, { "epoch": 0.07578756316469606, "grad_norm": 10.348166999400974, "learning_rate": 4.9918241837754865e-05, "loss": 2.5623, "mean_token_accuracy": 0.41724138259887694, "step": 75245 }, { "epoch": 0.07579259921780024, "grad_norm": 12.465487218834705, "learning_rate": 4.99182099197127e-05, "loss": 2.3039, "mean_token_accuracy": 0.4034482717514038, "step": 75250 }, { "epoch": 0.0757976352709044, "grad_norm": 11.270258523822815, "learning_rate": 4.991817799545277e-05, "loss": 2.1368, "mean_token_accuracy": 0.44319419264793397, "step": 75255 }, { "epoch": 0.07580267132400857, "grad_norm": 14.004005560534106, "learning_rate": 4.9918146064975055e-05, "loss": 2.4613, "mean_token_accuracy": 0.4620689690113068, "step": 75260 }, { "epoch": 0.07580770737711275, "grad_norm": 11.713717500858031, "learning_rate": 4.99181141282796e-05, "loss": 2.558, "mean_token_accuracy": 0.3517241358757019, "step": 75265 }, { "epoch": 0.07581274343021692, "grad_norm": 12.189310493823035, "learning_rate": 4.991808218536639e-05, "loss": 2.2257, "mean_token_accuracy": 0.41379310488700866, "step": 75270 }, { "epoch": 0.0758177794833211, "grad_norm": 20.639764066035305, "learning_rate": 4.991805023623544e-05, "loss": 2.7185, "mean_token_accuracy": 0.41034482717514037, "step": 75275 }, { "epoch": 0.07582281553642527, "grad_norm": 10.784292056196412, "learning_rate": 4.991801828088676e-05, "loss": 2.5384, "mean_token_accuracy": 0.42758620977401735, "step": 75280 }, { "epoch": 0.07582785158952944, "grad_norm": 11.041400958105278, "learning_rate": 4.991798631932036e-05, "loss": 3.1766, "mean_token_accuracy": 0.3241379201412201, "step": 75285 }, { "epoch": 0.07583288764263361, "grad_norm": 11.957961793293695, "learning_rate": 4.991795435153626e-05, "loss": 3.0443, "mean_token_accuracy": 0.3551724076271057, "step": 75290 }, { "epoch": 0.07583792369573779, "grad_norm": 11.03728810755462, "learning_rate": 4.9917922377534435e-05, "loss": 1.9332, "mean_token_accuracy": 0.5034482717514038, "step": 75295 }, { "epoch": 0.07584295974884196, "grad_norm": 7.7894521237925, "learning_rate": 4.991789039731493e-05, "loss": 2.3052, "mean_token_accuracy": 0.4809437394142151, "step": 75300 }, { "epoch": 0.07584799580194614, "grad_norm": 9.925731783100467, "learning_rate": 4.9917858410877736e-05, "loss": 2.4541, "mean_token_accuracy": 0.42413793206214906, "step": 75305 }, { "epoch": 0.07585303185505031, "grad_norm": 12.186828387113746, "learning_rate": 4.9917826418222866e-05, "loss": 1.9564, "mean_token_accuracy": 0.4793103516101837, "step": 75310 }, { "epoch": 0.07585806790815448, "grad_norm": 9.130023978551153, "learning_rate": 4.9917794419350334e-05, "loss": 2.1532, "mean_token_accuracy": 0.4862069010734558, "step": 75315 }, { "epoch": 0.07586310396125866, "grad_norm": 10.719552968103416, "learning_rate": 4.991776241426013e-05, "loss": 2.6554, "mean_token_accuracy": 0.3862068891525269, "step": 75320 }, { "epoch": 0.07586814001436282, "grad_norm": 11.588590398022935, "learning_rate": 4.991773040295228e-05, "loss": 2.801, "mean_token_accuracy": 0.37931033968925476, "step": 75325 }, { "epoch": 0.07587317606746699, "grad_norm": 10.377063041000913, "learning_rate": 4.99176983854268e-05, "loss": 3.0175, "mean_token_accuracy": 0.32413792610168457, "step": 75330 }, { "epoch": 0.07587821212057116, "grad_norm": 11.329290877943746, "learning_rate": 4.991766636168368e-05, "loss": 2.7734, "mean_token_accuracy": 0.4068965554237366, "step": 75335 }, { "epoch": 0.07588324817367534, "grad_norm": 10.184275511932269, "learning_rate": 4.991763433172293e-05, "loss": 2.6586, "mean_token_accuracy": 0.3655172407627106, "step": 75340 }, { "epoch": 0.07588828422677951, "grad_norm": 11.522456523744934, "learning_rate": 4.991760229554457e-05, "loss": 2.3303, "mean_token_accuracy": 0.42413793206214906, "step": 75345 }, { "epoch": 0.07589332027988369, "grad_norm": 10.294440350523576, "learning_rate": 4.991757025314861e-05, "loss": 2.8296, "mean_token_accuracy": 0.3551724135875702, "step": 75350 }, { "epoch": 0.07589835633298786, "grad_norm": 12.98451454093618, "learning_rate": 4.9917538204535046e-05, "loss": 2.5555, "mean_token_accuracy": 0.35517241060733795, "step": 75355 }, { "epoch": 0.07590339238609203, "grad_norm": 9.376752104112684, "learning_rate": 4.99175061497039e-05, "loss": 1.9258, "mean_token_accuracy": 0.5241379261016845, "step": 75360 }, { "epoch": 0.07590842843919621, "grad_norm": 10.175879366119066, "learning_rate": 4.991747408865517e-05, "loss": 2.341, "mean_token_accuracy": 0.4567453145980835, "step": 75365 }, { "epoch": 0.07591346449230038, "grad_norm": 14.439015036731913, "learning_rate": 4.991744202138887e-05, "loss": 2.4136, "mean_token_accuracy": 0.4379310369491577, "step": 75370 }, { "epoch": 0.07591850054540455, "grad_norm": 11.081259550873272, "learning_rate": 4.991740994790501e-05, "loss": 2.6835, "mean_token_accuracy": 0.3931034505367279, "step": 75375 }, { "epoch": 0.07592353659850873, "grad_norm": 13.832344843354749, "learning_rate": 4.991737786820361e-05, "loss": 2.7176, "mean_token_accuracy": 0.4, "step": 75380 }, { "epoch": 0.0759285726516129, "grad_norm": 13.03687956768484, "learning_rate": 4.991734578228465e-05, "loss": 2.3686, "mean_token_accuracy": 0.4896551728248596, "step": 75385 }, { "epoch": 0.07593360870471708, "grad_norm": 11.489477238906288, "learning_rate": 4.991731369014816e-05, "loss": 2.1997, "mean_token_accuracy": 0.44482757449150084, "step": 75390 }, { "epoch": 0.07593864475782124, "grad_norm": 11.3353927141494, "learning_rate": 4.991728159179414e-05, "loss": 2.6566, "mean_token_accuracy": 0.3793103456497192, "step": 75395 }, { "epoch": 0.07594368081092541, "grad_norm": 9.45146214503811, "learning_rate": 4.991724948722261e-05, "loss": 2.2825, "mean_token_accuracy": 0.4172413766384125, "step": 75400 }, { "epoch": 0.07594871686402958, "grad_norm": 11.096286193814361, "learning_rate": 4.9917217376433575e-05, "loss": 2.7253, "mean_token_accuracy": 0.37586206793785093, "step": 75405 }, { "epoch": 0.07595375291713376, "grad_norm": 10.350356556885208, "learning_rate": 4.991718525942704e-05, "loss": 2.9124, "mean_token_accuracy": 0.3931034505367279, "step": 75410 }, { "epoch": 0.07595878897023793, "grad_norm": 12.469181930014196, "learning_rate": 4.991715313620301e-05, "loss": 2.5351, "mean_token_accuracy": 0.4, "step": 75415 }, { "epoch": 0.0759638250233421, "grad_norm": 10.705802817729872, "learning_rate": 4.99171210067615e-05, "loss": 2.6078, "mean_token_accuracy": 0.42413792610168455, "step": 75420 }, { "epoch": 0.07596886107644628, "grad_norm": 10.836660424684972, "learning_rate": 4.991708887110252e-05, "loss": 2.7352, "mean_token_accuracy": 0.34827586114406583, "step": 75425 }, { "epoch": 0.07597389712955045, "grad_norm": 12.088037949854911, "learning_rate": 4.991705672922608e-05, "loss": 2.4011, "mean_token_accuracy": 0.43793103098869324, "step": 75430 }, { "epoch": 0.07597893318265463, "grad_norm": 10.52323614940045, "learning_rate": 4.991702458113218e-05, "loss": 2.2881, "mean_token_accuracy": 0.38965516686439516, "step": 75435 }, { "epoch": 0.0759839692357588, "grad_norm": 10.062417117715869, "learning_rate": 4.991699242682084e-05, "loss": 2.2873, "mean_token_accuracy": 0.4517241358757019, "step": 75440 }, { "epoch": 0.07598900528886297, "grad_norm": 10.416310908719684, "learning_rate": 4.9916960266292055e-05, "loss": 2.4658, "mean_token_accuracy": 0.39655172228813174, "step": 75445 }, { "epoch": 0.07599404134196715, "grad_norm": 11.000369668934095, "learning_rate": 4.9916928099545846e-05, "loss": 2.4586, "mean_token_accuracy": 0.4413793087005615, "step": 75450 }, { "epoch": 0.07599907739507132, "grad_norm": 11.457982912788287, "learning_rate": 4.991689592658222e-05, "loss": 2.442, "mean_token_accuracy": 0.4172413766384125, "step": 75455 }, { "epoch": 0.0760041134481755, "grad_norm": 12.307633852153732, "learning_rate": 4.991686374740118e-05, "loss": 2.5387, "mean_token_accuracy": 0.4034482777118683, "step": 75460 }, { "epoch": 0.07600914950127965, "grad_norm": 10.954265543886638, "learning_rate": 4.991683156200275e-05, "loss": 1.9785, "mean_token_accuracy": 0.5206896543502808, "step": 75465 }, { "epoch": 0.07601418555438383, "grad_norm": 12.20215608218267, "learning_rate": 4.9916799370386915e-05, "loss": 2.4452, "mean_token_accuracy": 0.42758620381355283, "step": 75470 }, { "epoch": 0.076019221607488, "grad_norm": 9.70822357113351, "learning_rate": 4.991676717255371e-05, "loss": 2.5331, "mean_token_accuracy": 0.38275861740112305, "step": 75475 }, { "epoch": 0.07602425766059218, "grad_norm": 16.52616441978274, "learning_rate": 4.991673496850312e-05, "loss": 2.6698, "mean_token_accuracy": 0.36551723778247835, "step": 75480 }, { "epoch": 0.07602929371369635, "grad_norm": 10.132111930425728, "learning_rate": 4.9916702758235165e-05, "loss": 2.826, "mean_token_accuracy": 0.33103448152542114, "step": 75485 }, { "epoch": 0.07603432976680052, "grad_norm": 11.558014058055821, "learning_rate": 4.991667054174985e-05, "loss": 2.3401, "mean_token_accuracy": 0.4068965554237366, "step": 75490 }, { "epoch": 0.0760393658199047, "grad_norm": 13.188818462888594, "learning_rate": 4.9916638319047204e-05, "loss": 2.5967, "mean_token_accuracy": 0.3655172407627106, "step": 75495 }, { "epoch": 0.07604440187300887, "grad_norm": 15.504357527271171, "learning_rate": 4.991660609012721e-05, "loss": 2.7195, "mean_token_accuracy": 0.3896551728248596, "step": 75500 }, { "epoch": 0.07604943792611304, "grad_norm": 12.890457659504058, "learning_rate": 4.991657385498989e-05, "loss": 3.0247, "mean_token_accuracy": 0.3517241358757019, "step": 75505 }, { "epoch": 0.07605447397921722, "grad_norm": 10.865775989051082, "learning_rate": 4.9916541613635256e-05, "loss": 2.3676, "mean_token_accuracy": 0.4766009867191315, "step": 75510 }, { "epoch": 0.07605951003232139, "grad_norm": 11.447554423078827, "learning_rate": 4.99165093660633e-05, "loss": 2.6091, "mean_token_accuracy": 0.40211735367774964, "step": 75515 }, { "epoch": 0.07606454608542557, "grad_norm": 12.192775550775172, "learning_rate": 4.991647711227404e-05, "loss": 2.2874, "mean_token_accuracy": 0.4551724135875702, "step": 75520 }, { "epoch": 0.07606958213852974, "grad_norm": 11.445999369619152, "learning_rate": 4.991644485226749e-05, "loss": 2.4792, "mean_token_accuracy": 0.4482758641242981, "step": 75525 }, { "epoch": 0.07607461819163391, "grad_norm": 10.741578388438642, "learning_rate": 4.991641258604365e-05, "loss": 2.549, "mean_token_accuracy": 0.41034482717514037, "step": 75530 }, { "epoch": 0.07607965424473807, "grad_norm": 12.080167817450432, "learning_rate": 4.991638031360254e-05, "loss": 2.6935, "mean_token_accuracy": 0.3896551728248596, "step": 75535 }, { "epoch": 0.07608469029784225, "grad_norm": 13.905967665457455, "learning_rate": 4.991634803494417e-05, "loss": 2.5429, "mean_token_accuracy": 0.4034482777118683, "step": 75540 }, { "epoch": 0.07608972635094642, "grad_norm": 9.955735813353055, "learning_rate": 4.9916315750068535e-05, "loss": 2.6584, "mean_token_accuracy": 0.4034482777118683, "step": 75545 }, { "epoch": 0.0760947624040506, "grad_norm": 11.70529111528291, "learning_rate": 4.991628345897565e-05, "loss": 2.5981, "mean_token_accuracy": 0.3758620649576187, "step": 75550 }, { "epoch": 0.07609979845715477, "grad_norm": 17.690014833130675, "learning_rate": 4.991625116166553e-05, "loss": 2.6763, "mean_token_accuracy": 0.4103448331356049, "step": 75555 }, { "epoch": 0.07610483451025894, "grad_norm": 11.515322335558984, "learning_rate": 4.991621885813818e-05, "loss": 2.9642, "mean_token_accuracy": 0.38965517580509185, "step": 75560 }, { "epoch": 0.07610987056336312, "grad_norm": 12.495993637539948, "learning_rate": 4.9916186548393596e-05, "loss": 2.6704, "mean_token_accuracy": 0.35862069129943847, "step": 75565 }, { "epoch": 0.07611490661646729, "grad_norm": 14.325867460775958, "learning_rate": 4.9916154232431806e-05, "loss": 2.2145, "mean_token_accuracy": 0.4620689570903778, "step": 75570 }, { "epoch": 0.07611994266957146, "grad_norm": 11.182823668119719, "learning_rate": 4.9916121910252815e-05, "loss": 2.5401, "mean_token_accuracy": 0.4551724076271057, "step": 75575 }, { "epoch": 0.07612497872267564, "grad_norm": 10.766892936911312, "learning_rate": 4.991608958185662e-05, "loss": 2.0896, "mean_token_accuracy": 0.44640048742294314, "step": 75580 }, { "epoch": 0.07613001477577981, "grad_norm": 11.20327310654806, "learning_rate": 4.9916057247243256e-05, "loss": 2.2858, "mean_token_accuracy": 0.44482759237289426, "step": 75585 }, { "epoch": 0.07613505082888399, "grad_norm": 16.05615146700863, "learning_rate": 4.991602490641271e-05, "loss": 2.6344, "mean_token_accuracy": 0.4034482777118683, "step": 75590 }, { "epoch": 0.07614008688198816, "grad_norm": 11.014872425188628, "learning_rate": 4.9915992559364986e-05, "loss": 2.4541, "mean_token_accuracy": 0.4620689630508423, "step": 75595 }, { "epoch": 0.07614512293509233, "grad_norm": 11.084625853288069, "learning_rate": 4.99159602061001e-05, "loss": 2.2378, "mean_token_accuracy": 0.4551724076271057, "step": 75600 }, { "epoch": 0.07615015898819649, "grad_norm": 12.029958990516516, "learning_rate": 4.991592784661807e-05, "loss": 2.4505, "mean_token_accuracy": 0.44343616962432864, "step": 75605 }, { "epoch": 0.07615519504130067, "grad_norm": 9.449986214103564, "learning_rate": 4.991589548091891e-05, "loss": 2.3957, "mean_token_accuracy": 0.4310344815254211, "step": 75610 }, { "epoch": 0.07616023109440484, "grad_norm": 11.236790398904777, "learning_rate": 4.99158631090026e-05, "loss": 2.5347, "mean_token_accuracy": 0.3793103516101837, "step": 75615 }, { "epoch": 0.07616526714750901, "grad_norm": 12.7769487512638, "learning_rate": 4.991583073086918e-05, "loss": 2.7342, "mean_token_accuracy": 0.38965516686439516, "step": 75620 }, { "epoch": 0.07617030320061319, "grad_norm": 13.557394454357604, "learning_rate": 4.991579834651864e-05, "loss": 2.5632, "mean_token_accuracy": 0.4263157844543457, "step": 75625 }, { "epoch": 0.07617533925371736, "grad_norm": 11.01410154138524, "learning_rate": 4.991576595595099e-05, "loss": 2.5489, "mean_token_accuracy": 0.39999999701976774, "step": 75630 }, { "epoch": 0.07618037530682154, "grad_norm": 10.00399654338808, "learning_rate": 4.9915733559166256e-05, "loss": 2.4339, "mean_token_accuracy": 0.3758620619773865, "step": 75635 }, { "epoch": 0.07618541135992571, "grad_norm": 15.52837338351932, "learning_rate": 4.9915701156164425e-05, "loss": 2.5025, "mean_token_accuracy": 0.3880822718143463, "step": 75640 }, { "epoch": 0.07619044741302988, "grad_norm": 10.646481008341645, "learning_rate": 4.9915668746945515e-05, "loss": 2.2505, "mean_token_accuracy": 0.4655172348022461, "step": 75645 }, { "epoch": 0.07619548346613406, "grad_norm": 9.818662496962382, "learning_rate": 4.991563633150954e-05, "loss": 2.6408, "mean_token_accuracy": 0.3931034505367279, "step": 75650 }, { "epoch": 0.07620051951923823, "grad_norm": 12.10137909331914, "learning_rate": 4.991560390985651e-05, "loss": 2.4038, "mean_token_accuracy": 0.45366001725196836, "step": 75655 }, { "epoch": 0.0762055555723424, "grad_norm": 10.83122901867495, "learning_rate": 4.991557148198642e-05, "loss": 2.2971, "mean_token_accuracy": 0.46896552443504336, "step": 75660 }, { "epoch": 0.07621059162544658, "grad_norm": 12.343724186293136, "learning_rate": 4.9915539047899286e-05, "loss": 2.1792, "mean_token_accuracy": 0.47586206793785096, "step": 75665 }, { "epoch": 0.07621562767855075, "grad_norm": 10.272449368344718, "learning_rate": 4.991550660759512e-05, "loss": 2.8551, "mean_token_accuracy": 0.37931033968925476, "step": 75670 }, { "epoch": 0.07622066373165491, "grad_norm": 12.521664763843308, "learning_rate": 4.9915474161073936e-05, "loss": 2.376, "mean_token_accuracy": 0.4620689690113068, "step": 75675 }, { "epoch": 0.07622569978475909, "grad_norm": 8.823733341684603, "learning_rate": 4.991544170833573e-05, "loss": 2.2021, "mean_token_accuracy": 0.4551724135875702, "step": 75680 }, { "epoch": 0.07623073583786326, "grad_norm": 10.227872309189037, "learning_rate": 4.991540924938052e-05, "loss": 2.6469, "mean_token_accuracy": 0.3793103516101837, "step": 75685 }, { "epoch": 0.07623577189096743, "grad_norm": 11.176571091423007, "learning_rate": 4.991537678420831e-05, "loss": 2.8674, "mean_token_accuracy": 0.36206896007061007, "step": 75690 }, { "epoch": 0.0762408079440716, "grad_norm": 9.119711797283024, "learning_rate": 4.991534431281912e-05, "loss": 2.5772, "mean_token_accuracy": 0.40895341634750365, "step": 75695 }, { "epoch": 0.07624584399717578, "grad_norm": 13.13742528826164, "learning_rate": 4.9915311835212944e-05, "loss": 2.5808, "mean_token_accuracy": 0.42758620977401735, "step": 75700 }, { "epoch": 0.07625088005027995, "grad_norm": 15.485259453933407, "learning_rate": 4.991527935138979e-05, "loss": 2.6134, "mean_token_accuracy": 0.39310344457626345, "step": 75705 }, { "epoch": 0.07625591610338413, "grad_norm": 12.245407266659196, "learning_rate": 4.991524686134969e-05, "loss": 2.6322, "mean_token_accuracy": 0.4034482777118683, "step": 75710 }, { "epoch": 0.0762609521564883, "grad_norm": 10.410091201452078, "learning_rate": 4.991521436509263e-05, "loss": 2.3037, "mean_token_accuracy": 0.458620685338974, "step": 75715 }, { "epoch": 0.07626598820959248, "grad_norm": 11.236174950689106, "learning_rate": 4.991518186261863e-05, "loss": 2.5427, "mean_token_accuracy": 0.41034482717514037, "step": 75720 }, { "epoch": 0.07627102426269665, "grad_norm": 10.479069129460154, "learning_rate": 4.9915149353927696e-05, "loss": 2.579, "mean_token_accuracy": 0.38275861740112305, "step": 75725 }, { "epoch": 0.07627606031580082, "grad_norm": 10.383650281887649, "learning_rate": 4.991511683901983e-05, "loss": 2.8116, "mean_token_accuracy": 0.3965517282485962, "step": 75730 }, { "epoch": 0.076281096368905, "grad_norm": 13.455229293508971, "learning_rate": 4.991508431789505e-05, "loss": 2.5172, "mean_token_accuracy": 0.3758620709180832, "step": 75735 }, { "epoch": 0.07628613242200917, "grad_norm": 10.967299834092879, "learning_rate": 4.991505179055337e-05, "loss": 2.4315, "mean_token_accuracy": 0.4275861978530884, "step": 75740 }, { "epoch": 0.07629116847511333, "grad_norm": 11.403334189242738, "learning_rate": 4.991501925699478e-05, "loss": 2.5231, "mean_token_accuracy": 0.3931034505367279, "step": 75745 }, { "epoch": 0.0762962045282175, "grad_norm": 11.555463087203313, "learning_rate": 4.991498671721931e-05, "loss": 2.3855, "mean_token_accuracy": 0.41724138557910917, "step": 75750 }, { "epoch": 0.07630124058132168, "grad_norm": 12.708598989309463, "learning_rate": 4.991495417122696e-05, "loss": 2.6184, "mean_token_accuracy": 0.4, "step": 75755 }, { "epoch": 0.07630627663442585, "grad_norm": 10.568363549210302, "learning_rate": 4.991492161901773e-05, "loss": 2.31, "mean_token_accuracy": 0.44482758045196535, "step": 75760 }, { "epoch": 0.07631131268753003, "grad_norm": 15.921399308053875, "learning_rate": 4.991488906059164e-05, "loss": 2.8976, "mean_token_accuracy": 0.4329703629016876, "step": 75765 }, { "epoch": 0.0763163487406342, "grad_norm": 9.652541944142751, "learning_rate": 4.9914856495948706e-05, "loss": 2.631, "mean_token_accuracy": 0.403448274731636, "step": 75770 }, { "epoch": 0.07632138479373837, "grad_norm": 11.303889242152, "learning_rate": 4.991482392508892e-05, "loss": 2.8653, "mean_token_accuracy": 0.3275861978530884, "step": 75775 }, { "epoch": 0.07632642084684255, "grad_norm": 17.392034790670053, "learning_rate": 4.991479134801231e-05, "loss": 2.725, "mean_token_accuracy": 0.36896551847457887, "step": 75780 }, { "epoch": 0.07633145689994672, "grad_norm": 11.004525866629024, "learning_rate": 4.9914758764718865e-05, "loss": 2.4916, "mean_token_accuracy": 0.4034482777118683, "step": 75785 }, { "epoch": 0.0763364929530509, "grad_norm": 11.529170184847597, "learning_rate": 4.9914726175208605e-05, "loss": 2.4252, "mean_token_accuracy": 0.43103447556495667, "step": 75790 }, { "epoch": 0.07634152900615507, "grad_norm": 15.050603487959778, "learning_rate": 4.991469357948153e-05, "loss": 2.5003, "mean_token_accuracy": 0.3896551728248596, "step": 75795 }, { "epoch": 0.07634656505925924, "grad_norm": 10.789635727337897, "learning_rate": 4.991466097753767e-05, "loss": 2.4802, "mean_token_accuracy": 0.4448275864124298, "step": 75800 }, { "epoch": 0.07635160111236342, "grad_norm": 11.683563473038868, "learning_rate": 4.9914628369377e-05, "loss": 2.5668, "mean_token_accuracy": 0.39310344457626345, "step": 75805 }, { "epoch": 0.07635663716546759, "grad_norm": 10.765910310748934, "learning_rate": 4.991459575499956e-05, "loss": 2.6054, "mean_token_accuracy": 0.3896551728248596, "step": 75810 }, { "epoch": 0.07636167321857175, "grad_norm": 9.635504361083918, "learning_rate": 4.9914563134405355e-05, "loss": 2.2248, "mean_token_accuracy": 0.4517241358757019, "step": 75815 }, { "epoch": 0.07636670927167592, "grad_norm": 21.691697319154112, "learning_rate": 4.991453050759438e-05, "loss": 2.8796, "mean_token_accuracy": 0.4000000089406967, "step": 75820 }, { "epoch": 0.0763717453247801, "grad_norm": 11.224333017730409, "learning_rate": 4.991449787456665e-05, "loss": 2.5865, "mean_token_accuracy": 0.44137930274009707, "step": 75825 }, { "epoch": 0.07637678137788427, "grad_norm": 8.47216788328823, "learning_rate": 4.991446523532218e-05, "loss": 2.1865, "mean_token_accuracy": 0.49243800044059755, "step": 75830 }, { "epoch": 0.07638181743098844, "grad_norm": 11.29522572380612, "learning_rate": 4.9914432589860977e-05, "loss": 2.2213, "mean_token_accuracy": 0.4643678247928619, "step": 75835 }, { "epoch": 0.07638685348409262, "grad_norm": 9.267673691772961, "learning_rate": 4.991439993818304e-05, "loss": 2.5495, "mean_token_accuracy": 0.417241370677948, "step": 75840 }, { "epoch": 0.07639188953719679, "grad_norm": 10.437566058020566, "learning_rate": 4.9914367280288394e-05, "loss": 2.0566, "mean_token_accuracy": 0.46896551847457885, "step": 75845 }, { "epoch": 0.07639692559030097, "grad_norm": 13.906158956665738, "learning_rate": 4.991433461617703e-05, "loss": 2.5037, "mean_token_accuracy": 0.3551724135875702, "step": 75850 }, { "epoch": 0.07640196164340514, "grad_norm": 12.036452471682875, "learning_rate": 4.991430194584898e-05, "loss": 2.7087, "mean_token_accuracy": 0.4034482717514038, "step": 75855 }, { "epoch": 0.07640699769650931, "grad_norm": 10.73824030041405, "learning_rate": 4.991426926930422e-05, "loss": 2.418, "mean_token_accuracy": 0.3965517163276672, "step": 75860 }, { "epoch": 0.07641203374961349, "grad_norm": 14.802172998273921, "learning_rate": 4.991423658654279e-05, "loss": 2.21, "mean_token_accuracy": 0.4344827592372894, "step": 75865 }, { "epoch": 0.07641706980271766, "grad_norm": 10.500910000184115, "learning_rate": 4.9914203897564696e-05, "loss": 2.0921, "mean_token_accuracy": 0.47428917288780215, "step": 75870 }, { "epoch": 0.07642210585582183, "grad_norm": 11.144134351017298, "learning_rate": 4.9914171202369925e-05, "loss": 2.2022, "mean_token_accuracy": 0.44137930274009707, "step": 75875 }, { "epoch": 0.07642714190892601, "grad_norm": 12.494834211201681, "learning_rate": 4.991413850095851e-05, "loss": 2.6443, "mean_token_accuracy": 0.375862056016922, "step": 75880 }, { "epoch": 0.07643217796203017, "grad_norm": 13.508432611391783, "learning_rate": 4.9914105793330446e-05, "loss": 2.5734, "mean_token_accuracy": 0.3965517282485962, "step": 75885 }, { "epoch": 0.07643721401513434, "grad_norm": 10.966296543416753, "learning_rate": 4.991407307948574e-05, "loss": 2.6002, "mean_token_accuracy": 0.3689655214548111, "step": 75890 }, { "epoch": 0.07644225006823852, "grad_norm": 11.5297554266137, "learning_rate": 4.991404035942441e-05, "loss": 2.2939, "mean_token_accuracy": 0.4413793087005615, "step": 75895 }, { "epoch": 0.07644728612134269, "grad_norm": 12.691866007702652, "learning_rate": 4.991400763314647e-05, "loss": 2.5595, "mean_token_accuracy": 0.4206896543502808, "step": 75900 }, { "epoch": 0.07645232217444686, "grad_norm": 10.596178340938968, "learning_rate": 4.9913974900651914e-05, "loss": 2.0649, "mean_token_accuracy": 0.4910098612308502, "step": 75905 }, { "epoch": 0.07645735822755104, "grad_norm": 12.24591751101665, "learning_rate": 4.9913942161940765e-05, "loss": 2.3893, "mean_token_accuracy": 0.42413793206214906, "step": 75910 }, { "epoch": 0.07646239428065521, "grad_norm": 10.41183630750871, "learning_rate": 4.991390941701302e-05, "loss": 2.536, "mean_token_accuracy": 0.3931034505367279, "step": 75915 }, { "epoch": 0.07646743033375938, "grad_norm": 9.28940065559611, "learning_rate": 4.99138766658687e-05, "loss": 2.1056, "mean_token_accuracy": 0.47731398344039916, "step": 75920 }, { "epoch": 0.07647246638686356, "grad_norm": 11.280956626278062, "learning_rate": 4.9913843908507794e-05, "loss": 2.6147, "mean_token_accuracy": 0.41379310488700866, "step": 75925 }, { "epoch": 0.07647750243996773, "grad_norm": 10.418037589122415, "learning_rate": 4.991381114493033e-05, "loss": 2.0753, "mean_token_accuracy": 0.4137930989265442, "step": 75930 }, { "epoch": 0.0764825384930719, "grad_norm": 12.295293040516704, "learning_rate": 4.991377837513632e-05, "loss": 2.773, "mean_token_accuracy": 0.3482758551836014, "step": 75935 }, { "epoch": 0.07648757454617608, "grad_norm": 9.647666117705828, "learning_rate": 4.991374559912576e-05, "loss": 2.5544, "mean_token_accuracy": 0.40689654350280763, "step": 75940 }, { "epoch": 0.07649261059928025, "grad_norm": 10.129150239704519, "learning_rate": 4.991371281689867e-05, "loss": 2.1761, "mean_token_accuracy": 0.46896551847457885, "step": 75945 }, { "epoch": 0.07649764665238443, "grad_norm": 11.802967044004843, "learning_rate": 4.991368002845504e-05, "loss": 2.833, "mean_token_accuracy": 0.31379309892654417, "step": 75950 }, { "epoch": 0.07650268270548859, "grad_norm": 9.096263347688152, "learning_rate": 4.99136472337949e-05, "loss": 2.2723, "mean_token_accuracy": 0.4620689690113068, "step": 75955 }, { "epoch": 0.07650771875859276, "grad_norm": 12.2025513547769, "learning_rate": 4.9913614432918256e-05, "loss": 2.8166, "mean_token_accuracy": 0.38275861740112305, "step": 75960 }, { "epoch": 0.07651275481169693, "grad_norm": 9.476075024808113, "learning_rate": 4.9913581625825107e-05, "loss": 2.3497, "mean_token_accuracy": 0.43793103098869324, "step": 75965 }, { "epoch": 0.07651779086480111, "grad_norm": 8.167493639580654, "learning_rate": 4.991354881251547e-05, "loss": 1.9435, "mean_token_accuracy": 0.46551724076271056, "step": 75970 }, { "epoch": 0.07652282691790528, "grad_norm": 11.920780312356031, "learning_rate": 4.9913515992989345e-05, "loss": 2.0221, "mean_token_accuracy": 0.4862068831920624, "step": 75975 }, { "epoch": 0.07652786297100946, "grad_norm": 10.868402975467268, "learning_rate": 4.991348316724675e-05, "loss": 2.1724, "mean_token_accuracy": 0.4551724135875702, "step": 75980 }, { "epoch": 0.07653289902411363, "grad_norm": 16.608397154067035, "learning_rate": 4.9913450335287705e-05, "loss": 2.8311, "mean_token_accuracy": 0.3896551728248596, "step": 75985 }, { "epoch": 0.0765379350772178, "grad_norm": 10.539757071855773, "learning_rate": 4.9913417497112194e-05, "loss": 2.394, "mean_token_accuracy": 0.42758620977401735, "step": 75990 }, { "epoch": 0.07654297113032198, "grad_norm": 16.595900709794915, "learning_rate": 4.991338465272023e-05, "loss": 2.7186, "mean_token_accuracy": 0.4068965494632721, "step": 75995 }, { "epoch": 0.07654800718342615, "grad_norm": 9.316557899786188, "learning_rate": 4.9913351802111844e-05, "loss": 2.0665, "mean_token_accuracy": 0.4913490653038025, "step": 76000 }, { "epoch": 0.07655304323653032, "grad_norm": 9.346339293452436, "learning_rate": 4.991331894528703e-05, "loss": 2.2158, "mean_token_accuracy": 0.44137930274009707, "step": 76005 }, { "epoch": 0.0765580792896345, "grad_norm": 15.873856871661653, "learning_rate": 4.99132860822458e-05, "loss": 3.0836, "mean_token_accuracy": 0.3724137991666794, "step": 76010 }, { "epoch": 0.07656311534273867, "grad_norm": 10.017197081567192, "learning_rate": 4.991325321298815e-05, "loss": 2.3409, "mean_token_accuracy": 0.42068966031074523, "step": 76015 }, { "epoch": 0.07656815139584285, "grad_norm": 11.805809406800645, "learning_rate": 4.9913220337514105e-05, "loss": 2.6357, "mean_token_accuracy": 0.3774954617023468, "step": 76020 }, { "epoch": 0.076573187448947, "grad_norm": 10.951204614392159, "learning_rate": 4.991318745582367e-05, "loss": 3.0615, "mean_token_accuracy": 0.37241379022598264, "step": 76025 }, { "epoch": 0.07657822350205118, "grad_norm": 12.913009617101537, "learning_rate": 4.991315456791686e-05, "loss": 2.5665, "mean_token_accuracy": 0.40496068000793456, "step": 76030 }, { "epoch": 0.07658325955515535, "grad_norm": 9.701074470634271, "learning_rate": 4.9913121673793675e-05, "loss": 2.3706, "mean_token_accuracy": 0.4310344815254211, "step": 76035 }, { "epoch": 0.07658829560825953, "grad_norm": 10.769471869941352, "learning_rate": 4.9913088773454125e-05, "loss": 2.2415, "mean_token_accuracy": 0.4620689630508423, "step": 76040 }, { "epoch": 0.0765933316613637, "grad_norm": 14.152698115220998, "learning_rate": 4.991305586689822e-05, "loss": 2.9488, "mean_token_accuracy": 0.3965517282485962, "step": 76045 }, { "epoch": 0.07659836771446787, "grad_norm": 11.285083982795774, "learning_rate": 4.9913022954125974e-05, "loss": 2.9882, "mean_token_accuracy": 0.3793103516101837, "step": 76050 }, { "epoch": 0.07660340376757205, "grad_norm": 12.285616983751531, "learning_rate": 4.991299003513739e-05, "loss": 2.4519, "mean_token_accuracy": 0.44827585816383364, "step": 76055 }, { "epoch": 0.07660843982067622, "grad_norm": 9.699297240927653, "learning_rate": 4.991295710993248e-05, "loss": 2.4837, "mean_token_accuracy": 0.4517241418361664, "step": 76060 }, { "epoch": 0.0766134758737804, "grad_norm": 10.660974168449924, "learning_rate": 4.991292417851126e-05, "loss": 2.4614, "mean_token_accuracy": 0.4137930989265442, "step": 76065 }, { "epoch": 0.07661851192688457, "grad_norm": 10.477693084192317, "learning_rate": 4.991289124087372e-05, "loss": 2.8269, "mean_token_accuracy": 0.37586206793785093, "step": 76070 }, { "epoch": 0.07662354797998874, "grad_norm": 10.094157344090616, "learning_rate": 4.991285829701988e-05, "loss": 2.5221, "mean_token_accuracy": 0.41379310488700866, "step": 76075 }, { "epoch": 0.07662858403309292, "grad_norm": 13.617982164643193, "learning_rate": 4.991282534694976e-05, "loss": 2.6224, "mean_token_accuracy": 0.4, "step": 76080 }, { "epoch": 0.07663362008619709, "grad_norm": 10.226769012475385, "learning_rate": 4.991279239066336e-05, "loss": 2.5625, "mean_token_accuracy": 0.3965517282485962, "step": 76085 }, { "epoch": 0.07663865613930126, "grad_norm": 11.753950744444511, "learning_rate": 4.991275942816068e-05, "loss": 2.4381, "mean_token_accuracy": 0.4379310369491577, "step": 76090 }, { "epoch": 0.07664369219240542, "grad_norm": 13.238021353430796, "learning_rate": 4.991272645944174e-05, "loss": 2.6456, "mean_token_accuracy": 0.4310344815254211, "step": 76095 }, { "epoch": 0.0766487282455096, "grad_norm": 13.221475504182036, "learning_rate": 4.9912693484506554e-05, "loss": 2.5872, "mean_token_accuracy": 0.4, "step": 76100 }, { "epoch": 0.07665376429861377, "grad_norm": 11.153834513315722, "learning_rate": 4.9912660503355114e-05, "loss": 2.5138, "mean_token_accuracy": 0.39842710494995115, "step": 76105 }, { "epoch": 0.07665880035171795, "grad_norm": 12.348462795601518, "learning_rate": 4.991262751598744e-05, "loss": 2.5586, "mean_token_accuracy": 0.40344826579093934, "step": 76110 }, { "epoch": 0.07666383640482212, "grad_norm": 11.058857682253631, "learning_rate": 4.991259452240354e-05, "loss": 2.6813, "mean_token_accuracy": 0.3931034505367279, "step": 76115 }, { "epoch": 0.0766688724579263, "grad_norm": 13.45940396384504, "learning_rate": 4.991256152260343e-05, "loss": 2.3806, "mean_token_accuracy": 0.3758620649576187, "step": 76120 }, { "epoch": 0.07667390851103047, "grad_norm": 10.380294446826063, "learning_rate": 4.991252851658711e-05, "loss": 2.5863, "mean_token_accuracy": 0.38965516686439516, "step": 76125 }, { "epoch": 0.07667894456413464, "grad_norm": 10.210874984121272, "learning_rate": 4.9912495504354594e-05, "loss": 2.4781, "mean_token_accuracy": 0.4068965554237366, "step": 76130 }, { "epoch": 0.07668398061723881, "grad_norm": 12.355744459605228, "learning_rate": 4.991246248590588e-05, "loss": 2.4452, "mean_token_accuracy": 0.4103448331356049, "step": 76135 }, { "epoch": 0.07668901667034299, "grad_norm": 14.781936750895781, "learning_rate": 4.9912429461240996e-05, "loss": 2.4778, "mean_token_accuracy": 0.4482758641242981, "step": 76140 }, { "epoch": 0.07669405272344716, "grad_norm": 10.311429793408207, "learning_rate": 4.991239643035994e-05, "loss": 2.1637, "mean_token_accuracy": 0.46896551847457885, "step": 76145 }, { "epoch": 0.07669908877655134, "grad_norm": 12.740274594516142, "learning_rate": 4.991236339326272e-05, "loss": 2.7306, "mean_token_accuracy": 0.3793103516101837, "step": 76150 }, { "epoch": 0.07670412482965551, "grad_norm": 13.465462341415407, "learning_rate": 4.9912330349949347e-05, "loss": 2.7967, "mean_token_accuracy": 0.3517241358757019, "step": 76155 }, { "epoch": 0.07670916088275968, "grad_norm": 22.085605173399905, "learning_rate": 4.9912297300419816e-05, "loss": 2.5842, "mean_token_accuracy": 0.42413792610168455, "step": 76160 }, { "epoch": 0.07671419693586384, "grad_norm": 11.377126568261668, "learning_rate": 4.991226424467417e-05, "loss": 2.9084, "mean_token_accuracy": 0.341379314661026, "step": 76165 }, { "epoch": 0.07671923298896802, "grad_norm": 9.068799027785182, "learning_rate": 4.991223118271239e-05, "loss": 2.1182, "mean_token_accuracy": 0.4379310369491577, "step": 76170 }, { "epoch": 0.07672426904207219, "grad_norm": 11.215560346188667, "learning_rate": 4.99121981145345e-05, "loss": 2.5119, "mean_token_accuracy": 0.4068965554237366, "step": 76175 }, { "epoch": 0.07672930509517636, "grad_norm": 10.954607236901209, "learning_rate": 4.99121650401405e-05, "loss": 2.6158, "mean_token_accuracy": 0.3448275774717331, "step": 76180 }, { "epoch": 0.07673434114828054, "grad_norm": 12.593666432344227, "learning_rate": 4.99121319595304e-05, "loss": 2.678, "mean_token_accuracy": 0.3655172407627106, "step": 76185 }, { "epoch": 0.07673937720138471, "grad_norm": 10.397434917774126, "learning_rate": 4.991209887270422e-05, "loss": 2.3829, "mean_token_accuracy": 0.4413793087005615, "step": 76190 }, { "epoch": 0.07674441325448889, "grad_norm": 9.991234837891225, "learning_rate": 4.9912065779661956e-05, "loss": 2.2518, "mean_token_accuracy": 0.42758620381355283, "step": 76195 }, { "epoch": 0.07674944930759306, "grad_norm": 12.336582522125166, "learning_rate": 4.991203268040362e-05, "loss": 2.2671, "mean_token_accuracy": 0.4586206912994385, "step": 76200 }, { "epoch": 0.07675448536069723, "grad_norm": 10.379597317759364, "learning_rate": 4.991199957492922e-05, "loss": 2.2362, "mean_token_accuracy": 0.4172413766384125, "step": 76205 }, { "epoch": 0.07675952141380141, "grad_norm": 10.465127083070813, "learning_rate": 4.991196646323878e-05, "loss": 2.6993, "mean_token_accuracy": 0.3862069010734558, "step": 76210 }, { "epoch": 0.07676455746690558, "grad_norm": 15.090868812203627, "learning_rate": 4.9911933345332286e-05, "loss": 2.7168, "mean_token_accuracy": 0.3206896513700485, "step": 76215 }, { "epoch": 0.07676959352000975, "grad_norm": 12.652042280143231, "learning_rate": 4.991190022120977e-05, "loss": 2.638, "mean_token_accuracy": 0.4125226855278015, "step": 76220 }, { "epoch": 0.07677462957311393, "grad_norm": 13.18150121367197, "learning_rate": 4.9911867090871215e-05, "loss": 2.7215, "mean_token_accuracy": 0.43448275327682495, "step": 76225 }, { "epoch": 0.0767796656262181, "grad_norm": 11.995238335150015, "learning_rate": 4.991183395431665e-05, "loss": 2.6699, "mean_token_accuracy": 0.4517241358757019, "step": 76230 }, { "epoch": 0.07678470167932226, "grad_norm": 10.423203353669022, "learning_rate": 4.991180081154608e-05, "loss": 2.3231, "mean_token_accuracy": 0.42068964838981626, "step": 76235 }, { "epoch": 0.07678973773242644, "grad_norm": 12.167635075071818, "learning_rate": 4.9911767662559514e-05, "loss": 2.698, "mean_token_accuracy": 0.379310342669487, "step": 76240 }, { "epoch": 0.07679477378553061, "grad_norm": 11.959190249219661, "learning_rate": 4.9911734507356965e-05, "loss": 2.6089, "mean_token_accuracy": 0.4172413766384125, "step": 76245 }, { "epoch": 0.07679980983863478, "grad_norm": 12.57301525973076, "learning_rate": 4.9911701345938435e-05, "loss": 2.3945, "mean_token_accuracy": 0.40344826579093934, "step": 76250 }, { "epoch": 0.07680484589173896, "grad_norm": 11.488919352560186, "learning_rate": 4.9911668178303936e-05, "loss": 2.4881, "mean_token_accuracy": 0.4310344815254211, "step": 76255 }, { "epoch": 0.07680988194484313, "grad_norm": 11.666034113108683, "learning_rate": 4.9911635004453476e-05, "loss": 2.4, "mean_token_accuracy": 0.38965516686439516, "step": 76260 }, { "epoch": 0.0768149179979473, "grad_norm": 11.383287288828535, "learning_rate": 4.991160182438707e-05, "loss": 2.7235, "mean_token_accuracy": 0.35862068831920624, "step": 76265 }, { "epoch": 0.07681995405105148, "grad_norm": 9.501910846751839, "learning_rate": 4.991156863810472e-05, "loss": 2.3704, "mean_token_accuracy": 0.42758620381355283, "step": 76270 }, { "epoch": 0.07682499010415565, "grad_norm": 10.52889500955914, "learning_rate": 4.9911535445606436e-05, "loss": 2.3017, "mean_token_accuracy": 0.4517241358757019, "step": 76275 }, { "epoch": 0.07683002615725983, "grad_norm": 10.127493030419503, "learning_rate": 4.9911502246892225e-05, "loss": 2.3911, "mean_token_accuracy": 0.4206896543502808, "step": 76280 }, { "epoch": 0.076835062210364, "grad_norm": 10.467548309573269, "learning_rate": 4.9911469041962114e-05, "loss": 2.856, "mean_token_accuracy": 0.37241379618644715, "step": 76285 }, { "epoch": 0.07684009826346817, "grad_norm": 13.920408180237013, "learning_rate": 4.991143583081608e-05, "loss": 3.2088, "mean_token_accuracy": 0.37586206793785093, "step": 76290 }, { "epoch": 0.07684513431657235, "grad_norm": 14.69910938298018, "learning_rate": 4.991140261345417e-05, "loss": 2.4855, "mean_token_accuracy": 0.3517241418361664, "step": 76295 }, { "epoch": 0.07685017036967652, "grad_norm": 12.792508531622563, "learning_rate": 4.991136938987636e-05, "loss": 2.3878, "mean_token_accuracy": 0.4172413766384125, "step": 76300 }, { "epoch": 0.07685520642278068, "grad_norm": 12.281887660239775, "learning_rate": 4.991133616008268e-05, "loss": 2.3622, "mean_token_accuracy": 0.4517241358757019, "step": 76305 }, { "epoch": 0.07686024247588485, "grad_norm": 13.286946207923359, "learning_rate": 4.9911302924073125e-05, "loss": 2.7695, "mean_token_accuracy": 0.41724138259887694, "step": 76310 }, { "epoch": 0.07686527852898903, "grad_norm": 14.49087467089417, "learning_rate": 4.991126968184772e-05, "loss": 2.3895, "mean_token_accuracy": 0.39491833448410035, "step": 76315 }, { "epoch": 0.0768703145820932, "grad_norm": 11.705957293550219, "learning_rate": 4.991123643340646e-05, "loss": 2.3873, "mean_token_accuracy": 0.4344827651977539, "step": 76320 }, { "epoch": 0.07687535063519738, "grad_norm": 14.995636561082682, "learning_rate": 4.991120317874937e-05, "loss": 2.8071, "mean_token_accuracy": 0.36896551251411436, "step": 76325 }, { "epoch": 0.07688038668830155, "grad_norm": 10.024962210705622, "learning_rate": 4.991116991787643e-05, "loss": 2.4787, "mean_token_accuracy": 0.39655172228813174, "step": 76330 }, { "epoch": 0.07688542274140572, "grad_norm": 11.147795370803149, "learning_rate": 4.991113665078768e-05, "loss": 2.7896, "mean_token_accuracy": 0.38275861740112305, "step": 76335 }, { "epoch": 0.0768904587945099, "grad_norm": 11.17179787394256, "learning_rate": 4.991110337748312e-05, "loss": 2.4725, "mean_token_accuracy": 0.43793103098869324, "step": 76340 }, { "epoch": 0.07689549484761407, "grad_norm": 12.938188381538987, "learning_rate": 4.991107009796275e-05, "loss": 2.3297, "mean_token_accuracy": 0.42068966031074523, "step": 76345 }, { "epoch": 0.07690053090071824, "grad_norm": 9.673208918236057, "learning_rate": 4.99110368122266e-05, "loss": 2.7345, "mean_token_accuracy": 0.4517241358757019, "step": 76350 }, { "epoch": 0.07690556695382242, "grad_norm": 9.300023807391415, "learning_rate": 4.991100352027465e-05, "loss": 2.4001, "mean_token_accuracy": 0.4655172348022461, "step": 76355 }, { "epoch": 0.07691060300692659, "grad_norm": 12.743634072448442, "learning_rate": 4.991097022210693e-05, "loss": 3.0314, "mean_token_accuracy": 0.3517241358757019, "step": 76360 }, { "epoch": 0.07691563906003077, "grad_norm": 16.05954807301064, "learning_rate": 4.9910936917723446e-05, "loss": 2.8275, "mean_token_accuracy": 0.36896551549434664, "step": 76365 }, { "epoch": 0.07692067511313494, "grad_norm": 11.978254526751734, "learning_rate": 4.99109036071242e-05, "loss": 2.8069, "mean_token_accuracy": 0.3689655244350433, "step": 76370 }, { "epoch": 0.0769257111662391, "grad_norm": 12.11489318519132, "learning_rate": 4.991087029030921e-05, "loss": 2.6078, "mean_token_accuracy": 0.37586206793785093, "step": 76375 }, { "epoch": 0.07693074721934327, "grad_norm": 18.95962277745913, "learning_rate": 4.991083696727848e-05, "loss": 2.5207, "mean_token_accuracy": 0.4310344815254211, "step": 76380 }, { "epoch": 0.07693578327244745, "grad_norm": 12.507215749014307, "learning_rate": 4.9910803638032024e-05, "loss": 2.3747, "mean_token_accuracy": 0.43103448748588563, "step": 76385 }, { "epoch": 0.07694081932555162, "grad_norm": 11.040131696764059, "learning_rate": 4.991077030256984e-05, "loss": 2.4414, "mean_token_accuracy": 0.39655172228813174, "step": 76390 }, { "epoch": 0.0769458553786558, "grad_norm": 11.162206482310072, "learning_rate": 4.991073696089195e-05, "loss": 2.6542, "mean_token_accuracy": 0.3379310339689255, "step": 76395 }, { "epoch": 0.07695089143175997, "grad_norm": 10.077112072607074, "learning_rate": 4.991070361299836e-05, "loss": 2.6271, "mean_token_accuracy": 0.3931034505367279, "step": 76400 }, { "epoch": 0.07695592748486414, "grad_norm": 11.070413085272996, "learning_rate": 4.991067025888908e-05, "loss": 2.2648, "mean_token_accuracy": 0.4620689630508423, "step": 76405 }, { "epoch": 0.07696096353796832, "grad_norm": 9.917713323973205, "learning_rate": 4.9910636898564116e-05, "loss": 2.4849, "mean_token_accuracy": 0.4034482717514038, "step": 76410 }, { "epoch": 0.07696599959107249, "grad_norm": 11.47594400030761, "learning_rate": 4.991060353202347e-05, "loss": 2.0804, "mean_token_accuracy": 0.4435571551322937, "step": 76415 }, { "epoch": 0.07697103564417666, "grad_norm": 11.729643919708229, "learning_rate": 4.991057015926717e-05, "loss": 2.6369, "mean_token_accuracy": 0.40538414716720583, "step": 76420 }, { "epoch": 0.07697607169728084, "grad_norm": 13.291758812686288, "learning_rate": 4.991053678029521e-05, "loss": 2.5011, "mean_token_accuracy": 0.42758620381355283, "step": 76425 }, { "epoch": 0.07698110775038501, "grad_norm": 13.620731717411319, "learning_rate": 4.99105033951076e-05, "loss": 2.6175, "mean_token_accuracy": 0.4034482777118683, "step": 76430 }, { "epoch": 0.07698614380348918, "grad_norm": 12.220741589624009, "learning_rate": 4.991047000370436e-05, "loss": 2.574, "mean_token_accuracy": 0.3482758641242981, "step": 76435 }, { "epoch": 0.07699117985659336, "grad_norm": 13.102777587086644, "learning_rate": 4.9910436606085495e-05, "loss": 2.393, "mean_token_accuracy": 0.38620689511299133, "step": 76440 }, { "epoch": 0.07699621590969752, "grad_norm": 10.567325511205329, "learning_rate": 4.991040320225101e-05, "loss": 2.2524, "mean_token_accuracy": 0.4724137902259827, "step": 76445 }, { "epoch": 0.07700125196280169, "grad_norm": 13.399178970245588, "learning_rate": 4.9910369792200904e-05, "loss": 2.5242, "mean_token_accuracy": 0.428796124458313, "step": 76450 }, { "epoch": 0.07700628801590587, "grad_norm": 10.514605026897222, "learning_rate": 4.991033637593521e-05, "loss": 1.9882, "mean_token_accuracy": 0.4918330252170563, "step": 76455 }, { "epoch": 0.07701132406901004, "grad_norm": 10.480547488976926, "learning_rate": 4.9910302953453924e-05, "loss": 2.4896, "mean_token_accuracy": 0.42758620381355283, "step": 76460 }, { "epoch": 0.07701636012211421, "grad_norm": 14.0726287037887, "learning_rate": 4.9910269524757054e-05, "loss": 2.604, "mean_token_accuracy": 0.4068965494632721, "step": 76465 }, { "epoch": 0.07702139617521839, "grad_norm": 11.20674176483731, "learning_rate": 4.9910236089844616e-05, "loss": 2.4799, "mean_token_accuracy": 0.4257108271121979, "step": 76470 }, { "epoch": 0.07702643222832256, "grad_norm": 10.417336116662733, "learning_rate": 4.9910202648716616e-05, "loss": 2.2341, "mean_token_accuracy": 0.47931033968925474, "step": 76475 }, { "epoch": 0.07703146828142673, "grad_norm": 10.816822667545958, "learning_rate": 4.991016920137306e-05, "loss": 2.2454, "mean_token_accuracy": 0.43793103098869324, "step": 76480 }, { "epoch": 0.07703650433453091, "grad_norm": 11.45095069200206, "learning_rate": 4.9910135747813965e-05, "loss": 2.5324, "mean_token_accuracy": 0.4551724076271057, "step": 76485 }, { "epoch": 0.07704154038763508, "grad_norm": 15.113232681737985, "learning_rate": 4.991010228803932e-05, "loss": 2.7173, "mean_token_accuracy": 0.3965517282485962, "step": 76490 }, { "epoch": 0.07704657644073926, "grad_norm": 9.01982280753244, "learning_rate": 4.991006882204916e-05, "loss": 2.3972, "mean_token_accuracy": 0.41597095131874084, "step": 76495 }, { "epoch": 0.07705161249384343, "grad_norm": 15.21468290379302, "learning_rate": 4.991003534984349e-05, "loss": 2.5124, "mean_token_accuracy": 0.403448274731636, "step": 76500 }, { "epoch": 0.0770566485469476, "grad_norm": 9.744995689730901, "learning_rate": 4.991000187142231e-05, "loss": 2.3012, "mean_token_accuracy": 0.4482758641242981, "step": 76505 }, { "epoch": 0.07706168460005178, "grad_norm": 9.801163742930056, "learning_rate": 4.990996838678563e-05, "loss": 2.6783, "mean_token_accuracy": 0.4206896543502808, "step": 76510 }, { "epoch": 0.07706672065315594, "grad_norm": 14.915655934076208, "learning_rate": 4.9909934895933455e-05, "loss": 2.7936, "mean_token_accuracy": 0.4034482777118683, "step": 76515 }, { "epoch": 0.07707175670626011, "grad_norm": 11.998220359109627, "learning_rate": 4.9909901398865806e-05, "loss": 2.3207, "mean_token_accuracy": 0.4172413766384125, "step": 76520 }, { "epoch": 0.07707679275936428, "grad_norm": 10.869897207155173, "learning_rate": 4.9909867895582685e-05, "loss": 2.4536, "mean_token_accuracy": 0.4620689630508423, "step": 76525 }, { "epoch": 0.07708182881246846, "grad_norm": 9.082628587109152, "learning_rate": 4.990983438608411e-05, "loss": 1.9954, "mean_token_accuracy": 0.49655171632766726, "step": 76530 }, { "epoch": 0.07708686486557263, "grad_norm": 10.34806382485586, "learning_rate": 4.990980087037008e-05, "loss": 2.6256, "mean_token_accuracy": 0.4379310369491577, "step": 76535 }, { "epoch": 0.0770919009186768, "grad_norm": 12.660727962211393, "learning_rate": 4.9909767348440614e-05, "loss": 2.4198, "mean_token_accuracy": 0.48275862336158754, "step": 76540 }, { "epoch": 0.07709693697178098, "grad_norm": 12.717709141544155, "learning_rate": 4.9909733820295706e-05, "loss": 2.2693, "mean_token_accuracy": 0.44827585816383364, "step": 76545 }, { "epoch": 0.07710197302488515, "grad_norm": 12.716322951460041, "learning_rate": 4.990970028593538e-05, "loss": 2.2004, "mean_token_accuracy": 0.44827585816383364, "step": 76550 }, { "epoch": 0.07710700907798933, "grad_norm": 13.334343662746278, "learning_rate": 4.990966674535964e-05, "loss": 2.9398, "mean_token_accuracy": 0.36551724672317504, "step": 76555 }, { "epoch": 0.0771120451310935, "grad_norm": 10.184239888061123, "learning_rate": 4.9909633198568495e-05, "loss": 2.5475, "mean_token_accuracy": 0.3793103456497192, "step": 76560 }, { "epoch": 0.07711708118419767, "grad_norm": 13.435392843453657, "learning_rate": 4.990959964556195e-05, "loss": 3.0107, "mean_token_accuracy": 0.33448275923728943, "step": 76565 }, { "epoch": 0.07712211723730185, "grad_norm": 10.490908638250238, "learning_rate": 4.990956608634003e-05, "loss": 2.6949, "mean_token_accuracy": 0.4, "step": 76570 }, { "epoch": 0.07712715329040602, "grad_norm": 10.76983059693207, "learning_rate": 4.990953252090272e-05, "loss": 2.4025, "mean_token_accuracy": 0.3862068891525269, "step": 76575 }, { "epoch": 0.0771321893435102, "grad_norm": 10.674401436385175, "learning_rate": 4.990949894925005e-05, "loss": 2.3607, "mean_token_accuracy": 0.4517241418361664, "step": 76580 }, { "epoch": 0.07713722539661436, "grad_norm": 12.45154984299301, "learning_rate": 4.990946537138202e-05, "loss": 2.1622, "mean_token_accuracy": 0.4565270960330963, "step": 76585 }, { "epoch": 0.07714226144971853, "grad_norm": 11.973738759218802, "learning_rate": 4.9909431787298644e-05, "loss": 2.8777, "mean_token_accuracy": 0.3965517282485962, "step": 76590 }, { "epoch": 0.0771472975028227, "grad_norm": 12.357447580176368, "learning_rate": 4.990939819699992e-05, "loss": 2.8391, "mean_token_accuracy": 0.358620685338974, "step": 76595 }, { "epoch": 0.07715233355592688, "grad_norm": 11.742388869704904, "learning_rate": 4.990936460048588e-05, "loss": 2.3411, "mean_token_accuracy": 0.4344827651977539, "step": 76600 }, { "epoch": 0.07715736960903105, "grad_norm": 11.273713494057555, "learning_rate": 4.9909330997756506e-05, "loss": 2.701, "mean_token_accuracy": 0.37586206793785093, "step": 76605 }, { "epoch": 0.07716240566213523, "grad_norm": 12.415423112366938, "learning_rate": 4.9909297388811825e-05, "loss": 2.8771, "mean_token_accuracy": 0.35862069129943847, "step": 76610 }, { "epoch": 0.0771674417152394, "grad_norm": 10.065836057157659, "learning_rate": 4.9909263773651846e-05, "loss": 2.176, "mean_token_accuracy": 0.447005432844162, "step": 76615 }, { "epoch": 0.07717247776834357, "grad_norm": 12.12845413323766, "learning_rate": 4.990923015227656e-05, "loss": 2.7089, "mean_token_accuracy": 0.3655172437429428, "step": 76620 }, { "epoch": 0.07717751382144775, "grad_norm": 13.282871201607344, "learning_rate": 4.990919652468601e-05, "loss": 2.6983, "mean_token_accuracy": 0.37241379022598264, "step": 76625 }, { "epoch": 0.07718254987455192, "grad_norm": 11.831922340032506, "learning_rate": 4.990916289088017e-05, "loss": 2.7447, "mean_token_accuracy": 0.33793103098869326, "step": 76630 }, { "epoch": 0.0771875859276561, "grad_norm": 12.146721469423477, "learning_rate": 4.990912925085908e-05, "loss": 2.6499, "mean_token_accuracy": 0.36551723480224607, "step": 76635 }, { "epoch": 0.07719262198076027, "grad_norm": 11.138006972203636, "learning_rate": 4.990909560462272e-05, "loss": 2.1437, "mean_token_accuracy": 0.4620689690113068, "step": 76640 }, { "epoch": 0.07719765803386444, "grad_norm": 8.764424727819648, "learning_rate": 4.990906195217112e-05, "loss": 2.3502, "mean_token_accuracy": 0.44482758045196535, "step": 76645 }, { "epoch": 0.07720269408696862, "grad_norm": 10.57674547070549, "learning_rate": 4.9909028293504285e-05, "loss": 2.4115, "mean_token_accuracy": 0.3931034505367279, "step": 76650 }, { "epoch": 0.07720773014007278, "grad_norm": 11.668071100589232, "learning_rate": 4.9908994628622225e-05, "loss": 2.8501, "mean_token_accuracy": 0.35517241060733795, "step": 76655 }, { "epoch": 0.07721276619317695, "grad_norm": 11.580241426482525, "learning_rate": 4.9908960957524936e-05, "loss": 2.5062, "mean_token_accuracy": 0.3965517163276672, "step": 76660 }, { "epoch": 0.07721780224628112, "grad_norm": 10.068895550433217, "learning_rate": 4.9908927280212444e-05, "loss": 2.0169, "mean_token_accuracy": 0.47791893482208253, "step": 76665 }, { "epoch": 0.0772228382993853, "grad_norm": 18.721556810818345, "learning_rate": 4.990889359668476e-05, "loss": 2.6902, "mean_token_accuracy": 0.37241379022598264, "step": 76670 }, { "epoch": 0.07722787435248947, "grad_norm": 19.770743462184516, "learning_rate": 4.9908859906941874e-05, "loss": 2.3348, "mean_token_accuracy": 0.46637930870056155, "step": 76675 }, { "epoch": 0.07723291040559364, "grad_norm": 10.747204373323715, "learning_rate": 4.990882621098381e-05, "loss": 2.2541, "mean_token_accuracy": 0.4344827592372894, "step": 76680 }, { "epoch": 0.07723794645869782, "grad_norm": 8.670902402922255, "learning_rate": 4.9908792508810574e-05, "loss": 2.3974, "mean_token_accuracy": 0.42625529170036314, "step": 76685 }, { "epoch": 0.07724298251180199, "grad_norm": 12.54872853327849, "learning_rate": 4.990875880042218e-05, "loss": 2.5139, "mean_token_accuracy": 0.4379310369491577, "step": 76690 }, { "epoch": 0.07724801856490617, "grad_norm": 9.833978259582063, "learning_rate": 4.990872508581863e-05, "loss": 2.5944, "mean_token_accuracy": 0.39655172228813174, "step": 76695 }, { "epoch": 0.07725305461801034, "grad_norm": 14.217952865564726, "learning_rate": 4.9908691364999935e-05, "loss": 2.5453, "mean_token_accuracy": 0.3965517282485962, "step": 76700 }, { "epoch": 0.07725809067111451, "grad_norm": 11.377033165299036, "learning_rate": 4.990865763796611e-05, "loss": 2.8863, "mean_token_accuracy": 0.3827586233615875, "step": 76705 }, { "epoch": 0.07726312672421869, "grad_norm": 10.807157102551283, "learning_rate": 4.990862390471716e-05, "loss": 2.1228, "mean_token_accuracy": 0.5068965554237366, "step": 76710 }, { "epoch": 0.07726816277732286, "grad_norm": 8.96121664337475, "learning_rate": 4.990859016525309e-05, "loss": 2.4218, "mean_token_accuracy": 0.43448275327682495, "step": 76715 }, { "epoch": 0.07727319883042703, "grad_norm": 11.592092091633278, "learning_rate": 4.990855641957391e-05, "loss": 2.1826, "mean_token_accuracy": 0.42758620977401735, "step": 76720 }, { "epoch": 0.0772782348835312, "grad_norm": 13.238505096344939, "learning_rate": 4.990852266767965e-05, "loss": 2.6173, "mean_token_accuracy": 0.3896551728248596, "step": 76725 }, { "epoch": 0.07728327093663537, "grad_norm": 12.9143989630657, "learning_rate": 4.990848890957029e-05, "loss": 2.7285, "mean_token_accuracy": 0.3275862097740173, "step": 76730 }, { "epoch": 0.07728830698973954, "grad_norm": 12.617083302482092, "learning_rate": 4.990845514524586e-05, "loss": 3.0764, "mean_token_accuracy": 0.3482758641242981, "step": 76735 }, { "epoch": 0.07729334304284372, "grad_norm": 14.81254473176798, "learning_rate": 4.990842137470635e-05, "loss": 3.1178, "mean_token_accuracy": 0.32068965435028074, "step": 76740 }, { "epoch": 0.07729837909594789, "grad_norm": 13.43898111578754, "learning_rate": 4.990838759795179e-05, "loss": 2.7771, "mean_token_accuracy": 0.3620689630508423, "step": 76745 }, { "epoch": 0.07730341514905206, "grad_norm": 20.497545315103874, "learning_rate": 4.990835381498217e-05, "loss": 2.5833, "mean_token_accuracy": 0.4344827592372894, "step": 76750 }, { "epoch": 0.07730845120215624, "grad_norm": 21.708362381297757, "learning_rate": 4.990832002579752e-05, "loss": 2.4177, "mean_token_accuracy": 0.44640048742294314, "step": 76755 }, { "epoch": 0.07731348725526041, "grad_norm": 12.329471219398691, "learning_rate": 4.990828623039784e-05, "loss": 2.3797, "mean_token_accuracy": 0.41724138259887694, "step": 76760 }, { "epoch": 0.07731852330836458, "grad_norm": 11.289764539150537, "learning_rate": 4.990825242878313e-05, "loss": 2.4791, "mean_token_accuracy": 0.43593466877937315, "step": 76765 }, { "epoch": 0.07732355936146876, "grad_norm": 11.264886517167614, "learning_rate": 4.990821862095341e-05, "loss": 2.7125, "mean_token_accuracy": 0.36896551251411436, "step": 76770 }, { "epoch": 0.07732859541457293, "grad_norm": 11.48903927551036, "learning_rate": 4.990818480690869e-05, "loss": 2.3278, "mean_token_accuracy": 0.4206896543502808, "step": 76775 }, { "epoch": 0.0773336314676771, "grad_norm": 10.448613586665761, "learning_rate": 4.990815098664897e-05, "loss": 2.3312, "mean_token_accuracy": 0.44137930274009707, "step": 76780 }, { "epoch": 0.07733866752078128, "grad_norm": 13.39716654387735, "learning_rate": 4.990811716017427e-05, "loss": 2.7367, "mean_token_accuracy": 0.4034482777118683, "step": 76785 }, { "epoch": 0.07734370357388545, "grad_norm": 12.903394287129473, "learning_rate": 4.99080833274846e-05, "loss": 2.203, "mean_token_accuracy": 0.44301270246505736, "step": 76790 }, { "epoch": 0.07734873962698961, "grad_norm": 11.637415678341826, "learning_rate": 4.990804948857995e-05, "loss": 2.4361, "mean_token_accuracy": 0.42413792610168455, "step": 76795 }, { "epoch": 0.07735377568009379, "grad_norm": 13.273942442474059, "learning_rate": 4.990801564346037e-05, "loss": 2.7994, "mean_token_accuracy": 0.3793103456497192, "step": 76800 }, { "epoch": 0.07735881173319796, "grad_norm": 11.144215004101454, "learning_rate": 4.990798179212582e-05, "loss": 2.3099, "mean_token_accuracy": 0.3999999940395355, "step": 76805 }, { "epoch": 0.07736384778630213, "grad_norm": 11.025364371891106, "learning_rate": 4.9907947934576336e-05, "loss": 1.9878, "mean_token_accuracy": 0.4949788212776184, "step": 76810 }, { "epoch": 0.07736888383940631, "grad_norm": 9.952123290815283, "learning_rate": 4.9907914070811936e-05, "loss": 2.4067, "mean_token_accuracy": 0.4206896543502808, "step": 76815 }, { "epoch": 0.07737391989251048, "grad_norm": 11.798667252282023, "learning_rate": 4.9907880200832605e-05, "loss": 2.8622, "mean_token_accuracy": 0.40889292359352114, "step": 76820 }, { "epoch": 0.07737895594561466, "grad_norm": 11.045752987632365, "learning_rate": 4.990784632463837e-05, "loss": 2.1986, "mean_token_accuracy": 0.44827587008476255, "step": 76825 }, { "epoch": 0.07738399199871883, "grad_norm": 10.172233878407384, "learning_rate": 4.990781244222924e-05, "loss": 2.2363, "mean_token_accuracy": 0.41724138259887694, "step": 76830 }, { "epoch": 0.077389028051823, "grad_norm": 8.50641354289392, "learning_rate": 4.9907778553605206e-05, "loss": 2.4945, "mean_token_accuracy": 0.47931033968925474, "step": 76835 }, { "epoch": 0.07739406410492718, "grad_norm": 10.730578535301627, "learning_rate": 4.99077446587663e-05, "loss": 2.2329, "mean_token_accuracy": 0.47931034564971925, "step": 76840 }, { "epoch": 0.07739910015803135, "grad_norm": 13.61393164705282, "learning_rate": 4.9907710757712514e-05, "loss": 2.3032, "mean_token_accuracy": 0.42413793206214906, "step": 76845 }, { "epoch": 0.07740413621113552, "grad_norm": 11.29643594938041, "learning_rate": 4.990767685044387e-05, "loss": 2.3159, "mean_token_accuracy": 0.41034482717514037, "step": 76850 }, { "epoch": 0.0774091722642397, "grad_norm": 10.20622086958567, "learning_rate": 4.990764293696039e-05, "loss": 2.5905, "mean_token_accuracy": 0.3862069010734558, "step": 76855 }, { "epoch": 0.07741420831734387, "grad_norm": 11.304876840964768, "learning_rate": 4.9907609017262045e-05, "loss": 2.7241, "mean_token_accuracy": 0.36896551251411436, "step": 76860 }, { "epoch": 0.07741924437044803, "grad_norm": 10.504188508325953, "learning_rate": 4.990757509134887e-05, "loss": 2.191, "mean_token_accuracy": 0.4310344815254211, "step": 76865 }, { "epoch": 0.0774242804235522, "grad_norm": 11.821432158141727, "learning_rate": 4.990754115922088e-05, "loss": 2.6792, "mean_token_accuracy": 0.4137930929660797, "step": 76870 }, { "epoch": 0.07742931647665638, "grad_norm": 10.740577286919532, "learning_rate": 4.9907507220878066e-05, "loss": 2.6304, "mean_token_accuracy": 0.4119177222251892, "step": 76875 }, { "epoch": 0.07743435252976055, "grad_norm": 10.39652888062975, "learning_rate": 4.990747327632045e-05, "loss": 2.6618, "mean_token_accuracy": 0.37241379618644715, "step": 76880 }, { "epoch": 0.07743938858286473, "grad_norm": 10.014224301433858, "learning_rate": 4.990743932554804e-05, "loss": 2.3836, "mean_token_accuracy": 0.4413793087005615, "step": 76885 }, { "epoch": 0.0774444246359689, "grad_norm": 12.000288053337657, "learning_rate": 4.990740536856084e-05, "loss": 2.6225, "mean_token_accuracy": 0.38620689511299133, "step": 76890 }, { "epoch": 0.07744946068907307, "grad_norm": 11.293931783274486, "learning_rate": 4.990737140535887e-05, "loss": 2.539, "mean_token_accuracy": 0.38965516686439516, "step": 76895 }, { "epoch": 0.07745449674217725, "grad_norm": 12.037554042093848, "learning_rate": 4.990733743594212e-05, "loss": 2.2965, "mean_token_accuracy": 0.3965517282485962, "step": 76900 }, { "epoch": 0.07745953279528142, "grad_norm": 12.199135105204514, "learning_rate": 4.990730346031061e-05, "loss": 2.3232, "mean_token_accuracy": 0.45359952449798585, "step": 76905 }, { "epoch": 0.0774645688483856, "grad_norm": 11.57851067043257, "learning_rate": 4.9907269478464366e-05, "loss": 2.7526, "mean_token_accuracy": 0.36551723480224607, "step": 76910 }, { "epoch": 0.07746960490148977, "grad_norm": 10.786326529970497, "learning_rate": 4.990723549040337e-05, "loss": 2.5696, "mean_token_accuracy": 0.4324258863925934, "step": 76915 }, { "epoch": 0.07747464095459394, "grad_norm": 10.272268245864172, "learning_rate": 4.9907201496127646e-05, "loss": 2.5471, "mean_token_accuracy": 0.42758620381355283, "step": 76920 }, { "epoch": 0.07747967700769812, "grad_norm": 9.00461927597447, "learning_rate": 4.99071674956372e-05, "loss": 2.0975, "mean_token_accuracy": 0.4673926174640656, "step": 76925 }, { "epoch": 0.07748471306080229, "grad_norm": 10.374740276214355, "learning_rate": 4.990713348893205e-05, "loss": 2.6679, "mean_token_accuracy": 0.4103448212146759, "step": 76930 }, { "epoch": 0.07748974911390645, "grad_norm": 10.850154591565463, "learning_rate": 4.9907099476012194e-05, "loss": 2.1278, "mean_token_accuracy": 0.4781004309654236, "step": 76935 }, { "epoch": 0.07749478516701062, "grad_norm": 10.917676404025936, "learning_rate": 4.990706545687764e-05, "loss": 2.4571, "mean_token_accuracy": 0.42758620977401735, "step": 76940 }, { "epoch": 0.0774998212201148, "grad_norm": 9.758435948472803, "learning_rate": 4.99070314315284e-05, "loss": 2.2532, "mean_token_accuracy": 0.43653962314128875, "step": 76945 }, { "epoch": 0.07750485727321897, "grad_norm": 12.685584803701056, "learning_rate": 4.990699739996449e-05, "loss": 3.1165, "mean_token_accuracy": 0.3738656997680664, "step": 76950 }, { "epoch": 0.07750989332632315, "grad_norm": 9.13455298637988, "learning_rate": 4.9906963362185936e-05, "loss": 2.402, "mean_token_accuracy": 0.3931034505367279, "step": 76955 }, { "epoch": 0.07751492937942732, "grad_norm": 9.88367341178582, "learning_rate": 4.9906929318192705e-05, "loss": 2.5017, "mean_token_accuracy": 0.4344827592372894, "step": 76960 }, { "epoch": 0.07751996543253149, "grad_norm": 10.865189248617265, "learning_rate": 4.990689526798483e-05, "loss": 2.365, "mean_token_accuracy": 0.44137930274009707, "step": 76965 }, { "epoch": 0.07752500148563567, "grad_norm": 12.906540801424704, "learning_rate": 4.990686121156232e-05, "loss": 2.5605, "mean_token_accuracy": 0.41379310488700866, "step": 76970 }, { "epoch": 0.07753003753873984, "grad_norm": 12.200622409007982, "learning_rate": 4.990682714892519e-05, "loss": 2.7969, "mean_token_accuracy": 0.33103448152542114, "step": 76975 }, { "epoch": 0.07753507359184401, "grad_norm": 12.551370541912643, "learning_rate": 4.9906793080073435e-05, "loss": 2.564, "mean_token_accuracy": 0.41379310488700866, "step": 76980 }, { "epoch": 0.07754010964494819, "grad_norm": 11.073032989595514, "learning_rate": 4.990675900500708e-05, "loss": 2.4957, "mean_token_accuracy": 0.4344827592372894, "step": 76985 }, { "epoch": 0.07754514569805236, "grad_norm": 10.549245650561533, "learning_rate": 4.9906724923726116e-05, "loss": 2.2975, "mean_token_accuracy": 0.42068966031074523, "step": 76990 }, { "epoch": 0.07755018175115654, "grad_norm": 11.067350904676875, "learning_rate": 4.9906690836230574e-05, "loss": 2.7179, "mean_token_accuracy": 0.3965517282485962, "step": 76995 }, { "epoch": 0.0775552178042607, "grad_norm": 9.383835295026707, "learning_rate": 4.9906656742520446e-05, "loss": 2.5448, "mean_token_accuracy": 0.3896551728248596, "step": 77000 }, { "epoch": 0.07756025385736487, "grad_norm": 9.001360798490378, "learning_rate": 4.9906622642595746e-05, "loss": 2.6067, "mean_token_accuracy": 0.38620689511299133, "step": 77005 }, { "epoch": 0.07756528991046904, "grad_norm": 12.140236485318464, "learning_rate": 4.9906588536456494e-05, "loss": 2.5044, "mean_token_accuracy": 0.4, "step": 77010 }, { "epoch": 0.07757032596357322, "grad_norm": 12.20984322279911, "learning_rate": 4.990655442410268e-05, "loss": 2.2532, "mean_token_accuracy": 0.47791893482208253, "step": 77015 }, { "epoch": 0.07757536201667739, "grad_norm": 16.296733836829336, "learning_rate": 4.990652030553434e-05, "loss": 3.2788, "mean_token_accuracy": 0.31379309892654417, "step": 77020 }, { "epoch": 0.07758039806978156, "grad_norm": 13.490840504622232, "learning_rate": 4.9906486180751467e-05, "loss": 2.4881, "mean_token_accuracy": 0.42820197343826294, "step": 77025 }, { "epoch": 0.07758543412288574, "grad_norm": 10.258618845610997, "learning_rate": 4.9906452049754054e-05, "loss": 2.1728, "mean_token_accuracy": 0.45172414779663084, "step": 77030 }, { "epoch": 0.07759047017598991, "grad_norm": 13.046263594947206, "learning_rate": 4.9906417912542144e-05, "loss": 2.39, "mean_token_accuracy": 0.4172413766384125, "step": 77035 }, { "epoch": 0.07759550622909409, "grad_norm": 11.720428962883737, "learning_rate": 4.990638376911572e-05, "loss": 2.9226, "mean_token_accuracy": 0.358620685338974, "step": 77040 }, { "epoch": 0.07760054228219826, "grad_norm": 11.36679894835108, "learning_rate": 4.990634961947481e-05, "loss": 2.6462, "mean_token_accuracy": 0.41935873627662656, "step": 77045 }, { "epoch": 0.07760557833530243, "grad_norm": 8.042992720656542, "learning_rate": 4.990631546361941e-05, "loss": 2.1437, "mean_token_accuracy": 0.4738916367292404, "step": 77050 }, { "epoch": 0.0776106143884066, "grad_norm": 11.33166364550852, "learning_rate": 4.990628130154954e-05, "loss": 2.5928, "mean_token_accuracy": 0.4379310369491577, "step": 77055 }, { "epoch": 0.07761565044151078, "grad_norm": 15.931823682485888, "learning_rate": 4.99062471332652e-05, "loss": 2.9103, "mean_token_accuracy": 0.43260737657547, "step": 77060 }, { "epoch": 0.07762068649461495, "grad_norm": 11.696198242036498, "learning_rate": 4.9906212958766405e-05, "loss": 2.2325, "mean_token_accuracy": 0.4744101643562317, "step": 77065 }, { "epoch": 0.07762572254771911, "grad_norm": 11.868072379388977, "learning_rate": 4.990617877805317e-05, "loss": 2.726, "mean_token_accuracy": 0.3620689630508423, "step": 77070 }, { "epoch": 0.07763075860082329, "grad_norm": 12.575450868013142, "learning_rate": 4.990614459112549e-05, "loss": 2.2071, "mean_token_accuracy": 0.4482758641242981, "step": 77075 }, { "epoch": 0.07763579465392746, "grad_norm": 9.8635547296028, "learning_rate": 4.990611039798338e-05, "loss": 2.6878, "mean_token_accuracy": 0.42661826610565184, "step": 77080 }, { "epoch": 0.07764083070703164, "grad_norm": 10.656224504866053, "learning_rate": 4.9906076198626855e-05, "loss": 2.5413, "mean_token_accuracy": 0.39655172526836396, "step": 77085 }, { "epoch": 0.07764586676013581, "grad_norm": 12.21576913900044, "learning_rate": 4.990604199305592e-05, "loss": 2.463, "mean_token_accuracy": 0.4, "step": 77090 }, { "epoch": 0.07765090281323998, "grad_norm": 10.542462046468861, "learning_rate": 4.990600778127059e-05, "loss": 2.498, "mean_token_accuracy": 0.3931034505367279, "step": 77095 }, { "epoch": 0.07765593886634416, "grad_norm": 10.419810014271121, "learning_rate": 4.990597356327086e-05, "loss": 2.6778, "mean_token_accuracy": 0.37241379022598264, "step": 77100 }, { "epoch": 0.07766097491944833, "grad_norm": 12.272945888129174, "learning_rate": 4.990593933905676e-05, "loss": 2.655, "mean_token_accuracy": 0.3965517163276672, "step": 77105 }, { "epoch": 0.0776660109725525, "grad_norm": 8.488296768323771, "learning_rate": 4.990590510862829e-05, "loss": 2.3775, "mean_token_accuracy": 0.4448275864124298, "step": 77110 }, { "epoch": 0.07767104702565668, "grad_norm": 11.327549218912567, "learning_rate": 4.990587087198545e-05, "loss": 2.464, "mean_token_accuracy": 0.4034482777118683, "step": 77115 }, { "epoch": 0.07767608307876085, "grad_norm": 13.340035863347502, "learning_rate": 4.990583662912826e-05, "loss": 2.4023, "mean_token_accuracy": 0.4034482717514038, "step": 77120 }, { "epoch": 0.07768111913186503, "grad_norm": 10.111026447498293, "learning_rate": 4.990580238005673e-05, "loss": 2.4683, "mean_token_accuracy": 0.42758620977401735, "step": 77125 }, { "epoch": 0.0776861551849692, "grad_norm": 12.198276974909005, "learning_rate": 4.990576812477087e-05, "loss": 2.3424, "mean_token_accuracy": 0.44482757449150084, "step": 77130 }, { "epoch": 0.07769119123807337, "grad_norm": 12.269838164945227, "learning_rate": 4.990573386327068e-05, "loss": 2.7472, "mean_token_accuracy": 0.38620689511299133, "step": 77135 }, { "epoch": 0.07769622729117753, "grad_norm": 11.214334445580265, "learning_rate": 4.9905699595556186e-05, "loss": 2.6984, "mean_token_accuracy": 0.34137930274009703, "step": 77140 }, { "epoch": 0.0777012633442817, "grad_norm": 11.868328614473265, "learning_rate": 4.9905665321627385e-05, "loss": 2.8829, "mean_token_accuracy": 0.37241379618644715, "step": 77145 }, { "epoch": 0.07770629939738588, "grad_norm": 9.692249384541242, "learning_rate": 4.9905631041484276e-05, "loss": 2.3625, "mean_token_accuracy": 0.44137930274009707, "step": 77150 }, { "epoch": 0.07771133545049005, "grad_norm": 9.260543071964502, "learning_rate": 4.990559675512689e-05, "loss": 2.388, "mean_token_accuracy": 0.46424682140350343, "step": 77155 }, { "epoch": 0.07771637150359423, "grad_norm": 10.25993233647754, "learning_rate": 4.9905562462555234e-05, "loss": 2.4785, "mean_token_accuracy": 0.4379310369491577, "step": 77160 }, { "epoch": 0.0777214075566984, "grad_norm": 14.465526981916323, "learning_rate": 4.990552816376931e-05, "loss": 2.3105, "mean_token_accuracy": 0.42413792610168455, "step": 77165 }, { "epoch": 0.07772644360980258, "grad_norm": 13.472094789788683, "learning_rate": 4.9905493858769123e-05, "loss": 2.6939, "mean_token_accuracy": 0.44972777366638184, "step": 77170 }, { "epoch": 0.07773147966290675, "grad_norm": 10.675178218827837, "learning_rate": 4.990545954755469e-05, "loss": 2.2165, "mean_token_accuracy": 0.4931034505367279, "step": 77175 }, { "epoch": 0.07773651571601092, "grad_norm": 15.638613658725355, "learning_rate": 4.990542523012602e-05, "loss": 2.3421, "mean_token_accuracy": 0.47586206793785096, "step": 77180 }, { "epoch": 0.0777415517691151, "grad_norm": 11.171000247407731, "learning_rate": 4.9905390906483125e-05, "loss": 2.5364, "mean_token_accuracy": 0.4172413766384125, "step": 77185 }, { "epoch": 0.07774658782221927, "grad_norm": 10.72922325361121, "learning_rate": 4.990535657662601e-05, "loss": 2.5636, "mean_token_accuracy": 0.4103448331356049, "step": 77190 }, { "epoch": 0.07775162387532344, "grad_norm": 13.229697650166804, "learning_rate": 4.9905322240554685e-05, "loss": 2.5378, "mean_token_accuracy": 0.40852994918823243, "step": 77195 }, { "epoch": 0.07775665992842762, "grad_norm": 11.55424419307672, "learning_rate": 4.9905287898269166e-05, "loss": 2.6512, "mean_token_accuracy": 0.44827585816383364, "step": 77200 }, { "epoch": 0.07776169598153179, "grad_norm": 12.450374053633396, "learning_rate": 4.990525354976944e-05, "loss": 2.9809, "mean_token_accuracy": 0.3827586233615875, "step": 77205 }, { "epoch": 0.07776673203463595, "grad_norm": 11.808871111308113, "learning_rate": 4.990521919505555e-05, "loss": 2.4263, "mean_token_accuracy": 0.3655172407627106, "step": 77210 }, { "epoch": 0.07777176808774013, "grad_norm": 10.688207290992883, "learning_rate": 4.9905184834127486e-05, "loss": 2.5481, "mean_token_accuracy": 0.3758620619773865, "step": 77215 }, { "epoch": 0.0777768041408443, "grad_norm": 11.820694163833142, "learning_rate": 4.990515046698526e-05, "loss": 2.357, "mean_token_accuracy": 0.46896551847457885, "step": 77220 }, { "epoch": 0.07778184019394847, "grad_norm": 12.063104326206414, "learning_rate": 4.9905116093628875e-05, "loss": 2.5647, "mean_token_accuracy": 0.4103448212146759, "step": 77225 }, { "epoch": 0.07778687624705265, "grad_norm": 10.62478618256457, "learning_rate": 4.990508171405835e-05, "loss": 2.4493, "mean_token_accuracy": 0.39310343861579894, "step": 77230 }, { "epoch": 0.07779191230015682, "grad_norm": 17.639347325034, "learning_rate": 4.9905047328273696e-05, "loss": 2.6097, "mean_token_accuracy": 0.4206896543502808, "step": 77235 }, { "epoch": 0.077796948353261, "grad_norm": 12.862348450890504, "learning_rate": 4.990501293627491e-05, "loss": 3.1285, "mean_token_accuracy": 0.36551724672317504, "step": 77240 }, { "epoch": 0.07780198440636517, "grad_norm": 8.260103950497417, "learning_rate": 4.9904978538062024e-05, "loss": 2.0416, "mean_token_accuracy": 0.5353448271751404, "step": 77245 }, { "epoch": 0.07780702045946934, "grad_norm": 11.992925238808867, "learning_rate": 4.9904944133635024e-05, "loss": 2.2865, "mean_token_accuracy": 0.40344828367233276, "step": 77250 }, { "epoch": 0.07781205651257352, "grad_norm": 11.40457495918098, "learning_rate": 4.990490972299393e-05, "loss": 2.5308, "mean_token_accuracy": 0.42758620381355283, "step": 77255 }, { "epoch": 0.07781709256567769, "grad_norm": 12.143363322759624, "learning_rate": 4.9904875306138756e-05, "loss": 2.4874, "mean_token_accuracy": 0.41379310488700866, "step": 77260 }, { "epoch": 0.07782212861878186, "grad_norm": 10.511484738956336, "learning_rate": 4.99048408830695e-05, "loss": 2.5931, "mean_token_accuracy": 0.3620689570903778, "step": 77265 }, { "epoch": 0.07782716467188604, "grad_norm": 11.486398400465754, "learning_rate": 4.9904806453786185e-05, "loss": 2.216, "mean_token_accuracy": 0.4551724135875702, "step": 77270 }, { "epoch": 0.07783220072499021, "grad_norm": 8.304870115817954, "learning_rate": 4.9904772018288804e-05, "loss": 2.1638, "mean_token_accuracy": 0.47767695784568787, "step": 77275 }, { "epoch": 0.07783723677809437, "grad_norm": 10.641655282768328, "learning_rate": 4.990473757657738e-05, "loss": 2.0042, "mean_token_accuracy": 0.4604355812072754, "step": 77280 }, { "epoch": 0.07784227283119854, "grad_norm": 15.624176700946522, "learning_rate": 4.990470312865192e-05, "loss": 2.438, "mean_token_accuracy": 0.42413793206214906, "step": 77285 }, { "epoch": 0.07784730888430272, "grad_norm": 10.778359249061541, "learning_rate": 4.990466867451244e-05, "loss": 2.2777, "mean_token_accuracy": 0.43793101906776427, "step": 77290 }, { "epoch": 0.07785234493740689, "grad_norm": 11.972668001524685, "learning_rate": 4.990463421415892e-05, "loss": 2.5658, "mean_token_accuracy": 0.41034482717514037, "step": 77295 }, { "epoch": 0.07785738099051107, "grad_norm": 10.822023264494888, "learning_rate": 4.990459974759141e-05, "loss": 2.253, "mean_token_accuracy": 0.46896551847457885, "step": 77300 }, { "epoch": 0.07786241704361524, "grad_norm": 9.67239073341343, "learning_rate": 4.990456527480989e-05, "loss": 2.5554, "mean_token_accuracy": 0.36551724672317504, "step": 77305 }, { "epoch": 0.07786745309671941, "grad_norm": 10.868694307630763, "learning_rate": 4.990453079581439e-05, "loss": 2.6433, "mean_token_accuracy": 0.3896551728248596, "step": 77310 }, { "epoch": 0.07787248914982359, "grad_norm": 10.405223147062465, "learning_rate": 4.99044963106049e-05, "loss": 2.2549, "mean_token_accuracy": 0.4379310369491577, "step": 77315 }, { "epoch": 0.07787752520292776, "grad_norm": 12.01185719692832, "learning_rate": 4.990446181918144e-05, "loss": 2.7374, "mean_token_accuracy": 0.3689655065536499, "step": 77320 }, { "epoch": 0.07788256125603193, "grad_norm": 15.449307330379405, "learning_rate": 4.990442732154403e-05, "loss": 2.6466, "mean_token_accuracy": 0.3517241358757019, "step": 77325 }, { "epoch": 0.07788759730913611, "grad_norm": 8.708893643828267, "learning_rate": 4.9904392817692655e-05, "loss": 2.2835, "mean_token_accuracy": 0.4712038695812225, "step": 77330 }, { "epoch": 0.07789263336224028, "grad_norm": 9.249796535795644, "learning_rate": 4.990435830762735e-05, "loss": 2.6712, "mean_token_accuracy": 0.3946763426065445, "step": 77335 }, { "epoch": 0.07789766941534446, "grad_norm": 15.216945957980988, "learning_rate": 4.9904323791348106e-05, "loss": 2.7996, "mean_token_accuracy": 0.3517241358757019, "step": 77340 }, { "epoch": 0.07790270546844863, "grad_norm": 10.228957677910476, "learning_rate": 4.9904289268854945e-05, "loss": 2.5776, "mean_token_accuracy": 0.417241370677948, "step": 77345 }, { "epoch": 0.07790774152155279, "grad_norm": 10.671223399816135, "learning_rate": 4.990425474014786e-05, "loss": 2.4744, "mean_token_accuracy": 0.42413792610168455, "step": 77350 }, { "epoch": 0.07791277757465696, "grad_norm": 11.711857515719785, "learning_rate": 4.990422020522688e-05, "loss": 2.7866, "mean_token_accuracy": 0.3896551728248596, "step": 77355 }, { "epoch": 0.07791781362776114, "grad_norm": 11.155314803416971, "learning_rate": 4.9904185664092e-05, "loss": 2.3209, "mean_token_accuracy": 0.41379310488700866, "step": 77360 }, { "epoch": 0.07792284968086531, "grad_norm": 13.383179664023888, "learning_rate": 4.990415111674324e-05, "loss": 2.5017, "mean_token_accuracy": 0.4, "step": 77365 }, { "epoch": 0.07792788573396948, "grad_norm": 10.281608306125618, "learning_rate": 4.9904116563180605e-05, "loss": 2.1787, "mean_token_accuracy": 0.4206896543502808, "step": 77370 }, { "epoch": 0.07793292178707366, "grad_norm": 12.190920997018454, "learning_rate": 4.99040820034041e-05, "loss": 2.793, "mean_token_accuracy": 0.42413793206214906, "step": 77375 }, { "epoch": 0.07793795784017783, "grad_norm": 10.823050388879269, "learning_rate": 4.990404743741374e-05, "loss": 2.4381, "mean_token_accuracy": 0.3793103456497192, "step": 77380 }, { "epoch": 0.077942993893282, "grad_norm": 8.443542591124924, "learning_rate": 4.9904012865209544e-05, "loss": 2.45, "mean_token_accuracy": 0.39842710494995115, "step": 77385 }, { "epoch": 0.07794802994638618, "grad_norm": 12.59677876481605, "learning_rate": 4.99039782867915e-05, "loss": 2.5679, "mean_token_accuracy": 0.43103448748588563, "step": 77390 }, { "epoch": 0.07795306599949035, "grad_norm": 11.752848582766706, "learning_rate": 4.9903943702159636e-05, "loss": 2.2728, "mean_token_accuracy": 0.42413792610168455, "step": 77395 }, { "epoch": 0.07795810205259453, "grad_norm": 12.170300177241133, "learning_rate": 4.990390911131395e-05, "loss": 2.6047, "mean_token_accuracy": 0.38620689511299133, "step": 77400 }, { "epoch": 0.0779631381056987, "grad_norm": 11.626570584768757, "learning_rate": 4.9903874514254455e-05, "loss": 2.1739, "mean_token_accuracy": 0.47586207985877993, "step": 77405 }, { "epoch": 0.07796817415880287, "grad_norm": 9.541032503015185, "learning_rate": 4.990383991098117e-05, "loss": 2.3631, "mean_token_accuracy": 0.4172413766384125, "step": 77410 }, { "epoch": 0.07797321021190705, "grad_norm": 11.004657934081129, "learning_rate": 4.990380530149409e-05, "loss": 2.8755, "mean_token_accuracy": 0.3896551728248596, "step": 77415 }, { "epoch": 0.07797824626501121, "grad_norm": 14.366950146538729, "learning_rate": 4.9903770685793225e-05, "loss": 2.8183, "mean_token_accuracy": 0.37586206793785093, "step": 77420 }, { "epoch": 0.07798328231811538, "grad_norm": 9.618222271910392, "learning_rate": 4.99037360638786e-05, "loss": 2.4779, "mean_token_accuracy": 0.4344827592372894, "step": 77425 }, { "epoch": 0.07798831837121956, "grad_norm": 9.190506763231031, "learning_rate": 4.990370143575022e-05, "loss": 2.4545, "mean_token_accuracy": 0.41379311084747317, "step": 77430 }, { "epoch": 0.07799335442432373, "grad_norm": 11.497295798524467, "learning_rate": 4.990366680140808e-05, "loss": 2.4318, "mean_token_accuracy": 0.4275861978530884, "step": 77435 }, { "epoch": 0.0779983904774279, "grad_norm": 9.496800597117138, "learning_rate": 4.99036321608522e-05, "loss": 2.2893, "mean_token_accuracy": 0.441379314661026, "step": 77440 }, { "epoch": 0.07800342653053208, "grad_norm": 10.152861910006921, "learning_rate": 4.99035975140826e-05, "loss": 2.3003, "mean_token_accuracy": 0.4620689630508423, "step": 77445 }, { "epoch": 0.07800846258363625, "grad_norm": 12.35096853437537, "learning_rate": 4.990356286109927e-05, "loss": 2.5475, "mean_token_accuracy": 0.3482758581638336, "step": 77450 }, { "epoch": 0.07801349863674042, "grad_norm": 11.840694224903142, "learning_rate": 4.990352820190222e-05, "loss": 2.6721, "mean_token_accuracy": 0.43793103098869324, "step": 77455 }, { "epoch": 0.0780185346898446, "grad_norm": 10.858485155808516, "learning_rate": 4.990349353649148e-05, "loss": 2.2983, "mean_token_accuracy": 0.42758620381355283, "step": 77460 }, { "epoch": 0.07802357074294877, "grad_norm": 82.99634059416334, "learning_rate": 4.990345886486704e-05, "loss": 2.8293, "mean_token_accuracy": 0.34827585369348524, "step": 77465 }, { "epoch": 0.07802860679605295, "grad_norm": 10.649379035150293, "learning_rate": 4.9903424187028927e-05, "loss": 2.4238, "mean_token_accuracy": 0.41524500846862794, "step": 77470 }, { "epoch": 0.07803364284915712, "grad_norm": 10.263754428461128, "learning_rate": 4.990338950297713e-05, "loss": 2.7195, "mean_token_accuracy": 0.38965516686439516, "step": 77475 }, { "epoch": 0.0780386789022613, "grad_norm": 11.122087049645675, "learning_rate": 4.9903354812711675e-05, "loss": 2.5077, "mean_token_accuracy": 0.3949788331985474, "step": 77480 }, { "epoch": 0.07804371495536547, "grad_norm": 8.688720918703758, "learning_rate": 4.9903320116232567e-05, "loss": 2.3453, "mean_token_accuracy": 0.42413793206214906, "step": 77485 }, { "epoch": 0.07804875100846963, "grad_norm": 9.092528986781291, "learning_rate": 4.9903285413539814e-05, "loss": 2.2281, "mean_token_accuracy": 0.43793103098869324, "step": 77490 }, { "epoch": 0.0780537870615738, "grad_norm": 10.449192995940253, "learning_rate": 4.990325070463342e-05, "loss": 2.7452, "mean_token_accuracy": 0.3827586233615875, "step": 77495 }, { "epoch": 0.07805882311467797, "grad_norm": 11.473330004250018, "learning_rate": 4.990321598951341e-05, "loss": 2.4092, "mean_token_accuracy": 0.3986085891723633, "step": 77500 }, { "epoch": 0.07806385916778215, "grad_norm": 10.37823741854408, "learning_rate": 4.9903181268179774e-05, "loss": 2.4559, "mean_token_accuracy": 0.42068964838981626, "step": 77505 }, { "epoch": 0.07806889522088632, "grad_norm": 17.58227547817744, "learning_rate": 4.990314654063255e-05, "loss": 2.8371, "mean_token_accuracy": 0.36551724672317504, "step": 77510 }, { "epoch": 0.0780739312739905, "grad_norm": 9.642618620771664, "learning_rate": 4.9903111806871703e-05, "loss": 2.3973, "mean_token_accuracy": 0.42068966031074523, "step": 77515 }, { "epoch": 0.07807896732709467, "grad_norm": 12.428955489996632, "learning_rate": 4.990307706689729e-05, "loss": 2.378, "mean_token_accuracy": 0.42758620977401735, "step": 77520 }, { "epoch": 0.07808400338019884, "grad_norm": 9.031763481518894, "learning_rate": 4.9903042320709294e-05, "loss": 2.3154, "mean_token_accuracy": 0.4413793087005615, "step": 77525 }, { "epoch": 0.07808903943330302, "grad_norm": 12.109553461055786, "learning_rate": 4.9903007568307736e-05, "loss": 3.0783, "mean_token_accuracy": 0.3517241358757019, "step": 77530 }, { "epoch": 0.07809407548640719, "grad_norm": 9.973297290249706, "learning_rate": 4.990297280969261e-05, "loss": 2.0104, "mean_token_accuracy": 0.4620689630508423, "step": 77535 }, { "epoch": 0.07809911153951136, "grad_norm": 13.298594226513286, "learning_rate": 4.9902938044863944e-05, "loss": 2.7978, "mean_token_accuracy": 0.36896551847457887, "step": 77540 }, { "epoch": 0.07810414759261554, "grad_norm": 10.326294686298171, "learning_rate": 4.990290327382174e-05, "loss": 2.5082, "mean_token_accuracy": 0.4379310250282288, "step": 77545 }, { "epoch": 0.07810918364571971, "grad_norm": 9.936743616913024, "learning_rate": 4.9902868496565996e-05, "loss": 2.5449, "mean_token_accuracy": 0.4448275864124298, "step": 77550 }, { "epoch": 0.07811421969882389, "grad_norm": 9.44577001416769, "learning_rate": 4.990283371309674e-05, "loss": 2.2585, "mean_token_accuracy": 0.45547489523887635, "step": 77555 }, { "epoch": 0.07811925575192805, "grad_norm": 10.373712115208718, "learning_rate": 4.990279892341398e-05, "loss": 2.4515, "mean_token_accuracy": 0.4103448152542114, "step": 77560 }, { "epoch": 0.07812429180503222, "grad_norm": 10.619338880950037, "learning_rate": 4.9902764127517714e-05, "loss": 2.2902, "mean_token_accuracy": 0.44295220375061034, "step": 77565 }, { "epoch": 0.0781293278581364, "grad_norm": 9.006362702548142, "learning_rate": 4.990272932540796e-05, "loss": 2.51, "mean_token_accuracy": 0.40344828367233276, "step": 77570 }, { "epoch": 0.07813436391124057, "grad_norm": 13.01488985470614, "learning_rate": 4.990269451708472e-05, "loss": 2.5705, "mean_token_accuracy": 0.4436781644821167, "step": 77575 }, { "epoch": 0.07813939996434474, "grad_norm": 15.26902274071977, "learning_rate": 4.990265970254802e-05, "loss": 2.2906, "mean_token_accuracy": 0.4517241418361664, "step": 77580 }, { "epoch": 0.07814443601744891, "grad_norm": 11.576915857324082, "learning_rate": 4.990262488179785e-05, "loss": 2.3379, "mean_token_accuracy": 0.441379314661026, "step": 77585 }, { "epoch": 0.07814947207055309, "grad_norm": 20.549483069711094, "learning_rate": 4.990259005483423e-05, "loss": 2.8464, "mean_token_accuracy": 0.358620685338974, "step": 77590 }, { "epoch": 0.07815450812365726, "grad_norm": 12.046618130577828, "learning_rate": 4.9902555221657175e-05, "loss": 2.7274, "mean_token_accuracy": 0.3793103456497192, "step": 77595 }, { "epoch": 0.07815954417676144, "grad_norm": 10.130409015216102, "learning_rate": 4.9902520382266674e-05, "loss": 2.7123, "mean_token_accuracy": 0.37241379618644715, "step": 77600 }, { "epoch": 0.07816458022986561, "grad_norm": 11.914377863204965, "learning_rate": 4.9902485536662766e-05, "loss": 2.7487, "mean_token_accuracy": 0.3517241358757019, "step": 77605 }, { "epoch": 0.07816961628296978, "grad_norm": 11.462090845801173, "learning_rate": 4.990245068484543e-05, "loss": 2.8318, "mean_token_accuracy": 0.4068965494632721, "step": 77610 }, { "epoch": 0.07817465233607396, "grad_norm": 11.709615840720161, "learning_rate": 4.9902415826814704e-05, "loss": 2.6475, "mean_token_accuracy": 0.37586207389831544, "step": 77615 }, { "epoch": 0.07817968838917813, "grad_norm": 10.041665743247364, "learning_rate": 4.990238096257057e-05, "loss": 2.8671, "mean_token_accuracy": 0.3931034505367279, "step": 77620 }, { "epoch": 0.0781847244422823, "grad_norm": 15.994878669830632, "learning_rate": 4.990234609211307e-05, "loss": 2.9271, "mean_token_accuracy": 0.4034482777118683, "step": 77625 }, { "epoch": 0.07818976049538647, "grad_norm": 10.421153728606646, "learning_rate": 4.990231121544218e-05, "loss": 2.3234, "mean_token_accuracy": 0.4689655125141144, "step": 77630 }, { "epoch": 0.07819479654849064, "grad_norm": 14.663087599721488, "learning_rate": 4.9902276332557935e-05, "loss": 2.3937, "mean_token_accuracy": 0.4, "step": 77635 }, { "epoch": 0.07819983260159481, "grad_norm": 10.84457910859915, "learning_rate": 4.990224144346033e-05, "loss": 2.3409, "mean_token_accuracy": 0.42068966031074523, "step": 77640 }, { "epoch": 0.07820486865469899, "grad_norm": 11.029728344334556, "learning_rate": 4.9902206548149384e-05, "loss": 2.6145, "mean_token_accuracy": 0.4137930989265442, "step": 77645 }, { "epoch": 0.07820990470780316, "grad_norm": 12.36272269839812, "learning_rate": 4.990217164662509e-05, "loss": 2.5562, "mean_token_accuracy": 0.39806411862373353, "step": 77650 }, { "epoch": 0.07821494076090733, "grad_norm": 10.871148998262232, "learning_rate": 4.9902136738887483e-05, "loss": 2.4392, "mean_token_accuracy": 0.4310344815254211, "step": 77655 }, { "epoch": 0.07821997681401151, "grad_norm": 9.143362222638888, "learning_rate": 4.990210182493656e-05, "loss": 2.1655, "mean_token_accuracy": 0.4413793087005615, "step": 77660 }, { "epoch": 0.07822501286711568, "grad_norm": 11.813268953545558, "learning_rate": 4.990206690477233e-05, "loss": 2.0417, "mean_token_accuracy": 0.49879008531570435, "step": 77665 }, { "epoch": 0.07823004892021986, "grad_norm": 12.2874043873012, "learning_rate": 4.99020319783948e-05, "loss": 2.43, "mean_token_accuracy": 0.3931034505367279, "step": 77670 }, { "epoch": 0.07823508497332403, "grad_norm": 12.474367189892519, "learning_rate": 4.990199704580398e-05, "loss": 2.8515, "mean_token_accuracy": 0.3896551728248596, "step": 77675 }, { "epoch": 0.0782401210264282, "grad_norm": 11.362234313004377, "learning_rate": 4.9901962106999886e-05, "loss": 2.5978, "mean_token_accuracy": 0.37241379618644715, "step": 77680 }, { "epoch": 0.07824515707953238, "grad_norm": 16.68431183229866, "learning_rate": 4.990192716198252e-05, "loss": 2.4829, "mean_token_accuracy": 0.44482758045196535, "step": 77685 }, { "epoch": 0.07825019313263655, "grad_norm": 10.477123648196688, "learning_rate": 4.99018922107519e-05, "loss": 2.6772, "mean_token_accuracy": 0.38082274198532107, "step": 77690 }, { "epoch": 0.07825522918574072, "grad_norm": 10.534043742860597, "learning_rate": 4.9901857253308035e-05, "loss": 2.5877, "mean_token_accuracy": 0.3999999940395355, "step": 77695 }, { "epoch": 0.07826026523884488, "grad_norm": 12.662280196862161, "learning_rate": 4.990182228965093e-05, "loss": 2.1179, "mean_token_accuracy": 0.4793103516101837, "step": 77700 }, { "epoch": 0.07826530129194906, "grad_norm": 11.314560749507471, "learning_rate": 4.9901787319780594e-05, "loss": 2.015, "mean_token_accuracy": 0.4448275864124298, "step": 77705 }, { "epoch": 0.07827033734505323, "grad_norm": 10.133887669552253, "learning_rate": 4.9901752343697035e-05, "loss": 2.0298, "mean_token_accuracy": 0.46896551847457885, "step": 77710 }, { "epoch": 0.0782753733981574, "grad_norm": 11.3691894100655, "learning_rate": 4.990171736140027e-05, "loss": 2.2396, "mean_token_accuracy": 0.4620689690113068, "step": 77715 }, { "epoch": 0.07828040945126158, "grad_norm": 14.87837346497743, "learning_rate": 4.9901682372890305e-05, "loss": 3.0485, "mean_token_accuracy": 0.28965516984462736, "step": 77720 }, { "epoch": 0.07828544550436575, "grad_norm": 10.914356450856655, "learning_rate": 4.990164737816715e-05, "loss": 2.4296, "mean_token_accuracy": 0.4310344815254211, "step": 77725 }, { "epoch": 0.07829048155746993, "grad_norm": 13.270479447871297, "learning_rate": 4.9901612377230815e-05, "loss": 2.5211, "mean_token_accuracy": 0.4310344815254211, "step": 77730 }, { "epoch": 0.0782955176105741, "grad_norm": 10.238196086365328, "learning_rate": 4.990157737008131e-05, "loss": 2.4752, "mean_token_accuracy": 0.41379310488700866, "step": 77735 }, { "epoch": 0.07830055366367827, "grad_norm": 11.050401422033408, "learning_rate": 4.990154235671865e-05, "loss": 2.5396, "mean_token_accuracy": 0.4310344815254211, "step": 77740 }, { "epoch": 0.07830558971678245, "grad_norm": 9.237864583147893, "learning_rate": 4.990150733714282e-05, "loss": 2.5144, "mean_token_accuracy": 0.4344827592372894, "step": 77745 }, { "epoch": 0.07831062576988662, "grad_norm": 15.751728247058942, "learning_rate": 4.990147231135386e-05, "loss": 2.8522, "mean_token_accuracy": 0.36551724672317504, "step": 77750 }, { "epoch": 0.0783156618229908, "grad_norm": 10.760721915459754, "learning_rate": 4.990143727935177e-05, "loss": 2.1458, "mean_token_accuracy": 0.4517241358757019, "step": 77755 }, { "epoch": 0.07832069787609497, "grad_norm": 12.6336216729002, "learning_rate": 4.990140224113656e-05, "loss": 2.8572, "mean_token_accuracy": 0.38620689511299133, "step": 77760 }, { "epoch": 0.07832573392919914, "grad_norm": 11.989119826292361, "learning_rate": 4.990136719670823e-05, "loss": 2.2877, "mean_token_accuracy": 0.43793103098869324, "step": 77765 }, { "epoch": 0.0783307699823033, "grad_norm": 18.65303384860608, "learning_rate": 4.9901332146066804e-05, "loss": 2.8702, "mean_token_accuracy": 0.3551724135875702, "step": 77770 }, { "epoch": 0.07833580603540748, "grad_norm": 13.41770640591508, "learning_rate": 4.9901297089212276e-05, "loss": 2.9513, "mean_token_accuracy": 0.4229280114173889, "step": 77775 }, { "epoch": 0.07834084208851165, "grad_norm": 11.37938688058401, "learning_rate": 4.9901262026144674e-05, "loss": 2.3659, "mean_token_accuracy": 0.4103448212146759, "step": 77780 }, { "epoch": 0.07834587814161582, "grad_norm": 8.918228064831073, "learning_rate": 4.9901226956863996e-05, "loss": 2.2168, "mean_token_accuracy": 0.46551724076271056, "step": 77785 }, { "epoch": 0.07835091419472, "grad_norm": 9.845983978064451, "learning_rate": 4.9901191881370245e-05, "loss": 2.4309, "mean_token_accuracy": 0.42068966031074523, "step": 77790 }, { "epoch": 0.07835595024782417, "grad_norm": 10.020543979179083, "learning_rate": 4.990115679966345e-05, "loss": 2.3145, "mean_token_accuracy": 0.4310344815254211, "step": 77795 }, { "epoch": 0.07836098630092835, "grad_norm": 14.177133947426526, "learning_rate": 4.990112171174361e-05, "loss": 2.7465, "mean_token_accuracy": 0.3827586114406586, "step": 77800 }, { "epoch": 0.07836602235403252, "grad_norm": 15.02286040244891, "learning_rate": 4.990108661761073e-05, "loss": 2.5008, "mean_token_accuracy": 0.4448275864124298, "step": 77805 }, { "epoch": 0.07837105840713669, "grad_norm": 11.229555815813425, "learning_rate": 4.990105151726483e-05, "loss": 2.3011, "mean_token_accuracy": 0.43793103098869324, "step": 77810 }, { "epoch": 0.07837609446024087, "grad_norm": 11.442773974119374, "learning_rate": 4.990101641070592e-05, "loss": 2.7001, "mean_token_accuracy": 0.3620689630508423, "step": 77815 }, { "epoch": 0.07838113051334504, "grad_norm": 8.993383660399237, "learning_rate": 4.990098129793399e-05, "loss": 2.5105, "mean_token_accuracy": 0.42758620977401735, "step": 77820 }, { "epoch": 0.07838616656644921, "grad_norm": 12.538639900582348, "learning_rate": 4.9900946178949074e-05, "loss": 2.3592, "mean_token_accuracy": 0.43448275327682495, "step": 77825 }, { "epoch": 0.07839120261955339, "grad_norm": 11.642842560089266, "learning_rate": 4.9900911053751164e-05, "loss": 3.0603, "mean_token_accuracy": 0.3448275804519653, "step": 77830 }, { "epoch": 0.07839623867265756, "grad_norm": 10.167745246079939, "learning_rate": 4.990087592234029e-05, "loss": 2.3072, "mean_token_accuracy": 0.43793103098869324, "step": 77835 }, { "epoch": 0.07840127472576172, "grad_norm": 10.983872289131089, "learning_rate": 4.9900840784716446e-05, "loss": 2.6044, "mean_token_accuracy": 0.39473684430122374, "step": 77840 }, { "epoch": 0.0784063107788659, "grad_norm": 10.705447759493023, "learning_rate": 4.990080564087964e-05, "loss": 2.603, "mean_token_accuracy": 0.4068965494632721, "step": 77845 }, { "epoch": 0.07841134683197007, "grad_norm": 10.865801967797859, "learning_rate": 4.990077049082989e-05, "loss": 2.1642, "mean_token_accuracy": 0.45862069725990295, "step": 77850 }, { "epoch": 0.07841638288507424, "grad_norm": 18.314199933038083, "learning_rate": 4.99007353345672e-05, "loss": 3.0366, "mean_token_accuracy": 0.34137930870056155, "step": 77855 }, { "epoch": 0.07842141893817842, "grad_norm": 10.64427808962319, "learning_rate": 4.9900700172091584e-05, "loss": 2.6459, "mean_token_accuracy": 0.43448275327682495, "step": 77860 }, { "epoch": 0.07842645499128259, "grad_norm": 10.48096390187752, "learning_rate": 4.990066500340305e-05, "loss": 2.714, "mean_token_accuracy": 0.38620689511299133, "step": 77865 }, { "epoch": 0.07843149104438676, "grad_norm": 14.82336082220531, "learning_rate": 4.990062982850161e-05, "loss": 2.6551, "mean_token_accuracy": 0.4172413766384125, "step": 77870 }, { "epoch": 0.07843652709749094, "grad_norm": 10.624774862309952, "learning_rate": 4.990059464738727e-05, "loss": 2.6161, "mean_token_accuracy": 0.39086509943008424, "step": 77875 }, { "epoch": 0.07844156315059511, "grad_norm": 10.099761055375216, "learning_rate": 4.9900559460060046e-05, "loss": 2.3368, "mean_token_accuracy": 0.4, "step": 77880 }, { "epoch": 0.07844659920369929, "grad_norm": 10.866077071626687, "learning_rate": 4.990052426651994e-05, "loss": 2.1657, "mean_token_accuracy": 0.4310344815254211, "step": 77885 }, { "epoch": 0.07845163525680346, "grad_norm": 12.525396996980634, "learning_rate": 4.9900489066766956e-05, "loss": 2.642, "mean_token_accuracy": 0.41034482717514037, "step": 77890 }, { "epoch": 0.07845667130990763, "grad_norm": 9.679378957677699, "learning_rate": 4.990045386080113e-05, "loss": 2.5682, "mean_token_accuracy": 0.42758620977401735, "step": 77895 }, { "epoch": 0.0784617073630118, "grad_norm": 10.401800885080567, "learning_rate": 4.990041864862244e-05, "loss": 2.2683, "mean_token_accuracy": 0.4517241418361664, "step": 77900 }, { "epoch": 0.07846674341611598, "grad_norm": 14.962260149546244, "learning_rate": 4.9900383430230925e-05, "loss": 2.4249, "mean_token_accuracy": 0.43660011887550354, "step": 77905 }, { "epoch": 0.07847177946922014, "grad_norm": 10.783198967962724, "learning_rate": 4.990034820562657e-05, "loss": 2.4569, "mean_token_accuracy": 0.42413793206214906, "step": 77910 }, { "epoch": 0.07847681552232431, "grad_norm": 11.112238044939174, "learning_rate": 4.990031297480939e-05, "loss": 2.3168, "mean_token_accuracy": 0.4482758641242981, "step": 77915 }, { "epoch": 0.07848185157542849, "grad_norm": 11.517421531163409, "learning_rate": 4.990027773777941e-05, "loss": 2.1478, "mean_token_accuracy": 0.43793103098869324, "step": 77920 }, { "epoch": 0.07848688762853266, "grad_norm": 10.552561776653235, "learning_rate": 4.9900242494536624e-05, "loss": 2.0982, "mean_token_accuracy": 0.48620688915252686, "step": 77925 }, { "epoch": 0.07849192368163684, "grad_norm": 11.778632147178488, "learning_rate": 4.990020724508105e-05, "loss": 2.3175, "mean_token_accuracy": 0.4068965554237366, "step": 77930 }, { "epoch": 0.07849695973474101, "grad_norm": 12.470143389751312, "learning_rate": 4.990017198941269e-05, "loss": 2.5224, "mean_token_accuracy": 0.34137930870056155, "step": 77935 }, { "epoch": 0.07850199578784518, "grad_norm": 9.650937328947304, "learning_rate": 4.9900136727531566e-05, "loss": 2.2958, "mean_token_accuracy": 0.443254691362381, "step": 77940 }, { "epoch": 0.07850703184094936, "grad_norm": 13.149721790520653, "learning_rate": 4.9900101459437673e-05, "loss": 2.3286, "mean_token_accuracy": 0.477339905500412, "step": 77945 }, { "epoch": 0.07851206789405353, "grad_norm": 11.06665839861471, "learning_rate": 4.990006618513103e-05, "loss": 2.9532, "mean_token_accuracy": 0.3724137932062149, "step": 77950 }, { "epoch": 0.0785171039471577, "grad_norm": 11.665802059271918, "learning_rate": 4.990003090461165e-05, "loss": 2.6354, "mean_token_accuracy": 0.38275861740112305, "step": 77955 }, { "epoch": 0.07852214000026188, "grad_norm": 12.252963413251779, "learning_rate": 4.989999561787953e-05, "loss": 2.6806, "mean_token_accuracy": 0.3862068891525269, "step": 77960 }, { "epoch": 0.07852717605336605, "grad_norm": 11.424115839781741, "learning_rate": 4.989996032493469e-05, "loss": 2.4271, "mean_token_accuracy": 0.4034482777118683, "step": 77965 }, { "epoch": 0.07853221210647023, "grad_norm": 10.443348288980115, "learning_rate": 4.9899925025777145e-05, "loss": 2.6289, "mean_token_accuracy": 0.4, "step": 77970 }, { "epoch": 0.0785372481595744, "grad_norm": 12.22355407825415, "learning_rate": 4.98998897204069e-05, "loss": 2.4876, "mean_token_accuracy": 0.4172413766384125, "step": 77975 }, { "epoch": 0.07854228421267856, "grad_norm": 10.188499107388187, "learning_rate": 4.9899854408823956e-05, "loss": 2.3492, "mean_token_accuracy": 0.44827585816383364, "step": 77980 }, { "epoch": 0.07854732026578273, "grad_norm": 12.686641330895586, "learning_rate": 4.989981909102832e-05, "loss": 2.7979, "mean_token_accuracy": 0.3827586233615875, "step": 77985 }, { "epoch": 0.0785523563188869, "grad_norm": 12.798850868888554, "learning_rate": 4.989978376702001e-05, "loss": 2.8899, "mean_token_accuracy": 0.37931033968925476, "step": 77990 }, { "epoch": 0.07855739237199108, "grad_norm": 10.912500127828649, "learning_rate": 4.989974843679905e-05, "loss": 2.3205, "mean_token_accuracy": 0.47931034564971925, "step": 77995 }, { "epoch": 0.07856242842509525, "grad_norm": 10.00375394393882, "learning_rate": 4.989971310036543e-05, "loss": 2.2352, "mean_token_accuracy": 0.45517241954803467, "step": 78000 }, { "epoch": 0.07856746447819943, "grad_norm": 12.388914182107136, "learning_rate": 4.989967775771917e-05, "loss": 2.6464, "mean_token_accuracy": 0.3896551787853241, "step": 78005 }, { "epoch": 0.0785725005313036, "grad_norm": 11.986064372242483, "learning_rate": 4.989964240886027e-05, "loss": 2.5315, "mean_token_accuracy": 0.42413793206214906, "step": 78010 }, { "epoch": 0.07857753658440778, "grad_norm": 11.919565627560743, "learning_rate": 4.989960705378875e-05, "loss": 2.705, "mean_token_accuracy": 0.3862069010734558, "step": 78015 }, { "epoch": 0.07858257263751195, "grad_norm": 9.718258522378472, "learning_rate": 4.9899571692504606e-05, "loss": 2.5904, "mean_token_accuracy": 0.4103448331356049, "step": 78020 }, { "epoch": 0.07858760869061612, "grad_norm": 9.924243929549906, "learning_rate": 4.989953632500787e-05, "loss": 2.7506, "mean_token_accuracy": 0.37241379618644715, "step": 78025 }, { "epoch": 0.0785926447437203, "grad_norm": 11.598700105704713, "learning_rate": 4.9899500951298535e-05, "loss": 2.5934, "mean_token_accuracy": 0.4103448212146759, "step": 78030 }, { "epoch": 0.07859768079682447, "grad_norm": 10.628935677654898, "learning_rate": 4.9899465571376615e-05, "loss": 2.6348, "mean_token_accuracy": 0.38427101969718935, "step": 78035 }, { "epoch": 0.07860271684992864, "grad_norm": 11.034283704894001, "learning_rate": 4.9899430185242115e-05, "loss": 2.5264, "mean_token_accuracy": 0.37241379022598264, "step": 78040 }, { "epoch": 0.07860775290303282, "grad_norm": 13.187847758266072, "learning_rate": 4.989939479289505e-05, "loss": 2.595, "mean_token_accuracy": 0.3551724135875702, "step": 78045 }, { "epoch": 0.07861278895613698, "grad_norm": 9.679061406343292, "learning_rate": 4.989935939433543e-05, "loss": 2.0989, "mean_token_accuracy": 0.43448275327682495, "step": 78050 }, { "epoch": 0.07861782500924115, "grad_norm": 11.821685886627446, "learning_rate": 4.989932398956327e-05, "loss": 2.3925, "mean_token_accuracy": 0.4255898356437683, "step": 78055 }, { "epoch": 0.07862286106234533, "grad_norm": 10.644059295370624, "learning_rate": 4.989928857857856e-05, "loss": 2.5019, "mean_token_accuracy": 0.4172413766384125, "step": 78060 }, { "epoch": 0.0786278971154495, "grad_norm": 11.0075802549604, "learning_rate": 4.9899253161381335e-05, "loss": 2.9096, "mean_token_accuracy": 0.3620689630508423, "step": 78065 }, { "epoch": 0.07863293316855367, "grad_norm": 10.710006767371173, "learning_rate": 4.98992177379716e-05, "loss": 2.499, "mean_token_accuracy": 0.4034482777118683, "step": 78070 }, { "epoch": 0.07863796922165785, "grad_norm": 10.414498772499007, "learning_rate": 4.989918230834934e-05, "loss": 1.995, "mean_token_accuracy": 0.4987900793552399, "step": 78075 }, { "epoch": 0.07864300527476202, "grad_norm": 15.929957814957554, "learning_rate": 4.98991468725146e-05, "loss": 2.87, "mean_token_accuracy": 0.3551724165678024, "step": 78080 }, { "epoch": 0.0786480413278662, "grad_norm": 11.275233058068917, "learning_rate": 4.989911143046736e-05, "loss": 2.5036, "mean_token_accuracy": 0.37586206793785093, "step": 78085 }, { "epoch": 0.07865307738097037, "grad_norm": 11.990835674424549, "learning_rate": 4.989907598220765e-05, "loss": 2.7431, "mean_token_accuracy": 0.34827585220336915, "step": 78090 }, { "epoch": 0.07865811343407454, "grad_norm": 11.93487259957259, "learning_rate": 4.989904052773548e-05, "loss": 2.2425, "mean_token_accuracy": 0.39655172228813174, "step": 78095 }, { "epoch": 0.07866314948717872, "grad_norm": 11.17485918984423, "learning_rate": 4.989900506705084e-05, "loss": 2.3554, "mean_token_accuracy": 0.42758620381355283, "step": 78100 }, { "epoch": 0.07866818554028289, "grad_norm": 10.75351023064932, "learning_rate": 4.989896960015376e-05, "loss": 2.0886, "mean_token_accuracy": 0.5275862097740174, "step": 78105 }, { "epoch": 0.07867322159338706, "grad_norm": 20.11025771059245, "learning_rate": 4.989893412704424e-05, "loss": 2.6657, "mean_token_accuracy": 0.45015124082565305, "step": 78110 }, { "epoch": 0.07867825764649124, "grad_norm": 10.536596538058998, "learning_rate": 4.989889864772229e-05, "loss": 2.1567, "mean_token_accuracy": 0.4551724076271057, "step": 78115 }, { "epoch": 0.0786832936995954, "grad_norm": 11.115732843162577, "learning_rate": 4.9898863162187934e-05, "loss": 2.606, "mean_token_accuracy": 0.41034482717514037, "step": 78120 }, { "epoch": 0.07868832975269957, "grad_norm": 11.238252077459066, "learning_rate": 4.9898827670441145e-05, "loss": 2.5128, "mean_token_accuracy": 0.37719298601150514, "step": 78125 }, { "epoch": 0.07869336580580374, "grad_norm": 11.708453129461873, "learning_rate": 4.989879217248197e-05, "loss": 2.209, "mean_token_accuracy": 0.4310344815254211, "step": 78130 }, { "epoch": 0.07869840185890792, "grad_norm": 11.33740620553316, "learning_rate": 4.9898756668310415e-05, "loss": 2.4957, "mean_token_accuracy": 0.44482758045196535, "step": 78135 }, { "epoch": 0.07870343791201209, "grad_norm": 12.193548063808446, "learning_rate": 4.9898721157926475e-05, "loss": 2.5928, "mean_token_accuracy": 0.41530550718307496, "step": 78140 }, { "epoch": 0.07870847396511627, "grad_norm": 9.019370941574635, "learning_rate": 4.9898685641330165e-05, "loss": 2.8856, "mean_token_accuracy": 0.3896551728248596, "step": 78145 }, { "epoch": 0.07871351001822044, "grad_norm": 12.854838406201386, "learning_rate": 4.98986501185215e-05, "loss": 3.0098, "mean_token_accuracy": 0.3241379290819168, "step": 78150 }, { "epoch": 0.07871854607132461, "grad_norm": 13.864132592054066, "learning_rate": 4.989861458950048e-05, "loss": 2.6143, "mean_token_accuracy": 0.4172413766384125, "step": 78155 }, { "epoch": 0.07872358212442879, "grad_norm": 26.63420088731216, "learning_rate": 4.989857905426712e-05, "loss": 2.5678, "mean_token_accuracy": 0.40689654350280763, "step": 78160 }, { "epoch": 0.07872861817753296, "grad_norm": 11.158711318742565, "learning_rate": 4.9898543512821435e-05, "loss": 2.4516, "mean_token_accuracy": 0.44482759237289426, "step": 78165 }, { "epoch": 0.07873365423063713, "grad_norm": 10.633237016291352, "learning_rate": 4.989850796516343e-05, "loss": 2.7876, "mean_token_accuracy": 0.37931033968925476, "step": 78170 }, { "epoch": 0.07873869028374131, "grad_norm": 10.878535885777696, "learning_rate": 4.989847241129312e-05, "loss": 2.1544, "mean_token_accuracy": 0.48118571639060975, "step": 78175 }, { "epoch": 0.07874372633684548, "grad_norm": 10.870838227236137, "learning_rate": 4.98984368512105e-05, "loss": 2.8581, "mean_token_accuracy": 0.3517241358757019, "step": 78180 }, { "epoch": 0.07874876238994966, "grad_norm": 11.29869714069804, "learning_rate": 4.989840128491559e-05, "loss": 2.6302, "mean_token_accuracy": 0.38965516686439516, "step": 78185 }, { "epoch": 0.07875379844305382, "grad_norm": 11.768136283267921, "learning_rate": 4.989836571240841e-05, "loss": 2.7723, "mean_token_accuracy": 0.37586206793785093, "step": 78190 }, { "epoch": 0.07875883449615799, "grad_norm": 12.465739118803386, "learning_rate": 4.989833013368895e-05, "loss": 2.6103, "mean_token_accuracy": 0.4034482717514038, "step": 78195 }, { "epoch": 0.07876387054926216, "grad_norm": 11.579993657782195, "learning_rate": 4.9898294548757234e-05, "loss": 2.5917, "mean_token_accuracy": 0.37241379618644715, "step": 78200 }, { "epoch": 0.07876890660236634, "grad_norm": 11.871879928764042, "learning_rate": 4.989825895761326e-05, "loss": 2.6607, "mean_token_accuracy": 0.42413793206214906, "step": 78205 }, { "epoch": 0.07877394265547051, "grad_norm": 10.877670938099344, "learning_rate": 4.989822336025706e-05, "loss": 2.3815, "mean_token_accuracy": 0.4103448331356049, "step": 78210 }, { "epoch": 0.07877897870857468, "grad_norm": 9.601080917141719, "learning_rate": 4.989818775668862e-05, "loss": 2.5743, "mean_token_accuracy": 0.44137930274009707, "step": 78215 }, { "epoch": 0.07878401476167886, "grad_norm": 12.315288445007246, "learning_rate": 4.989815214690796e-05, "loss": 2.6005, "mean_token_accuracy": 0.4275861978530884, "step": 78220 }, { "epoch": 0.07878905081478303, "grad_norm": 10.692325740790034, "learning_rate": 4.989811653091509e-05, "loss": 2.3773, "mean_token_accuracy": 0.4620689570903778, "step": 78225 }, { "epoch": 0.0787940868678872, "grad_norm": 10.718865183674883, "learning_rate": 4.989808090871002e-05, "loss": 2.6373, "mean_token_accuracy": 0.37931033968925476, "step": 78230 }, { "epoch": 0.07879912292099138, "grad_norm": 12.791412510972354, "learning_rate": 4.989804528029276e-05, "loss": 2.8541, "mean_token_accuracy": 0.39310344457626345, "step": 78235 }, { "epoch": 0.07880415897409555, "grad_norm": 8.768748426886317, "learning_rate": 4.989800964566331e-05, "loss": 2.1414, "mean_token_accuracy": 0.45862067937850953, "step": 78240 }, { "epoch": 0.07880919502719973, "grad_norm": 15.67332287652288, "learning_rate": 4.9897974004821696e-05, "loss": 2.4058, "mean_token_accuracy": 0.4344827651977539, "step": 78245 }, { "epoch": 0.0788142310803039, "grad_norm": 17.235932745695596, "learning_rate": 4.989793835776791e-05, "loss": 2.542, "mean_token_accuracy": 0.39310344457626345, "step": 78250 }, { "epoch": 0.07881926713340807, "grad_norm": 11.97124630879913, "learning_rate": 4.989790270450199e-05, "loss": 2.9625, "mean_token_accuracy": 0.37931033968925476, "step": 78255 }, { "epoch": 0.07882430318651223, "grad_norm": 11.641866945741356, "learning_rate": 4.989786704502391e-05, "loss": 2.1454, "mean_token_accuracy": 0.4310344815254211, "step": 78260 }, { "epoch": 0.07882933923961641, "grad_norm": 13.39836576076939, "learning_rate": 4.989783137933372e-05, "loss": 2.6927, "mean_token_accuracy": 0.40689654350280763, "step": 78265 }, { "epoch": 0.07883437529272058, "grad_norm": 11.896583936661386, "learning_rate": 4.9897795707431396e-05, "loss": 2.2044, "mean_token_accuracy": 0.46896551847457885, "step": 78270 }, { "epoch": 0.07883941134582476, "grad_norm": 11.969052013258853, "learning_rate": 4.989776002931695e-05, "loss": 2.5295, "mean_token_accuracy": 0.3965517282485962, "step": 78275 }, { "epoch": 0.07884444739892893, "grad_norm": 11.77452714337248, "learning_rate": 4.9897724344990415e-05, "loss": 2.3893, "mean_token_accuracy": 0.4172413766384125, "step": 78280 }, { "epoch": 0.0788494834520331, "grad_norm": 8.82001672170787, "learning_rate": 4.989768865445178e-05, "loss": 2.1824, "mean_token_accuracy": 0.4569872975349426, "step": 78285 }, { "epoch": 0.07885451950513728, "grad_norm": 12.685321253895612, "learning_rate": 4.9897652957701065e-05, "loss": 2.3342, "mean_token_accuracy": 0.4344827651977539, "step": 78290 }, { "epoch": 0.07885955555824145, "grad_norm": 11.106801939889955, "learning_rate": 4.9897617254738276e-05, "loss": 2.4034, "mean_token_accuracy": 0.43793103098869324, "step": 78295 }, { "epoch": 0.07886459161134562, "grad_norm": 12.515324623807322, "learning_rate": 4.989758154556343e-05, "loss": 2.59, "mean_token_accuracy": 0.42758620381355283, "step": 78300 }, { "epoch": 0.0788696276644498, "grad_norm": 8.449631689722558, "learning_rate": 4.989754583017652e-05, "loss": 2.5047, "mean_token_accuracy": 0.382758629322052, "step": 78305 }, { "epoch": 0.07887466371755397, "grad_norm": 9.545694913009477, "learning_rate": 4.989751010857758e-05, "loss": 2.3564, "mean_token_accuracy": 0.4517241299152374, "step": 78310 }, { "epoch": 0.07887969977065815, "grad_norm": 12.286567312827732, "learning_rate": 4.98974743807666e-05, "loss": 2.5101, "mean_token_accuracy": 0.38965517580509185, "step": 78315 }, { "epoch": 0.07888473582376232, "grad_norm": 11.504772314547145, "learning_rate": 4.9897438646743596e-05, "loss": 2.7542, "mean_token_accuracy": 0.38275861740112305, "step": 78320 }, { "epoch": 0.0788897718768665, "grad_norm": 10.75957659578681, "learning_rate": 4.9897402906508585e-05, "loss": 3.0954, "mean_token_accuracy": 0.35862068831920624, "step": 78325 }, { "epoch": 0.07889480792997065, "grad_norm": 9.327434533143025, "learning_rate": 4.9897367160061564e-05, "loss": 2.3284, "mean_token_accuracy": 0.41724137365818026, "step": 78330 }, { "epoch": 0.07889984398307483, "grad_norm": 11.36911630927558, "learning_rate": 4.989733140740255e-05, "loss": 2.6571, "mean_token_accuracy": 0.36896551847457887, "step": 78335 }, { "epoch": 0.078904880036179, "grad_norm": 11.28981496466912, "learning_rate": 4.9897295648531556e-05, "loss": 2.8349, "mean_token_accuracy": 0.4241379380226135, "step": 78340 }, { "epoch": 0.07890991608928317, "grad_norm": 11.712463525010799, "learning_rate": 4.989725988344859e-05, "loss": 2.507, "mean_token_accuracy": 0.37586206793785093, "step": 78345 }, { "epoch": 0.07891495214238735, "grad_norm": 12.308655374647532, "learning_rate": 4.989722411215366e-05, "loss": 2.6173, "mean_token_accuracy": 0.4053841531276703, "step": 78350 }, { "epoch": 0.07891998819549152, "grad_norm": 21.8597767339935, "learning_rate": 4.9897188334646774e-05, "loss": 2.7904, "mean_token_accuracy": 0.3655172437429428, "step": 78355 }, { "epoch": 0.0789250242485957, "grad_norm": 9.836588536941452, "learning_rate": 4.9897152550927945e-05, "loss": 2.3368, "mean_token_accuracy": 0.4344827651977539, "step": 78360 }, { "epoch": 0.07893006030169987, "grad_norm": 10.364152791705735, "learning_rate": 4.989711676099718e-05, "loss": 2.1182, "mean_token_accuracy": 0.44827585816383364, "step": 78365 }, { "epoch": 0.07893509635480404, "grad_norm": 10.715354779091028, "learning_rate": 4.989708096485449e-05, "loss": 2.3901, "mean_token_accuracy": 0.41724138855934145, "step": 78370 }, { "epoch": 0.07894013240790822, "grad_norm": 11.602813852620008, "learning_rate": 4.9897045162499886e-05, "loss": 2.3435, "mean_token_accuracy": 0.42068964838981626, "step": 78375 }, { "epoch": 0.07894516846101239, "grad_norm": 10.253440608869086, "learning_rate": 4.9897009353933386e-05, "loss": 2.808, "mean_token_accuracy": 0.37586207389831544, "step": 78380 }, { "epoch": 0.07895020451411656, "grad_norm": 11.335797322360873, "learning_rate": 4.989697353915498e-05, "loss": 2.8148, "mean_token_accuracy": 0.3758620619773865, "step": 78385 }, { "epoch": 0.07895524056722074, "grad_norm": 11.143192649833749, "learning_rate": 4.989693771816471e-05, "loss": 2.674, "mean_token_accuracy": 0.37586206793785093, "step": 78390 }, { "epoch": 0.07896027662032491, "grad_norm": 14.164056237246406, "learning_rate": 4.989690189096255e-05, "loss": 2.6687, "mean_token_accuracy": 0.4034482777118683, "step": 78395 }, { "epoch": 0.07896531267342907, "grad_norm": 11.924397553928824, "learning_rate": 4.989686605754852e-05, "loss": 2.5302, "mean_token_accuracy": 0.4310344815254211, "step": 78400 }, { "epoch": 0.07897034872653325, "grad_norm": 13.27827908848532, "learning_rate": 4.989683021792265e-05, "loss": 2.5886, "mean_token_accuracy": 0.37586206793785093, "step": 78405 }, { "epoch": 0.07897538477963742, "grad_norm": 10.714199390729068, "learning_rate": 4.989679437208493e-05, "loss": 2.1248, "mean_token_accuracy": 0.4848154842853546, "step": 78410 }, { "epoch": 0.0789804208327416, "grad_norm": 12.211020084039685, "learning_rate": 4.9896758520035375e-05, "loss": 2.2881, "mean_token_accuracy": 0.4551724076271057, "step": 78415 }, { "epoch": 0.07898545688584577, "grad_norm": 12.936267522472036, "learning_rate": 4.9896722661773994e-05, "loss": 2.7644, "mean_token_accuracy": 0.3517241418361664, "step": 78420 }, { "epoch": 0.07899049293894994, "grad_norm": 12.982692873641906, "learning_rate": 4.98966867973008e-05, "loss": 2.6529, "mean_token_accuracy": 0.42413793206214906, "step": 78425 }, { "epoch": 0.07899552899205411, "grad_norm": 11.433586607962994, "learning_rate": 4.98966509266158e-05, "loss": 2.7599, "mean_token_accuracy": 0.3896551728248596, "step": 78430 }, { "epoch": 0.07900056504515829, "grad_norm": 9.317903010298034, "learning_rate": 4.9896615049719005e-05, "loss": 2.358, "mean_token_accuracy": 0.4724137902259827, "step": 78435 }, { "epoch": 0.07900560109826246, "grad_norm": 9.206169183648598, "learning_rate": 4.989657916661043e-05, "loss": 2.4098, "mean_token_accuracy": 0.42068966031074523, "step": 78440 }, { "epoch": 0.07901063715136664, "grad_norm": 10.618737812209181, "learning_rate": 4.9896543277290076e-05, "loss": 2.4763, "mean_token_accuracy": 0.4344827592372894, "step": 78445 }, { "epoch": 0.07901567320447081, "grad_norm": 13.249988257152847, "learning_rate": 4.989650738175796e-05, "loss": 2.2168, "mean_token_accuracy": 0.4551724135875702, "step": 78450 }, { "epoch": 0.07902070925757498, "grad_norm": 9.789724521286935, "learning_rate": 4.9896471480014086e-05, "loss": 2.356, "mean_token_accuracy": 0.41379310488700866, "step": 78455 }, { "epoch": 0.07902574531067916, "grad_norm": 12.737389570698216, "learning_rate": 4.989643557205846e-05, "loss": 2.6491, "mean_token_accuracy": 0.3448275923728943, "step": 78460 }, { "epoch": 0.07903078136378333, "grad_norm": 12.171954394195058, "learning_rate": 4.9896399657891116e-05, "loss": 2.3918, "mean_token_accuracy": 0.40459770560264585, "step": 78465 }, { "epoch": 0.07903581741688749, "grad_norm": 9.626619542814526, "learning_rate": 4.989636373751204e-05, "loss": 2.5488, "mean_token_accuracy": 0.4517241299152374, "step": 78470 }, { "epoch": 0.07904085346999166, "grad_norm": 12.786666036995987, "learning_rate": 4.989632781092125e-05, "loss": 2.8176, "mean_token_accuracy": 0.3793103456497192, "step": 78475 }, { "epoch": 0.07904588952309584, "grad_norm": 9.672040446618084, "learning_rate": 4.9896291878118745e-05, "loss": 2.5524, "mean_token_accuracy": 0.4310344815254211, "step": 78480 }, { "epoch": 0.07905092557620001, "grad_norm": 14.22173018931263, "learning_rate": 4.989625593910455e-05, "loss": 2.5094, "mean_token_accuracy": 0.4275862157344818, "step": 78485 }, { "epoch": 0.07905596162930419, "grad_norm": 11.453353872963568, "learning_rate": 4.9896219993878676e-05, "loss": 2.7493, "mean_token_accuracy": 0.37241379022598264, "step": 78490 }, { "epoch": 0.07906099768240836, "grad_norm": 10.614517530726477, "learning_rate": 4.989618404244112e-05, "loss": 2.3692, "mean_token_accuracy": 0.4586206912994385, "step": 78495 }, { "epoch": 0.07906603373551253, "grad_norm": 11.345152082085463, "learning_rate": 4.9896148084791904e-05, "loss": 2.5453, "mean_token_accuracy": 0.37241379618644715, "step": 78500 }, { "epoch": 0.07907106978861671, "grad_norm": 10.192445124255062, "learning_rate": 4.9896112120931034e-05, "loss": 2.821, "mean_token_accuracy": 0.401935875415802, "step": 78505 }, { "epoch": 0.07907610584172088, "grad_norm": 11.426046682426364, "learning_rate": 4.989607615085851e-05, "loss": 2.1573, "mean_token_accuracy": 0.4517241418361664, "step": 78510 }, { "epoch": 0.07908114189482505, "grad_norm": 11.006950411716689, "learning_rate": 4.989604017457436e-05, "loss": 2.3259, "mean_token_accuracy": 0.43103448748588563, "step": 78515 }, { "epoch": 0.07908617794792923, "grad_norm": 11.472446347213923, "learning_rate": 4.9896004192078574e-05, "loss": 2.3709, "mean_token_accuracy": 0.4517241418361664, "step": 78520 }, { "epoch": 0.0790912140010334, "grad_norm": 14.135369688170858, "learning_rate": 4.989596820337118e-05, "loss": 2.33, "mean_token_accuracy": 0.4436176657676697, "step": 78525 }, { "epoch": 0.07909625005413758, "grad_norm": 7.583904831950299, "learning_rate": 4.9895932208452185e-05, "loss": 2.2528, "mean_token_accuracy": 0.44053236246109007, "step": 78530 }, { "epoch": 0.07910128610724175, "grad_norm": 12.696054766514253, "learning_rate": 4.9895896207321594e-05, "loss": 2.6148, "mean_token_accuracy": 0.3931034505367279, "step": 78535 }, { "epoch": 0.07910632216034591, "grad_norm": 11.855054225172006, "learning_rate": 4.989586019997941e-05, "loss": 2.3555, "mean_token_accuracy": 0.42329097986221315, "step": 78540 }, { "epoch": 0.07911135821345008, "grad_norm": 15.98435187546126, "learning_rate": 4.9895824186425646e-05, "loss": 2.446, "mean_token_accuracy": 0.47586206793785096, "step": 78545 }, { "epoch": 0.07911639426655426, "grad_norm": 13.448484458402408, "learning_rate": 4.989578816666033e-05, "loss": 2.6782, "mean_token_accuracy": 0.41379310488700866, "step": 78550 }, { "epoch": 0.07912143031965843, "grad_norm": 10.973437750868962, "learning_rate": 4.989575214068346e-05, "loss": 2.347, "mean_token_accuracy": 0.4122202038764954, "step": 78555 }, { "epoch": 0.0791264663727626, "grad_norm": 11.05906416877406, "learning_rate": 4.9895716108495035e-05, "loss": 2.1053, "mean_token_accuracy": 0.4965517222881317, "step": 78560 }, { "epoch": 0.07913150242586678, "grad_norm": 15.603490232309085, "learning_rate": 4.9895680070095076e-05, "loss": 2.7859, "mean_token_accuracy": 0.46551723778247833, "step": 78565 }, { "epoch": 0.07913653847897095, "grad_norm": 13.585602495892667, "learning_rate": 4.989564402548359e-05, "loss": 2.5675, "mean_token_accuracy": 0.4137930989265442, "step": 78570 }, { "epoch": 0.07914157453207513, "grad_norm": 14.125507906797763, "learning_rate": 4.989560797466059e-05, "loss": 2.2475, "mean_token_accuracy": 0.458620685338974, "step": 78575 }, { "epoch": 0.0791466105851793, "grad_norm": 11.39027051451844, "learning_rate": 4.9895571917626094e-05, "loss": 2.6381, "mean_token_accuracy": 0.38620689511299133, "step": 78580 }, { "epoch": 0.07915164663828347, "grad_norm": 10.286879992247956, "learning_rate": 4.989553585438009e-05, "loss": 2.4588, "mean_token_accuracy": 0.3987900733947754, "step": 78585 }, { "epoch": 0.07915668269138765, "grad_norm": 10.036019679285083, "learning_rate": 4.98954997849226e-05, "loss": 2.1679, "mean_token_accuracy": 0.47241378426551817, "step": 78590 }, { "epoch": 0.07916171874449182, "grad_norm": 11.005379804823022, "learning_rate": 4.989546370925364e-05, "loss": 2.2059, "mean_token_accuracy": 0.4689655125141144, "step": 78595 }, { "epoch": 0.079166754797596, "grad_norm": 9.036371321835324, "learning_rate": 4.989542762737321e-05, "loss": 2.2338, "mean_token_accuracy": 0.39655172228813174, "step": 78600 }, { "epoch": 0.07917179085070017, "grad_norm": 10.758523305561363, "learning_rate": 4.989539153928134e-05, "loss": 2.3823, "mean_token_accuracy": 0.4344827592372894, "step": 78605 }, { "epoch": 0.07917682690380433, "grad_norm": 10.918537252225011, "learning_rate": 4.989535544497801e-05, "loss": 2.371, "mean_token_accuracy": 0.4310344815254211, "step": 78610 }, { "epoch": 0.0791818629569085, "grad_norm": 14.586341409415386, "learning_rate": 4.989531934446325e-05, "loss": 2.7082, "mean_token_accuracy": 0.38620689511299133, "step": 78615 }, { "epoch": 0.07918689901001268, "grad_norm": 11.865600807322288, "learning_rate": 4.989528323773706e-05, "loss": 2.4662, "mean_token_accuracy": 0.45517241954803467, "step": 78620 }, { "epoch": 0.07919193506311685, "grad_norm": 11.802136188100564, "learning_rate": 4.989524712479946e-05, "loss": 2.4161, "mean_token_accuracy": 0.4551724135875702, "step": 78625 }, { "epoch": 0.07919697111622102, "grad_norm": 11.268328041407166, "learning_rate": 4.989521100565045e-05, "loss": 2.538, "mean_token_accuracy": 0.4413793087005615, "step": 78630 }, { "epoch": 0.0792020071693252, "grad_norm": 12.968375867174217, "learning_rate": 4.989517488029005e-05, "loss": 2.2995, "mean_token_accuracy": 0.441379314661026, "step": 78635 }, { "epoch": 0.07920704322242937, "grad_norm": 10.799306106715989, "learning_rate": 4.989513874871825e-05, "loss": 2.3502, "mean_token_accuracy": 0.46896551847457885, "step": 78640 }, { "epoch": 0.07921207927553355, "grad_norm": 14.87386045397428, "learning_rate": 4.98951026109351e-05, "loss": 2.5901, "mean_token_accuracy": 0.37586206793785093, "step": 78645 }, { "epoch": 0.07921711532863772, "grad_norm": 13.477035956769711, "learning_rate": 4.989506646694056e-05, "loss": 2.8726, "mean_token_accuracy": 0.358620685338974, "step": 78650 }, { "epoch": 0.07922215138174189, "grad_norm": 12.933320886047325, "learning_rate": 4.989503031673469e-05, "loss": 2.1843, "mean_token_accuracy": 0.4460979998111725, "step": 78655 }, { "epoch": 0.07922718743484607, "grad_norm": 11.061208132985234, "learning_rate": 4.989499416031745e-05, "loss": 2.7445, "mean_token_accuracy": 0.4103448331356049, "step": 78660 }, { "epoch": 0.07923222348795024, "grad_norm": 13.287697380642538, "learning_rate": 4.989495799768889e-05, "loss": 2.7283, "mean_token_accuracy": 0.44271021485328677, "step": 78665 }, { "epoch": 0.07923725954105441, "grad_norm": 13.057654779275143, "learning_rate": 4.9894921828849e-05, "loss": 2.7293, "mean_token_accuracy": 0.3875983119010925, "step": 78670 }, { "epoch": 0.07924229559415859, "grad_norm": 12.87269880187759, "learning_rate": 4.98948856537978e-05, "loss": 2.3815, "mean_token_accuracy": 0.4562807857990265, "step": 78675 }, { "epoch": 0.07924733164726275, "grad_norm": 11.001155470594274, "learning_rate": 4.989484947253529e-05, "loss": 2.5103, "mean_token_accuracy": 0.38620689511299133, "step": 78680 }, { "epoch": 0.07925236770036692, "grad_norm": 12.85507519341154, "learning_rate": 4.989481328506148e-05, "loss": 2.3292, "mean_token_accuracy": 0.4448275864124298, "step": 78685 }, { "epoch": 0.0792574037534711, "grad_norm": 10.873968555930258, "learning_rate": 4.989477709137639e-05, "loss": 2.4972, "mean_token_accuracy": 0.45862069725990295, "step": 78690 }, { "epoch": 0.07926243980657527, "grad_norm": 9.120444015320231, "learning_rate": 4.989474089148002e-05, "loss": 2.4746, "mean_token_accuracy": 0.4344827592372894, "step": 78695 }, { "epoch": 0.07926747585967944, "grad_norm": 11.689257984390649, "learning_rate": 4.989470468537239e-05, "loss": 2.3675, "mean_token_accuracy": 0.44827585220336913, "step": 78700 }, { "epoch": 0.07927251191278362, "grad_norm": 15.050799163266042, "learning_rate": 4.989466847305351e-05, "loss": 3.0557, "mean_token_accuracy": 0.3448275804519653, "step": 78705 }, { "epoch": 0.07927754796588779, "grad_norm": 10.952844709093384, "learning_rate": 4.989463225452338e-05, "loss": 2.3942, "mean_token_accuracy": 0.4457350313663483, "step": 78710 }, { "epoch": 0.07928258401899196, "grad_norm": 9.914772054164848, "learning_rate": 4.9894596029782015e-05, "loss": 2.5209, "mean_token_accuracy": 0.4086509346961975, "step": 78715 }, { "epoch": 0.07928762007209614, "grad_norm": 12.307315501031175, "learning_rate": 4.989455979882943e-05, "loss": 2.5019, "mean_token_accuracy": 0.37931033968925476, "step": 78720 }, { "epoch": 0.07929265612520031, "grad_norm": 10.282622224261065, "learning_rate": 4.989452356166562e-05, "loss": 2.1762, "mean_token_accuracy": 0.4482758641242981, "step": 78725 }, { "epoch": 0.07929769217830449, "grad_norm": 17.520930150895307, "learning_rate": 4.989448731829062e-05, "loss": 2.7802, "mean_token_accuracy": 0.3793103337287903, "step": 78730 }, { "epoch": 0.07930272823140866, "grad_norm": 13.783355895095683, "learning_rate": 4.9894451068704415e-05, "loss": 2.5909, "mean_token_accuracy": 0.3586206793785095, "step": 78735 }, { "epoch": 0.07930776428451283, "grad_norm": 10.399493469513057, "learning_rate": 4.9894414812907026e-05, "loss": 2.2132, "mean_token_accuracy": 0.4344827592372894, "step": 78740 }, { "epoch": 0.079312800337617, "grad_norm": 11.531716203666184, "learning_rate": 4.989437855089846e-05, "loss": 2.5759, "mean_token_accuracy": 0.42413792610168455, "step": 78745 }, { "epoch": 0.07931783639072117, "grad_norm": 11.95798148547566, "learning_rate": 4.9894342282678736e-05, "loss": 2.2443, "mean_token_accuracy": 0.4533575356006622, "step": 78750 }, { "epoch": 0.07932287244382534, "grad_norm": 13.766153512752494, "learning_rate": 4.989430600824786e-05, "loss": 2.9394, "mean_token_accuracy": 0.39310343861579894, "step": 78755 }, { "epoch": 0.07932790849692951, "grad_norm": 10.605239506360876, "learning_rate": 4.989426972760583e-05, "loss": 2.8322, "mean_token_accuracy": 0.358620685338974, "step": 78760 }, { "epoch": 0.07933294455003369, "grad_norm": 12.716464716503063, "learning_rate": 4.989423344075267e-05, "loss": 2.2988, "mean_token_accuracy": 0.42413793206214906, "step": 78765 }, { "epoch": 0.07933798060313786, "grad_norm": 10.853243913568248, "learning_rate": 4.9894197147688386e-05, "loss": 2.5564, "mean_token_accuracy": 0.3965517163276672, "step": 78770 }, { "epoch": 0.07934301665624204, "grad_norm": 19.491684384785483, "learning_rate": 4.9894160848412986e-05, "loss": 2.6944, "mean_token_accuracy": 0.3620689630508423, "step": 78775 }, { "epoch": 0.07934805270934621, "grad_norm": 12.154723347879697, "learning_rate": 4.989412454292649e-05, "loss": 2.5494, "mean_token_accuracy": 0.4538415014743805, "step": 78780 }, { "epoch": 0.07935308876245038, "grad_norm": 10.741446964274648, "learning_rate": 4.989408823122889e-05, "loss": 2.7048, "mean_token_accuracy": 0.40689654350280763, "step": 78785 }, { "epoch": 0.07935812481555456, "grad_norm": 11.029414854368818, "learning_rate": 4.9894051913320214e-05, "loss": 2.8098, "mean_token_accuracy": 0.3896551728248596, "step": 78790 }, { "epoch": 0.07936316086865873, "grad_norm": 12.150412561726041, "learning_rate": 4.989401558920046e-05, "loss": 2.5273, "mean_token_accuracy": 0.4034482777118683, "step": 78795 }, { "epoch": 0.0793681969217629, "grad_norm": 11.101015916840135, "learning_rate": 4.989397925886964e-05, "loss": 2.7468, "mean_token_accuracy": 0.4103448212146759, "step": 78800 }, { "epoch": 0.07937323297486708, "grad_norm": 12.009185015198462, "learning_rate": 4.989394292232777e-05, "loss": 2.5021, "mean_token_accuracy": 0.4275257170200348, "step": 78805 }, { "epoch": 0.07937826902797125, "grad_norm": 10.455358762100778, "learning_rate": 4.989390657957486e-05, "loss": 2.6097, "mean_token_accuracy": 0.4103448331356049, "step": 78810 }, { "epoch": 0.07938330508107543, "grad_norm": 9.961001676297018, "learning_rate": 4.9893870230610905e-05, "loss": 2.3874, "mean_token_accuracy": 0.4034482777118683, "step": 78815 }, { "epoch": 0.07938834113417959, "grad_norm": 12.877451003272213, "learning_rate": 4.989383387543593e-05, "loss": 2.5086, "mean_token_accuracy": 0.42413793206214906, "step": 78820 }, { "epoch": 0.07939337718728376, "grad_norm": 9.07781812412281, "learning_rate": 4.989379751404995e-05, "loss": 2.5926, "mean_token_accuracy": 0.43448275327682495, "step": 78825 }, { "epoch": 0.07939841324038793, "grad_norm": 10.865417837493823, "learning_rate": 4.989376114645296e-05, "loss": 2.133, "mean_token_accuracy": 0.4241379380226135, "step": 78830 }, { "epoch": 0.0794034492934921, "grad_norm": 11.610448477741116, "learning_rate": 4.989372477264498e-05, "loss": 2.3317, "mean_token_accuracy": 0.47586206793785096, "step": 78835 }, { "epoch": 0.07940848534659628, "grad_norm": 10.720084598734902, "learning_rate": 4.9893688392626016e-05, "loss": 2.189, "mean_token_accuracy": 0.46551724672317507, "step": 78840 }, { "epoch": 0.07941352139970045, "grad_norm": 8.9643043953185, "learning_rate": 4.989365200639608e-05, "loss": 2.3278, "mean_token_accuracy": 0.48349754214286805, "step": 78845 }, { "epoch": 0.07941855745280463, "grad_norm": 9.640619694269047, "learning_rate": 4.989361561395518e-05, "loss": 2.299, "mean_token_accuracy": 0.4034482777118683, "step": 78850 }, { "epoch": 0.0794235935059088, "grad_norm": 22.134719662697403, "learning_rate": 4.989357921530333e-05, "loss": 2.6759, "mean_token_accuracy": 0.4551724076271057, "step": 78855 }, { "epoch": 0.07942862955901298, "grad_norm": 9.35638709776434, "learning_rate": 4.989354281044052e-05, "loss": 2.1153, "mean_token_accuracy": 0.4379310429096222, "step": 78860 }, { "epoch": 0.07943366561211715, "grad_norm": 9.469346322042352, "learning_rate": 4.98935063993668e-05, "loss": 2.5177, "mean_token_accuracy": 0.43103448748588563, "step": 78865 }, { "epoch": 0.07943870166522132, "grad_norm": 11.22746914136466, "learning_rate": 4.9893469982082154e-05, "loss": 2.6082, "mean_token_accuracy": 0.3862068891525269, "step": 78870 }, { "epoch": 0.0794437377183255, "grad_norm": 10.072645815832288, "learning_rate": 4.9893433558586586e-05, "loss": 2.2005, "mean_token_accuracy": 0.42262552976608275, "step": 78875 }, { "epoch": 0.07944877377142967, "grad_norm": 11.779955280243406, "learning_rate": 4.989339712888012e-05, "loss": 2.2313, "mean_token_accuracy": 0.4572897791862488, "step": 78880 }, { "epoch": 0.07945380982453384, "grad_norm": 9.609052843104465, "learning_rate": 4.989336069296276e-05, "loss": 2.7638, "mean_token_accuracy": 0.4000000089406967, "step": 78885 }, { "epoch": 0.079458845877638, "grad_norm": 9.16755332617589, "learning_rate": 4.989332425083452e-05, "loss": 2.402, "mean_token_accuracy": 0.4517241299152374, "step": 78890 }, { "epoch": 0.07946388193074218, "grad_norm": 11.664351058790395, "learning_rate": 4.989328780249541e-05, "loss": 2.3447, "mean_token_accuracy": 0.4172413766384125, "step": 78895 }, { "epoch": 0.07946891798384635, "grad_norm": 12.990235074942706, "learning_rate": 4.989325134794543e-05, "loss": 2.6636, "mean_token_accuracy": 0.39310344159603117, "step": 78900 }, { "epoch": 0.07947395403695053, "grad_norm": 10.37226894374493, "learning_rate": 4.989321488718461e-05, "loss": 2.0345, "mean_token_accuracy": 0.46551724076271056, "step": 78905 }, { "epoch": 0.0794789900900547, "grad_norm": 8.524706626251902, "learning_rate": 4.9893178420212935e-05, "loss": 2.4904, "mean_token_accuracy": 0.42413793206214906, "step": 78910 }, { "epoch": 0.07948402614315887, "grad_norm": 9.80941971775653, "learning_rate": 4.989314194703044e-05, "loss": 2.4988, "mean_token_accuracy": 0.39310344457626345, "step": 78915 }, { "epoch": 0.07948906219626305, "grad_norm": 14.686539685166338, "learning_rate": 4.989310546763712e-05, "loss": 3.1782, "mean_token_accuracy": 0.35862069129943847, "step": 78920 }, { "epoch": 0.07949409824936722, "grad_norm": 11.711138792220204, "learning_rate": 4.989306898203299e-05, "loss": 2.4634, "mean_token_accuracy": 0.42256503701210024, "step": 78925 }, { "epoch": 0.0794991343024714, "grad_norm": 9.64314148430209, "learning_rate": 4.9893032490218054e-05, "loss": 2.2562, "mean_token_accuracy": 0.4517241358757019, "step": 78930 }, { "epoch": 0.07950417035557557, "grad_norm": 12.987007564596025, "learning_rate": 4.989299599219233e-05, "loss": 2.9858, "mean_token_accuracy": 0.3839080512523651, "step": 78935 }, { "epoch": 0.07950920640867974, "grad_norm": 9.625225588429506, "learning_rate": 4.989295948795582e-05, "loss": 2.3827, "mean_token_accuracy": 0.3758620709180832, "step": 78940 }, { "epoch": 0.07951424246178392, "grad_norm": 10.73914048859907, "learning_rate": 4.9892922977508545e-05, "loss": 2.5297, "mean_token_accuracy": 0.4103448212146759, "step": 78945 }, { "epoch": 0.07951927851488809, "grad_norm": 10.098809545148274, "learning_rate": 4.989288646085051e-05, "loss": 2.1728, "mean_token_accuracy": 0.4379310369491577, "step": 78950 }, { "epoch": 0.07952431456799226, "grad_norm": 16.568157861316664, "learning_rate": 4.9892849937981726e-05, "loss": 2.5212, "mean_token_accuracy": 0.38620689511299133, "step": 78955 }, { "epoch": 0.07952935062109642, "grad_norm": 12.32224581310678, "learning_rate": 4.98928134089022e-05, "loss": 2.2655, "mean_token_accuracy": 0.4724137902259827, "step": 78960 }, { "epoch": 0.0795343866742006, "grad_norm": 10.195108793425687, "learning_rate": 4.989277687361193e-05, "loss": 2.9423, "mean_token_accuracy": 0.3448275804519653, "step": 78965 }, { "epoch": 0.07953942272730477, "grad_norm": 10.731613936504187, "learning_rate": 4.989274033211096e-05, "loss": 2.7249, "mean_token_accuracy": 0.40344828367233276, "step": 78970 }, { "epoch": 0.07954445878040894, "grad_norm": 12.180209031044248, "learning_rate": 4.989270378439928e-05, "loss": 2.3179, "mean_token_accuracy": 0.44482759237289426, "step": 78975 }, { "epoch": 0.07954949483351312, "grad_norm": 12.780501983801642, "learning_rate": 4.9892667230476886e-05, "loss": 2.425, "mean_token_accuracy": 0.4448275864124298, "step": 78980 }, { "epoch": 0.07955453088661729, "grad_norm": 12.066920059026836, "learning_rate": 4.9892630670343805e-05, "loss": 2.3433, "mean_token_accuracy": 0.42758620977401735, "step": 78985 }, { "epoch": 0.07955956693972147, "grad_norm": 10.467678043016184, "learning_rate": 4.989259410400005e-05, "loss": 2.4125, "mean_token_accuracy": 0.45517241954803467, "step": 78990 }, { "epoch": 0.07956460299282564, "grad_norm": 12.494793377844234, "learning_rate": 4.9892557531445615e-05, "loss": 2.9205, "mean_token_accuracy": 0.3655172407627106, "step": 78995 }, { "epoch": 0.07956963904592981, "grad_norm": 10.26614850114355, "learning_rate": 4.9892520952680526e-05, "loss": 2.6883, "mean_token_accuracy": 0.4188142716884613, "step": 79000 }, { "epoch": 0.07957467509903399, "grad_norm": 11.361967622074925, "learning_rate": 4.9892484367704795e-05, "loss": 2.5872, "mean_token_accuracy": 0.4, "step": 79005 }, { "epoch": 0.07957971115213816, "grad_norm": 11.918631573846808, "learning_rate": 4.989244777651842e-05, "loss": 2.7012, "mean_token_accuracy": 0.3827586263418198, "step": 79010 }, { "epoch": 0.07958474720524233, "grad_norm": 9.967427030029807, "learning_rate": 4.989241117912142e-05, "loss": 2.4855, "mean_token_accuracy": 0.4310344815254211, "step": 79015 }, { "epoch": 0.07958978325834651, "grad_norm": 10.273479260232072, "learning_rate": 4.989237457551379e-05, "loss": 2.1678, "mean_token_accuracy": 0.46896551847457885, "step": 79020 }, { "epoch": 0.07959481931145068, "grad_norm": 10.468749290174168, "learning_rate": 4.989233796569556e-05, "loss": 2.4507, "mean_token_accuracy": 0.4206896543502808, "step": 79025 }, { "epoch": 0.07959985536455484, "grad_norm": 10.891219009419883, "learning_rate": 4.989230134966673e-05, "loss": 2.3346, "mean_token_accuracy": 0.41034482717514037, "step": 79030 }, { "epoch": 0.07960489141765902, "grad_norm": 9.679580535328958, "learning_rate": 4.989226472742731e-05, "loss": 2.1463, "mean_token_accuracy": 0.4569268047809601, "step": 79035 }, { "epoch": 0.07960992747076319, "grad_norm": 10.169769202129375, "learning_rate": 4.989222809897732e-05, "loss": 2.2505, "mean_token_accuracy": 0.41034482717514037, "step": 79040 }, { "epoch": 0.07961496352386736, "grad_norm": 10.606880381568496, "learning_rate": 4.989219146431675e-05, "loss": 2.4708, "mean_token_accuracy": 0.4068965554237366, "step": 79045 }, { "epoch": 0.07961999957697154, "grad_norm": 10.666109848881554, "learning_rate": 4.989215482344563e-05, "loss": 2.5997, "mean_token_accuracy": 0.3896551787853241, "step": 79050 }, { "epoch": 0.07962503563007571, "grad_norm": 13.305649768803201, "learning_rate": 4.989211817636396e-05, "loss": 2.9108, "mean_token_accuracy": 0.3448275804519653, "step": 79055 }, { "epoch": 0.07963007168317988, "grad_norm": 10.28781482381563, "learning_rate": 4.989208152307175e-05, "loss": 2.7319, "mean_token_accuracy": 0.358620685338974, "step": 79060 }, { "epoch": 0.07963510773628406, "grad_norm": 9.357041371563932, "learning_rate": 4.989204486356901e-05, "loss": 2.3215, "mean_token_accuracy": 0.42413792610168455, "step": 79065 }, { "epoch": 0.07964014378938823, "grad_norm": 12.330352138985358, "learning_rate": 4.989200819785576e-05, "loss": 2.4427, "mean_token_accuracy": 0.3655172407627106, "step": 79070 }, { "epoch": 0.0796451798424924, "grad_norm": 11.604326180890489, "learning_rate": 4.9891971525932e-05, "loss": 2.5581, "mean_token_accuracy": 0.3896551728248596, "step": 79075 }, { "epoch": 0.07965021589559658, "grad_norm": 11.880013785101163, "learning_rate": 4.989193484779775e-05, "loss": 2.3486, "mean_token_accuracy": 0.46733213067054746, "step": 79080 }, { "epoch": 0.07965525194870075, "grad_norm": 12.204031395575958, "learning_rate": 4.989189816345301e-05, "loss": 2.6204, "mean_token_accuracy": 0.37241379618644715, "step": 79085 }, { "epoch": 0.07966028800180493, "grad_norm": 11.249298395271282, "learning_rate": 4.9891861472897785e-05, "loss": 2.4101, "mean_token_accuracy": 0.41034482717514037, "step": 79090 }, { "epoch": 0.0796653240549091, "grad_norm": 11.409319515844729, "learning_rate": 4.98918247761321e-05, "loss": 2.342, "mean_token_accuracy": 0.4586206912994385, "step": 79095 }, { "epoch": 0.07967036010801326, "grad_norm": 14.619813850222037, "learning_rate": 4.989178807315596e-05, "loss": 2.5412, "mean_token_accuracy": 0.41379310488700866, "step": 79100 }, { "epoch": 0.07967539616111743, "grad_norm": 11.591505544280942, "learning_rate": 4.989175136396937e-05, "loss": 2.461, "mean_token_accuracy": 0.4310344815254211, "step": 79105 }, { "epoch": 0.07968043221422161, "grad_norm": 12.188130524149408, "learning_rate": 4.989171464857235e-05, "loss": 2.4768, "mean_token_accuracy": 0.42413793206214906, "step": 79110 }, { "epoch": 0.07968546826732578, "grad_norm": 10.025258511065248, "learning_rate": 4.9891677926964905e-05, "loss": 2.6062, "mean_token_accuracy": 0.42413793206214906, "step": 79115 }, { "epoch": 0.07969050432042996, "grad_norm": 10.353772316045388, "learning_rate": 4.9891641199147035e-05, "loss": 2.2524, "mean_token_accuracy": 0.417241370677948, "step": 79120 }, { "epoch": 0.07969554037353413, "grad_norm": 10.598122069498606, "learning_rate": 4.989160446511877e-05, "loss": 2.5497, "mean_token_accuracy": 0.3896551698446274, "step": 79125 }, { "epoch": 0.0797005764266383, "grad_norm": 16.909737503054817, "learning_rate": 4.989156772488011e-05, "loss": 2.641, "mean_token_accuracy": 0.4206896543502808, "step": 79130 }, { "epoch": 0.07970561247974248, "grad_norm": 13.120796250005803, "learning_rate": 4.989153097843105e-05, "loss": 2.8636, "mean_token_accuracy": 0.3620689570903778, "step": 79135 }, { "epoch": 0.07971064853284665, "grad_norm": 10.786061274805299, "learning_rate": 4.989149422577162e-05, "loss": 2.3404, "mean_token_accuracy": 0.43623715043067934, "step": 79140 }, { "epoch": 0.07971568458595082, "grad_norm": 10.297845580415794, "learning_rate": 4.989145746690184e-05, "loss": 2.5618, "mean_token_accuracy": 0.42413793206214906, "step": 79145 }, { "epoch": 0.079720720639055, "grad_norm": 11.605618769114415, "learning_rate": 4.9891420701821694e-05, "loss": 2.2503, "mean_token_accuracy": 0.4538415014743805, "step": 79150 }, { "epoch": 0.07972575669215917, "grad_norm": 13.484618283359, "learning_rate": 4.9891383930531214e-05, "loss": 2.6268, "mean_token_accuracy": 0.38275861740112305, "step": 79155 }, { "epoch": 0.07973079274526335, "grad_norm": 12.150237886939786, "learning_rate": 4.989134715303039e-05, "loss": 2.4696, "mean_token_accuracy": 0.3827586233615875, "step": 79160 }, { "epoch": 0.07973582879836752, "grad_norm": 12.327805385403058, "learning_rate": 4.989131036931924e-05, "loss": 2.5036, "mean_token_accuracy": 0.39655172228813174, "step": 79165 }, { "epoch": 0.07974086485147168, "grad_norm": 11.228189666213435, "learning_rate": 4.989127357939779e-05, "loss": 2.3775, "mean_token_accuracy": 0.42232305407524107, "step": 79170 }, { "epoch": 0.07974590090457585, "grad_norm": 15.766313352953114, "learning_rate": 4.989123678326603e-05, "loss": 2.9528, "mean_token_accuracy": 0.3551724076271057, "step": 79175 }, { "epoch": 0.07975093695768003, "grad_norm": 14.661573875554476, "learning_rate": 4.989119998092397e-05, "loss": 2.7111, "mean_token_accuracy": 0.4482758641242981, "step": 79180 }, { "epoch": 0.0797559730107842, "grad_norm": 10.27537912167993, "learning_rate": 4.989116317237164e-05, "loss": 2.3425, "mean_token_accuracy": 0.41379310488700866, "step": 79185 }, { "epoch": 0.07976100906388837, "grad_norm": 9.666719547046183, "learning_rate": 4.989112635760902e-05, "loss": 2.4718, "mean_token_accuracy": 0.3999999940395355, "step": 79190 }, { "epoch": 0.07976604511699255, "grad_norm": 12.82334278739967, "learning_rate": 4.9891089536636156e-05, "loss": 2.277, "mean_token_accuracy": 0.4586206912994385, "step": 79195 }, { "epoch": 0.07977108117009672, "grad_norm": 12.451765225044706, "learning_rate": 4.989105270945303e-05, "loss": 2.3536, "mean_token_accuracy": 0.4620689630508423, "step": 79200 }, { "epoch": 0.0797761172232009, "grad_norm": 12.061392168171809, "learning_rate": 4.9891015876059663e-05, "loss": 2.5005, "mean_token_accuracy": 0.4206896543502808, "step": 79205 }, { "epoch": 0.07978115327630507, "grad_norm": 11.569599215930866, "learning_rate": 4.9890979036456064e-05, "loss": 2.4667, "mean_token_accuracy": 0.3862068891525269, "step": 79210 }, { "epoch": 0.07978618932940924, "grad_norm": 10.922474577070293, "learning_rate": 4.989094219064225e-05, "loss": 2.4825, "mean_token_accuracy": 0.41379310488700866, "step": 79215 }, { "epoch": 0.07979122538251342, "grad_norm": 9.916546552249, "learning_rate": 4.989090533861821e-05, "loss": 2.6726, "mean_token_accuracy": 0.3999999940395355, "step": 79220 }, { "epoch": 0.07979626143561759, "grad_norm": 9.326509209526781, "learning_rate": 4.989086848038398e-05, "loss": 2.5372, "mean_token_accuracy": 0.3965517163276672, "step": 79225 }, { "epoch": 0.07980129748872176, "grad_norm": 11.621949220131109, "learning_rate": 4.989083161593956e-05, "loss": 2.362, "mean_token_accuracy": 0.42413792610168455, "step": 79230 }, { "epoch": 0.07980633354182594, "grad_norm": 11.158118043453529, "learning_rate": 4.9890794745284955e-05, "loss": 2.6022, "mean_token_accuracy": 0.4137930989265442, "step": 79235 }, { "epoch": 0.0798113695949301, "grad_norm": 11.61270927968363, "learning_rate": 4.989075786842018e-05, "loss": 2.6715, "mean_token_accuracy": 0.4137930989265442, "step": 79240 }, { "epoch": 0.07981640564803427, "grad_norm": 13.703164591070829, "learning_rate": 4.989072098534524e-05, "loss": 2.915, "mean_token_accuracy": 0.34827586710453035, "step": 79245 }, { "epoch": 0.07982144170113845, "grad_norm": 15.24638430762623, "learning_rate": 4.9890684096060166e-05, "loss": 3.1375, "mean_token_accuracy": 0.3310344725847244, "step": 79250 }, { "epoch": 0.07982647775424262, "grad_norm": 9.540202790414654, "learning_rate": 4.989064720056494e-05, "loss": 2.6545, "mean_token_accuracy": 0.42413793206214906, "step": 79255 }, { "epoch": 0.0798315138073468, "grad_norm": 10.596690551745844, "learning_rate": 4.989061029885959e-05, "loss": 2.7929, "mean_token_accuracy": 0.38965516686439516, "step": 79260 }, { "epoch": 0.07983654986045097, "grad_norm": 9.936718320036045, "learning_rate": 4.989057339094412e-05, "loss": 2.6504, "mean_token_accuracy": 0.3758620709180832, "step": 79265 }, { "epoch": 0.07984158591355514, "grad_norm": 11.048165294774309, "learning_rate": 4.989053647681853e-05, "loss": 2.5218, "mean_token_accuracy": 0.39310344457626345, "step": 79270 }, { "epoch": 0.07984662196665931, "grad_norm": 10.04943602513441, "learning_rate": 4.989049955648285e-05, "loss": 2.6439, "mean_token_accuracy": 0.37241379022598264, "step": 79275 }, { "epoch": 0.07985165801976349, "grad_norm": 8.82320170070897, "learning_rate": 4.989046262993708e-05, "loss": 2.4372, "mean_token_accuracy": 0.41929823756217954, "step": 79280 }, { "epoch": 0.07985669407286766, "grad_norm": 11.795950361431165, "learning_rate": 4.989042569718124e-05, "loss": 2.707, "mean_token_accuracy": 0.34827586114406583, "step": 79285 }, { "epoch": 0.07986173012597184, "grad_norm": 16.722107412778513, "learning_rate": 4.989038875821532e-05, "loss": 2.794, "mean_token_accuracy": 0.3793103456497192, "step": 79290 }, { "epoch": 0.07986676617907601, "grad_norm": 11.847275538094214, "learning_rate": 4.989035181303935e-05, "loss": 3.0336, "mean_token_accuracy": 0.39310343861579894, "step": 79295 }, { "epoch": 0.07987180223218018, "grad_norm": 11.73960278611399, "learning_rate": 4.989031486165332e-05, "loss": 2.72, "mean_token_accuracy": 0.334482753276825, "step": 79300 }, { "epoch": 0.07987683828528436, "grad_norm": 10.140944938298517, "learning_rate": 4.989027790405727e-05, "loss": 2.3956, "mean_token_accuracy": 0.4103448331356049, "step": 79305 }, { "epoch": 0.07988187433838852, "grad_norm": 10.651185281269854, "learning_rate": 4.989024094025118e-05, "loss": 2.802, "mean_token_accuracy": 0.3482758581638336, "step": 79310 }, { "epoch": 0.07988691039149269, "grad_norm": 10.065431122913454, "learning_rate": 4.9890203970235076e-05, "loss": 2.7406, "mean_token_accuracy": 0.36896551847457887, "step": 79315 }, { "epoch": 0.07989194644459686, "grad_norm": 15.532866536386312, "learning_rate": 4.989016699400897e-05, "loss": 3.0697, "mean_token_accuracy": 0.324137932062149, "step": 79320 }, { "epoch": 0.07989698249770104, "grad_norm": 11.036025609573155, "learning_rate": 4.989013001157286e-05, "loss": 2.6956, "mean_token_accuracy": 0.41167573928833007, "step": 79325 }, { "epoch": 0.07990201855080521, "grad_norm": 12.862779580150415, "learning_rate": 4.989009302292677e-05, "loss": 2.795, "mean_token_accuracy": 0.3689655244350433, "step": 79330 }, { "epoch": 0.07990705460390939, "grad_norm": 12.042692920864773, "learning_rate": 4.9890056028070704e-05, "loss": 2.6827, "mean_token_accuracy": 0.36896551847457887, "step": 79335 }, { "epoch": 0.07991209065701356, "grad_norm": 10.637622421730365, "learning_rate": 4.9890019027004675e-05, "loss": 2.8021, "mean_token_accuracy": 0.4034482777118683, "step": 79340 }, { "epoch": 0.07991712671011773, "grad_norm": 11.788736092348069, "learning_rate": 4.988998201972868e-05, "loss": 2.449, "mean_token_accuracy": 0.3999999940395355, "step": 79345 }, { "epoch": 0.07992216276322191, "grad_norm": 10.888020666608107, "learning_rate": 4.9889945006242756e-05, "loss": 2.3412, "mean_token_accuracy": 0.458620685338974, "step": 79350 }, { "epoch": 0.07992719881632608, "grad_norm": 12.733626524190512, "learning_rate": 4.988990798654689e-05, "loss": 2.2461, "mean_token_accuracy": 0.42413792610168455, "step": 79355 }, { "epoch": 0.07993223486943025, "grad_norm": 10.538600366181154, "learning_rate": 4.9889870960641095e-05, "loss": 2.2895, "mean_token_accuracy": 0.43448275327682495, "step": 79360 }, { "epoch": 0.07993727092253443, "grad_norm": 9.958773640758265, "learning_rate": 4.988983392852538e-05, "loss": 2.3394, "mean_token_accuracy": 0.4482758641242981, "step": 79365 }, { "epoch": 0.0799423069756386, "grad_norm": 12.862544927659476, "learning_rate": 4.988979689019977e-05, "loss": 2.5036, "mean_token_accuracy": 0.45862067937850953, "step": 79370 }, { "epoch": 0.07994734302874278, "grad_norm": 12.494765187812574, "learning_rate": 4.988975984566427e-05, "loss": 2.8422, "mean_token_accuracy": 0.2931034505367279, "step": 79375 }, { "epoch": 0.07995237908184694, "grad_norm": 13.62324926522192, "learning_rate": 4.9889722794918884e-05, "loss": 2.5984, "mean_token_accuracy": 0.43793103098869324, "step": 79380 }, { "epoch": 0.07995741513495111, "grad_norm": 12.306840764973211, "learning_rate": 4.9889685737963615e-05, "loss": 2.4794, "mean_token_accuracy": 0.4413793087005615, "step": 79385 }, { "epoch": 0.07996245118805528, "grad_norm": 12.593377958422776, "learning_rate": 4.9889648674798496e-05, "loss": 2.5965, "mean_token_accuracy": 0.43599515557289126, "step": 79390 }, { "epoch": 0.07996748724115946, "grad_norm": 15.03936765847017, "learning_rate": 4.988961160542352e-05, "loss": 2.1961, "mean_token_accuracy": 0.44827585220336913, "step": 79395 }, { "epoch": 0.07997252329426363, "grad_norm": 12.81936495222095, "learning_rate": 4.98895745298387e-05, "loss": 2.9628, "mean_token_accuracy": 0.3241379290819168, "step": 79400 }, { "epoch": 0.0799775593473678, "grad_norm": 15.82207117024875, "learning_rate": 4.988953744804405e-05, "loss": 2.798, "mean_token_accuracy": 0.3896551728248596, "step": 79405 }, { "epoch": 0.07998259540047198, "grad_norm": 15.50903712450827, "learning_rate": 4.988950036003958e-05, "loss": 2.7759, "mean_token_accuracy": 0.3724137842655182, "step": 79410 }, { "epoch": 0.07998763145357615, "grad_norm": 10.24700952681741, "learning_rate": 4.98894632658253e-05, "loss": 2.5835, "mean_token_accuracy": 0.4068965554237366, "step": 79415 }, { "epoch": 0.07999266750668033, "grad_norm": 11.702107205661529, "learning_rate": 4.988942616540121e-05, "loss": 2.5464, "mean_token_accuracy": 0.42413793206214906, "step": 79420 }, { "epoch": 0.0799977035597845, "grad_norm": 9.62508201514851, "learning_rate": 4.9889389058767336e-05, "loss": 2.616, "mean_token_accuracy": 0.4068965554237366, "step": 79425 }, { "epoch": 0.08000273961288867, "grad_norm": 10.912720722740826, "learning_rate": 4.988935194592368e-05, "loss": 2.6568, "mean_token_accuracy": 0.4206896543502808, "step": 79430 }, { "epoch": 0.08000777566599285, "grad_norm": 10.91455076097727, "learning_rate": 4.988931482687025e-05, "loss": 2.487, "mean_token_accuracy": 0.45045371651649474, "step": 79435 }, { "epoch": 0.08001281171909702, "grad_norm": 10.86007851545835, "learning_rate": 4.988927770160706e-05, "loss": 2.4039, "mean_token_accuracy": 0.4379310369491577, "step": 79440 }, { "epoch": 0.0800178477722012, "grad_norm": 10.90186781334332, "learning_rate": 4.988924057013413e-05, "loss": 2.4577, "mean_token_accuracy": 0.3551724135875702, "step": 79445 }, { "epoch": 0.08002288382530535, "grad_norm": 9.539891858373304, "learning_rate": 4.988920343245145e-05, "loss": 2.5775, "mean_token_accuracy": 0.42413793206214906, "step": 79450 }, { "epoch": 0.08002791987840953, "grad_norm": 10.036409760452624, "learning_rate": 4.988916628855905e-05, "loss": 2.3667, "mean_token_accuracy": 0.4068965554237366, "step": 79455 }, { "epoch": 0.0800329559315137, "grad_norm": 12.020411914002882, "learning_rate": 4.988912913845692e-05, "loss": 2.4363, "mean_token_accuracy": 0.4103448212146759, "step": 79460 }, { "epoch": 0.08003799198461788, "grad_norm": 11.553561959101906, "learning_rate": 4.988909198214509e-05, "loss": 2.5985, "mean_token_accuracy": 0.4068965554237366, "step": 79465 }, { "epoch": 0.08004302803772205, "grad_norm": 12.018691851414367, "learning_rate": 4.988905481962356e-05, "loss": 2.5504, "mean_token_accuracy": 0.4310344815254211, "step": 79470 }, { "epoch": 0.08004806409082622, "grad_norm": 10.79057088037424, "learning_rate": 4.988901765089235e-05, "loss": 2.5433, "mean_token_accuracy": 0.3724137842655182, "step": 79475 }, { "epoch": 0.0800531001439304, "grad_norm": 16.415658349364726, "learning_rate": 4.988898047595145e-05, "loss": 2.8936, "mean_token_accuracy": 0.3793103516101837, "step": 79480 }, { "epoch": 0.08005813619703457, "grad_norm": 10.98445964087828, "learning_rate": 4.988894329480088e-05, "loss": 2.1526, "mean_token_accuracy": 0.47241379618644713, "step": 79485 }, { "epoch": 0.08006317225013874, "grad_norm": 10.04208022061848, "learning_rate": 4.988890610744067e-05, "loss": 2.9845, "mean_token_accuracy": 0.37241379022598264, "step": 79490 }, { "epoch": 0.08006820830324292, "grad_norm": 9.28462111364128, "learning_rate": 4.9888868913870794e-05, "loss": 2.2329, "mean_token_accuracy": 0.4344827592372894, "step": 79495 }, { "epoch": 0.08007324435634709, "grad_norm": 13.963385023502035, "learning_rate": 4.988883171409129e-05, "loss": 2.5368, "mean_token_accuracy": 0.3793103516101837, "step": 79500 }, { "epoch": 0.08007828040945127, "grad_norm": 11.910005668954845, "learning_rate": 4.988879450810216e-05, "loss": 2.4019, "mean_token_accuracy": 0.41724138259887694, "step": 79505 }, { "epoch": 0.08008331646255544, "grad_norm": 10.484470060039783, "learning_rate": 4.9888757295903416e-05, "loss": 2.5086, "mean_token_accuracy": 0.4156079888343811, "step": 79510 }, { "epoch": 0.08008835251565961, "grad_norm": 10.622992811141126, "learning_rate": 4.988872007749507e-05, "loss": 2.3137, "mean_token_accuracy": 0.4517241418361664, "step": 79515 }, { "epoch": 0.08009338856876377, "grad_norm": 10.542131848954652, "learning_rate": 4.988868285287712e-05, "loss": 2.6186, "mean_token_accuracy": 0.39655172526836396, "step": 79520 }, { "epoch": 0.08009842462186795, "grad_norm": 11.752066254907898, "learning_rate": 4.988864562204959e-05, "loss": 2.4696, "mean_token_accuracy": 0.4, "step": 79525 }, { "epoch": 0.08010346067497212, "grad_norm": 12.31359622501086, "learning_rate": 4.988860838501249e-05, "loss": 2.9548, "mean_token_accuracy": 0.3896551728248596, "step": 79530 }, { "epoch": 0.0801084967280763, "grad_norm": 10.015468910699429, "learning_rate": 4.988857114176582e-05, "loss": 2.3145, "mean_token_accuracy": 0.4689655125141144, "step": 79535 }, { "epoch": 0.08011353278118047, "grad_norm": 11.515838320838489, "learning_rate": 4.98885338923096e-05, "loss": 2.7304, "mean_token_accuracy": 0.4206896543502808, "step": 79540 }, { "epoch": 0.08011856883428464, "grad_norm": 11.36721479113331, "learning_rate": 4.988849663664383e-05, "loss": 2.5044, "mean_token_accuracy": 0.41724138259887694, "step": 79545 }, { "epoch": 0.08012360488738882, "grad_norm": 11.68749132872101, "learning_rate": 4.988845937476853e-05, "loss": 2.5287, "mean_token_accuracy": 0.4172413766384125, "step": 79550 }, { "epoch": 0.08012864094049299, "grad_norm": 10.99722820556555, "learning_rate": 4.9888422106683713e-05, "loss": 2.2147, "mean_token_accuracy": 0.4354679763317108, "step": 79555 }, { "epoch": 0.08013367699359716, "grad_norm": 10.15516759491134, "learning_rate": 4.9888384832389385e-05, "loss": 2.3187, "mean_token_accuracy": 0.3931034475564957, "step": 79560 }, { "epoch": 0.08013871304670134, "grad_norm": 10.90276231219662, "learning_rate": 4.988834755188555e-05, "loss": 2.3714, "mean_token_accuracy": 0.40689656138420105, "step": 79565 }, { "epoch": 0.08014374909980551, "grad_norm": 11.89418994604228, "learning_rate": 4.988831026517222e-05, "loss": 2.6674, "mean_token_accuracy": 0.3896551728248596, "step": 79570 }, { "epoch": 0.08014878515290969, "grad_norm": 11.53098581794788, "learning_rate": 4.9888272972249406e-05, "loss": 2.4177, "mean_token_accuracy": 0.4103448331356049, "step": 79575 }, { "epoch": 0.08015382120601386, "grad_norm": 10.979256057860457, "learning_rate": 4.988823567311713e-05, "loss": 2.5364, "mean_token_accuracy": 0.4566243290901184, "step": 79580 }, { "epoch": 0.08015885725911803, "grad_norm": 12.53954995100159, "learning_rate": 4.988819836777539e-05, "loss": 2.2688, "mean_token_accuracy": 0.4482758641242981, "step": 79585 }, { "epoch": 0.08016389331222219, "grad_norm": 8.985642105762485, "learning_rate": 4.98881610562242e-05, "loss": 2.514, "mean_token_accuracy": 0.39999998807907106, "step": 79590 }, { "epoch": 0.08016892936532637, "grad_norm": 15.280237782500492, "learning_rate": 4.988812373846357e-05, "loss": 2.6493, "mean_token_accuracy": 0.4, "step": 79595 }, { "epoch": 0.08017396541843054, "grad_norm": 12.741831699507062, "learning_rate": 4.988808641449351e-05, "loss": 2.5508, "mean_token_accuracy": 0.44137930274009707, "step": 79600 }, { "epoch": 0.08017900147153471, "grad_norm": 11.058858186616307, "learning_rate": 4.988804908431403e-05, "loss": 3.029, "mean_token_accuracy": 0.37586206793785093, "step": 79605 }, { "epoch": 0.08018403752463889, "grad_norm": 12.225983720457378, "learning_rate": 4.9888011747925136e-05, "loss": 2.6872, "mean_token_accuracy": 0.42758620977401735, "step": 79610 }, { "epoch": 0.08018907357774306, "grad_norm": 10.153732164002742, "learning_rate": 4.988797440532685e-05, "loss": 2.2537, "mean_token_accuracy": 0.4396249234676361, "step": 79615 }, { "epoch": 0.08019410963084724, "grad_norm": 9.692566389941037, "learning_rate": 4.9887937056519176e-05, "loss": 2.6086, "mean_token_accuracy": 0.36551724672317504, "step": 79620 }, { "epoch": 0.08019914568395141, "grad_norm": 11.19721172131227, "learning_rate": 4.988789970150212e-05, "loss": 3.1417, "mean_token_accuracy": 0.3931034505367279, "step": 79625 }, { "epoch": 0.08020418173705558, "grad_norm": 10.626421905870574, "learning_rate": 4.9887862340275696e-05, "loss": 2.4906, "mean_token_accuracy": 0.4, "step": 79630 }, { "epoch": 0.08020921779015976, "grad_norm": 13.48661091513199, "learning_rate": 4.9887824972839924e-05, "loss": 2.4112, "mean_token_accuracy": 0.417241370677948, "step": 79635 }, { "epoch": 0.08021425384326393, "grad_norm": 11.391024371543178, "learning_rate": 4.98877875991948e-05, "loss": 2.5537, "mean_token_accuracy": 0.3896551728248596, "step": 79640 }, { "epoch": 0.0802192898963681, "grad_norm": 10.763922938761674, "learning_rate": 4.988775021934034e-05, "loss": 3.0357, "mean_token_accuracy": 0.3137931048870087, "step": 79645 }, { "epoch": 0.08022432594947228, "grad_norm": 11.802065843454798, "learning_rate": 4.988771283327654e-05, "loss": 2.6795, "mean_token_accuracy": 0.41724138259887694, "step": 79650 }, { "epoch": 0.08022936200257645, "grad_norm": 13.615335567169247, "learning_rate": 4.9887675441003454e-05, "loss": 2.8496, "mean_token_accuracy": 0.4, "step": 79655 }, { "epoch": 0.08023439805568061, "grad_norm": 12.75109226458029, "learning_rate": 4.988763804252104e-05, "loss": 2.3276, "mean_token_accuracy": 0.43793103098869324, "step": 79660 }, { "epoch": 0.08023943410878479, "grad_norm": 10.966371952175226, "learning_rate": 4.9887600637829335e-05, "loss": 2.4453, "mean_token_accuracy": 0.3999999940395355, "step": 79665 }, { "epoch": 0.08024447016188896, "grad_norm": 9.734638172584472, "learning_rate": 4.9887563226928345e-05, "loss": 2.1118, "mean_token_accuracy": 0.47931034564971925, "step": 79670 }, { "epoch": 0.08024950621499313, "grad_norm": 10.442486615425338, "learning_rate": 4.988752580981809e-05, "loss": 2.5411, "mean_token_accuracy": 0.3655172407627106, "step": 79675 }, { "epoch": 0.0802545422680973, "grad_norm": 12.56943107664165, "learning_rate": 4.9887488386498566e-05, "loss": 2.5066, "mean_token_accuracy": 0.41034482717514037, "step": 79680 }, { "epoch": 0.08025957832120148, "grad_norm": 12.139026560210146, "learning_rate": 4.988745095696978e-05, "loss": 2.7677, "mean_token_accuracy": 0.36896551847457887, "step": 79685 }, { "epoch": 0.08026461437430565, "grad_norm": 10.783844072848778, "learning_rate": 4.9887413521231755e-05, "loss": 2.1863, "mean_token_accuracy": 0.47241379618644713, "step": 79690 }, { "epoch": 0.08026965042740983, "grad_norm": 12.714070347197381, "learning_rate": 4.98873760792845e-05, "loss": 2.8798, "mean_token_accuracy": 0.39999999701976774, "step": 79695 }, { "epoch": 0.080274686480514, "grad_norm": 12.097323280906638, "learning_rate": 4.988733863112803e-05, "loss": 2.8448, "mean_token_accuracy": 0.3586206793785095, "step": 79700 }, { "epoch": 0.08027972253361818, "grad_norm": 11.318475823153298, "learning_rate": 4.988730117676234e-05, "loss": 2.5614, "mean_token_accuracy": 0.38275861740112305, "step": 79705 }, { "epoch": 0.08028475858672235, "grad_norm": 12.342792221412806, "learning_rate": 4.988726371618745e-05, "loss": 2.2565, "mean_token_accuracy": 0.47447065711021424, "step": 79710 }, { "epoch": 0.08028979463982652, "grad_norm": 13.079296823306526, "learning_rate": 4.9887226249403366e-05, "loss": 3.2312, "mean_token_accuracy": 0.3034482777118683, "step": 79715 }, { "epoch": 0.0802948306929307, "grad_norm": 10.189609894998457, "learning_rate": 4.9887188776410104e-05, "loss": 2.212, "mean_token_accuracy": 0.446059113740921, "step": 79720 }, { "epoch": 0.08029986674603487, "grad_norm": 12.880201862628562, "learning_rate": 4.988715129720767e-05, "loss": 2.5805, "mean_token_accuracy": 0.4379310369491577, "step": 79725 }, { "epoch": 0.08030490279913903, "grad_norm": 10.701305269857007, "learning_rate": 4.988711381179608e-05, "loss": 2.2229, "mean_token_accuracy": 0.4482758641242981, "step": 79730 }, { "epoch": 0.0803099388522432, "grad_norm": 10.74862441620486, "learning_rate": 4.9887076320175344e-05, "loss": 2.3637, "mean_token_accuracy": 0.379310342669487, "step": 79735 }, { "epoch": 0.08031497490534738, "grad_norm": 10.315495838198936, "learning_rate": 4.988703882234546e-05, "loss": 2.13, "mean_token_accuracy": 0.4793103516101837, "step": 79740 }, { "epoch": 0.08032001095845155, "grad_norm": 9.859496816317955, "learning_rate": 4.9887001318306445e-05, "loss": 2.5702, "mean_token_accuracy": 0.3873563170433044, "step": 79745 }, { "epoch": 0.08032504701155573, "grad_norm": 11.39502861035003, "learning_rate": 4.9886963808058316e-05, "loss": 2.2766, "mean_token_accuracy": 0.4482758641242981, "step": 79750 }, { "epoch": 0.0803300830646599, "grad_norm": 11.030634730035914, "learning_rate": 4.988692629160109e-05, "loss": 2.442, "mean_token_accuracy": 0.4121597111225128, "step": 79755 }, { "epoch": 0.08033511911776407, "grad_norm": 11.330420949228133, "learning_rate": 4.988688876893475e-05, "loss": 2.7946, "mean_token_accuracy": 0.34137930870056155, "step": 79760 }, { "epoch": 0.08034015517086825, "grad_norm": 10.644631745597845, "learning_rate": 4.9886851240059326e-05, "loss": 2.5511, "mean_token_accuracy": 0.4310344815254211, "step": 79765 }, { "epoch": 0.08034519122397242, "grad_norm": 9.669454591323623, "learning_rate": 4.9886813704974836e-05, "loss": 2.8437, "mean_token_accuracy": 0.38620689511299133, "step": 79770 }, { "epoch": 0.0803502272770766, "grad_norm": 11.374484953795273, "learning_rate": 4.988677616368127e-05, "loss": 2.2781, "mean_token_accuracy": 0.4295220851898193, "step": 79775 }, { "epoch": 0.08035526333018077, "grad_norm": 9.795831235410121, "learning_rate": 4.9886738616178646e-05, "loss": 2.5287, "mean_token_accuracy": 0.41379310488700866, "step": 79780 }, { "epoch": 0.08036029938328494, "grad_norm": 10.655827587640758, "learning_rate": 4.9886701062466986e-05, "loss": 2.3407, "mean_token_accuracy": 0.38965516090393065, "step": 79785 }, { "epoch": 0.08036533543638912, "grad_norm": 12.586780844475395, "learning_rate": 4.9886663502546284e-05, "loss": 2.4843, "mean_token_accuracy": 0.39655173420906065, "step": 79790 }, { "epoch": 0.08037037148949329, "grad_norm": 11.927960813721993, "learning_rate": 4.9886625936416566e-05, "loss": 2.678, "mean_token_accuracy": 0.3793103516101837, "step": 79795 }, { "epoch": 0.08037540754259745, "grad_norm": 9.521429979326083, "learning_rate": 4.9886588364077825e-05, "loss": 2.7178, "mean_token_accuracy": 0.40689654350280763, "step": 79800 }, { "epoch": 0.08038044359570162, "grad_norm": 11.899903336030563, "learning_rate": 4.9886550785530076e-05, "loss": 2.3666, "mean_token_accuracy": 0.42068966031074523, "step": 79805 }, { "epoch": 0.0803854796488058, "grad_norm": 12.300148921835435, "learning_rate": 4.9886513200773344e-05, "loss": 2.2913, "mean_token_accuracy": 0.4517241299152374, "step": 79810 }, { "epoch": 0.08039051570190997, "grad_norm": 10.563158826999986, "learning_rate": 4.988647560980762e-05, "loss": 2.2174, "mean_token_accuracy": 0.42758620381355283, "step": 79815 }, { "epoch": 0.08039555175501414, "grad_norm": 10.750558351441953, "learning_rate": 4.988643801263293e-05, "loss": 2.3908, "mean_token_accuracy": 0.43448275327682495, "step": 79820 }, { "epoch": 0.08040058780811832, "grad_norm": 14.524846482764712, "learning_rate": 4.988640040924928e-05, "loss": 2.5401, "mean_token_accuracy": 0.3896551728248596, "step": 79825 }, { "epoch": 0.08040562386122249, "grad_norm": 12.221590944296963, "learning_rate": 4.988636279965667e-05, "loss": 2.8444, "mean_token_accuracy": 0.36896551847457887, "step": 79830 }, { "epoch": 0.08041065991432667, "grad_norm": 12.74631890165887, "learning_rate": 4.988632518385513e-05, "loss": 2.5182, "mean_token_accuracy": 0.42068964838981626, "step": 79835 }, { "epoch": 0.08041569596743084, "grad_norm": 11.687276256584504, "learning_rate": 4.988628756184465e-05, "loss": 2.7375, "mean_token_accuracy": 0.3482758641242981, "step": 79840 }, { "epoch": 0.08042073202053501, "grad_norm": 14.082311400520085, "learning_rate": 4.988624993362525e-05, "loss": 2.2607, "mean_token_accuracy": 0.493103438615799, "step": 79845 }, { "epoch": 0.08042576807363919, "grad_norm": 12.966420308281704, "learning_rate": 4.988621229919694e-05, "loss": 2.5812, "mean_token_accuracy": 0.41034482717514037, "step": 79850 }, { "epoch": 0.08043080412674336, "grad_norm": 10.991819187188435, "learning_rate": 4.988617465855974e-05, "loss": 2.6653, "mean_token_accuracy": 0.42758620381355283, "step": 79855 }, { "epoch": 0.08043584017984753, "grad_norm": 10.739196316802005, "learning_rate": 4.9886137011713644e-05, "loss": 2.4861, "mean_token_accuracy": 0.41724138259887694, "step": 79860 }, { "epoch": 0.08044087623295171, "grad_norm": 12.909467950256174, "learning_rate": 4.988609935865867e-05, "loss": 2.5022, "mean_token_accuracy": 0.4431336998939514, "step": 79865 }, { "epoch": 0.08044591228605587, "grad_norm": 11.670864669742258, "learning_rate": 4.988606169939482e-05, "loss": 2.6424, "mean_token_accuracy": 0.4172413766384125, "step": 79870 }, { "epoch": 0.08045094833916004, "grad_norm": 15.221183157398855, "learning_rate": 4.988602403392212e-05, "loss": 2.7179, "mean_token_accuracy": 0.3448275804519653, "step": 79875 }, { "epoch": 0.08045598439226422, "grad_norm": 11.838323875449849, "learning_rate": 4.988598636224057e-05, "loss": 2.2819, "mean_token_accuracy": 0.4344827592372894, "step": 79880 }, { "epoch": 0.08046102044536839, "grad_norm": 11.225833823955256, "learning_rate": 4.988594868435019e-05, "loss": 2.6866, "mean_token_accuracy": 0.39310344457626345, "step": 79885 }, { "epoch": 0.08046605649847256, "grad_norm": 12.98032907270105, "learning_rate": 4.988591100025097e-05, "loss": 2.3301, "mean_token_accuracy": 0.4206896543502808, "step": 79890 }, { "epoch": 0.08047109255157674, "grad_norm": 13.410106224052436, "learning_rate": 4.988587330994295e-05, "loss": 2.4293, "mean_token_accuracy": 0.4000000059604645, "step": 79895 }, { "epoch": 0.08047612860468091, "grad_norm": 13.169469317421113, "learning_rate": 4.988583561342612e-05, "loss": 2.318, "mean_token_accuracy": 0.44137930274009707, "step": 79900 }, { "epoch": 0.08048116465778508, "grad_norm": 10.486903331143807, "learning_rate": 4.9885797910700485e-05, "loss": 2.9473, "mean_token_accuracy": 0.3844525098800659, "step": 79905 }, { "epoch": 0.08048620071088926, "grad_norm": 8.90262439587754, "learning_rate": 4.9885760201766074e-05, "loss": 2.435, "mean_token_accuracy": 0.38275861740112305, "step": 79910 }, { "epoch": 0.08049123676399343, "grad_norm": 11.884550774846595, "learning_rate": 4.988572248662289e-05, "loss": 2.6421, "mean_token_accuracy": 0.40344828069210054, "step": 79915 }, { "epoch": 0.0804962728170976, "grad_norm": 11.787390357462737, "learning_rate": 4.988568476527094e-05, "loss": 2.2563, "mean_token_accuracy": 0.44137930274009707, "step": 79920 }, { "epoch": 0.08050130887020178, "grad_norm": 10.90209612277338, "learning_rate": 4.9885647037710236e-05, "loss": 2.3895, "mean_token_accuracy": 0.41034482717514037, "step": 79925 }, { "epoch": 0.08050634492330595, "grad_norm": 11.093734961714159, "learning_rate": 4.988560930394079e-05, "loss": 2.5695, "mean_token_accuracy": 0.3551724135875702, "step": 79930 }, { "epoch": 0.08051138097641013, "grad_norm": 11.126401106702302, "learning_rate": 4.9885571563962617e-05, "loss": 2.6632, "mean_token_accuracy": 0.4068965554237366, "step": 79935 }, { "epoch": 0.08051641702951429, "grad_norm": 11.105289026821751, "learning_rate": 4.988553381777571e-05, "loss": 2.3929, "mean_token_accuracy": 0.4068965494632721, "step": 79940 }, { "epoch": 0.08052145308261846, "grad_norm": 10.380835545502952, "learning_rate": 4.98854960653801e-05, "loss": 2.4595, "mean_token_accuracy": 0.42413792610168455, "step": 79945 }, { "epoch": 0.08052648913572263, "grad_norm": 12.144611770987705, "learning_rate": 4.988545830677579e-05, "loss": 2.7452, "mean_token_accuracy": 0.44343616962432864, "step": 79950 }, { "epoch": 0.08053152518882681, "grad_norm": 10.389274956117513, "learning_rate": 4.988542054196279e-05, "loss": 2.6751, "mean_token_accuracy": 0.38620689511299133, "step": 79955 }, { "epoch": 0.08053656124193098, "grad_norm": 11.256659021841969, "learning_rate": 4.988538277094111e-05, "loss": 2.3903, "mean_token_accuracy": 0.41379310488700866, "step": 79960 }, { "epoch": 0.08054159729503516, "grad_norm": 13.262805525885387, "learning_rate": 4.988534499371076e-05, "loss": 3.2736, "mean_token_accuracy": 0.31379310190677645, "step": 79965 }, { "epoch": 0.08054663334813933, "grad_norm": 12.817269076485031, "learning_rate": 4.988530721027174e-05, "loss": 2.4415, "mean_token_accuracy": 0.4103448152542114, "step": 79970 }, { "epoch": 0.0805516694012435, "grad_norm": 10.52413004771572, "learning_rate": 4.988526942062408e-05, "loss": 2.3273, "mean_token_accuracy": 0.49522080421447756, "step": 79975 }, { "epoch": 0.08055670545434768, "grad_norm": 10.340954679511736, "learning_rate": 4.988523162476779e-05, "loss": 2.9416, "mean_token_accuracy": 0.3689655214548111, "step": 79980 }, { "epoch": 0.08056174150745185, "grad_norm": 11.591033114318126, "learning_rate": 4.988519382270286e-05, "loss": 2.5983, "mean_token_accuracy": 0.4, "step": 79985 }, { "epoch": 0.08056677756055602, "grad_norm": 8.90478801103833, "learning_rate": 4.988515601442932e-05, "loss": 2.9364, "mean_token_accuracy": 0.3862068921327591, "step": 79990 }, { "epoch": 0.0805718136136602, "grad_norm": 13.762661286990895, "learning_rate": 4.988511819994717e-05, "loss": 2.2698, "mean_token_accuracy": 0.4655172526836395, "step": 79995 }, { "epoch": 0.08057684966676437, "grad_norm": 10.912207858523226, "learning_rate": 4.988508037925643e-05, "loss": 2.1762, "mean_token_accuracy": 0.4413793087005615, "step": 80000 }, { "epoch": 0.08058188571986855, "grad_norm": 9.51618328037348, "learning_rate": 4.98850425523571e-05, "loss": 2.237, "mean_token_accuracy": 0.42758620381355283, "step": 80005 }, { "epoch": 0.0805869217729727, "grad_norm": 11.867797072088862, "learning_rate": 4.9885004719249196e-05, "loss": 2.853, "mean_token_accuracy": 0.3793103456497192, "step": 80010 }, { "epoch": 0.08059195782607688, "grad_norm": 13.887625076652968, "learning_rate": 4.9884966879932725e-05, "loss": 2.7307, "mean_token_accuracy": 0.38275861740112305, "step": 80015 }, { "epoch": 0.08059699387918105, "grad_norm": 10.029960111341182, "learning_rate": 4.98849290344077e-05, "loss": 2.0672, "mean_token_accuracy": 0.5068965494632721, "step": 80020 }, { "epoch": 0.08060202993228523, "grad_norm": 9.974470470143453, "learning_rate": 4.988489118267413e-05, "loss": 2.5035, "mean_token_accuracy": 0.38275861740112305, "step": 80025 }, { "epoch": 0.0806070659853894, "grad_norm": 11.435146159564116, "learning_rate": 4.988485332473204e-05, "loss": 2.6457, "mean_token_accuracy": 0.36896551251411436, "step": 80030 }, { "epoch": 0.08061210203849357, "grad_norm": 12.318485210083049, "learning_rate": 4.988481546058141e-05, "loss": 2.4789, "mean_token_accuracy": 0.4261947989463806, "step": 80035 }, { "epoch": 0.08061713809159775, "grad_norm": 13.11043811340994, "learning_rate": 4.988477759022228e-05, "loss": 2.3176, "mean_token_accuracy": 0.47090138792991637, "step": 80040 }, { "epoch": 0.08062217414470192, "grad_norm": 9.232902861239058, "learning_rate": 4.988473971365464e-05, "loss": 2.2328, "mean_token_accuracy": 0.41724138557910917, "step": 80045 }, { "epoch": 0.0806272101978061, "grad_norm": 15.231336022835224, "learning_rate": 4.9884701830878514e-05, "loss": 2.7632, "mean_token_accuracy": 0.3758620649576187, "step": 80050 }, { "epoch": 0.08063224625091027, "grad_norm": 11.171454595368516, "learning_rate": 4.988466394189391e-05, "loss": 2.3664, "mean_token_accuracy": 0.4310344815254211, "step": 80055 }, { "epoch": 0.08063728230401444, "grad_norm": 9.475104022093126, "learning_rate": 4.988462604670083e-05, "loss": 2.5767, "mean_token_accuracy": 0.4068965494632721, "step": 80060 }, { "epoch": 0.08064231835711862, "grad_norm": 10.139071607950687, "learning_rate": 4.988458814529929e-05, "loss": 2.3909, "mean_token_accuracy": 0.4020568609237671, "step": 80065 }, { "epoch": 0.08064735441022279, "grad_norm": 11.389050934764665, "learning_rate": 4.988455023768931e-05, "loss": 2.4227, "mean_token_accuracy": 0.4655172288417816, "step": 80070 }, { "epoch": 0.08065239046332696, "grad_norm": 8.679097856972227, "learning_rate": 4.988451232387088e-05, "loss": 2.3241, "mean_token_accuracy": 0.4724137902259827, "step": 80075 }, { "epoch": 0.08065742651643112, "grad_norm": 12.146492281222727, "learning_rate": 4.988447440384402e-05, "loss": 2.6556, "mean_token_accuracy": 0.43448275327682495, "step": 80080 }, { "epoch": 0.0806624625695353, "grad_norm": 14.02000345331648, "learning_rate": 4.988443647760876e-05, "loss": 2.5912, "mean_token_accuracy": 0.3620689660310745, "step": 80085 }, { "epoch": 0.08066749862263947, "grad_norm": 11.506086277764037, "learning_rate": 4.9884398545165076e-05, "loss": 2.6011, "mean_token_accuracy": 0.37931033968925476, "step": 80090 }, { "epoch": 0.08067253467574365, "grad_norm": 10.218058789640365, "learning_rate": 4.9884360606513006e-05, "loss": 2.2922, "mean_token_accuracy": 0.4868118643760681, "step": 80095 }, { "epoch": 0.08067757072884782, "grad_norm": 14.27204041098857, "learning_rate": 4.988432266165254e-05, "loss": 2.2842, "mean_token_accuracy": 0.441379314661026, "step": 80100 }, { "epoch": 0.080682606781952, "grad_norm": 12.51315977299802, "learning_rate": 4.9884284710583714e-05, "loss": 2.3819, "mean_token_accuracy": 0.38275861740112305, "step": 80105 }, { "epoch": 0.08068764283505617, "grad_norm": 10.869885159969265, "learning_rate": 4.988424675330651e-05, "loss": 2.545, "mean_token_accuracy": 0.3793103486299515, "step": 80110 }, { "epoch": 0.08069267888816034, "grad_norm": 10.941300973683427, "learning_rate": 4.9884208789820955e-05, "loss": 2.5336, "mean_token_accuracy": 0.4, "step": 80115 }, { "epoch": 0.08069771494126451, "grad_norm": 11.211951391725778, "learning_rate": 4.988417082012706e-05, "loss": 2.5925, "mean_token_accuracy": 0.43944343328475954, "step": 80120 }, { "epoch": 0.08070275099436869, "grad_norm": 10.058595229674568, "learning_rate": 4.988413284422482e-05, "loss": 2.7034, "mean_token_accuracy": 0.4034482717514038, "step": 80125 }, { "epoch": 0.08070778704747286, "grad_norm": 19.511809694640156, "learning_rate": 4.988409486211427e-05, "loss": 2.2603, "mean_token_accuracy": 0.4862069010734558, "step": 80130 }, { "epoch": 0.08071282310057704, "grad_norm": 11.644891592733625, "learning_rate": 4.9884056873795406e-05, "loss": 2.9915, "mean_token_accuracy": 0.358620685338974, "step": 80135 }, { "epoch": 0.08071785915368121, "grad_norm": 15.7495380519481, "learning_rate": 4.988401887926824e-05, "loss": 3.0508, "mean_token_accuracy": 0.358620685338974, "step": 80140 }, { "epoch": 0.08072289520678538, "grad_norm": 12.507406787038004, "learning_rate": 4.988398087853277e-05, "loss": 3.0027, "mean_token_accuracy": 0.3517241358757019, "step": 80145 }, { "epoch": 0.08072793125988954, "grad_norm": 10.038542193021751, "learning_rate": 4.9883942871589036e-05, "loss": 2.4415, "mean_token_accuracy": 0.43448275327682495, "step": 80150 }, { "epoch": 0.08073296731299372, "grad_norm": 9.411735462598953, "learning_rate": 4.988390485843702e-05, "loss": 2.237, "mean_token_accuracy": 0.43793103098869324, "step": 80155 }, { "epoch": 0.08073800336609789, "grad_norm": 9.72046616540649, "learning_rate": 4.988386683907675e-05, "loss": 2.2092, "mean_token_accuracy": 0.4793103337287903, "step": 80160 }, { "epoch": 0.08074303941920206, "grad_norm": 12.138774288037089, "learning_rate": 4.988382881350824e-05, "loss": 2.7918, "mean_token_accuracy": 0.33793103098869326, "step": 80165 }, { "epoch": 0.08074807547230624, "grad_norm": 18.716953340008946, "learning_rate": 4.988379078173148e-05, "loss": 2.6917, "mean_token_accuracy": 0.40000000298023225, "step": 80170 }, { "epoch": 0.08075311152541041, "grad_norm": 10.86495990632601, "learning_rate": 4.9883752743746495e-05, "loss": 2.1994, "mean_token_accuracy": 0.4586206912994385, "step": 80175 }, { "epoch": 0.08075814757851459, "grad_norm": 9.131201003497042, "learning_rate": 4.98837146995533e-05, "loss": 2.6416, "mean_token_accuracy": 0.4034482777118683, "step": 80180 }, { "epoch": 0.08076318363161876, "grad_norm": 10.07188350968997, "learning_rate": 4.988367664915188e-05, "loss": 2.5278, "mean_token_accuracy": 0.42068964838981626, "step": 80185 }, { "epoch": 0.08076821968472293, "grad_norm": 9.71708828707169, "learning_rate": 4.988363859254228e-05, "loss": 2.1993, "mean_token_accuracy": 0.45517241954803467, "step": 80190 }, { "epoch": 0.08077325573782711, "grad_norm": 11.043367429420679, "learning_rate": 4.988360052972449e-05, "loss": 2.1815, "mean_token_accuracy": 0.4551724076271057, "step": 80195 }, { "epoch": 0.08077829179093128, "grad_norm": 9.97235453722267, "learning_rate": 4.9883562460698524e-05, "loss": 2.4646, "mean_token_accuracy": 0.4344827592372894, "step": 80200 }, { "epoch": 0.08078332784403545, "grad_norm": 12.43980982333486, "learning_rate": 4.988352438546439e-05, "loss": 2.1286, "mean_token_accuracy": 0.4586206912994385, "step": 80205 }, { "epoch": 0.08078836389713963, "grad_norm": 11.658957859859289, "learning_rate": 4.9883486304022106e-05, "loss": 2.5462, "mean_token_accuracy": 0.4172413766384125, "step": 80210 }, { "epoch": 0.0807933999502438, "grad_norm": 11.799511945372018, "learning_rate": 4.988344821637168e-05, "loss": 2.2434, "mean_token_accuracy": 0.43448275327682495, "step": 80215 }, { "epoch": 0.08079843600334796, "grad_norm": 12.098013786992764, "learning_rate": 4.9883410122513116e-05, "loss": 2.6017, "mean_token_accuracy": 0.3793103456497192, "step": 80220 }, { "epoch": 0.08080347205645214, "grad_norm": 12.422930525499568, "learning_rate": 4.9883372022446435e-05, "loss": 2.7533, "mean_token_accuracy": 0.41034482717514037, "step": 80225 }, { "epoch": 0.08080850810955631, "grad_norm": 14.897009786497573, "learning_rate": 4.988333391617164e-05, "loss": 2.8545, "mean_token_accuracy": 0.35862069129943847, "step": 80230 }, { "epoch": 0.08081354416266048, "grad_norm": 11.826513220996073, "learning_rate": 4.988329580368875e-05, "loss": 2.246, "mean_token_accuracy": 0.43793103098869324, "step": 80235 }, { "epoch": 0.08081858021576466, "grad_norm": 15.203374607575356, "learning_rate": 4.988325768499775e-05, "loss": 2.8242, "mean_token_accuracy": 0.3931034505367279, "step": 80240 }, { "epoch": 0.08082361626886883, "grad_norm": 11.174047717908518, "learning_rate": 4.9883219560098696e-05, "loss": 2.1627, "mean_token_accuracy": 0.43103447556495667, "step": 80245 }, { "epoch": 0.080828652321973, "grad_norm": 12.682668948223332, "learning_rate": 4.988318142899156e-05, "loss": 2.7145, "mean_token_accuracy": 0.3986690878868103, "step": 80250 }, { "epoch": 0.08083368837507718, "grad_norm": 12.50380770396913, "learning_rate": 4.988314329167636e-05, "loss": 2.496, "mean_token_accuracy": 0.36896551251411436, "step": 80255 }, { "epoch": 0.08083872442818135, "grad_norm": 13.32962268240316, "learning_rate": 4.988310514815312e-05, "loss": 2.3401, "mean_token_accuracy": 0.48784029483795166, "step": 80260 }, { "epoch": 0.08084376048128553, "grad_norm": 12.12451603861171, "learning_rate": 4.9883066998421835e-05, "loss": 2.4031, "mean_token_accuracy": 0.4517241418361664, "step": 80265 }, { "epoch": 0.0808487965343897, "grad_norm": 15.099892896296414, "learning_rate": 4.9883028842482524e-05, "loss": 2.9451, "mean_token_accuracy": 0.3482758551836014, "step": 80270 }, { "epoch": 0.08085383258749387, "grad_norm": 12.094979330002506, "learning_rate": 4.9882990680335206e-05, "loss": 2.7055, "mean_token_accuracy": 0.38965516686439516, "step": 80275 }, { "epoch": 0.08085886864059805, "grad_norm": 11.665387114445087, "learning_rate": 4.9882952511979875e-05, "loss": 2.2193, "mean_token_accuracy": 0.4448275864124298, "step": 80280 }, { "epoch": 0.08086390469370222, "grad_norm": 14.10158159130548, "learning_rate": 4.9882914337416545e-05, "loss": 3.0817, "mean_token_accuracy": 0.34319419860839845, "step": 80285 }, { "epoch": 0.08086894074680638, "grad_norm": 10.179150286633963, "learning_rate": 4.9882876156645236e-05, "loss": 2.7764, "mean_token_accuracy": 0.34482758641242983, "step": 80290 }, { "epoch": 0.08087397679991055, "grad_norm": 10.871569142489253, "learning_rate": 4.9882837969665955e-05, "loss": 2.6984, "mean_token_accuracy": 0.39655172526836396, "step": 80295 }, { "epoch": 0.08087901285301473, "grad_norm": 12.875749533800725, "learning_rate": 4.98827997764787e-05, "loss": 2.5701, "mean_token_accuracy": 0.4103448331356049, "step": 80300 }, { "epoch": 0.0808840489061189, "grad_norm": 13.002409126383187, "learning_rate": 4.98827615770835e-05, "loss": 2.4456, "mean_token_accuracy": 0.4034482777118683, "step": 80305 }, { "epoch": 0.08088908495922308, "grad_norm": 10.513427848861115, "learning_rate": 4.988272337148036e-05, "loss": 2.5324, "mean_token_accuracy": 0.4379310429096222, "step": 80310 }, { "epoch": 0.08089412101232725, "grad_norm": 10.709374298765223, "learning_rate": 4.988268515966928e-05, "loss": 2.6932, "mean_token_accuracy": 0.4103448212146759, "step": 80315 }, { "epoch": 0.08089915706543142, "grad_norm": 11.039740581851477, "learning_rate": 4.988264694165029e-05, "loss": 2.6414, "mean_token_accuracy": 0.34482758939266206, "step": 80320 }, { "epoch": 0.0809041931185356, "grad_norm": 9.579921519239816, "learning_rate": 4.988260871742337e-05, "loss": 1.9643, "mean_token_accuracy": 0.5034482777118683, "step": 80325 }, { "epoch": 0.08090922917163977, "grad_norm": 15.581674042565298, "learning_rate": 4.988257048698857e-05, "loss": 2.9888, "mean_token_accuracy": 0.3620689630508423, "step": 80330 }, { "epoch": 0.08091426522474394, "grad_norm": 8.304602156844403, "learning_rate": 4.9882532250345876e-05, "loss": 1.9195, "mean_token_accuracy": 0.5086206912994384, "step": 80335 }, { "epoch": 0.08091930127784812, "grad_norm": 13.298919231907949, "learning_rate": 4.98824940074953e-05, "loss": 2.7577, "mean_token_accuracy": 0.3965517163276672, "step": 80340 }, { "epoch": 0.08092433733095229, "grad_norm": 11.09491780705811, "learning_rate": 4.988245575843685e-05, "loss": 2.4673, "mean_token_accuracy": 0.4206896543502808, "step": 80345 }, { "epoch": 0.08092937338405647, "grad_norm": 10.64679779297878, "learning_rate": 4.988241750317055e-05, "loss": 2.3759, "mean_token_accuracy": 0.4517241418361664, "step": 80350 }, { "epoch": 0.08093440943716064, "grad_norm": 10.777953916426817, "learning_rate": 4.9882379241696404e-05, "loss": 2.4596, "mean_token_accuracy": 0.4034482777118683, "step": 80355 }, { "epoch": 0.0809394454902648, "grad_norm": 10.967138295568134, "learning_rate": 4.9882340974014426e-05, "loss": 2.2674, "mean_token_accuracy": 0.4310344815254211, "step": 80360 }, { "epoch": 0.08094448154336897, "grad_norm": 12.183091784277773, "learning_rate": 4.9882302700124604e-05, "loss": 2.9372, "mean_token_accuracy": 0.3482758641242981, "step": 80365 }, { "epoch": 0.08094951759647315, "grad_norm": 13.835472889224372, "learning_rate": 4.988226442002698e-05, "loss": 2.6108, "mean_token_accuracy": 0.39655172228813174, "step": 80370 }, { "epoch": 0.08095455364957732, "grad_norm": 10.378683760424224, "learning_rate": 4.988222613372155e-05, "loss": 2.3908, "mean_token_accuracy": 0.42413793206214906, "step": 80375 }, { "epoch": 0.0809595897026815, "grad_norm": 10.333852996097557, "learning_rate": 4.988218784120833e-05, "loss": 2.316, "mean_token_accuracy": 0.43793103098869324, "step": 80380 }, { "epoch": 0.08096462575578567, "grad_norm": 11.81532983430724, "learning_rate": 4.988214954248732e-05, "loss": 2.4685, "mean_token_accuracy": 0.4206896543502808, "step": 80385 }, { "epoch": 0.08096966180888984, "grad_norm": 13.466744886582683, "learning_rate": 4.988211123755854e-05, "loss": 2.5621, "mean_token_accuracy": 0.3827586233615875, "step": 80390 }, { "epoch": 0.08097469786199402, "grad_norm": 10.700760446542624, "learning_rate": 4.9882072926422e-05, "loss": 2.4769, "mean_token_accuracy": 0.45172414779663084, "step": 80395 }, { "epoch": 0.08097973391509819, "grad_norm": 9.993418519949108, "learning_rate": 4.98820346090777e-05, "loss": 2.7326, "mean_token_accuracy": 0.37586206793785093, "step": 80400 }, { "epoch": 0.08098476996820236, "grad_norm": 10.983636937786894, "learning_rate": 4.9881996285525676e-05, "loss": 2.3388, "mean_token_accuracy": 0.3876587986946106, "step": 80405 }, { "epoch": 0.08098980602130654, "grad_norm": 16.44801484288338, "learning_rate": 4.9881957955765906e-05, "loss": 2.5329, "mean_token_accuracy": 0.4482758641242981, "step": 80410 }, { "epoch": 0.08099484207441071, "grad_norm": 11.539544601640708, "learning_rate": 4.9881919619798415e-05, "loss": 2.4551, "mean_token_accuracy": 0.4034482777118683, "step": 80415 }, { "epoch": 0.08099987812751488, "grad_norm": 10.210634267785238, "learning_rate": 4.988188127762322e-05, "loss": 2.3251, "mean_token_accuracy": 0.44482758045196535, "step": 80420 }, { "epoch": 0.08100491418061906, "grad_norm": 12.300412509665371, "learning_rate": 4.9881842929240336e-05, "loss": 2.2474, "mean_token_accuracy": 0.4310344815254211, "step": 80425 }, { "epoch": 0.08100995023372322, "grad_norm": 16.171579462595655, "learning_rate": 4.9881804574649755e-05, "loss": 2.7253, "mean_token_accuracy": 0.36896551847457887, "step": 80430 }, { "epoch": 0.08101498628682739, "grad_norm": 12.045463551483095, "learning_rate": 4.98817662138515e-05, "loss": 2.6204, "mean_token_accuracy": 0.4034482777118683, "step": 80435 }, { "epoch": 0.08102002233993157, "grad_norm": 16.59282934226582, "learning_rate": 4.988172784684558e-05, "loss": 2.6744, "mean_token_accuracy": 0.3965517163276672, "step": 80440 }, { "epoch": 0.08102505839303574, "grad_norm": 11.697301474228242, "learning_rate": 4.9881689473632e-05, "loss": 2.7192, "mean_token_accuracy": 0.39310345649719236, "step": 80445 }, { "epoch": 0.08103009444613991, "grad_norm": 12.67939947331828, "learning_rate": 4.988165109421077e-05, "loss": 2.63, "mean_token_accuracy": 0.3551724076271057, "step": 80450 }, { "epoch": 0.08103513049924409, "grad_norm": 11.676686093097386, "learning_rate": 4.9881612708581915e-05, "loss": 2.3637, "mean_token_accuracy": 0.4172413766384125, "step": 80455 }, { "epoch": 0.08104016655234826, "grad_norm": 10.519003900866483, "learning_rate": 4.988157431674543e-05, "loss": 2.5983, "mean_token_accuracy": 0.43629764318466185, "step": 80460 }, { "epoch": 0.08104520260545243, "grad_norm": 11.984610301428427, "learning_rate": 4.9881535918701336e-05, "loss": 2.6656, "mean_token_accuracy": 0.3772534728050232, "step": 80465 }, { "epoch": 0.08105023865855661, "grad_norm": 10.495241763905344, "learning_rate": 4.988149751444964e-05, "loss": 2.6824, "mean_token_accuracy": 0.38965516686439516, "step": 80470 }, { "epoch": 0.08105527471166078, "grad_norm": 11.054858489966934, "learning_rate": 4.988145910399036e-05, "loss": 2.6188, "mean_token_accuracy": 0.4275862067937851, "step": 80475 }, { "epoch": 0.08106031076476496, "grad_norm": 15.446198923535718, "learning_rate": 4.9881420687323484e-05, "loss": 2.4128, "mean_token_accuracy": 0.44482758045196535, "step": 80480 }, { "epoch": 0.08106534681786913, "grad_norm": 9.937294660003108, "learning_rate": 4.988138226444904e-05, "loss": 2.4816, "mean_token_accuracy": 0.42758620977401735, "step": 80485 }, { "epoch": 0.0810703828709733, "grad_norm": 12.60511158817512, "learning_rate": 4.988134383536704e-05, "loss": 2.7229, "mean_token_accuracy": 0.3965517282485962, "step": 80490 }, { "epoch": 0.08107541892407748, "grad_norm": 10.867341439604449, "learning_rate": 4.988130540007749e-05, "loss": 2.3165, "mean_token_accuracy": 0.42413792610168455, "step": 80495 }, { "epoch": 0.08108045497718164, "grad_norm": 11.315171846829784, "learning_rate": 4.98812669585804e-05, "loss": 2.3471, "mean_token_accuracy": 0.44482758045196535, "step": 80500 }, { "epoch": 0.08108549103028581, "grad_norm": 11.326825572156388, "learning_rate": 4.988122851087579e-05, "loss": 2.6614, "mean_token_accuracy": 0.3931034505367279, "step": 80505 }, { "epoch": 0.08109052708338998, "grad_norm": 13.25628106266195, "learning_rate": 4.988119005696366e-05, "loss": 2.6381, "mean_token_accuracy": 0.36551723480224607, "step": 80510 }, { "epoch": 0.08109556313649416, "grad_norm": 11.740345704206694, "learning_rate": 4.988115159684402e-05, "loss": 2.7657, "mean_token_accuracy": 0.4, "step": 80515 }, { "epoch": 0.08110059918959833, "grad_norm": 11.858175323397843, "learning_rate": 4.9881113130516876e-05, "loss": 2.3949, "mean_token_accuracy": 0.45862069725990295, "step": 80520 }, { "epoch": 0.0811056352427025, "grad_norm": 8.921639315255058, "learning_rate": 4.9881074657982256e-05, "loss": 2.3166, "mean_token_accuracy": 0.4517241418361664, "step": 80525 }, { "epoch": 0.08111067129580668, "grad_norm": 9.84686569291601, "learning_rate": 4.9881036179240165e-05, "loss": 2.6999, "mean_token_accuracy": 0.4310344815254211, "step": 80530 }, { "epoch": 0.08111570734891085, "grad_norm": 13.04787468673603, "learning_rate": 4.9880997694290604e-05, "loss": 2.8996, "mean_token_accuracy": 0.33448275923728943, "step": 80535 }, { "epoch": 0.08112074340201503, "grad_norm": 9.54958123655461, "learning_rate": 4.988095920313359e-05, "loss": 2.1938, "mean_token_accuracy": 0.4379310369491577, "step": 80540 }, { "epoch": 0.0811257794551192, "grad_norm": 10.934699852134056, "learning_rate": 4.9880920705769136e-05, "loss": 2.2643, "mean_token_accuracy": 0.4103448301553726, "step": 80545 }, { "epoch": 0.08113081550822338, "grad_norm": 9.247376493774293, "learning_rate": 4.988088220219725e-05, "loss": 2.6106, "mean_token_accuracy": 0.452339905500412, "step": 80550 }, { "epoch": 0.08113585156132755, "grad_norm": 12.145502451659977, "learning_rate": 4.988084369241794e-05, "loss": 2.6369, "mean_token_accuracy": 0.4241379201412201, "step": 80555 }, { "epoch": 0.08114088761443172, "grad_norm": 11.13356690082388, "learning_rate": 4.988080517643122e-05, "loss": 2.4567, "mean_token_accuracy": 0.4068965554237366, "step": 80560 }, { "epoch": 0.0811459236675359, "grad_norm": 16.310321519827284, "learning_rate": 4.988076665423711e-05, "loss": 3.1015, "mean_token_accuracy": 0.3517241358757019, "step": 80565 }, { "epoch": 0.08115095972064006, "grad_norm": 11.69844915258311, "learning_rate": 4.9880728125835596e-05, "loss": 2.7174, "mean_token_accuracy": 0.36896551847457887, "step": 80570 }, { "epoch": 0.08115599577374423, "grad_norm": 12.55853948228214, "learning_rate": 4.988068959122671e-05, "loss": 2.7191, "mean_token_accuracy": 0.3999999940395355, "step": 80575 }, { "epoch": 0.0811610318268484, "grad_norm": 9.916995438745841, "learning_rate": 4.988065105041046e-05, "loss": 2.3463, "mean_token_accuracy": 0.43103448748588563, "step": 80580 }, { "epoch": 0.08116606787995258, "grad_norm": 11.445161336951655, "learning_rate": 4.988061250338685e-05, "loss": 2.2488, "mean_token_accuracy": 0.45347853302955626, "step": 80585 }, { "epoch": 0.08117110393305675, "grad_norm": 13.775213524954982, "learning_rate": 4.988057395015589e-05, "loss": 2.6585, "mean_token_accuracy": 0.38620689511299133, "step": 80590 }, { "epoch": 0.08117613998616093, "grad_norm": 12.221722020087352, "learning_rate": 4.9880535390717596e-05, "loss": 2.2576, "mean_token_accuracy": 0.42413793206214906, "step": 80595 }, { "epoch": 0.0811811760392651, "grad_norm": 9.562788938033352, "learning_rate": 4.988049682507198e-05, "loss": 2.0319, "mean_token_accuracy": 0.5172413647174835, "step": 80600 }, { "epoch": 0.08118621209236927, "grad_norm": 13.39292304397339, "learning_rate": 4.9880458253219046e-05, "loss": 2.7925, "mean_token_accuracy": 0.35862068831920624, "step": 80605 }, { "epoch": 0.08119124814547345, "grad_norm": 10.781562737169876, "learning_rate": 4.988041967515881e-05, "loss": 2.2439, "mean_token_accuracy": 0.42758620381355283, "step": 80610 }, { "epoch": 0.08119628419857762, "grad_norm": 12.143270616113, "learning_rate": 4.988038109089128e-05, "loss": 2.3752, "mean_token_accuracy": 0.4379310250282288, "step": 80615 }, { "epoch": 0.0812013202516818, "grad_norm": 10.879837736361424, "learning_rate": 4.9880342500416466e-05, "loss": 2.5596, "mean_token_accuracy": 0.38965516686439516, "step": 80620 }, { "epoch": 0.08120635630478597, "grad_norm": 11.84345163700897, "learning_rate": 4.988030390373439e-05, "loss": 2.6932, "mean_token_accuracy": 0.3896551728248596, "step": 80625 }, { "epoch": 0.08121139235789014, "grad_norm": 12.507698336899674, "learning_rate": 4.9880265300845044e-05, "loss": 2.6337, "mean_token_accuracy": 0.39655172228813174, "step": 80630 }, { "epoch": 0.0812164284109943, "grad_norm": 10.812020139835624, "learning_rate": 4.988022669174845e-05, "loss": 2.5561, "mean_token_accuracy": 0.3862069010734558, "step": 80635 }, { "epoch": 0.08122146446409848, "grad_norm": 9.544459293385454, "learning_rate": 4.988018807644462e-05, "loss": 2.5739, "mean_token_accuracy": 0.37586206793785093, "step": 80640 }, { "epoch": 0.08122650051720265, "grad_norm": 16.935397057645687, "learning_rate": 4.9880149454933554e-05, "loss": 2.5199, "mean_token_accuracy": 0.441379314661026, "step": 80645 }, { "epoch": 0.08123153657030682, "grad_norm": 11.279963659866782, "learning_rate": 4.988011082721528e-05, "loss": 2.3649, "mean_token_accuracy": 0.41724138259887694, "step": 80650 }, { "epoch": 0.081236572623411, "grad_norm": 14.104323968969988, "learning_rate": 4.988007219328979e-05, "loss": 2.4824, "mean_token_accuracy": 0.38620689511299133, "step": 80655 }, { "epoch": 0.08124160867651517, "grad_norm": 9.579509212770297, "learning_rate": 4.988003355315711e-05, "loss": 2.2785, "mean_token_accuracy": 0.4517241358757019, "step": 80660 }, { "epoch": 0.08124664472961934, "grad_norm": 13.321921090103697, "learning_rate": 4.987999490681724e-05, "loss": 2.8125, "mean_token_accuracy": 0.3758620619773865, "step": 80665 }, { "epoch": 0.08125168078272352, "grad_norm": 13.455675274793983, "learning_rate": 4.98799562542702e-05, "loss": 2.6695, "mean_token_accuracy": 0.4034482777118683, "step": 80670 }, { "epoch": 0.08125671683582769, "grad_norm": 12.337300330850573, "learning_rate": 4.987991759551599e-05, "loss": 2.6627, "mean_token_accuracy": 0.4, "step": 80675 }, { "epoch": 0.08126175288893187, "grad_norm": 10.13627910878303, "learning_rate": 4.987987893055463e-05, "loss": 2.7342, "mean_token_accuracy": 0.4, "step": 80680 }, { "epoch": 0.08126678894203604, "grad_norm": 12.098234710743167, "learning_rate": 4.987984025938612e-05, "loss": 2.9283, "mean_token_accuracy": 0.33793103098869326, "step": 80685 }, { "epoch": 0.08127182499514021, "grad_norm": 17.27515027493642, "learning_rate": 4.9879801582010486e-05, "loss": 2.4923, "mean_token_accuracy": 0.4344827651977539, "step": 80690 }, { "epoch": 0.08127686104824439, "grad_norm": 9.235110828918094, "learning_rate": 4.987976289842773e-05, "loss": 2.6481, "mean_token_accuracy": 0.41929824352264405, "step": 80695 }, { "epoch": 0.08128189710134856, "grad_norm": 11.372605573155063, "learning_rate": 4.987972420863786e-05, "loss": 3.056, "mean_token_accuracy": 0.4068965554237366, "step": 80700 }, { "epoch": 0.08128693315445272, "grad_norm": 12.678842363017122, "learning_rate": 4.987968551264089e-05, "loss": 2.3025, "mean_token_accuracy": 0.441379314661026, "step": 80705 }, { "epoch": 0.0812919692075569, "grad_norm": 10.202974900111595, "learning_rate": 4.987964681043683e-05, "loss": 2.4311, "mean_token_accuracy": 0.41191771626472473, "step": 80710 }, { "epoch": 0.08129700526066107, "grad_norm": 10.493990314270032, "learning_rate": 4.987960810202569e-05, "loss": 2.6218, "mean_token_accuracy": 0.4068965554237366, "step": 80715 }, { "epoch": 0.08130204131376524, "grad_norm": 13.703774648739824, "learning_rate": 4.987956938740749e-05, "loss": 3.001, "mean_token_accuracy": 0.3586206793785095, "step": 80720 }, { "epoch": 0.08130707736686942, "grad_norm": 14.270355294741167, "learning_rate": 4.9879530666582225e-05, "loss": 2.711, "mean_token_accuracy": 0.36896551549434664, "step": 80725 }, { "epoch": 0.08131211341997359, "grad_norm": 9.939802733315174, "learning_rate": 4.987949193954992e-05, "loss": 2.3826, "mean_token_accuracy": 0.4172413766384125, "step": 80730 }, { "epoch": 0.08131714947307776, "grad_norm": 11.59544362284299, "learning_rate": 4.9879453206310575e-05, "loss": 3.4914, "mean_token_accuracy": 0.34827586114406583, "step": 80735 }, { "epoch": 0.08132218552618194, "grad_norm": 10.291815631230232, "learning_rate": 4.987941446686421e-05, "loss": 2.3719, "mean_token_accuracy": 0.43793103098869324, "step": 80740 }, { "epoch": 0.08132722157928611, "grad_norm": 10.861994219440632, "learning_rate": 4.9879375721210825e-05, "loss": 2.4622, "mean_token_accuracy": 0.40689656138420105, "step": 80745 }, { "epoch": 0.08133225763239028, "grad_norm": 10.867006763651423, "learning_rate": 4.987933696935044e-05, "loss": 2.1243, "mean_token_accuracy": 0.43448275327682495, "step": 80750 }, { "epoch": 0.08133729368549446, "grad_norm": 16.05325381802022, "learning_rate": 4.987929821128306e-05, "loss": 2.6652, "mean_token_accuracy": 0.3793103456497192, "step": 80755 }, { "epoch": 0.08134232973859863, "grad_norm": 12.228687821770087, "learning_rate": 4.9879259447008705e-05, "loss": 2.5697, "mean_token_accuracy": 0.34482758641242983, "step": 80760 }, { "epoch": 0.0813473657917028, "grad_norm": 10.336646190393555, "learning_rate": 4.987922067652737e-05, "loss": 2.4539, "mean_token_accuracy": 0.3862068891525269, "step": 80765 }, { "epoch": 0.08135240184480698, "grad_norm": 11.36042793705165, "learning_rate": 4.987918189983908e-05, "loss": 2.3363, "mean_token_accuracy": 0.47931034564971925, "step": 80770 }, { "epoch": 0.08135743789791114, "grad_norm": 9.997425549224465, "learning_rate": 4.9879143116943834e-05, "loss": 2.6877, "mean_token_accuracy": 0.3448275804519653, "step": 80775 }, { "epoch": 0.08136247395101531, "grad_norm": 9.517219810764468, "learning_rate": 4.9879104327841655e-05, "loss": 2.0694, "mean_token_accuracy": 0.47931034564971925, "step": 80780 }, { "epoch": 0.08136751000411949, "grad_norm": 12.900330376974662, "learning_rate": 4.987906553253255e-05, "loss": 2.7264, "mean_token_accuracy": 0.38275861740112305, "step": 80785 }, { "epoch": 0.08137254605722366, "grad_norm": 13.128841987131354, "learning_rate": 4.987902673101652e-05, "loss": 2.4306, "mean_token_accuracy": 0.44482758045196535, "step": 80790 }, { "epoch": 0.08137758211032783, "grad_norm": 32.49612206356215, "learning_rate": 4.98789879232936e-05, "loss": 2.98, "mean_token_accuracy": 0.3965517163276672, "step": 80795 }, { "epoch": 0.08138261816343201, "grad_norm": 10.304227890398673, "learning_rate": 4.9878949109363766e-05, "loss": 3.0978, "mean_token_accuracy": 0.3068965464830399, "step": 80800 }, { "epoch": 0.08138765421653618, "grad_norm": 11.76261118447038, "learning_rate": 4.987891028922705e-05, "loss": 2.7044, "mean_token_accuracy": 0.38620689511299133, "step": 80805 }, { "epoch": 0.08139269026964036, "grad_norm": 9.579032064442766, "learning_rate": 4.987887146288346e-05, "loss": 2.7355, "mean_token_accuracy": 0.3862069010734558, "step": 80810 }, { "epoch": 0.08139772632274453, "grad_norm": 9.038490077277798, "learning_rate": 4.9878832630333014e-05, "loss": 2.4376, "mean_token_accuracy": 0.4, "step": 80815 }, { "epoch": 0.0814027623758487, "grad_norm": 14.756393535797246, "learning_rate": 4.987879379157571e-05, "loss": 2.7933, "mean_token_accuracy": 0.37586206793785093, "step": 80820 }, { "epoch": 0.08140779842895288, "grad_norm": 10.246050987750717, "learning_rate": 4.9878754946611565e-05, "loss": 2.5148, "mean_token_accuracy": 0.39655171930789945, "step": 80825 }, { "epoch": 0.08141283448205705, "grad_norm": 10.816268117662576, "learning_rate": 4.987871609544059e-05, "loss": 2.708, "mean_token_accuracy": 0.40344826579093934, "step": 80830 }, { "epoch": 0.08141787053516122, "grad_norm": 11.651370278472523, "learning_rate": 4.9878677238062795e-05, "loss": 2.7155, "mean_token_accuracy": 0.3793103456497192, "step": 80835 }, { "epoch": 0.0814229065882654, "grad_norm": 12.180002383182236, "learning_rate": 4.987863837447818e-05, "loss": 2.5, "mean_token_accuracy": 0.42413792610168455, "step": 80840 }, { "epoch": 0.08142794264136956, "grad_norm": 16.070291286245048, "learning_rate": 4.987859950468677e-05, "loss": 2.5269, "mean_token_accuracy": 0.42068966031074523, "step": 80845 }, { "epoch": 0.08143297869447373, "grad_norm": 10.580847623124962, "learning_rate": 4.9878560628688586e-05, "loss": 2.5778, "mean_token_accuracy": 0.44137931764125826, "step": 80850 }, { "epoch": 0.0814380147475779, "grad_norm": 11.882722821194134, "learning_rate": 4.9878521746483615e-05, "loss": 2.6037, "mean_token_accuracy": 0.3793103486299515, "step": 80855 }, { "epoch": 0.08144305080068208, "grad_norm": 13.474487396525651, "learning_rate": 4.987848285807188e-05, "loss": 2.7154, "mean_token_accuracy": 0.3827586114406586, "step": 80860 }, { "epoch": 0.08144808685378625, "grad_norm": 10.421423643089492, "learning_rate": 4.987844396345338e-05, "loss": 2.0914, "mean_token_accuracy": 0.44827585816383364, "step": 80865 }, { "epoch": 0.08145312290689043, "grad_norm": 9.848272006215531, "learning_rate": 4.9878405062628146e-05, "loss": 2.4973, "mean_token_accuracy": 0.4034482777118683, "step": 80870 }, { "epoch": 0.0814581589599946, "grad_norm": 9.261356638443749, "learning_rate": 4.987836615559617e-05, "loss": 2.1214, "mean_token_accuracy": 0.5034482717514038, "step": 80875 }, { "epoch": 0.08146319501309877, "grad_norm": 18.075430985244505, "learning_rate": 4.9878327242357485e-05, "loss": 2.9689, "mean_token_accuracy": 0.37931033968925476, "step": 80880 }, { "epoch": 0.08146823106620295, "grad_norm": 12.571121944768532, "learning_rate": 4.987828832291207e-05, "loss": 2.5551, "mean_token_accuracy": 0.4517241418361664, "step": 80885 }, { "epoch": 0.08147326711930712, "grad_norm": 12.730770016847952, "learning_rate": 4.987824939725997e-05, "loss": 2.9734, "mean_token_accuracy": 0.3172413736581802, "step": 80890 }, { "epoch": 0.0814783031724113, "grad_norm": 10.614779103187868, "learning_rate": 4.987821046540116e-05, "loss": 2.7368, "mean_token_accuracy": 0.36551723480224607, "step": 80895 }, { "epoch": 0.08148333922551547, "grad_norm": 14.846407870020794, "learning_rate": 4.987817152733568e-05, "loss": 2.9338, "mean_token_accuracy": 0.37586206793785093, "step": 80900 }, { "epoch": 0.08148837527861964, "grad_norm": 11.768193154523427, "learning_rate": 4.987813258306353e-05, "loss": 2.8692, "mean_token_accuracy": 0.3931034505367279, "step": 80905 }, { "epoch": 0.08149341133172382, "grad_norm": 12.714827909541008, "learning_rate": 4.9878093632584724e-05, "loss": 2.2394, "mean_token_accuracy": 0.44966728091239927, "step": 80910 }, { "epoch": 0.08149844738482798, "grad_norm": 11.059337599429616, "learning_rate": 4.987805467589927e-05, "loss": 2.4695, "mean_token_accuracy": 0.42068966031074523, "step": 80915 }, { "epoch": 0.08150348343793215, "grad_norm": 10.303392276725578, "learning_rate": 4.9878015713007175e-05, "loss": 2.3578, "mean_token_accuracy": 0.38620689511299133, "step": 80920 }, { "epoch": 0.08150851949103632, "grad_norm": 12.041536715508318, "learning_rate": 4.9877976743908455e-05, "loss": 2.3054, "mean_token_accuracy": 0.43793103098869324, "step": 80925 }, { "epoch": 0.0815135555441405, "grad_norm": 12.56715689495364, "learning_rate": 4.9877937768603124e-05, "loss": 2.6111, "mean_token_accuracy": 0.36702964305877683, "step": 80930 }, { "epoch": 0.08151859159724467, "grad_norm": 11.72463429644742, "learning_rate": 4.987789878709118e-05, "loss": 2.6106, "mean_token_accuracy": 0.3551724135875702, "step": 80935 }, { "epoch": 0.08152362765034885, "grad_norm": 11.824934475719614, "learning_rate": 4.987785979937265e-05, "loss": 2.3363, "mean_token_accuracy": 0.43793103098869324, "step": 80940 }, { "epoch": 0.08152866370345302, "grad_norm": 10.86685550254957, "learning_rate": 4.987782080544753e-05, "loss": 2.5327, "mean_token_accuracy": 0.4379310369491577, "step": 80945 }, { "epoch": 0.08153369975655719, "grad_norm": 10.62866279330211, "learning_rate": 4.987778180531585e-05, "loss": 2.1464, "mean_token_accuracy": 0.4586206912994385, "step": 80950 }, { "epoch": 0.08153873580966137, "grad_norm": 11.33004527422251, "learning_rate": 4.98777427989776e-05, "loss": 2.909, "mean_token_accuracy": 0.36896551847457887, "step": 80955 }, { "epoch": 0.08154377186276554, "grad_norm": 11.196355243749153, "learning_rate": 4.98777037864328e-05, "loss": 2.6224, "mean_token_accuracy": 0.35862069129943847, "step": 80960 }, { "epoch": 0.08154880791586971, "grad_norm": 11.031057410550053, "learning_rate": 4.987766476768146e-05, "loss": 2.767, "mean_token_accuracy": 0.42068966031074523, "step": 80965 }, { "epoch": 0.08155384396897389, "grad_norm": 10.725091472969833, "learning_rate": 4.987762574272359e-05, "loss": 2.3007, "mean_token_accuracy": 0.47931034564971925, "step": 80970 }, { "epoch": 0.08155888002207806, "grad_norm": 11.915379982721193, "learning_rate": 4.98775867115592e-05, "loss": 2.3971, "mean_token_accuracy": 0.4103448331356049, "step": 80975 }, { "epoch": 0.08156391607518224, "grad_norm": 13.398931031628925, "learning_rate": 4.987754767418831e-05, "loss": 2.7, "mean_token_accuracy": 0.37241379022598264, "step": 80980 }, { "epoch": 0.0815689521282864, "grad_norm": 11.980596998938738, "learning_rate": 4.987750863061092e-05, "loss": 2.7258, "mean_token_accuracy": 0.3517241418361664, "step": 80985 }, { "epoch": 0.08157398818139057, "grad_norm": 12.050181510152736, "learning_rate": 4.9877469580827046e-05, "loss": 2.3833, "mean_token_accuracy": 0.4103448212146759, "step": 80990 }, { "epoch": 0.08157902423449474, "grad_norm": 11.9405839134718, "learning_rate": 4.98774305248367e-05, "loss": 2.2712, "mean_token_accuracy": 0.4801724135875702, "step": 80995 }, { "epoch": 0.08158406028759892, "grad_norm": 10.521387340723319, "learning_rate": 4.9877391462639875e-05, "loss": 2.475, "mean_token_accuracy": 0.4310344815254211, "step": 81000 }, { "epoch": 0.08158909634070309, "grad_norm": 9.62052871298048, "learning_rate": 4.987735239423662e-05, "loss": 2.2586, "mean_token_accuracy": 0.43103448748588563, "step": 81005 }, { "epoch": 0.08159413239380726, "grad_norm": 9.637375199138168, "learning_rate": 4.9877313319626914e-05, "loss": 2.2953, "mean_token_accuracy": 0.43448275327682495, "step": 81010 }, { "epoch": 0.08159916844691144, "grad_norm": 10.785433247382567, "learning_rate": 4.987727423881077e-05, "loss": 2.2796, "mean_token_accuracy": 0.4655172348022461, "step": 81015 }, { "epoch": 0.08160420450001561, "grad_norm": 11.18051399030864, "learning_rate": 4.987723515178821e-05, "loss": 2.2251, "mean_token_accuracy": 0.39310344457626345, "step": 81020 }, { "epoch": 0.08160924055311979, "grad_norm": 9.237148460856131, "learning_rate": 4.9877196058559236e-05, "loss": 2.4817, "mean_token_accuracy": 0.4034482777118683, "step": 81025 }, { "epoch": 0.08161427660622396, "grad_norm": 11.521486360077368, "learning_rate": 4.9877156959123864e-05, "loss": 2.5331, "mean_token_accuracy": 0.40163339376449586, "step": 81030 }, { "epoch": 0.08161931265932813, "grad_norm": 12.014665381750389, "learning_rate": 4.987711785348211e-05, "loss": 2.7369, "mean_token_accuracy": 0.36206896901130675, "step": 81035 }, { "epoch": 0.08162434871243231, "grad_norm": 11.295725638311858, "learning_rate": 4.987707874163398e-05, "loss": 2.5128, "mean_token_accuracy": 0.39655172228813174, "step": 81040 }, { "epoch": 0.08162938476553648, "grad_norm": 9.620289602564789, "learning_rate": 4.987703962357948e-05, "loss": 2.5488, "mean_token_accuracy": 0.42413793206214906, "step": 81045 }, { "epoch": 0.08163442081864065, "grad_norm": 10.723430668467355, "learning_rate": 4.987700049931862e-05, "loss": 2.4451, "mean_token_accuracy": 0.3827586114406586, "step": 81050 }, { "epoch": 0.08163945687174481, "grad_norm": 11.849135233110083, "learning_rate": 4.9876961368851424e-05, "loss": 2.7102, "mean_token_accuracy": 0.3724137872457504, "step": 81055 }, { "epoch": 0.08164449292484899, "grad_norm": 10.540233038215963, "learning_rate": 4.987692223217789e-05, "loss": 2.6537, "mean_token_accuracy": 0.38620689511299133, "step": 81060 }, { "epoch": 0.08164952897795316, "grad_norm": 13.15798465431207, "learning_rate": 4.987688308929803e-05, "loss": 2.8105, "mean_token_accuracy": 0.4137930989265442, "step": 81065 }, { "epoch": 0.08165456503105734, "grad_norm": 10.32822121119542, "learning_rate": 4.9876843940211865e-05, "loss": 2.4364, "mean_token_accuracy": 0.42413793206214906, "step": 81070 }, { "epoch": 0.08165960108416151, "grad_norm": 12.71600164309924, "learning_rate": 4.98768047849194e-05, "loss": 2.3828, "mean_token_accuracy": 0.40834846496582033, "step": 81075 }, { "epoch": 0.08166463713726568, "grad_norm": 8.28888240311657, "learning_rate": 4.987676562342063e-05, "loss": 2.0854, "mean_token_accuracy": 0.5131276428699494, "step": 81080 }, { "epoch": 0.08166967319036986, "grad_norm": 10.145516571151319, "learning_rate": 4.987672645571559e-05, "loss": 2.2285, "mean_token_accuracy": 0.43629764318466185, "step": 81085 }, { "epoch": 0.08167470924347403, "grad_norm": 11.115886152634676, "learning_rate": 4.9876687281804285e-05, "loss": 2.6775, "mean_token_accuracy": 0.4448275864124298, "step": 81090 }, { "epoch": 0.0816797452965782, "grad_norm": 13.424199232884765, "learning_rate": 4.987664810168672e-05, "loss": 2.3594, "mean_token_accuracy": 0.42758620381355283, "step": 81095 }, { "epoch": 0.08168478134968238, "grad_norm": 10.448907505988625, "learning_rate": 4.98766089153629e-05, "loss": 2.4992, "mean_token_accuracy": 0.41379310488700866, "step": 81100 }, { "epoch": 0.08168981740278655, "grad_norm": 10.920361393007603, "learning_rate": 4.987656972283286e-05, "loss": 2.6147, "mean_token_accuracy": 0.36551723480224607, "step": 81105 }, { "epoch": 0.08169485345589073, "grad_norm": 11.549175706480233, "learning_rate": 4.987653052409658e-05, "loss": 2.8872, "mean_token_accuracy": 0.3517241418361664, "step": 81110 }, { "epoch": 0.0816998895089949, "grad_norm": 11.680179664268882, "learning_rate": 4.98764913191541e-05, "loss": 2.3128, "mean_token_accuracy": 0.428078818321228, "step": 81115 }, { "epoch": 0.08170492556209907, "grad_norm": 11.162243522567726, "learning_rate": 4.98764521080054e-05, "loss": 2.3091, "mean_token_accuracy": 0.4379310369491577, "step": 81120 }, { "epoch": 0.08170996161520323, "grad_norm": 9.822567084184392, "learning_rate": 4.987641289065052e-05, "loss": 2.6185, "mean_token_accuracy": 0.3620689630508423, "step": 81125 }, { "epoch": 0.08171499766830741, "grad_norm": 8.93756438664108, "learning_rate": 4.987637366708945e-05, "loss": 2.2498, "mean_token_accuracy": 0.4223835408687592, "step": 81130 }, { "epoch": 0.08172003372141158, "grad_norm": 14.753175349206241, "learning_rate": 4.987633443732222e-05, "loss": 2.3147, "mean_token_accuracy": 0.44827585816383364, "step": 81135 }, { "epoch": 0.08172506977451575, "grad_norm": 9.73792950577434, "learning_rate": 4.987629520134883e-05, "loss": 1.9809, "mean_token_accuracy": 0.5068965494632721, "step": 81140 }, { "epoch": 0.08173010582761993, "grad_norm": 10.919551548041344, "learning_rate": 4.987625595916928e-05, "loss": 2.6392, "mean_token_accuracy": 0.358620685338974, "step": 81145 }, { "epoch": 0.0817351418807241, "grad_norm": 11.258242545266995, "learning_rate": 4.98762167107836e-05, "loss": 2.8099, "mean_token_accuracy": 0.36896551847457887, "step": 81150 }, { "epoch": 0.08174017793382828, "grad_norm": 11.035441564656917, "learning_rate": 4.9876177456191785e-05, "loss": 1.8944, "mean_token_accuracy": 0.4758620738983154, "step": 81155 }, { "epoch": 0.08174521398693245, "grad_norm": 11.419492494209264, "learning_rate": 4.987613819539386e-05, "loss": 2.184, "mean_token_accuracy": 0.4517241358757019, "step": 81160 }, { "epoch": 0.08175025004003662, "grad_norm": 15.362632091442393, "learning_rate": 4.987609892838983e-05, "loss": 2.5469, "mean_token_accuracy": 0.379310342669487, "step": 81165 }, { "epoch": 0.0817552860931408, "grad_norm": 10.948886071885237, "learning_rate": 4.98760596551797e-05, "loss": 2.7753, "mean_token_accuracy": 0.41034482717514037, "step": 81170 }, { "epoch": 0.08176032214624497, "grad_norm": 10.424296943075129, "learning_rate": 4.987602037576349e-05, "loss": 2.5047, "mean_token_accuracy": 0.4448275864124298, "step": 81175 }, { "epoch": 0.08176535819934914, "grad_norm": 10.26206713251904, "learning_rate": 4.987598109014121e-05, "loss": 2.4, "mean_token_accuracy": 0.38275861740112305, "step": 81180 }, { "epoch": 0.08177039425245332, "grad_norm": 11.038373921761233, "learning_rate": 4.9875941798312866e-05, "loss": 2.4196, "mean_token_accuracy": 0.43793103098869324, "step": 81185 }, { "epoch": 0.08177543030555749, "grad_norm": 10.152708213978844, "learning_rate": 4.987590250027846e-05, "loss": 2.209, "mean_token_accuracy": 0.4551724076271057, "step": 81190 }, { "epoch": 0.08178046635866165, "grad_norm": 9.82734789084404, "learning_rate": 4.987586319603803e-05, "loss": 2.4812, "mean_token_accuracy": 0.4206896543502808, "step": 81195 }, { "epoch": 0.08178550241176583, "grad_norm": 10.750970673794946, "learning_rate": 4.9875823885591565e-05, "loss": 2.489, "mean_token_accuracy": 0.458620685338974, "step": 81200 }, { "epoch": 0.08179053846487, "grad_norm": 15.08769117332296, "learning_rate": 4.9875784568939085e-05, "loss": 2.4726, "mean_token_accuracy": 0.42068964838981626, "step": 81205 }, { "epoch": 0.08179557451797417, "grad_norm": 15.253691948709307, "learning_rate": 4.98757452460806e-05, "loss": 2.4641, "mean_token_accuracy": 0.3896551728248596, "step": 81210 }, { "epoch": 0.08180061057107835, "grad_norm": 10.295033977631194, "learning_rate": 4.9875705917016105e-05, "loss": 2.5201, "mean_token_accuracy": 0.45704779028892517, "step": 81215 }, { "epoch": 0.08180564662418252, "grad_norm": 11.975836658117911, "learning_rate": 4.987566658174564e-05, "loss": 2.3585, "mean_token_accuracy": 0.4034482717514038, "step": 81220 }, { "epoch": 0.0818106826772867, "grad_norm": 9.948566422066165, "learning_rate": 4.9875627240269184e-05, "loss": 2.4004, "mean_token_accuracy": 0.43793103098869324, "step": 81225 }, { "epoch": 0.08181571873039087, "grad_norm": 12.307416332226992, "learning_rate": 4.987558789258678e-05, "loss": 2.7307, "mean_token_accuracy": 0.3862069010734558, "step": 81230 }, { "epoch": 0.08182075478349504, "grad_norm": 12.002340596906384, "learning_rate": 4.987554853869842e-05, "loss": 2.2401, "mean_token_accuracy": 0.4533575356006622, "step": 81235 }, { "epoch": 0.08182579083659922, "grad_norm": 13.23536567022831, "learning_rate": 4.987550917860411e-05, "loss": 2.8642, "mean_token_accuracy": 0.3428917109966278, "step": 81240 }, { "epoch": 0.08183082688970339, "grad_norm": 11.444329663869775, "learning_rate": 4.9875469812303874e-05, "loss": 2.6881, "mean_token_accuracy": 0.3896551728248596, "step": 81245 }, { "epoch": 0.08183586294280756, "grad_norm": 10.337526132040864, "learning_rate": 4.9875430439797716e-05, "loss": 2.1752, "mean_token_accuracy": 0.43260737657547, "step": 81250 }, { "epoch": 0.08184089899591174, "grad_norm": 10.420520038880248, "learning_rate": 4.987539106108566e-05, "loss": 2.3796, "mean_token_accuracy": 0.42413793206214906, "step": 81255 }, { "epoch": 0.08184593504901591, "grad_norm": 12.108330417189388, "learning_rate": 4.987535167616769e-05, "loss": 2.5087, "mean_token_accuracy": 0.4482758641242981, "step": 81260 }, { "epoch": 0.08185097110212007, "grad_norm": 17.833796206525847, "learning_rate": 4.987531228504384e-05, "loss": 2.5276, "mean_token_accuracy": 0.4034482777118683, "step": 81265 }, { "epoch": 0.08185600715522424, "grad_norm": 8.032582884638982, "learning_rate": 4.987527288771412e-05, "loss": 2.0377, "mean_token_accuracy": 0.5008620738983154, "step": 81270 }, { "epoch": 0.08186104320832842, "grad_norm": 10.342873646082817, "learning_rate": 4.9875233484178524e-05, "loss": 2.44, "mean_token_accuracy": 0.38620689511299133, "step": 81275 }, { "epoch": 0.08186607926143259, "grad_norm": 12.889865072343847, "learning_rate": 4.9875194074437076e-05, "loss": 2.8287, "mean_token_accuracy": 0.34137930274009703, "step": 81280 }, { "epoch": 0.08187111531453677, "grad_norm": 12.483038298095767, "learning_rate": 4.9875154658489784e-05, "loss": 2.6404, "mean_token_accuracy": 0.38275861740112305, "step": 81285 }, { "epoch": 0.08187615136764094, "grad_norm": 11.282489430913788, "learning_rate": 4.987511523633667e-05, "loss": 2.5054, "mean_token_accuracy": 0.4034482777118683, "step": 81290 }, { "epoch": 0.08188118742074511, "grad_norm": 12.31106979400585, "learning_rate": 4.987507580797772e-05, "loss": 2.1622, "mean_token_accuracy": 0.48154870271682737, "step": 81295 }, { "epoch": 0.08188622347384929, "grad_norm": 12.385695953066737, "learning_rate": 4.987503637341296e-05, "loss": 2.2948, "mean_token_accuracy": 0.4482758641242981, "step": 81300 }, { "epoch": 0.08189125952695346, "grad_norm": 10.844347294984537, "learning_rate": 4.9874996932642406e-05, "loss": 2.7856, "mean_token_accuracy": 0.4190562665462494, "step": 81305 }, { "epoch": 0.08189629558005763, "grad_norm": 11.500084795989771, "learning_rate": 4.987495748566606e-05, "loss": 2.8147, "mean_token_accuracy": 0.37586206793785093, "step": 81310 }, { "epoch": 0.08190133163316181, "grad_norm": 11.786504526093998, "learning_rate": 4.987491803248394e-05, "loss": 2.5437, "mean_token_accuracy": 0.4158499717712402, "step": 81315 }, { "epoch": 0.08190636768626598, "grad_norm": 11.920140643983734, "learning_rate": 4.9874878573096054e-05, "loss": 2.4836, "mean_token_accuracy": 0.3703569233417511, "step": 81320 }, { "epoch": 0.08191140373937016, "grad_norm": 7.397232508638606, "learning_rate": 4.9874839107502405e-05, "loss": 2.4173, "mean_token_accuracy": 0.42982457280159, "step": 81325 }, { "epoch": 0.08191643979247433, "grad_norm": 9.566002506836593, "learning_rate": 4.987479963570302e-05, "loss": 2.1633, "mean_token_accuracy": 0.4862068951129913, "step": 81330 }, { "epoch": 0.08192147584557849, "grad_norm": 10.2917756475441, "learning_rate": 4.987476015769789e-05, "loss": 2.1104, "mean_token_accuracy": 0.4448275864124298, "step": 81335 }, { "epoch": 0.08192651189868266, "grad_norm": 11.220089186264747, "learning_rate": 4.9874720673487043e-05, "loss": 2.4531, "mean_token_accuracy": 0.4344827592372894, "step": 81340 }, { "epoch": 0.08193154795178684, "grad_norm": 12.107307079566278, "learning_rate": 4.9874681183070485e-05, "loss": 2.8069, "mean_token_accuracy": 0.420689657330513, "step": 81345 }, { "epoch": 0.08193658400489101, "grad_norm": 10.745517735179662, "learning_rate": 4.9874641686448224e-05, "loss": 2.6981, "mean_token_accuracy": 0.4068965494632721, "step": 81350 }, { "epoch": 0.08194162005799518, "grad_norm": 11.857550199321265, "learning_rate": 4.9874602183620274e-05, "loss": 2.6019, "mean_token_accuracy": 0.40689654350280763, "step": 81355 }, { "epoch": 0.08194665611109936, "grad_norm": 10.46215998687996, "learning_rate": 4.987456267458664e-05, "loss": 2.7538, "mean_token_accuracy": 0.3896551728248596, "step": 81360 }, { "epoch": 0.08195169216420353, "grad_norm": 13.716299429833622, "learning_rate": 4.987452315934734e-05, "loss": 2.6918, "mean_token_accuracy": 0.4068965554237366, "step": 81365 }, { "epoch": 0.0819567282173077, "grad_norm": 10.33593890629514, "learning_rate": 4.987448363790239e-05, "loss": 2.2856, "mean_token_accuracy": 0.4206896543502808, "step": 81370 }, { "epoch": 0.08196176427041188, "grad_norm": 9.765382202081087, "learning_rate": 4.9874444110251785e-05, "loss": 1.9348, "mean_token_accuracy": 0.4841133117675781, "step": 81375 }, { "epoch": 0.08196680032351605, "grad_norm": 11.726860739314754, "learning_rate": 4.987440457639554e-05, "loss": 2.7666, "mean_token_accuracy": 0.3620689630508423, "step": 81380 }, { "epoch": 0.08197183637662023, "grad_norm": 9.475170778713064, "learning_rate": 4.9874365036333684e-05, "loss": 2.1273, "mean_token_accuracy": 0.475274395942688, "step": 81385 }, { "epoch": 0.0819768724297244, "grad_norm": 11.093570151975394, "learning_rate": 4.9874325490066205e-05, "loss": 2.2124, "mean_token_accuracy": 0.47586206197738645, "step": 81390 }, { "epoch": 0.08198190848282857, "grad_norm": 12.448440988637792, "learning_rate": 4.9874285937593126e-05, "loss": 2.4409, "mean_token_accuracy": 0.4034482717514038, "step": 81395 }, { "epoch": 0.08198694453593275, "grad_norm": 13.249236770524718, "learning_rate": 4.987424637891446e-05, "loss": 2.7062, "mean_token_accuracy": 0.39655172228813174, "step": 81400 }, { "epoch": 0.08199198058903691, "grad_norm": 10.82851623829469, "learning_rate": 4.9874206814030207e-05, "loss": 2.6589, "mean_token_accuracy": 0.3827586233615875, "step": 81405 }, { "epoch": 0.08199701664214108, "grad_norm": 10.449966397354737, "learning_rate": 4.987416724294039e-05, "loss": 2.4951, "mean_token_accuracy": 0.4068965494632721, "step": 81410 }, { "epoch": 0.08200205269524526, "grad_norm": 9.507344802305338, "learning_rate": 4.987412766564501e-05, "loss": 2.8058, "mean_token_accuracy": 0.4020568609237671, "step": 81415 }, { "epoch": 0.08200708874834943, "grad_norm": 13.236516356606005, "learning_rate": 4.9874088082144074e-05, "loss": 2.5722, "mean_token_accuracy": 0.3931034475564957, "step": 81420 }, { "epoch": 0.0820121248014536, "grad_norm": 10.513473207006554, "learning_rate": 4.9874048492437615e-05, "loss": 2.0874, "mean_token_accuracy": 0.4482758641242981, "step": 81425 }, { "epoch": 0.08201716085455778, "grad_norm": 10.152692613411277, "learning_rate": 4.9874008896525624e-05, "loss": 2.1786, "mean_token_accuracy": 0.43297035694122316, "step": 81430 }, { "epoch": 0.08202219690766195, "grad_norm": 10.125156631137767, "learning_rate": 4.987396929440812e-05, "loss": 3.0436, "mean_token_accuracy": 0.3879612863063812, "step": 81435 }, { "epoch": 0.08202723296076612, "grad_norm": 11.240906940550802, "learning_rate": 4.987392968608511e-05, "loss": 2.4295, "mean_token_accuracy": 0.41034482717514037, "step": 81440 }, { "epoch": 0.0820322690138703, "grad_norm": 14.689263014332754, "learning_rate": 4.98738900715566e-05, "loss": 2.4392, "mean_token_accuracy": 0.43793103098869324, "step": 81445 }, { "epoch": 0.08203730506697447, "grad_norm": 13.140273078749582, "learning_rate": 4.987385045082262e-05, "loss": 2.9028, "mean_token_accuracy": 0.38112522959709166, "step": 81450 }, { "epoch": 0.08204234112007865, "grad_norm": 12.178984734427017, "learning_rate": 4.987381082388317e-05, "loss": 2.6935, "mean_token_accuracy": 0.37931033968925476, "step": 81455 }, { "epoch": 0.08204737717318282, "grad_norm": 12.05904797084661, "learning_rate": 4.987377119073825e-05, "loss": 2.6447, "mean_token_accuracy": 0.38275861740112305, "step": 81460 }, { "epoch": 0.082052413226287, "grad_norm": 8.257418900540168, "learning_rate": 4.987373155138789e-05, "loss": 2.0941, "mean_token_accuracy": 0.49655172824859617, "step": 81465 }, { "epoch": 0.08205744927939117, "grad_norm": 12.347566438871892, "learning_rate": 4.987369190583208e-05, "loss": 2.6886, "mean_token_accuracy": 0.4034482777118683, "step": 81470 }, { "epoch": 0.08206248533249533, "grad_norm": 10.50822540173339, "learning_rate": 4.987365225407085e-05, "loss": 2.5617, "mean_token_accuracy": 0.39655172228813174, "step": 81475 }, { "epoch": 0.0820675213855995, "grad_norm": 17.157931285693447, "learning_rate": 4.9873612596104216e-05, "loss": 2.5032, "mean_token_accuracy": 0.4068965494632721, "step": 81480 }, { "epoch": 0.08207255743870367, "grad_norm": 11.236838451856055, "learning_rate": 4.9873572931932164e-05, "loss": 2.2415, "mean_token_accuracy": 0.43793103098869324, "step": 81485 }, { "epoch": 0.08207759349180785, "grad_norm": 11.985930629826742, "learning_rate": 4.9873533261554714e-05, "loss": 2.7209, "mean_token_accuracy": 0.379310342669487, "step": 81490 }, { "epoch": 0.08208262954491202, "grad_norm": 10.388284944476947, "learning_rate": 4.9873493584971894e-05, "loss": 2.7286, "mean_token_accuracy": 0.3896551728248596, "step": 81495 }, { "epoch": 0.0820876655980162, "grad_norm": 13.085601354797138, "learning_rate": 4.987345390218369e-05, "loss": 2.5696, "mean_token_accuracy": 0.41034482717514037, "step": 81500 }, { "epoch": 0.08209270165112037, "grad_norm": 13.915127084540062, "learning_rate": 4.987341421319014e-05, "loss": 2.581, "mean_token_accuracy": 0.37241379618644715, "step": 81505 }, { "epoch": 0.08209773770422454, "grad_norm": 12.721252808747396, "learning_rate": 4.987337451799122e-05, "loss": 2.3806, "mean_token_accuracy": 0.4448275983333588, "step": 81510 }, { "epoch": 0.08210277375732872, "grad_norm": 10.936574024571263, "learning_rate": 4.9873334816586973e-05, "loss": 2.5832, "mean_token_accuracy": 0.38965516686439516, "step": 81515 }, { "epoch": 0.08210780981043289, "grad_norm": 11.902919384748596, "learning_rate": 4.9873295108977404e-05, "loss": 2.6235, "mean_token_accuracy": 0.3827586263418198, "step": 81520 }, { "epoch": 0.08211284586353707, "grad_norm": 9.358887264122853, "learning_rate": 4.987325539516251e-05, "loss": 2.3114, "mean_token_accuracy": 0.44827587008476255, "step": 81525 }, { "epoch": 0.08211788191664124, "grad_norm": 11.517641638446594, "learning_rate": 4.987321567514231e-05, "loss": 2.6765, "mean_token_accuracy": 0.36896551847457887, "step": 81530 }, { "epoch": 0.08212291796974541, "grad_norm": 11.280099649574352, "learning_rate": 4.9873175948916815e-05, "loss": 2.5016, "mean_token_accuracy": 0.4586206912994385, "step": 81535 }, { "epoch": 0.08212795402284959, "grad_norm": 10.617870411712952, "learning_rate": 4.9873136216486036e-05, "loss": 2.3628, "mean_token_accuracy": 0.43103448748588563, "step": 81540 }, { "epoch": 0.08213299007595375, "grad_norm": 10.20799085076727, "learning_rate": 4.987309647784999e-05, "loss": 2.3382, "mean_token_accuracy": 0.3981851160526276, "step": 81545 }, { "epoch": 0.08213802612905792, "grad_norm": 11.518904351515944, "learning_rate": 4.987305673300868e-05, "loss": 2.5585, "mean_token_accuracy": 0.39310344457626345, "step": 81550 }, { "epoch": 0.0821430621821621, "grad_norm": 15.303858250456429, "learning_rate": 4.9873016981962116e-05, "loss": 2.8698, "mean_token_accuracy": 0.3620689660310745, "step": 81555 }, { "epoch": 0.08214809823526627, "grad_norm": 15.498837697743825, "learning_rate": 4.987297722471032e-05, "loss": 2.6036, "mean_token_accuracy": 0.3620689630508423, "step": 81560 }, { "epoch": 0.08215313428837044, "grad_norm": 11.756474935370587, "learning_rate": 4.9872937461253285e-05, "loss": 2.497, "mean_token_accuracy": 0.4052026689052582, "step": 81565 }, { "epoch": 0.08215817034147462, "grad_norm": 14.289805961167959, "learning_rate": 4.987289769159104e-05, "loss": 2.3978, "mean_token_accuracy": 0.45517241954803467, "step": 81570 }, { "epoch": 0.08216320639457879, "grad_norm": 12.28272871661099, "learning_rate": 4.987285791572358e-05, "loss": 2.7464, "mean_token_accuracy": 0.36896551847457887, "step": 81575 }, { "epoch": 0.08216824244768296, "grad_norm": 10.699613980549676, "learning_rate": 4.987281813365093e-05, "loss": 2.5454, "mean_token_accuracy": 0.39310344457626345, "step": 81580 }, { "epoch": 0.08217327850078714, "grad_norm": 11.685694707438968, "learning_rate": 4.98727783453731e-05, "loss": 2.5342, "mean_token_accuracy": 0.4000000059604645, "step": 81585 }, { "epoch": 0.08217831455389131, "grad_norm": 23.34661911038643, "learning_rate": 4.987273855089009e-05, "loss": 2.7759, "mean_token_accuracy": 0.3965517163276672, "step": 81590 }, { "epoch": 0.08218335060699548, "grad_norm": 13.067482193216819, "learning_rate": 4.987269875020191e-05, "loss": 2.6956, "mean_token_accuracy": 0.4034482717514038, "step": 81595 }, { "epoch": 0.08218838666009966, "grad_norm": 10.427030374886247, "learning_rate": 4.9872658943308595e-05, "loss": 2.6692, "mean_token_accuracy": 0.36896551847457887, "step": 81600 }, { "epoch": 0.08219342271320383, "grad_norm": 12.070745073352324, "learning_rate": 4.987261913021013e-05, "loss": 2.5313, "mean_token_accuracy": 0.4068965554237366, "step": 81605 }, { "epoch": 0.082198458766308, "grad_norm": 13.252374773247201, "learning_rate": 4.987257931090654e-05, "loss": 2.6322, "mean_token_accuracy": 0.3965517163276672, "step": 81610 }, { "epoch": 0.08220349481941217, "grad_norm": 10.865300776872681, "learning_rate": 4.987253948539783e-05, "loss": 2.5031, "mean_token_accuracy": 0.3965517163276672, "step": 81615 }, { "epoch": 0.08220853087251634, "grad_norm": 10.05526959551301, "learning_rate": 4.987249965368401e-05, "loss": 2.4438, "mean_token_accuracy": 0.41034482717514037, "step": 81620 }, { "epoch": 0.08221356692562051, "grad_norm": 11.626754750788221, "learning_rate": 4.9872459815765094e-05, "loss": 2.3676, "mean_token_accuracy": 0.44482758045196535, "step": 81625 }, { "epoch": 0.08221860297872469, "grad_norm": 9.994483452143669, "learning_rate": 4.9872419971641095e-05, "loss": 2.1402, "mean_token_accuracy": 0.44482758045196535, "step": 81630 }, { "epoch": 0.08222363903182886, "grad_norm": 14.135512749617225, "learning_rate": 4.987238012131202e-05, "loss": 2.7536, "mean_token_accuracy": 0.37931033968925476, "step": 81635 }, { "epoch": 0.08222867508493303, "grad_norm": 11.11952855182373, "learning_rate": 4.987234026477788e-05, "loss": 2.6523, "mean_token_accuracy": 0.4379310429096222, "step": 81640 }, { "epoch": 0.08223371113803721, "grad_norm": 10.84750169684057, "learning_rate": 4.987230040203869e-05, "loss": 2.4172, "mean_token_accuracy": 0.4103448212146759, "step": 81645 }, { "epoch": 0.08223874719114138, "grad_norm": 10.89622725244941, "learning_rate": 4.987226053309446e-05, "loss": 2.4901, "mean_token_accuracy": 0.42758620977401735, "step": 81650 }, { "epoch": 0.08224378324424556, "grad_norm": 14.345987262528412, "learning_rate": 4.98722206579452e-05, "loss": 2.6574, "mean_token_accuracy": 0.37586206793785093, "step": 81655 }, { "epoch": 0.08224881929734973, "grad_norm": 10.496282400150783, "learning_rate": 4.987218077659093e-05, "loss": 2.4616, "mean_token_accuracy": 0.441379314661026, "step": 81660 }, { "epoch": 0.0822538553504539, "grad_norm": 12.776862838804494, "learning_rate": 4.9872140889031635e-05, "loss": 2.7211, "mean_token_accuracy": 0.37586206793785093, "step": 81665 }, { "epoch": 0.08225889140355808, "grad_norm": 11.222362330731583, "learning_rate": 4.987210099526735e-05, "loss": 2.2948, "mean_token_accuracy": 0.42758620381355283, "step": 81670 }, { "epoch": 0.08226392745666225, "grad_norm": 11.672519332361157, "learning_rate": 4.9872061095298084e-05, "loss": 2.4291, "mean_token_accuracy": 0.46031458377838136, "step": 81675 }, { "epoch": 0.08226896350976642, "grad_norm": 10.197068900600106, "learning_rate": 4.987202118912384e-05, "loss": 2.4444, "mean_token_accuracy": 0.39310344457626345, "step": 81680 }, { "epoch": 0.08227399956287058, "grad_norm": 12.669812135959248, "learning_rate": 4.987198127674464e-05, "loss": 2.6298, "mean_token_accuracy": 0.35862069129943847, "step": 81685 }, { "epoch": 0.08227903561597476, "grad_norm": 10.173115569780382, "learning_rate": 4.987194135816048e-05, "loss": 2.3556, "mean_token_accuracy": 0.42758620381355283, "step": 81690 }, { "epoch": 0.08228407166907893, "grad_norm": 10.16899322916087, "learning_rate": 4.987190143337138e-05, "loss": 2.5863, "mean_token_accuracy": 0.36551724672317504, "step": 81695 }, { "epoch": 0.0822891077221831, "grad_norm": 10.727720209556825, "learning_rate": 4.987186150237734e-05, "loss": 2.3479, "mean_token_accuracy": 0.42758620977401735, "step": 81700 }, { "epoch": 0.08229414377528728, "grad_norm": 10.525654188297539, "learning_rate": 4.9871821565178394e-05, "loss": 2.2155, "mean_token_accuracy": 0.47241379618644713, "step": 81705 }, { "epoch": 0.08229917982839145, "grad_norm": 10.37779794688868, "learning_rate": 4.987178162177453e-05, "loss": 2.5508, "mean_token_accuracy": 0.38965516686439516, "step": 81710 }, { "epoch": 0.08230421588149563, "grad_norm": 10.865760064256426, "learning_rate": 4.987174167216578e-05, "loss": 2.6565, "mean_token_accuracy": 0.38620689511299133, "step": 81715 }, { "epoch": 0.0823092519345998, "grad_norm": 9.424715212643328, "learning_rate": 4.987170171635214e-05, "loss": 2.8124, "mean_token_accuracy": 0.37241377830505373, "step": 81720 }, { "epoch": 0.08231428798770397, "grad_norm": 11.474249265987654, "learning_rate": 4.987166175433361e-05, "loss": 2.6937, "mean_token_accuracy": 0.36896551847457887, "step": 81725 }, { "epoch": 0.08231932404080815, "grad_norm": 14.227867848242319, "learning_rate": 4.987162178611024e-05, "loss": 2.2906, "mean_token_accuracy": 0.41379310488700866, "step": 81730 }, { "epoch": 0.08232436009391232, "grad_norm": 14.620717487780217, "learning_rate": 4.9871581811682014e-05, "loss": 2.7476, "mean_token_accuracy": 0.36206896901130675, "step": 81735 }, { "epoch": 0.0823293961470165, "grad_norm": 10.673865826163299, "learning_rate": 4.987154183104893e-05, "loss": 2.2111, "mean_token_accuracy": 0.41724138259887694, "step": 81740 }, { "epoch": 0.08233443220012067, "grad_norm": 10.240199485641792, "learning_rate": 4.987150184421102e-05, "loss": 2.8363, "mean_token_accuracy": 0.3689655214548111, "step": 81745 }, { "epoch": 0.08233946825322484, "grad_norm": 14.392806777788987, "learning_rate": 4.98714618511683e-05, "loss": 2.7746, "mean_token_accuracy": 0.3896551728248596, "step": 81750 }, { "epoch": 0.082344504306329, "grad_norm": 10.61370188078038, "learning_rate": 4.987142185192078e-05, "loss": 2.7205, "mean_token_accuracy": 0.38747731447219846, "step": 81755 }, { "epoch": 0.08234954035943318, "grad_norm": 12.577605056208963, "learning_rate": 4.9871381846468443e-05, "loss": 3.0185, "mean_token_accuracy": 0.3482758581638336, "step": 81760 }, { "epoch": 0.08235457641253735, "grad_norm": 10.137700347648314, "learning_rate": 4.987134183481132e-05, "loss": 2.1899, "mean_token_accuracy": 0.4034482777118683, "step": 81765 }, { "epoch": 0.08235961246564152, "grad_norm": 12.604500774433353, "learning_rate": 4.987130181694943e-05, "loss": 2.3482, "mean_token_accuracy": 0.4413793087005615, "step": 81770 }, { "epoch": 0.0823646485187457, "grad_norm": 9.320877205301201, "learning_rate": 4.987126179288277e-05, "loss": 1.9868, "mean_token_accuracy": 0.48620688915252686, "step": 81775 }, { "epoch": 0.08236968457184987, "grad_norm": 11.342020207347884, "learning_rate": 4.987122176261136e-05, "loss": 2.6241, "mean_token_accuracy": 0.36896551251411436, "step": 81780 }, { "epoch": 0.08237472062495405, "grad_norm": 11.416143382131429, "learning_rate": 4.9871181726135216e-05, "loss": 2.8221, "mean_token_accuracy": 0.4, "step": 81785 }, { "epoch": 0.08237975667805822, "grad_norm": 11.532615481089342, "learning_rate": 4.9871141683454334e-05, "loss": 2.6775, "mean_token_accuracy": 0.42068964838981626, "step": 81790 }, { "epoch": 0.08238479273116239, "grad_norm": 11.167263412310776, "learning_rate": 4.9871101634568726e-05, "loss": 2.4119, "mean_token_accuracy": 0.43103447556495667, "step": 81795 }, { "epoch": 0.08238982878426657, "grad_norm": 9.038242018533337, "learning_rate": 4.9871061579478425e-05, "loss": 2.7853, "mean_token_accuracy": 0.3896551728248596, "step": 81800 }, { "epoch": 0.08239486483737074, "grad_norm": 13.83529029876891, "learning_rate": 4.987102151818341e-05, "loss": 2.8959, "mean_token_accuracy": 0.417241370677948, "step": 81805 }, { "epoch": 0.08239990089047491, "grad_norm": 10.071578066235098, "learning_rate": 4.9870981450683726e-05, "loss": 2.4057, "mean_token_accuracy": 0.3965517282485962, "step": 81810 }, { "epoch": 0.08240493694357909, "grad_norm": 8.780635461193746, "learning_rate": 4.9870941376979355e-05, "loss": 2.1417, "mean_token_accuracy": 0.5055051445960999, "step": 81815 }, { "epoch": 0.08240997299668326, "grad_norm": 11.764658599877748, "learning_rate": 4.987090129707032e-05, "loss": 2.1675, "mean_token_accuracy": 0.47586206197738645, "step": 81820 }, { "epoch": 0.08241500904978742, "grad_norm": 10.50080843552927, "learning_rate": 4.9870861210956636e-05, "loss": 2.3203, "mean_token_accuracy": 0.4206896543502808, "step": 81825 }, { "epoch": 0.0824200451028916, "grad_norm": 10.950683623588455, "learning_rate": 4.987082111863831e-05, "loss": 2.5013, "mean_token_accuracy": 0.4862068951129913, "step": 81830 }, { "epoch": 0.08242508115599577, "grad_norm": 9.728412376229704, "learning_rate": 4.987078102011535e-05, "loss": 2.9069, "mean_token_accuracy": 0.3931034505367279, "step": 81835 }, { "epoch": 0.08243011720909994, "grad_norm": 10.352568670066955, "learning_rate": 4.987074091538777e-05, "loss": 2.8801, "mean_token_accuracy": 0.42413792610168455, "step": 81840 }, { "epoch": 0.08243515326220412, "grad_norm": 10.76995682927042, "learning_rate": 4.9870700804455586e-05, "loss": 2.5537, "mean_token_accuracy": 0.3999999940395355, "step": 81845 }, { "epoch": 0.08244018931530829, "grad_norm": 14.49241323472634, "learning_rate": 4.987066068731881e-05, "loss": 2.785, "mean_token_accuracy": 0.38275861740112305, "step": 81850 }, { "epoch": 0.08244522536841246, "grad_norm": 10.51078363191253, "learning_rate": 4.987062056397743e-05, "loss": 2.7253, "mean_token_accuracy": 0.3379310339689255, "step": 81855 }, { "epoch": 0.08245026142151664, "grad_norm": 12.917833298624373, "learning_rate": 4.987058043443149e-05, "loss": 2.4394, "mean_token_accuracy": 0.4382335126399994, "step": 81860 }, { "epoch": 0.08245529747462081, "grad_norm": 14.729484288394806, "learning_rate": 4.987054029868098e-05, "loss": 2.8656, "mean_token_accuracy": 0.4049606740474701, "step": 81865 }, { "epoch": 0.08246033352772499, "grad_norm": 10.457324489532406, "learning_rate": 4.987050015672592e-05, "loss": 2.2777, "mean_token_accuracy": 0.4413793087005615, "step": 81870 }, { "epoch": 0.08246536958082916, "grad_norm": 11.187049451528614, "learning_rate": 4.9870460008566315e-05, "loss": 2.4473, "mean_token_accuracy": 0.4344827592372894, "step": 81875 }, { "epoch": 0.08247040563393333, "grad_norm": 10.217322086023508, "learning_rate": 4.987041985420218e-05, "loss": 2.3903, "mean_token_accuracy": 0.35862069129943847, "step": 81880 }, { "epoch": 0.0824754416870375, "grad_norm": 10.32136740904889, "learning_rate": 4.987037969363354e-05, "loss": 2.3202, "mean_token_accuracy": 0.4724137902259827, "step": 81885 }, { "epoch": 0.08248047774014168, "grad_norm": 12.401026039110102, "learning_rate": 4.9870339526860364e-05, "loss": 2.2188, "mean_token_accuracy": 0.4758620738983154, "step": 81890 }, { "epoch": 0.08248551379324584, "grad_norm": 9.001122061774357, "learning_rate": 4.987029935388271e-05, "loss": 2.5788, "mean_token_accuracy": 0.36896551847457887, "step": 81895 }, { "epoch": 0.08249054984635001, "grad_norm": 9.872967106819132, "learning_rate": 4.9870259174700576e-05, "loss": 3.1086, "mean_token_accuracy": 0.36896551847457887, "step": 81900 }, { "epoch": 0.08249558589945419, "grad_norm": 10.563606782590925, "learning_rate": 4.987021898931395e-05, "loss": 2.3515, "mean_token_accuracy": 0.4068965494632721, "step": 81905 }, { "epoch": 0.08250062195255836, "grad_norm": 10.618502337308142, "learning_rate": 4.9870178797722865e-05, "loss": 2.6355, "mean_token_accuracy": 0.41379310488700866, "step": 81910 }, { "epoch": 0.08250565800566254, "grad_norm": 10.174453803502516, "learning_rate": 4.987013859992733e-05, "loss": 2.338, "mean_token_accuracy": 0.43793103098869324, "step": 81915 }, { "epoch": 0.08251069405876671, "grad_norm": 9.687709146429334, "learning_rate": 4.9870098395927354e-05, "loss": 2.4959, "mean_token_accuracy": 0.38275861740112305, "step": 81920 }, { "epoch": 0.08251573011187088, "grad_norm": 10.633532069546701, "learning_rate": 4.987005818572295e-05, "loss": 2.435, "mean_token_accuracy": 0.3931034505367279, "step": 81925 }, { "epoch": 0.08252076616497506, "grad_norm": 8.707036057138948, "learning_rate": 4.987001796931412e-05, "loss": 2.4132, "mean_token_accuracy": 0.4620689630508423, "step": 81930 }, { "epoch": 0.08252580221807923, "grad_norm": 12.281550205326257, "learning_rate": 4.9869977746700884e-05, "loss": 1.981, "mean_token_accuracy": 0.48965516686439514, "step": 81935 }, { "epoch": 0.0825308382711834, "grad_norm": 11.734018832835128, "learning_rate": 4.986993751788326e-05, "loss": 2.7577, "mean_token_accuracy": 0.3482758581638336, "step": 81940 }, { "epoch": 0.08253587432428758, "grad_norm": 14.694972533357946, "learning_rate": 4.986989728286124e-05, "loss": 2.7719, "mean_token_accuracy": 0.36551723480224607, "step": 81945 }, { "epoch": 0.08254091037739175, "grad_norm": 11.969210016525244, "learning_rate": 4.986985704163485e-05, "loss": 2.3409, "mean_token_accuracy": 0.4344827592372894, "step": 81950 }, { "epoch": 0.08254594643049593, "grad_norm": 10.201358146922619, "learning_rate": 4.98698167942041e-05, "loss": 2.3967, "mean_token_accuracy": 0.4655172526836395, "step": 81955 }, { "epoch": 0.0825509824836001, "grad_norm": 12.047767625250296, "learning_rate": 4.986977654056899e-05, "loss": 2.9362, "mean_token_accuracy": 0.32758620381355286, "step": 81960 }, { "epoch": 0.08255601853670426, "grad_norm": 10.21606920406701, "learning_rate": 4.986973628072954e-05, "loss": 2.3996, "mean_token_accuracy": 0.4, "step": 81965 }, { "epoch": 0.08256105458980843, "grad_norm": 20.35672984140577, "learning_rate": 4.9869696014685766e-05, "loss": 3.4923, "mean_token_accuracy": 0.3517241418361664, "step": 81970 }, { "epoch": 0.0825660906429126, "grad_norm": 11.084608581569237, "learning_rate": 4.986965574243767e-05, "loss": 2.2638, "mean_token_accuracy": 0.44482759237289426, "step": 81975 }, { "epoch": 0.08257112669601678, "grad_norm": 18.233912395461548, "learning_rate": 4.986961546398527e-05, "loss": 2.6584, "mean_token_accuracy": 0.40689654350280763, "step": 81980 }, { "epoch": 0.08257616274912095, "grad_norm": 11.384874414194416, "learning_rate": 4.986957517932857e-05, "loss": 2.1579, "mean_token_accuracy": 0.49165154695510865, "step": 81985 }, { "epoch": 0.08258119880222513, "grad_norm": 12.94025713846464, "learning_rate": 4.9869534888467584e-05, "loss": 2.5271, "mean_token_accuracy": 0.38965516686439516, "step": 81990 }, { "epoch": 0.0825862348553293, "grad_norm": 11.449083551705915, "learning_rate": 4.9869494591402325e-05, "loss": 2.6842, "mean_token_accuracy": 0.36896551847457887, "step": 81995 }, { "epoch": 0.08259127090843348, "grad_norm": 13.377856212639013, "learning_rate": 4.986945428813281e-05, "loss": 2.6191, "mean_token_accuracy": 0.4034482717514038, "step": 82000 }, { "epoch": 0.08259630696153765, "grad_norm": 11.388965822941854, "learning_rate": 4.986941397865903e-05, "loss": 2.5936, "mean_token_accuracy": 0.36551723480224607, "step": 82005 }, { "epoch": 0.08260134301464182, "grad_norm": 11.286257685456427, "learning_rate": 4.986937366298102e-05, "loss": 2.3464, "mean_token_accuracy": 0.3896551787853241, "step": 82010 }, { "epoch": 0.082606379067746, "grad_norm": 9.294967098094592, "learning_rate": 4.9869333341098776e-05, "loss": 2.1069, "mean_token_accuracy": 0.5000000059604645, "step": 82015 }, { "epoch": 0.08261141512085017, "grad_norm": 11.199128741270535, "learning_rate": 4.986929301301232e-05, "loss": 2.5909, "mean_token_accuracy": 0.38620689511299133, "step": 82020 }, { "epoch": 0.08261645117395434, "grad_norm": 14.091829685915462, "learning_rate": 4.986925267872165e-05, "loss": 2.875, "mean_token_accuracy": 0.3310344755649567, "step": 82025 }, { "epoch": 0.08262148722705852, "grad_norm": 11.736163944762179, "learning_rate": 4.986921233822678e-05, "loss": 2.731, "mean_token_accuracy": 0.34137930870056155, "step": 82030 }, { "epoch": 0.08262652328016268, "grad_norm": 13.325874216542108, "learning_rate": 4.9869171991527736e-05, "loss": 2.5499, "mean_token_accuracy": 0.3965517282485962, "step": 82035 }, { "epoch": 0.08263155933326685, "grad_norm": 11.927570398095131, "learning_rate": 4.9869131638624514e-05, "loss": 2.5634, "mean_token_accuracy": 0.41379311084747317, "step": 82040 }, { "epoch": 0.08263659538637103, "grad_norm": 11.239408786162556, "learning_rate": 4.9869091279517134e-05, "loss": 2.5338, "mean_token_accuracy": 0.4206896543502808, "step": 82045 }, { "epoch": 0.0826416314394752, "grad_norm": 12.245354198665796, "learning_rate": 4.98690509142056e-05, "loss": 2.4502, "mean_token_accuracy": 0.38620689511299133, "step": 82050 }, { "epoch": 0.08264666749257937, "grad_norm": 10.740303854170767, "learning_rate": 4.9869010542689924e-05, "loss": 2.6616, "mean_token_accuracy": 0.4068965494632721, "step": 82055 }, { "epoch": 0.08265170354568355, "grad_norm": 11.809428567162636, "learning_rate": 4.986897016497012e-05, "loss": 2.3787, "mean_token_accuracy": 0.4, "step": 82060 }, { "epoch": 0.08265673959878772, "grad_norm": 9.6422481385882, "learning_rate": 4.9868929781046206e-05, "loss": 2.377, "mean_token_accuracy": 0.4434966742992401, "step": 82065 }, { "epoch": 0.0826617756518919, "grad_norm": 10.301294175866536, "learning_rate": 4.9868889390918176e-05, "loss": 2.1604, "mean_token_accuracy": 0.43272837400436404, "step": 82070 }, { "epoch": 0.08266681170499607, "grad_norm": 10.072941030867309, "learning_rate": 4.986884899458606e-05, "loss": 2.7969, "mean_token_accuracy": 0.38620689511299133, "step": 82075 }, { "epoch": 0.08267184775810024, "grad_norm": 10.914905542380184, "learning_rate": 4.9868808592049855e-05, "loss": 2.4481, "mean_token_accuracy": 0.4328493714332581, "step": 82080 }, { "epoch": 0.08267688381120442, "grad_norm": 12.500324915024295, "learning_rate": 4.986876818330958e-05, "loss": 2.5834, "mean_token_accuracy": 0.36206896901130675, "step": 82085 }, { "epoch": 0.08268191986430859, "grad_norm": 11.138756071296362, "learning_rate": 4.9868727768365236e-05, "loss": 2.4558, "mean_token_accuracy": 0.39679802060127256, "step": 82090 }, { "epoch": 0.08268695591741276, "grad_norm": 9.875283062794104, "learning_rate": 4.9868687347216854e-05, "loss": 2.2244, "mean_token_accuracy": 0.39655172228813174, "step": 82095 }, { "epoch": 0.08269199197051694, "grad_norm": 9.371911844877621, "learning_rate": 4.986864691986442e-05, "loss": 2.5634, "mean_token_accuracy": 0.4103448331356049, "step": 82100 }, { "epoch": 0.0826970280236211, "grad_norm": 11.787498245271317, "learning_rate": 4.986860648630797e-05, "loss": 2.496, "mean_token_accuracy": 0.4417487621307373, "step": 82105 }, { "epoch": 0.08270206407672527, "grad_norm": 8.76541442878178, "learning_rate": 4.9868566046547495e-05, "loss": 2.2096, "mean_token_accuracy": 0.43103448748588563, "step": 82110 }, { "epoch": 0.08270710012982944, "grad_norm": 11.857872328799202, "learning_rate": 4.986852560058302e-05, "loss": 2.1527, "mean_token_accuracy": 0.49171203970909116, "step": 82115 }, { "epoch": 0.08271213618293362, "grad_norm": 12.348058283588381, "learning_rate": 4.986848514841455e-05, "loss": 2.0096, "mean_token_accuracy": 0.5413793087005615, "step": 82120 }, { "epoch": 0.08271717223603779, "grad_norm": 12.29504213267743, "learning_rate": 4.98684446900421e-05, "loss": 2.278, "mean_token_accuracy": 0.41034482717514037, "step": 82125 }, { "epoch": 0.08272220828914197, "grad_norm": 15.624190110093977, "learning_rate": 4.986840422546567e-05, "loss": 2.1763, "mean_token_accuracy": 0.4758620738983154, "step": 82130 }, { "epoch": 0.08272724434224614, "grad_norm": 12.995336919819847, "learning_rate": 4.9868363754685286e-05, "loss": 2.5681, "mean_token_accuracy": 0.36206897497177126, "step": 82135 }, { "epoch": 0.08273228039535031, "grad_norm": 11.552659013863321, "learning_rate": 4.986832327770096e-05, "loss": 2.5575, "mean_token_accuracy": 0.4034482777118683, "step": 82140 }, { "epoch": 0.08273731644845449, "grad_norm": 14.038935445953188, "learning_rate": 4.9868282794512685e-05, "loss": 2.9021, "mean_token_accuracy": 0.4013309121131897, "step": 82145 }, { "epoch": 0.08274235250155866, "grad_norm": 10.669460757107403, "learning_rate": 4.9868242305120485e-05, "loss": 2.7179, "mean_token_accuracy": 0.3379310369491577, "step": 82150 }, { "epoch": 0.08274738855466283, "grad_norm": 14.202824286472207, "learning_rate": 4.986820180952437e-05, "loss": 3.0896, "mean_token_accuracy": 0.337931028008461, "step": 82155 }, { "epoch": 0.08275242460776701, "grad_norm": 11.511900297238604, "learning_rate": 4.986816130772435e-05, "loss": 2.37, "mean_token_accuracy": 0.4448275864124298, "step": 82160 }, { "epoch": 0.08275746066087118, "grad_norm": 12.12579496229365, "learning_rate": 4.986812079972044e-05, "loss": 3.1261, "mean_token_accuracy": 0.31379310190677645, "step": 82165 }, { "epoch": 0.08276249671397536, "grad_norm": 11.069243108656195, "learning_rate": 4.9868080285512645e-05, "loss": 2.4434, "mean_token_accuracy": 0.4275861978530884, "step": 82170 }, { "epoch": 0.08276753276707952, "grad_norm": 10.270612461211211, "learning_rate": 4.986803976510098e-05, "loss": 2.5181, "mean_token_accuracy": 0.4517241358757019, "step": 82175 }, { "epoch": 0.08277256882018369, "grad_norm": 16.201747485654018, "learning_rate": 4.9867999238485455e-05, "loss": 2.2256, "mean_token_accuracy": 0.46551724076271056, "step": 82180 }, { "epoch": 0.08277760487328786, "grad_norm": 12.974753120540193, "learning_rate": 4.9867958705666085e-05, "loss": 2.3936, "mean_token_accuracy": 0.4344827592372894, "step": 82185 }, { "epoch": 0.08278264092639204, "grad_norm": 11.66416062296085, "learning_rate": 4.986791816664288e-05, "loss": 2.7619, "mean_token_accuracy": 0.431280779838562, "step": 82190 }, { "epoch": 0.08278767697949621, "grad_norm": 11.579143840457157, "learning_rate": 4.986787762141584e-05, "loss": 2.477, "mean_token_accuracy": 0.47496975064277647, "step": 82195 }, { "epoch": 0.08279271303260038, "grad_norm": 11.042978300523743, "learning_rate": 4.9867837069985e-05, "loss": 2.4684, "mean_token_accuracy": 0.3965517282485962, "step": 82200 }, { "epoch": 0.08279774908570456, "grad_norm": 11.23755340544159, "learning_rate": 4.986779651235035e-05, "loss": 2.5728, "mean_token_accuracy": 0.43448275327682495, "step": 82205 }, { "epoch": 0.08280278513880873, "grad_norm": 10.10717044253312, "learning_rate": 4.9867755948511906e-05, "loss": 2.3824, "mean_token_accuracy": 0.40169388651847837, "step": 82210 }, { "epoch": 0.0828078211919129, "grad_norm": 9.12514766672047, "learning_rate": 4.986771537846968e-05, "loss": 2.2814, "mean_token_accuracy": 0.4482758641242981, "step": 82215 }, { "epoch": 0.08281285724501708, "grad_norm": 11.593216219970484, "learning_rate": 4.986767480222369e-05, "loss": 2.1515, "mean_token_accuracy": 0.46206897497177124, "step": 82220 }, { "epoch": 0.08281789329812125, "grad_norm": 10.792839425321397, "learning_rate": 4.986763421977394e-05, "loss": 2.1905, "mean_token_accuracy": 0.46551724076271056, "step": 82225 }, { "epoch": 0.08282292935122543, "grad_norm": 9.24773944485411, "learning_rate": 4.986759363112044e-05, "loss": 2.3699, "mean_token_accuracy": 0.38965516686439516, "step": 82230 }, { "epoch": 0.0828279654043296, "grad_norm": 19.49526015743598, "learning_rate": 4.986755303626321e-05, "loss": 3.0063, "mean_token_accuracy": 0.3895320177078247, "step": 82235 }, { "epoch": 0.08283300145743377, "grad_norm": 11.23071830529706, "learning_rate": 4.986751243520225e-05, "loss": 2.427, "mean_token_accuracy": 0.4172413766384125, "step": 82240 }, { "epoch": 0.08283803751053793, "grad_norm": 13.545678706775375, "learning_rate": 4.9867471827937576e-05, "loss": 2.7822, "mean_token_accuracy": 0.3965517163276672, "step": 82245 }, { "epoch": 0.08284307356364211, "grad_norm": 21.45802840981344, "learning_rate": 4.986743121446921e-05, "loss": 2.6908, "mean_token_accuracy": 0.4, "step": 82250 }, { "epoch": 0.08284810961674628, "grad_norm": 17.136502283038606, "learning_rate": 4.986739059479715e-05, "loss": 2.5603, "mean_token_accuracy": 0.38620689511299133, "step": 82255 }, { "epoch": 0.08285314566985046, "grad_norm": 15.125587180355422, "learning_rate": 4.98673499689214e-05, "loss": 2.8803, "mean_token_accuracy": 0.3482758700847626, "step": 82260 }, { "epoch": 0.08285818172295463, "grad_norm": 15.590895857715374, "learning_rate": 4.986730933684199e-05, "loss": 2.8564, "mean_token_accuracy": 0.3620689630508423, "step": 82265 }, { "epoch": 0.0828632177760588, "grad_norm": 12.05218254791798, "learning_rate": 4.9867268698558936e-05, "loss": 2.5547, "mean_token_accuracy": 0.42758620977401735, "step": 82270 }, { "epoch": 0.08286825382916298, "grad_norm": 11.379713224787885, "learning_rate": 4.986722805407222e-05, "loss": 2.7613, "mean_token_accuracy": 0.3896551787853241, "step": 82275 }, { "epoch": 0.08287328988226715, "grad_norm": 15.58638918653899, "learning_rate": 4.986718740338187e-05, "loss": 2.5878, "mean_token_accuracy": 0.441379314661026, "step": 82280 }, { "epoch": 0.08287832593537132, "grad_norm": 13.41194509230148, "learning_rate": 4.98671467464879e-05, "loss": 2.8009, "mean_token_accuracy": 0.34252873361110686, "step": 82285 }, { "epoch": 0.0828833619884755, "grad_norm": 10.568042948199562, "learning_rate": 4.9867106083390325e-05, "loss": 2.4719, "mean_token_accuracy": 0.4275861978530884, "step": 82290 }, { "epoch": 0.08288839804157967, "grad_norm": 10.72876268931549, "learning_rate": 4.9867065414089146e-05, "loss": 2.3343, "mean_token_accuracy": 0.458620685338974, "step": 82295 }, { "epoch": 0.08289343409468385, "grad_norm": 12.059655562712353, "learning_rate": 4.986702473858437e-05, "loss": 2.6755, "mean_token_accuracy": 0.36896551251411436, "step": 82300 }, { "epoch": 0.08289847014778802, "grad_norm": 11.985264149627843, "learning_rate": 4.986698405687603e-05, "loss": 2.0459, "mean_token_accuracy": 0.47761645913124084, "step": 82305 }, { "epoch": 0.0829035062008922, "grad_norm": 10.939417492358507, "learning_rate": 4.986694336896411e-05, "loss": 2.3665, "mean_token_accuracy": 0.43793103098869324, "step": 82310 }, { "epoch": 0.08290854225399635, "grad_norm": 11.373189070197762, "learning_rate": 4.986690267484864e-05, "loss": 2.2552, "mean_token_accuracy": 0.43448275327682495, "step": 82315 }, { "epoch": 0.08291357830710053, "grad_norm": 11.206124896487783, "learning_rate": 4.9866861974529636e-05, "loss": 2.1531, "mean_token_accuracy": 0.4379310369491577, "step": 82320 }, { "epoch": 0.0829186143602047, "grad_norm": 12.015208372476694, "learning_rate": 4.9866821268007085e-05, "loss": 2.4326, "mean_token_accuracy": 0.41724138259887694, "step": 82325 }, { "epoch": 0.08292365041330887, "grad_norm": 11.283109762344512, "learning_rate": 4.9866780555281014e-05, "loss": 2.1348, "mean_token_accuracy": 0.44827585816383364, "step": 82330 }, { "epoch": 0.08292868646641305, "grad_norm": 10.400926014463256, "learning_rate": 4.9866739836351436e-05, "loss": 2.5959, "mean_token_accuracy": 0.42758620381355283, "step": 82335 }, { "epoch": 0.08293372251951722, "grad_norm": 10.264456714365265, "learning_rate": 4.9866699111218366e-05, "loss": 2.5533, "mean_token_accuracy": 0.3793103456497192, "step": 82340 }, { "epoch": 0.0829387585726214, "grad_norm": 11.470615730278977, "learning_rate": 4.986665837988181e-05, "loss": 2.6789, "mean_token_accuracy": 0.37241379618644715, "step": 82345 }, { "epoch": 0.08294379462572557, "grad_norm": 11.418931579053867, "learning_rate": 4.9866617642341765e-05, "loss": 2.7184, "mean_token_accuracy": 0.3620689630508423, "step": 82350 }, { "epoch": 0.08294883067882974, "grad_norm": 9.797881937803332, "learning_rate": 4.986657689859826e-05, "loss": 2.4456, "mean_token_accuracy": 0.3827586233615875, "step": 82355 }, { "epoch": 0.08295386673193392, "grad_norm": 13.552657740748518, "learning_rate": 4.98665361486513e-05, "loss": 2.6914, "mean_token_accuracy": 0.36896551847457887, "step": 82360 }, { "epoch": 0.08295890278503809, "grad_norm": 10.309420123735132, "learning_rate": 4.9866495392500906e-05, "loss": 2.4082, "mean_token_accuracy": 0.4206896543502808, "step": 82365 }, { "epoch": 0.08296393883814226, "grad_norm": 10.242104718101537, "learning_rate": 4.986645463014707e-05, "loss": 2.4196, "mean_token_accuracy": 0.4517241418361664, "step": 82370 }, { "epoch": 0.08296897489124644, "grad_norm": 10.258008269604604, "learning_rate": 4.986641386158982e-05, "loss": 1.7767, "mean_token_accuracy": 0.510344821214676, "step": 82375 }, { "epoch": 0.08297401094435061, "grad_norm": 13.11827123491122, "learning_rate": 4.9866373086829165e-05, "loss": 2.7863, "mean_token_accuracy": 0.37241379022598264, "step": 82380 }, { "epoch": 0.08297904699745477, "grad_norm": 8.276961334496558, "learning_rate": 4.986633230586511e-05, "loss": 2.033, "mean_token_accuracy": 0.4689655125141144, "step": 82385 }, { "epoch": 0.08298408305055895, "grad_norm": 11.129658267833584, "learning_rate": 4.9866291518697663e-05, "loss": 2.4358, "mean_token_accuracy": 0.4172413766384125, "step": 82390 }, { "epoch": 0.08298911910366312, "grad_norm": 11.311449258103943, "learning_rate": 4.986625072532685e-05, "loss": 2.4932, "mean_token_accuracy": 0.4103448331356049, "step": 82395 }, { "epoch": 0.0829941551567673, "grad_norm": 10.540236135203783, "learning_rate": 4.986620992575268e-05, "loss": 2.4471, "mean_token_accuracy": 0.4068965494632721, "step": 82400 }, { "epoch": 0.08299919120987147, "grad_norm": 10.80015560392916, "learning_rate": 4.986616911997514e-05, "loss": 2.2977, "mean_token_accuracy": 0.4517241418361664, "step": 82405 }, { "epoch": 0.08300422726297564, "grad_norm": 10.115286718282931, "learning_rate": 4.9866128307994275e-05, "loss": 2.3791, "mean_token_accuracy": 0.4068965494632721, "step": 82410 }, { "epoch": 0.08300926331607981, "grad_norm": 17.769597000061392, "learning_rate": 4.986608748981007e-05, "loss": 2.1898, "mean_token_accuracy": 0.5000000059604645, "step": 82415 }, { "epoch": 0.08301429936918399, "grad_norm": 11.613024122986275, "learning_rate": 4.986604666542255e-05, "loss": 2.6336, "mean_token_accuracy": 0.39310344457626345, "step": 82420 }, { "epoch": 0.08301933542228816, "grad_norm": 11.693155807341295, "learning_rate": 4.986600583483173e-05, "loss": 2.5195, "mean_token_accuracy": 0.3999999940395355, "step": 82425 }, { "epoch": 0.08302437147539234, "grad_norm": 11.018920325517906, "learning_rate": 4.9865964998037605e-05, "loss": 3.0672, "mean_token_accuracy": 0.35862068831920624, "step": 82430 }, { "epoch": 0.08302940752849651, "grad_norm": 10.247827377121986, "learning_rate": 4.9865924155040205e-05, "loss": 2.4922, "mean_token_accuracy": 0.4034482717514038, "step": 82435 }, { "epoch": 0.08303444358160068, "grad_norm": 12.609278637767037, "learning_rate": 4.986588330583953e-05, "loss": 2.5104, "mean_token_accuracy": 0.4413793087005615, "step": 82440 }, { "epoch": 0.08303947963470486, "grad_norm": 11.592829360869892, "learning_rate": 4.986584245043559e-05, "loss": 2.3057, "mean_token_accuracy": 0.4517241299152374, "step": 82445 }, { "epoch": 0.08304451568780903, "grad_norm": 10.442397922436294, "learning_rate": 4.9865801588828406e-05, "loss": 2.3694, "mean_token_accuracy": 0.38965516686439516, "step": 82450 }, { "epoch": 0.08304955174091319, "grad_norm": 10.234166271392697, "learning_rate": 4.986576072101798e-05, "loss": 2.5674, "mean_token_accuracy": 0.43629764318466185, "step": 82455 }, { "epoch": 0.08305458779401736, "grad_norm": 12.070769226955278, "learning_rate": 4.986571984700433e-05, "loss": 2.4629, "mean_token_accuracy": 0.4517241418361664, "step": 82460 }, { "epoch": 0.08305962384712154, "grad_norm": 10.769423826566602, "learning_rate": 4.986567896678746e-05, "loss": 2.2905, "mean_token_accuracy": 0.4551724135875702, "step": 82465 }, { "epoch": 0.08306465990022571, "grad_norm": 12.663762146462663, "learning_rate": 4.9865638080367384e-05, "loss": 2.3578, "mean_token_accuracy": 0.4137930989265442, "step": 82470 }, { "epoch": 0.08306969595332989, "grad_norm": 12.634583502315403, "learning_rate": 4.9865597187744125e-05, "loss": 2.5017, "mean_token_accuracy": 0.4034482777118683, "step": 82475 }, { "epoch": 0.08307473200643406, "grad_norm": 10.270875818208602, "learning_rate": 4.986555628891767e-05, "loss": 2.7021, "mean_token_accuracy": 0.35172414779663086, "step": 82480 }, { "epoch": 0.08307976805953823, "grad_norm": 10.042170572891742, "learning_rate": 4.986551538388806e-05, "loss": 2.4761, "mean_token_accuracy": 0.4379310250282288, "step": 82485 }, { "epoch": 0.08308480411264241, "grad_norm": 12.715301454530257, "learning_rate": 4.986547447265527e-05, "loss": 2.6405, "mean_token_accuracy": 0.45172413885593415, "step": 82490 }, { "epoch": 0.08308984016574658, "grad_norm": 9.966131609728272, "learning_rate": 4.986543355521935e-05, "loss": 2.8094, "mean_token_accuracy": 0.39310344457626345, "step": 82495 }, { "epoch": 0.08309487621885076, "grad_norm": 10.081602601673959, "learning_rate": 4.986539263158029e-05, "loss": 2.3483, "mean_token_accuracy": 0.5103448271751404, "step": 82500 }, { "epoch": 0.08309991227195493, "grad_norm": 11.672426357501793, "learning_rate": 4.98653517017381e-05, "loss": 2.6305, "mean_token_accuracy": 0.37241379022598264, "step": 82505 }, { "epoch": 0.0831049483250591, "grad_norm": 10.545887748686948, "learning_rate": 4.98653107656928e-05, "loss": 2.2094, "mean_token_accuracy": 0.41034482717514037, "step": 82510 }, { "epoch": 0.08310998437816328, "grad_norm": 14.434290881032153, "learning_rate": 4.98652698234444e-05, "loss": 3.1225, "mean_token_accuracy": 0.32758620381355286, "step": 82515 }, { "epoch": 0.08311502043126745, "grad_norm": 10.939133217799744, "learning_rate": 4.98652288749929e-05, "loss": 2.6625, "mean_token_accuracy": 0.3517241358757019, "step": 82520 }, { "epoch": 0.08312005648437161, "grad_norm": 9.127346029818197, "learning_rate": 4.986518792033833e-05, "loss": 2.3697, "mean_token_accuracy": 0.46896551847457885, "step": 82525 }, { "epoch": 0.08312509253747578, "grad_norm": 9.41830891574775, "learning_rate": 4.986514695948069e-05, "loss": 2.2095, "mean_token_accuracy": 0.4724137902259827, "step": 82530 }, { "epoch": 0.08313012859057996, "grad_norm": 11.182816224776074, "learning_rate": 4.9865105992419985e-05, "loss": 2.2927, "mean_token_accuracy": 0.46551724076271056, "step": 82535 }, { "epoch": 0.08313516464368413, "grad_norm": 12.831482045296415, "learning_rate": 4.986506501915625e-05, "loss": 2.6978, "mean_token_accuracy": 0.3737447053194046, "step": 82540 }, { "epoch": 0.0831402006967883, "grad_norm": 9.21525293783706, "learning_rate": 4.986502403968947e-05, "loss": 2.8101, "mean_token_accuracy": 0.35172414481639863, "step": 82545 }, { "epoch": 0.08314523674989248, "grad_norm": 9.180655615267488, "learning_rate": 4.986498305401967e-05, "loss": 2.162, "mean_token_accuracy": 0.4310344815254211, "step": 82550 }, { "epoch": 0.08315027280299665, "grad_norm": 13.102176631612645, "learning_rate": 4.9864942062146854e-05, "loss": 2.8436, "mean_token_accuracy": 0.3517241358757019, "step": 82555 }, { "epoch": 0.08315530885610083, "grad_norm": 11.055183765231957, "learning_rate": 4.986490106407104e-05, "loss": 2.435, "mean_token_accuracy": 0.4034482717514038, "step": 82560 }, { "epoch": 0.083160344909205, "grad_norm": 11.00716294889182, "learning_rate": 4.9864860059792233e-05, "loss": 2.5923, "mean_token_accuracy": 0.3931034505367279, "step": 82565 }, { "epoch": 0.08316538096230917, "grad_norm": 8.597140943359383, "learning_rate": 4.986481904931046e-05, "loss": 2.2193, "mean_token_accuracy": 0.4814881980419159, "step": 82570 }, { "epoch": 0.08317041701541335, "grad_norm": 11.88079292483633, "learning_rate": 4.9864778032625714e-05, "loss": 2.3942, "mean_token_accuracy": 0.4241379380226135, "step": 82575 }, { "epoch": 0.08317545306851752, "grad_norm": 10.130839029263173, "learning_rate": 4.986473700973802e-05, "loss": 2.4907, "mean_token_accuracy": 0.43793103098869324, "step": 82580 }, { "epoch": 0.0831804891216217, "grad_norm": 11.058094850283616, "learning_rate": 4.986469598064738e-05, "loss": 2.6836, "mean_token_accuracy": 0.3793103456497192, "step": 82585 }, { "epoch": 0.08318552517472587, "grad_norm": 10.541970667913601, "learning_rate": 4.986465494535379e-05, "loss": 2.367, "mean_token_accuracy": 0.47931034564971925, "step": 82590 }, { "epoch": 0.08319056122783003, "grad_norm": 10.298919433337783, "learning_rate": 4.98646139038573e-05, "loss": 2.4729, "mean_token_accuracy": 0.4206896543502808, "step": 82595 }, { "epoch": 0.0831955972809342, "grad_norm": 10.396375609137843, "learning_rate": 4.98645728561579e-05, "loss": 2.3314, "mean_token_accuracy": 0.42068964838981626, "step": 82600 }, { "epoch": 0.08320063333403838, "grad_norm": 11.24772062722133, "learning_rate": 4.9864531802255596e-05, "loss": 2.2713, "mean_token_accuracy": 0.4379310369491577, "step": 82605 }, { "epoch": 0.08320566938714255, "grad_norm": 12.439737389563897, "learning_rate": 4.98644907421504e-05, "loss": 2.2975, "mean_token_accuracy": 0.4206896543502808, "step": 82610 }, { "epoch": 0.08321070544024672, "grad_norm": 11.990642166322042, "learning_rate": 4.986444967584235e-05, "loss": 2.4687, "mean_token_accuracy": 0.3655172407627106, "step": 82615 }, { "epoch": 0.0832157414933509, "grad_norm": 10.643555331061288, "learning_rate": 4.9864408603331416e-05, "loss": 2.9553, "mean_token_accuracy": 0.3703569293022156, "step": 82620 }, { "epoch": 0.08322077754645507, "grad_norm": 11.394726795178814, "learning_rate": 4.986436752461764e-05, "loss": 2.3523, "mean_token_accuracy": 0.47029643058776854, "step": 82625 }, { "epoch": 0.08322581359955925, "grad_norm": 9.645019771558594, "learning_rate": 4.986432643970102e-05, "loss": 2.3189, "mean_token_accuracy": 0.4, "step": 82630 }, { "epoch": 0.08323084965266342, "grad_norm": 9.794633256007705, "learning_rate": 4.9864285348581565e-05, "loss": 2.4805, "mean_token_accuracy": 0.4137930989265442, "step": 82635 }, { "epoch": 0.08323588570576759, "grad_norm": 9.415129871817223, "learning_rate": 4.98642442512593e-05, "loss": 2.5274, "mean_token_accuracy": 0.499939501285553, "step": 82640 }, { "epoch": 0.08324092175887177, "grad_norm": 9.681877618821478, "learning_rate": 4.9864203147734226e-05, "loss": 2.5213, "mean_token_accuracy": 0.36896551847457887, "step": 82645 }, { "epoch": 0.08324595781197594, "grad_norm": 11.284220978964598, "learning_rate": 4.9864162038006354e-05, "loss": 2.3518, "mean_token_accuracy": 0.42413792610168455, "step": 82650 }, { "epoch": 0.08325099386508011, "grad_norm": 12.299902521987429, "learning_rate": 4.98641209220757e-05, "loss": 2.3673, "mean_token_accuracy": 0.42758620977401735, "step": 82655 }, { "epoch": 0.08325602991818429, "grad_norm": 14.359614690450313, "learning_rate": 4.9864079799942284e-05, "loss": 2.7329, "mean_token_accuracy": 0.36896551251411436, "step": 82660 }, { "epoch": 0.08326106597128845, "grad_norm": 10.127414832485723, "learning_rate": 4.9864038671606094e-05, "loss": 2.1576, "mean_token_accuracy": 0.4241379380226135, "step": 82665 }, { "epoch": 0.08326610202439262, "grad_norm": 15.438443729499125, "learning_rate": 4.986399753706716e-05, "loss": 2.7622, "mean_token_accuracy": 0.4172413766384125, "step": 82670 }, { "epoch": 0.0832711380774968, "grad_norm": 15.747582974856801, "learning_rate": 4.9863956396325485e-05, "loss": 2.4678, "mean_token_accuracy": 0.42758620977401735, "step": 82675 }, { "epoch": 0.08327617413060097, "grad_norm": 11.205610932179288, "learning_rate": 4.9863915249381085e-05, "loss": 2.7303, "mean_token_accuracy": 0.3862068921327591, "step": 82680 }, { "epoch": 0.08328121018370514, "grad_norm": 10.75878697001982, "learning_rate": 4.986387409623397e-05, "loss": 2.5576, "mean_token_accuracy": 0.38275861740112305, "step": 82685 }, { "epoch": 0.08328624623680932, "grad_norm": 10.329895116175349, "learning_rate": 4.9863832936884145e-05, "loss": 2.1803, "mean_token_accuracy": 0.47586206197738645, "step": 82690 }, { "epoch": 0.08329128228991349, "grad_norm": 11.408368361612242, "learning_rate": 4.986379177133163e-05, "loss": 2.3212, "mean_token_accuracy": 0.44137930274009707, "step": 82695 }, { "epoch": 0.08329631834301766, "grad_norm": 11.429149217489485, "learning_rate": 4.9863750599576444e-05, "loss": 2.6446, "mean_token_accuracy": 0.4068965554237366, "step": 82700 }, { "epoch": 0.08330135439612184, "grad_norm": 9.610288105148737, "learning_rate": 4.986370942161858e-05, "loss": 2.3372, "mean_token_accuracy": 0.4103448212146759, "step": 82705 }, { "epoch": 0.08330639044922601, "grad_norm": 12.099681776037754, "learning_rate": 4.986366823745806e-05, "loss": 2.649, "mean_token_accuracy": 0.4413793087005615, "step": 82710 }, { "epoch": 0.08331142650233019, "grad_norm": 12.518378304491279, "learning_rate": 4.986362704709489e-05, "loss": 2.5691, "mean_token_accuracy": 0.4103448212146759, "step": 82715 }, { "epoch": 0.08331646255543436, "grad_norm": 10.520785558671852, "learning_rate": 4.986358585052908e-05, "loss": 1.9944, "mean_token_accuracy": 0.47241379618644713, "step": 82720 }, { "epoch": 0.08332149860853853, "grad_norm": 10.79122326106139, "learning_rate": 4.9863544647760655e-05, "loss": 2.1735, "mean_token_accuracy": 0.4310344815254211, "step": 82725 }, { "epoch": 0.0833265346616427, "grad_norm": 11.989668210155806, "learning_rate": 4.986350343878962e-05, "loss": 2.3488, "mean_token_accuracy": 0.4413793087005615, "step": 82730 }, { "epoch": 0.08333157071474687, "grad_norm": 10.370662232070792, "learning_rate": 4.986346222361598e-05, "loss": 2.6357, "mean_token_accuracy": 0.3571082890033722, "step": 82735 }, { "epoch": 0.08333660676785104, "grad_norm": 12.986089711490512, "learning_rate": 4.986342100223974e-05, "loss": 2.6291, "mean_token_accuracy": 0.4103448331356049, "step": 82740 }, { "epoch": 0.08334164282095521, "grad_norm": 9.429207836036195, "learning_rate": 4.986337977466094e-05, "loss": 2.4347, "mean_token_accuracy": 0.41379310488700866, "step": 82745 }, { "epoch": 0.08334667887405939, "grad_norm": 11.71014257183348, "learning_rate": 4.986333854087956e-05, "loss": 2.3085, "mean_token_accuracy": 0.42758620381355283, "step": 82750 }, { "epoch": 0.08335171492716356, "grad_norm": 8.865422849601968, "learning_rate": 4.9863297300895626e-05, "loss": 2.5315, "mean_token_accuracy": 0.39999998807907106, "step": 82755 }, { "epoch": 0.08335675098026774, "grad_norm": 10.843287885844795, "learning_rate": 4.986325605470916e-05, "loss": 2.2121, "mean_token_accuracy": 0.482758617401123, "step": 82760 }, { "epoch": 0.08336178703337191, "grad_norm": 9.85226027274634, "learning_rate": 4.986321480232015e-05, "loss": 2.4583, "mean_token_accuracy": 0.41034482717514037, "step": 82765 }, { "epoch": 0.08336682308647608, "grad_norm": 11.139862959308285, "learning_rate": 4.986317354372862e-05, "loss": 2.6922, "mean_token_accuracy": 0.3482758641242981, "step": 82770 }, { "epoch": 0.08337185913958026, "grad_norm": 11.661981576904845, "learning_rate": 4.9863132278934585e-05, "loss": 2.5937, "mean_token_accuracy": 0.38965516686439516, "step": 82775 }, { "epoch": 0.08337689519268443, "grad_norm": 10.575097645057724, "learning_rate": 4.986309100793804e-05, "loss": 2.3208, "mean_token_accuracy": 0.3931034505367279, "step": 82780 }, { "epoch": 0.0833819312457886, "grad_norm": 9.377124098019275, "learning_rate": 4.986304973073902e-05, "loss": 2.3938, "mean_token_accuracy": 0.41379310488700866, "step": 82785 }, { "epoch": 0.08338696729889278, "grad_norm": 9.59378778467062, "learning_rate": 4.986300844733752e-05, "loss": 2.1794, "mean_token_accuracy": 0.47586206197738645, "step": 82790 }, { "epoch": 0.08339200335199695, "grad_norm": 11.162354912547405, "learning_rate": 4.9862967157733564e-05, "loss": 2.7136, "mean_token_accuracy": 0.3758620649576187, "step": 82795 }, { "epoch": 0.08339703940510113, "grad_norm": 9.670769318582886, "learning_rate": 4.986292586192714e-05, "loss": 2.2376, "mean_token_accuracy": 0.45396249890327456, "step": 82800 }, { "epoch": 0.08340207545820529, "grad_norm": 12.352703104482522, "learning_rate": 4.9862884559918285e-05, "loss": 2.5833, "mean_token_accuracy": 0.4278325140476227, "step": 82805 }, { "epoch": 0.08340711151130946, "grad_norm": 11.10472151922161, "learning_rate": 4.986284325170701e-05, "loss": 2.3118, "mean_token_accuracy": 0.42068964838981626, "step": 82810 }, { "epoch": 0.08341214756441363, "grad_norm": 14.819942600467058, "learning_rate": 4.98628019372933e-05, "loss": 2.9864, "mean_token_accuracy": 0.33448275923728943, "step": 82815 }, { "epoch": 0.0834171836175178, "grad_norm": 11.216083681449321, "learning_rate": 4.98627606166772e-05, "loss": 2.6831, "mean_token_accuracy": 0.4034482717514038, "step": 82820 }, { "epoch": 0.08342221967062198, "grad_norm": 8.698610313581378, "learning_rate": 4.98627192898587e-05, "loss": 2.02, "mean_token_accuracy": 0.46896552443504336, "step": 82825 }, { "epoch": 0.08342725572372615, "grad_norm": 10.275269383389702, "learning_rate": 4.98626779568378e-05, "loss": 2.4617, "mean_token_accuracy": 0.45517240166664125, "step": 82830 }, { "epoch": 0.08343229177683033, "grad_norm": 13.307403436034104, "learning_rate": 4.9862636617614546e-05, "loss": 2.4454, "mean_token_accuracy": 0.38620689511299133, "step": 82835 }, { "epoch": 0.0834373278299345, "grad_norm": 11.33589318710027, "learning_rate": 4.9862595272188914e-05, "loss": 2.0192, "mean_token_accuracy": 0.4793103516101837, "step": 82840 }, { "epoch": 0.08344236388303868, "grad_norm": 12.468456769436129, "learning_rate": 4.986255392056095e-05, "loss": 2.389, "mean_token_accuracy": 0.4517241418361664, "step": 82845 }, { "epoch": 0.08344739993614285, "grad_norm": 9.401665585925628, "learning_rate": 4.986251256273064e-05, "loss": 2.2423, "mean_token_accuracy": 0.47586206197738645, "step": 82850 }, { "epoch": 0.08345243598924702, "grad_norm": 10.67840874556472, "learning_rate": 4.9862471198698005e-05, "loss": 2.4844, "mean_token_accuracy": 0.41724138259887694, "step": 82855 }, { "epoch": 0.0834574720423512, "grad_norm": 11.146497111660969, "learning_rate": 4.9862429828463056e-05, "loss": 2.7394, "mean_token_accuracy": 0.41724138259887694, "step": 82860 }, { "epoch": 0.08346250809545537, "grad_norm": 11.339253852790847, "learning_rate": 4.986238845202581e-05, "loss": 2.1454, "mean_token_accuracy": 0.4586206912994385, "step": 82865 }, { "epoch": 0.08346754414855954, "grad_norm": 10.638422928250272, "learning_rate": 4.986234706938626e-05, "loss": 2.1987, "mean_token_accuracy": 0.4517241418361664, "step": 82870 }, { "epoch": 0.0834725802016637, "grad_norm": 11.373663882252632, "learning_rate": 4.986230568054444e-05, "loss": 2.8942, "mean_token_accuracy": 0.39310343861579894, "step": 82875 }, { "epoch": 0.08347761625476788, "grad_norm": 14.98459242917824, "learning_rate": 4.986226428550034e-05, "loss": 2.6782, "mean_token_accuracy": 0.3793103456497192, "step": 82880 }, { "epoch": 0.08348265230787205, "grad_norm": 11.847385312192364, "learning_rate": 4.986222288425399e-05, "loss": 2.2851, "mean_token_accuracy": 0.42758620381355283, "step": 82885 }, { "epoch": 0.08348768836097623, "grad_norm": 9.533590532619968, "learning_rate": 4.9862181476805396e-05, "loss": 2.3982, "mean_token_accuracy": 0.441379314661026, "step": 82890 }, { "epoch": 0.0834927244140804, "grad_norm": 14.374307413990477, "learning_rate": 4.986214006315456e-05, "loss": 2.8215, "mean_token_accuracy": 0.36551724672317504, "step": 82895 }, { "epoch": 0.08349776046718457, "grad_norm": 8.214865386659008, "learning_rate": 4.986209864330151e-05, "loss": 1.9521, "mean_token_accuracy": 0.482758617401123, "step": 82900 }, { "epoch": 0.08350279652028875, "grad_norm": 13.981274575220105, "learning_rate": 4.986205721724625e-05, "loss": 2.9051, "mean_token_accuracy": 0.34137930870056155, "step": 82905 }, { "epoch": 0.08350783257339292, "grad_norm": 13.087368983137498, "learning_rate": 4.986201578498878e-05, "loss": 2.5624, "mean_token_accuracy": 0.4, "step": 82910 }, { "epoch": 0.0835128686264971, "grad_norm": 12.38059333661824, "learning_rate": 4.986197434652912e-05, "loss": 2.4325, "mean_token_accuracy": 0.42758620381355283, "step": 82915 }, { "epoch": 0.08351790467960127, "grad_norm": 10.42108401114212, "learning_rate": 4.986193290186729e-05, "loss": 2.3919, "mean_token_accuracy": 0.4620689690113068, "step": 82920 }, { "epoch": 0.08352294073270544, "grad_norm": 10.809411618508351, "learning_rate": 4.9861891451003296e-05, "loss": 2.1537, "mean_token_accuracy": 0.4275861978530884, "step": 82925 }, { "epoch": 0.08352797678580962, "grad_norm": 14.837942985273731, "learning_rate": 4.986184999393715e-05, "loss": 2.761, "mean_token_accuracy": 0.42413793206214906, "step": 82930 }, { "epoch": 0.08353301283891379, "grad_norm": 11.506331624579325, "learning_rate": 4.986180853066885e-05, "loss": 2.3066, "mean_token_accuracy": 0.41379310488700866, "step": 82935 }, { "epoch": 0.08353804889201796, "grad_norm": 11.318133632255877, "learning_rate": 4.986176706119843e-05, "loss": 2.2004, "mean_token_accuracy": 0.458620685338974, "step": 82940 }, { "epoch": 0.08354308494512212, "grad_norm": 14.286271040364733, "learning_rate": 4.986172558552588e-05, "loss": 2.5893, "mean_token_accuracy": 0.4448275864124298, "step": 82945 }, { "epoch": 0.0835481209982263, "grad_norm": 10.307457282477662, "learning_rate": 4.986168410365123e-05, "loss": 2.4582, "mean_token_accuracy": 0.41724138855934145, "step": 82950 }, { "epoch": 0.08355315705133047, "grad_norm": 11.865857535641164, "learning_rate": 4.9861642615574485e-05, "loss": 2.4798, "mean_token_accuracy": 0.4137930989265442, "step": 82955 }, { "epoch": 0.08355819310443464, "grad_norm": 10.689273741160822, "learning_rate": 4.986160112129565e-05, "loss": 2.3675, "mean_token_accuracy": 0.4068965524435043, "step": 82960 }, { "epoch": 0.08356322915753882, "grad_norm": 7.188308982097855, "learning_rate": 4.9861559620814745e-05, "loss": 2.0852, "mean_token_accuracy": 0.5160098552703858, "step": 82965 }, { "epoch": 0.08356826521064299, "grad_norm": 11.229698860769501, "learning_rate": 4.9861518114131773e-05, "loss": 3.0638, "mean_token_accuracy": 0.3517241358757019, "step": 82970 }, { "epoch": 0.08357330126374717, "grad_norm": 11.158785103427302, "learning_rate": 4.986147660124676e-05, "loss": 2.4454, "mean_token_accuracy": 0.42758620381355283, "step": 82975 }, { "epoch": 0.08357833731685134, "grad_norm": 9.751250228631148, "learning_rate": 4.98614350821597e-05, "loss": 2.3353, "mean_token_accuracy": 0.46896551847457885, "step": 82980 }, { "epoch": 0.08358337336995551, "grad_norm": 11.965337134827106, "learning_rate": 4.9861393556870614e-05, "loss": 2.7998, "mean_token_accuracy": 0.32068965435028074, "step": 82985 }, { "epoch": 0.08358840942305969, "grad_norm": 12.468862950010557, "learning_rate": 4.986135202537951e-05, "loss": 2.8967, "mean_token_accuracy": 0.4034482717514038, "step": 82990 }, { "epoch": 0.08359344547616386, "grad_norm": 11.889973584537154, "learning_rate": 4.98613104876864e-05, "loss": 2.514, "mean_token_accuracy": 0.41524500846862794, "step": 82995 }, { "epoch": 0.08359848152926803, "grad_norm": 10.39654880073071, "learning_rate": 4.9861268943791305e-05, "loss": 2.1639, "mean_token_accuracy": 0.46079854369163514, "step": 83000 }, { "epoch": 0.08360351758237221, "grad_norm": 10.37141449284123, "learning_rate": 4.986122739369423e-05, "loss": 2.8829, "mean_token_accuracy": 0.4034482777118683, "step": 83005 }, { "epoch": 0.08360855363547638, "grad_norm": 10.105373484060541, "learning_rate": 4.986118583739518e-05, "loss": 2.5437, "mean_token_accuracy": 0.4396249294281006, "step": 83010 }, { "epoch": 0.08361358968858054, "grad_norm": 13.239378237296776, "learning_rate": 4.986114427489416e-05, "loss": 2.6328, "mean_token_accuracy": 0.39310344457626345, "step": 83015 }, { "epoch": 0.08361862574168472, "grad_norm": 10.676070233725856, "learning_rate": 4.986110270619122e-05, "loss": 2.241, "mean_token_accuracy": 0.42758620381355283, "step": 83020 }, { "epoch": 0.08362366179478889, "grad_norm": 11.342473487449631, "learning_rate": 4.986106113128632e-05, "loss": 2.3042, "mean_token_accuracy": 0.4517241358757019, "step": 83025 }, { "epoch": 0.08362869784789306, "grad_norm": 11.464720293349988, "learning_rate": 4.9861019550179505e-05, "loss": 2.7189, "mean_token_accuracy": 0.4034482717514038, "step": 83030 }, { "epoch": 0.08363373390099724, "grad_norm": 11.219807923161536, "learning_rate": 4.986097796287078e-05, "loss": 2.263, "mean_token_accuracy": 0.4517241299152374, "step": 83035 }, { "epoch": 0.08363876995410141, "grad_norm": 11.582436365012168, "learning_rate": 4.9860936369360156e-05, "loss": 2.2919, "mean_token_accuracy": 0.42758620381355283, "step": 83040 }, { "epoch": 0.08364380600720558, "grad_norm": 26.07383653524439, "learning_rate": 4.986089476964764e-05, "loss": 2.9936, "mean_token_accuracy": 0.39655171930789945, "step": 83045 }, { "epoch": 0.08364884206030976, "grad_norm": 14.741572202758435, "learning_rate": 4.9860853163733244e-05, "loss": 2.572, "mean_token_accuracy": 0.40689656138420105, "step": 83050 }, { "epoch": 0.08365387811341393, "grad_norm": 13.26011684732726, "learning_rate": 4.986081155161698e-05, "loss": 2.538, "mean_token_accuracy": 0.3862068891525269, "step": 83055 }, { "epoch": 0.0836589141665181, "grad_norm": 10.208249407708344, "learning_rate": 4.986076993329887e-05, "loss": 3.1153, "mean_token_accuracy": 0.36896551549434664, "step": 83060 }, { "epoch": 0.08366395021962228, "grad_norm": 10.538724381325476, "learning_rate": 4.986072830877891e-05, "loss": 2.3663, "mean_token_accuracy": 0.4448275864124298, "step": 83065 }, { "epoch": 0.08366898627272645, "grad_norm": 11.11193206991651, "learning_rate": 4.986068667805712e-05, "loss": 2.6124, "mean_token_accuracy": 0.3862068891525269, "step": 83070 }, { "epoch": 0.08367402232583063, "grad_norm": 10.677506839797585, "learning_rate": 4.986064504113351e-05, "loss": 2.5111, "mean_token_accuracy": 0.41724138557910917, "step": 83075 }, { "epoch": 0.0836790583789348, "grad_norm": 10.256579178278194, "learning_rate": 4.98606033980081e-05, "loss": 2.4099, "mean_token_accuracy": 0.43284936547279357, "step": 83080 }, { "epoch": 0.08368409443203896, "grad_norm": 11.957307785556674, "learning_rate": 4.9860561748680884e-05, "loss": 2.6292, "mean_token_accuracy": 0.3827586144208908, "step": 83085 }, { "epoch": 0.08368913048514313, "grad_norm": 14.171708756823811, "learning_rate": 4.986052009315189e-05, "loss": 2.748, "mean_token_accuracy": 0.4137930929660797, "step": 83090 }, { "epoch": 0.08369416653824731, "grad_norm": 9.34690622564794, "learning_rate": 4.9860478431421115e-05, "loss": 2.6633, "mean_token_accuracy": 0.38965517580509185, "step": 83095 }, { "epoch": 0.08369920259135148, "grad_norm": 11.211966450382377, "learning_rate": 4.986043676348858e-05, "loss": 2.6758, "mean_token_accuracy": 0.37586207389831544, "step": 83100 }, { "epoch": 0.08370423864445566, "grad_norm": 11.322832974585808, "learning_rate": 4.986039508935429e-05, "loss": 2.7295, "mean_token_accuracy": 0.4110837459564209, "step": 83105 }, { "epoch": 0.08370927469755983, "grad_norm": 9.845022844999637, "learning_rate": 4.9860353409018264e-05, "loss": 2.4116, "mean_token_accuracy": 0.4068965524435043, "step": 83110 }, { "epoch": 0.083714310750664, "grad_norm": 9.593409045018639, "learning_rate": 4.986031172248052e-05, "loss": 2.5494, "mean_token_accuracy": 0.46031458377838136, "step": 83115 }, { "epoch": 0.08371934680376818, "grad_norm": 11.460032509523739, "learning_rate": 4.9860270029741054e-05, "loss": 2.229, "mean_token_accuracy": 0.4310344815254211, "step": 83120 }, { "epoch": 0.08372438285687235, "grad_norm": 8.595612385810396, "learning_rate": 4.986022833079987e-05, "loss": 2.1374, "mean_token_accuracy": 0.4460375070571899, "step": 83125 }, { "epoch": 0.08372941890997652, "grad_norm": 10.51958155458344, "learning_rate": 4.986018662565701e-05, "loss": 2.9014, "mean_token_accuracy": 0.41034482717514037, "step": 83130 }, { "epoch": 0.0837344549630807, "grad_norm": 10.963323145351263, "learning_rate": 4.9860144914312463e-05, "loss": 2.8365, "mean_token_accuracy": 0.3896551728248596, "step": 83135 }, { "epoch": 0.08373949101618487, "grad_norm": 12.475843505631406, "learning_rate": 4.986010319676625e-05, "loss": 2.4888, "mean_token_accuracy": 0.4034482717514038, "step": 83140 }, { "epoch": 0.08374452706928905, "grad_norm": 10.852976164484186, "learning_rate": 4.986006147301838e-05, "loss": 2.7658, "mean_token_accuracy": 0.38620689511299133, "step": 83145 }, { "epoch": 0.08374956312239322, "grad_norm": 12.625646125963442, "learning_rate": 4.986001974306886e-05, "loss": 2.5164, "mean_token_accuracy": 0.41724138557910917, "step": 83150 }, { "epoch": 0.08375459917549738, "grad_norm": 9.073883619489958, "learning_rate": 4.9859978006917705e-05, "loss": 2.6762, "mean_token_accuracy": 0.4103448331356049, "step": 83155 }, { "epoch": 0.08375963522860155, "grad_norm": 9.87505331301339, "learning_rate": 4.9859936264564926e-05, "loss": 2.7848, "mean_token_accuracy": 0.3482758581638336, "step": 83160 }, { "epoch": 0.08376467128170573, "grad_norm": 8.741733627967973, "learning_rate": 4.985989451601054e-05, "loss": 1.9837, "mean_token_accuracy": 0.544367390871048, "step": 83165 }, { "epoch": 0.0837697073348099, "grad_norm": 10.613866469402993, "learning_rate": 4.9859852761254546e-05, "loss": 2.6031, "mean_token_accuracy": 0.3482758581638336, "step": 83170 }, { "epoch": 0.08377474338791407, "grad_norm": 11.341855242737278, "learning_rate": 4.9859811000296966e-05, "loss": 2.7525, "mean_token_accuracy": 0.3689655244350433, "step": 83175 }, { "epoch": 0.08377977944101825, "grad_norm": 9.587401421148714, "learning_rate": 4.985976923313782e-05, "loss": 2.5865, "mean_token_accuracy": 0.4068965554237366, "step": 83180 }, { "epoch": 0.08378481549412242, "grad_norm": 16.58057132708195, "learning_rate": 4.9859727459777094e-05, "loss": 2.4006, "mean_token_accuracy": 0.4482758641242981, "step": 83185 }, { "epoch": 0.0837898515472266, "grad_norm": 13.07455993981948, "learning_rate": 4.9859685680214815e-05, "loss": 2.6401, "mean_token_accuracy": 0.3827586263418198, "step": 83190 }, { "epoch": 0.08379488760033077, "grad_norm": 12.093203659277888, "learning_rate": 4.9859643894451e-05, "loss": 2.6058, "mean_token_accuracy": 0.415365993976593, "step": 83195 }, { "epoch": 0.08379992365343494, "grad_norm": 12.405668138378134, "learning_rate": 4.985960210248565e-05, "loss": 2.5005, "mean_token_accuracy": 0.42413793206214906, "step": 83200 }, { "epoch": 0.08380495970653912, "grad_norm": 11.061046364004444, "learning_rate": 4.985956030431879e-05, "loss": 2.2196, "mean_token_accuracy": 0.41379310190677643, "step": 83205 }, { "epoch": 0.08380999575964329, "grad_norm": 15.416844184669113, "learning_rate": 4.985951849995041e-05, "loss": 2.4872, "mean_token_accuracy": 0.4000000059604645, "step": 83210 }, { "epoch": 0.08381503181274746, "grad_norm": 11.737768805503919, "learning_rate": 4.985947668938054e-05, "loss": 2.5916, "mean_token_accuracy": 0.3896551728248596, "step": 83215 }, { "epoch": 0.08382006786585164, "grad_norm": 11.40953277103002, "learning_rate": 4.985943487260919e-05, "loss": 3.1841, "mean_token_accuracy": 0.38620689511299133, "step": 83220 }, { "epoch": 0.0838251039189558, "grad_norm": 11.680256934264111, "learning_rate": 4.985939304963636e-05, "loss": 2.4775, "mean_token_accuracy": 0.3655172407627106, "step": 83225 }, { "epoch": 0.08383013997205997, "grad_norm": 9.689944282517445, "learning_rate": 4.985935122046207e-05, "loss": 2.381, "mean_token_accuracy": 0.4448275864124298, "step": 83230 }, { "epoch": 0.08383517602516415, "grad_norm": 8.095969975617331, "learning_rate": 4.985930938508634e-05, "loss": 2.1982, "mean_token_accuracy": 0.47241379618644713, "step": 83235 }, { "epoch": 0.08384021207826832, "grad_norm": 10.029520564491996, "learning_rate": 4.9859267543509155e-05, "loss": 2.1695, "mean_token_accuracy": 0.4482758641242981, "step": 83240 }, { "epoch": 0.0838452481313725, "grad_norm": 11.99928970860891, "learning_rate": 4.9859225695730556e-05, "loss": 2.4908, "mean_token_accuracy": 0.4206896543502808, "step": 83245 }, { "epoch": 0.08385028418447667, "grad_norm": 10.5699337411793, "learning_rate": 4.985918384175054e-05, "loss": 2.4345, "mean_token_accuracy": 0.45862067937850953, "step": 83250 }, { "epoch": 0.08385532023758084, "grad_norm": 11.697460338333602, "learning_rate": 4.985914198156912e-05, "loss": 2.0608, "mean_token_accuracy": 0.43448275327682495, "step": 83255 }, { "epoch": 0.08386035629068501, "grad_norm": 9.588130151317326, "learning_rate": 4.985910011518631e-05, "loss": 2.308, "mean_token_accuracy": 0.4172413766384125, "step": 83260 }, { "epoch": 0.08386539234378919, "grad_norm": 9.66939931499166, "learning_rate": 4.985905824260212e-05, "loss": 2.3924, "mean_token_accuracy": 0.42758620977401735, "step": 83265 }, { "epoch": 0.08387042839689336, "grad_norm": 13.407319088290151, "learning_rate": 4.985901636381656e-05, "loss": 3.0651, "mean_token_accuracy": 0.38275861740112305, "step": 83270 }, { "epoch": 0.08387546444999754, "grad_norm": 11.570050876747313, "learning_rate": 4.985897447882964e-05, "loss": 2.674, "mean_token_accuracy": 0.3482758581638336, "step": 83275 }, { "epoch": 0.08388050050310171, "grad_norm": 12.402931613388107, "learning_rate": 4.9858932587641385e-05, "loss": 2.3106, "mean_token_accuracy": 0.44137930274009707, "step": 83280 }, { "epoch": 0.08388553655620588, "grad_norm": 12.485987034234936, "learning_rate": 4.9858890690251794e-05, "loss": 2.3368, "mean_token_accuracy": 0.41379310488700866, "step": 83285 }, { "epoch": 0.08389057260931006, "grad_norm": 10.333118726636856, "learning_rate": 4.985884878666088e-05, "loss": 2.518, "mean_token_accuracy": 0.4310344815254211, "step": 83290 }, { "epoch": 0.08389560866241422, "grad_norm": 9.538062171671852, "learning_rate": 4.9858806876868654e-05, "loss": 2.4016, "mean_token_accuracy": 0.44137930274009707, "step": 83295 }, { "epoch": 0.08390064471551839, "grad_norm": 10.074059507341508, "learning_rate": 4.985876496087512e-05, "loss": 2.3968, "mean_token_accuracy": 0.42068966031074523, "step": 83300 }, { "epoch": 0.08390568076862256, "grad_norm": 11.876317798531739, "learning_rate": 4.985872303868032e-05, "loss": 2.397, "mean_token_accuracy": 0.4344827711582184, "step": 83305 }, { "epoch": 0.08391071682172674, "grad_norm": 10.28582294796457, "learning_rate": 4.985868111028423e-05, "loss": 2.4197, "mean_token_accuracy": 0.42413793206214906, "step": 83310 }, { "epoch": 0.08391575287483091, "grad_norm": 12.01814970649126, "learning_rate": 4.9858639175686875e-05, "loss": 2.4169, "mean_token_accuracy": 0.42413793206214906, "step": 83315 }, { "epoch": 0.08392078892793509, "grad_norm": 11.01635476178143, "learning_rate": 4.9858597234888276e-05, "loss": 2.071, "mean_token_accuracy": 0.4675741076469421, "step": 83320 }, { "epoch": 0.08392582498103926, "grad_norm": 10.758304955650187, "learning_rate": 4.9858555287888434e-05, "loss": 2.5264, "mean_token_accuracy": 0.42068966031074523, "step": 83325 }, { "epoch": 0.08393086103414343, "grad_norm": 11.71823484031071, "learning_rate": 4.985851333468737e-05, "loss": 2.8669, "mean_token_accuracy": 0.3310344755649567, "step": 83330 }, { "epoch": 0.08393589708724761, "grad_norm": 10.834454257246085, "learning_rate": 4.9858471375285074e-05, "loss": 2.3563, "mean_token_accuracy": 0.3965517282485962, "step": 83335 }, { "epoch": 0.08394093314035178, "grad_norm": 10.126775380980824, "learning_rate": 4.985842940968158e-05, "loss": 2.236, "mean_token_accuracy": 0.40852994918823243, "step": 83340 }, { "epoch": 0.08394596919345595, "grad_norm": 10.8159106229767, "learning_rate": 4.9858387437876895e-05, "loss": 2.3857, "mean_token_accuracy": 0.4413793087005615, "step": 83345 }, { "epoch": 0.08395100524656013, "grad_norm": 11.342946971137298, "learning_rate": 4.985834545987103e-05, "loss": 2.8594, "mean_token_accuracy": 0.3241379290819168, "step": 83350 }, { "epoch": 0.0839560412996643, "grad_norm": 15.755150936394266, "learning_rate": 4.9858303475663994e-05, "loss": 2.497, "mean_token_accuracy": 0.417241370677948, "step": 83355 }, { "epoch": 0.08396107735276848, "grad_norm": 11.182494798203983, "learning_rate": 4.9858261485255794e-05, "loss": 2.99, "mean_token_accuracy": 0.3758620619773865, "step": 83360 }, { "epoch": 0.08396611340587264, "grad_norm": 13.5176061440183, "learning_rate": 4.985821948864644e-05, "loss": 2.3469, "mean_token_accuracy": 0.4675136148929596, "step": 83365 }, { "epoch": 0.08397114945897681, "grad_norm": 9.869952761977242, "learning_rate": 4.985817748583596e-05, "loss": 2.248, "mean_token_accuracy": 0.43448275327682495, "step": 83370 }, { "epoch": 0.08397618551208098, "grad_norm": 10.083993494759188, "learning_rate": 4.9858135476824355e-05, "loss": 2.3983, "mean_token_accuracy": 0.4586206912994385, "step": 83375 }, { "epoch": 0.08398122156518516, "grad_norm": 12.032422385467223, "learning_rate": 4.985809346161164e-05, "loss": 2.4095, "mean_token_accuracy": 0.4448275983333588, "step": 83380 }, { "epoch": 0.08398625761828933, "grad_norm": 13.460077153896455, "learning_rate": 4.985805144019782e-05, "loss": 2.4692, "mean_token_accuracy": 0.47241379618644713, "step": 83385 }, { "epoch": 0.0839912936713935, "grad_norm": 10.482575455635631, "learning_rate": 4.985800941258291e-05, "loss": 2.3637, "mean_token_accuracy": 0.4189957737922668, "step": 83390 }, { "epoch": 0.08399632972449768, "grad_norm": 10.403140586478827, "learning_rate": 4.985796737876693e-05, "loss": 2.3694, "mean_token_accuracy": 0.4551724076271057, "step": 83395 }, { "epoch": 0.08400136577760185, "grad_norm": 18.558199476790985, "learning_rate": 4.985792533874988e-05, "loss": 2.5721, "mean_token_accuracy": 0.4103448331356049, "step": 83400 }, { "epoch": 0.08400640183070603, "grad_norm": 10.78607015806668, "learning_rate": 4.985788329253177e-05, "loss": 2.533, "mean_token_accuracy": 0.43793103098869324, "step": 83405 }, { "epoch": 0.0840114378838102, "grad_norm": 10.016962450835825, "learning_rate": 4.9857841240112625e-05, "loss": 2.4461, "mean_token_accuracy": 0.42758620977401735, "step": 83410 }, { "epoch": 0.08401647393691437, "grad_norm": 10.761255996201754, "learning_rate": 4.985779918149245e-05, "loss": 2.5842, "mean_token_accuracy": 0.3551724225282669, "step": 83415 }, { "epoch": 0.08402150999001855, "grad_norm": 12.575771029547626, "learning_rate": 4.985775711667125e-05, "loss": 2.4588, "mean_token_accuracy": 0.4344827592372894, "step": 83420 }, { "epoch": 0.08402654604312272, "grad_norm": 10.217571490778349, "learning_rate": 4.985771504564905e-05, "loss": 2.5513, "mean_token_accuracy": 0.3931034505367279, "step": 83425 }, { "epoch": 0.0840315820962269, "grad_norm": 9.37783715546154, "learning_rate": 4.985767296842585e-05, "loss": 2.207, "mean_token_accuracy": 0.47931034564971925, "step": 83430 }, { "epoch": 0.08403661814933105, "grad_norm": 12.324104982332953, "learning_rate": 4.9857630885001664e-05, "loss": 2.5952, "mean_token_accuracy": 0.39038112163543703, "step": 83435 }, { "epoch": 0.08404165420243523, "grad_norm": 9.411516401782722, "learning_rate": 4.985758879537651e-05, "loss": 2.3721, "mean_token_accuracy": 0.4344827592372894, "step": 83440 }, { "epoch": 0.0840466902555394, "grad_norm": 14.129667545512621, "learning_rate": 4.98575466995504e-05, "loss": 2.9329, "mean_token_accuracy": 0.3689655244350433, "step": 83445 }, { "epoch": 0.08405172630864358, "grad_norm": 8.680730247975015, "learning_rate": 4.985750459752333e-05, "loss": 2.1472, "mean_token_accuracy": 0.47241378426551817, "step": 83450 }, { "epoch": 0.08405676236174775, "grad_norm": 12.679439612347666, "learning_rate": 4.985746248929533e-05, "loss": 2.8274, "mean_token_accuracy": 0.37725347876548765, "step": 83455 }, { "epoch": 0.08406179841485192, "grad_norm": 9.79999333168821, "learning_rate": 4.98574203748664e-05, "loss": 2.407, "mean_token_accuracy": 0.44137930274009707, "step": 83460 }, { "epoch": 0.0840668344679561, "grad_norm": 9.100487384750533, "learning_rate": 4.985737825423656e-05, "loss": 2.1897, "mean_token_accuracy": 0.4517241358757019, "step": 83465 }, { "epoch": 0.08407187052106027, "grad_norm": 12.288822990260268, "learning_rate": 4.985733612740581e-05, "loss": 2.778, "mean_token_accuracy": 0.382758629322052, "step": 83470 }, { "epoch": 0.08407690657416445, "grad_norm": 9.627459132036227, "learning_rate": 4.9857293994374175e-05, "loss": 2.0448, "mean_token_accuracy": 0.4793103516101837, "step": 83475 }, { "epoch": 0.08408194262726862, "grad_norm": 15.442524535760647, "learning_rate": 4.985725185514166e-05, "loss": 2.8338, "mean_token_accuracy": 0.38275861740112305, "step": 83480 }, { "epoch": 0.08408697868037279, "grad_norm": 10.60837560632719, "learning_rate": 4.985720970970828e-05, "loss": 2.4749, "mean_token_accuracy": 0.39655172228813174, "step": 83485 }, { "epoch": 0.08409201473347697, "grad_norm": 10.268959755285113, "learning_rate": 4.9857167558074045e-05, "loss": 2.2077, "mean_token_accuracy": 0.4674531161785126, "step": 83490 }, { "epoch": 0.08409705078658114, "grad_norm": 8.579567045135438, "learning_rate": 4.985712540023896e-05, "loss": 2.6732, "mean_token_accuracy": 0.3896551728248596, "step": 83495 }, { "epoch": 0.08410208683968531, "grad_norm": 10.863909003507919, "learning_rate": 4.985708323620305e-05, "loss": 2.3116, "mean_token_accuracy": 0.4206896543502808, "step": 83500 }, { "epoch": 0.08410712289278947, "grad_norm": 20.447739638391393, "learning_rate": 4.9857041065966315e-05, "loss": 2.7044, "mean_token_accuracy": 0.4431941986083984, "step": 83505 }, { "epoch": 0.08411215894589365, "grad_norm": 14.0968070478773, "learning_rate": 4.9856998889528775e-05, "loss": 2.5483, "mean_token_accuracy": 0.4034482717514038, "step": 83510 }, { "epoch": 0.08411719499899782, "grad_norm": 10.669455041495757, "learning_rate": 4.985695670689044e-05, "loss": 2.3945, "mean_token_accuracy": 0.4448275864124298, "step": 83515 }, { "epoch": 0.084122231052102, "grad_norm": 10.679598527568388, "learning_rate": 4.985691451805131e-05, "loss": 2.2929, "mean_token_accuracy": 0.4344827592372894, "step": 83520 }, { "epoch": 0.08412726710520617, "grad_norm": 12.933184483045327, "learning_rate": 4.985687232301141e-05, "loss": 2.5172, "mean_token_accuracy": 0.4413793087005615, "step": 83525 }, { "epoch": 0.08413230315831034, "grad_norm": 11.56637720851547, "learning_rate": 4.985683012177075e-05, "loss": 2.7266, "mean_token_accuracy": 0.324137932062149, "step": 83530 }, { "epoch": 0.08413733921141452, "grad_norm": 9.070347912354439, "learning_rate": 4.985678791432935e-05, "loss": 2.4695, "mean_token_accuracy": 0.42068966031074523, "step": 83535 }, { "epoch": 0.08414237526451869, "grad_norm": 11.508919523569414, "learning_rate": 4.98567457006872e-05, "loss": 2.2462, "mean_token_accuracy": 0.42413793206214906, "step": 83540 }, { "epoch": 0.08414741131762286, "grad_norm": 10.757802391824914, "learning_rate": 4.9856703480844315e-05, "loss": 2.362, "mean_token_accuracy": 0.41724138259887694, "step": 83545 }, { "epoch": 0.08415244737072704, "grad_norm": 12.755158349735522, "learning_rate": 4.985666125480073e-05, "loss": 2.4378, "mean_token_accuracy": 0.42758620977401735, "step": 83550 }, { "epoch": 0.08415748342383121, "grad_norm": 12.063317805286587, "learning_rate": 4.985661902255643e-05, "loss": 2.1539, "mean_token_accuracy": 0.44827587008476255, "step": 83555 }, { "epoch": 0.08416251947693539, "grad_norm": 20.29228412387696, "learning_rate": 4.985657678411145e-05, "loss": 2.6971, "mean_token_accuracy": 0.41724138259887694, "step": 83560 }, { "epoch": 0.08416755553003956, "grad_norm": 10.257929939489882, "learning_rate": 4.985653453946578e-05, "loss": 2.1913, "mean_token_accuracy": 0.42758620977401735, "step": 83565 }, { "epoch": 0.08417259158314373, "grad_norm": 14.383276524396509, "learning_rate": 4.985649228861945e-05, "loss": 2.948, "mean_token_accuracy": 0.3827586233615875, "step": 83570 }, { "epoch": 0.08417762763624789, "grad_norm": 10.30648390708572, "learning_rate": 4.985645003157246e-05, "loss": 2.4667, "mean_token_accuracy": 0.4482758641242981, "step": 83575 }, { "epoch": 0.08418266368935207, "grad_norm": 10.249379067291272, "learning_rate": 4.985640776832483e-05, "loss": 2.4525, "mean_token_accuracy": 0.41034482717514037, "step": 83580 }, { "epoch": 0.08418769974245624, "grad_norm": 10.855730506101349, "learning_rate": 4.985636549887655e-05, "loss": 2.6559, "mean_token_accuracy": 0.3793103456497192, "step": 83585 }, { "epoch": 0.08419273579556041, "grad_norm": 9.661826057351298, "learning_rate": 4.985632322322766e-05, "loss": 2.6538, "mean_token_accuracy": 0.41034482717514037, "step": 83590 }, { "epoch": 0.08419777184866459, "grad_norm": 12.333247295666878, "learning_rate": 4.985628094137817e-05, "loss": 2.9186, "mean_token_accuracy": 0.33448275923728943, "step": 83595 }, { "epoch": 0.08420280790176876, "grad_norm": 9.861304076755584, "learning_rate": 4.985623865332807e-05, "loss": 2.8294, "mean_token_accuracy": 0.3896551698446274, "step": 83600 }, { "epoch": 0.08420784395487294, "grad_norm": 8.767464209051358, "learning_rate": 4.9856196359077386e-05, "loss": 2.0935, "mean_token_accuracy": 0.4310344815254211, "step": 83605 }, { "epoch": 0.08421288000797711, "grad_norm": 9.222419076136434, "learning_rate": 4.985615405862613e-05, "loss": 2.2168, "mean_token_accuracy": 0.43103448748588563, "step": 83610 }, { "epoch": 0.08421791606108128, "grad_norm": 10.9804586818531, "learning_rate": 4.9856111751974304e-05, "loss": 2.5717, "mean_token_accuracy": 0.40689654350280763, "step": 83615 }, { "epoch": 0.08422295211418546, "grad_norm": 12.711654891537941, "learning_rate": 4.985606943912193e-05, "loss": 2.5657, "mean_token_accuracy": 0.3655172407627106, "step": 83620 }, { "epoch": 0.08422798816728963, "grad_norm": 9.780426719619351, "learning_rate": 4.985602712006902e-05, "loss": 2.3777, "mean_token_accuracy": 0.4413793087005615, "step": 83625 }, { "epoch": 0.0842330242203938, "grad_norm": 12.01340667319542, "learning_rate": 4.985598479481558e-05, "loss": 2.699, "mean_token_accuracy": 0.3551724076271057, "step": 83630 }, { "epoch": 0.08423806027349798, "grad_norm": 9.300339252866545, "learning_rate": 4.9855942463361635e-05, "loss": 2.0927, "mean_token_accuracy": 0.5051724135875701, "step": 83635 }, { "epoch": 0.08424309632660215, "grad_norm": 13.691430369832881, "learning_rate": 4.9855900125707175e-05, "loss": 3.0368, "mean_token_accuracy": 0.34482758343219755, "step": 83640 }, { "epoch": 0.08424813237970631, "grad_norm": 10.211380844463578, "learning_rate": 4.9855857781852224e-05, "loss": 2.0413, "mean_token_accuracy": 0.506896561384201, "step": 83645 }, { "epoch": 0.08425316843281049, "grad_norm": 11.607502455241056, "learning_rate": 4.9855815431796796e-05, "loss": 2.1468, "mean_token_accuracy": 0.4535995125770569, "step": 83650 }, { "epoch": 0.08425820448591466, "grad_norm": 9.650552742950898, "learning_rate": 4.9855773075540896e-05, "loss": 2.6727, "mean_token_accuracy": 0.38275861740112305, "step": 83655 }, { "epoch": 0.08426324053901883, "grad_norm": 11.50047321352506, "learning_rate": 4.985573071308454e-05, "loss": 2.2598, "mean_token_accuracy": 0.38965516686439516, "step": 83660 }, { "epoch": 0.084268276592123, "grad_norm": 10.018957259214215, "learning_rate": 4.985568834442773e-05, "loss": 2.0793, "mean_token_accuracy": 0.47586206793785096, "step": 83665 }, { "epoch": 0.08427331264522718, "grad_norm": 15.339343266534113, "learning_rate": 4.9855645969570494e-05, "loss": 2.7098, "mean_token_accuracy": 0.38427101969718935, "step": 83670 }, { "epoch": 0.08427834869833135, "grad_norm": 10.151174820132887, "learning_rate": 4.9855603588512846e-05, "loss": 2.23, "mean_token_accuracy": 0.4379310250282288, "step": 83675 }, { "epoch": 0.08428338475143553, "grad_norm": 11.190103851056818, "learning_rate": 4.985556120125477e-05, "loss": 2.7104, "mean_token_accuracy": 0.3551724016666412, "step": 83680 }, { "epoch": 0.0842884208045397, "grad_norm": 11.45299452698718, "learning_rate": 4.98555188077963e-05, "loss": 2.7769, "mean_token_accuracy": 0.3620689570903778, "step": 83685 }, { "epoch": 0.08429345685764388, "grad_norm": 8.633813503853581, "learning_rate": 4.985547640813745e-05, "loss": 2.1723, "mean_token_accuracy": 0.4551724135875702, "step": 83690 }, { "epoch": 0.08429849291074805, "grad_norm": 12.423418933991139, "learning_rate": 4.985543400227823e-05, "loss": 2.7155, "mean_token_accuracy": 0.3896551728248596, "step": 83695 }, { "epoch": 0.08430352896385222, "grad_norm": 14.56417705586622, "learning_rate": 4.985539159021863e-05, "loss": 3.1379, "mean_token_accuracy": 0.37241379022598264, "step": 83700 }, { "epoch": 0.0843085650169564, "grad_norm": 10.59480027066553, "learning_rate": 4.985534917195869e-05, "loss": 2.1449, "mean_token_accuracy": 0.4517241418361664, "step": 83705 }, { "epoch": 0.08431360107006057, "grad_norm": 11.772442408747924, "learning_rate": 4.985530674749842e-05, "loss": 2.5681, "mean_token_accuracy": 0.36551723480224607, "step": 83710 }, { "epoch": 0.08431863712316473, "grad_norm": 12.439573023320447, "learning_rate": 4.98552643168378e-05, "loss": 2.5881, "mean_token_accuracy": 0.42068965137004855, "step": 83715 }, { "epoch": 0.0843236731762689, "grad_norm": 16.36763947195971, "learning_rate": 4.9855221879976885e-05, "loss": 2.6941, "mean_token_accuracy": 0.3999999940395355, "step": 83720 }, { "epoch": 0.08432870922937308, "grad_norm": 13.275772855999945, "learning_rate": 4.985517943691565e-05, "loss": 2.3868, "mean_token_accuracy": 0.4429521977901459, "step": 83725 }, { "epoch": 0.08433374528247725, "grad_norm": 10.251653508773567, "learning_rate": 4.985513698765413e-05, "loss": 3.1793, "mean_token_accuracy": 0.2931034505367279, "step": 83730 }, { "epoch": 0.08433878133558143, "grad_norm": 10.395316168758846, "learning_rate": 4.9855094532192325e-05, "loss": 2.3831, "mean_token_accuracy": 0.482758617401123, "step": 83735 }, { "epoch": 0.0843438173886856, "grad_norm": 11.100283423538864, "learning_rate": 4.9855052070530264e-05, "loss": 2.5817, "mean_token_accuracy": 0.3896551728248596, "step": 83740 }, { "epoch": 0.08434885344178977, "grad_norm": 11.055251755741168, "learning_rate": 4.9855009602667934e-05, "loss": 2.4166, "mean_token_accuracy": 0.4241379201412201, "step": 83745 }, { "epoch": 0.08435388949489395, "grad_norm": 9.972814619484913, "learning_rate": 4.985496712860536e-05, "loss": 2.3152, "mean_token_accuracy": 0.4620689630508423, "step": 83750 }, { "epoch": 0.08435892554799812, "grad_norm": 16.024162181669592, "learning_rate": 4.985492464834255e-05, "loss": 2.589, "mean_token_accuracy": 0.4620689630508423, "step": 83755 }, { "epoch": 0.0843639616011023, "grad_norm": 11.163473070194046, "learning_rate": 4.9854882161879516e-05, "loss": 2.7607, "mean_token_accuracy": 0.3758620619773865, "step": 83760 }, { "epoch": 0.08436899765420647, "grad_norm": 9.880954849879211, "learning_rate": 4.985483966921628e-05, "loss": 2.6464, "mean_token_accuracy": 0.37241379022598264, "step": 83765 }, { "epoch": 0.08437403370731064, "grad_norm": 10.94050574802675, "learning_rate": 4.985479717035285e-05, "loss": 2.3329, "mean_token_accuracy": 0.4137930989265442, "step": 83770 }, { "epoch": 0.08437906976041482, "grad_norm": 12.557494976653757, "learning_rate": 4.9854754665289225e-05, "loss": 2.7767, "mean_token_accuracy": 0.3758620649576187, "step": 83775 }, { "epoch": 0.08438410581351899, "grad_norm": 12.66255309221867, "learning_rate": 4.985471215402542e-05, "loss": 2.8687, "mean_token_accuracy": 0.4034482717514038, "step": 83780 }, { "epoch": 0.08438914186662315, "grad_norm": 10.057855018967532, "learning_rate": 4.9854669636561466e-05, "loss": 2.409, "mean_token_accuracy": 0.42244404554367065, "step": 83785 }, { "epoch": 0.08439417791972732, "grad_norm": 12.208123004929682, "learning_rate": 4.985462711289736e-05, "loss": 2.3515, "mean_token_accuracy": 0.4344827592372894, "step": 83790 }, { "epoch": 0.0843992139728315, "grad_norm": 12.147974165143134, "learning_rate": 4.98545845830331e-05, "loss": 2.3825, "mean_token_accuracy": 0.4344827592372894, "step": 83795 }, { "epoch": 0.08440425002593567, "grad_norm": 11.813704677351128, "learning_rate": 4.9854542046968726e-05, "loss": 2.6515, "mean_token_accuracy": 0.37241379618644715, "step": 83800 }, { "epoch": 0.08440928607903984, "grad_norm": 10.602102671869622, "learning_rate": 4.985449950470423e-05, "loss": 2.6925, "mean_token_accuracy": 0.4, "step": 83805 }, { "epoch": 0.08441432213214402, "grad_norm": 11.398026330272291, "learning_rate": 4.985445695623963e-05, "loss": 2.4871, "mean_token_accuracy": 0.44482759237289426, "step": 83810 }, { "epoch": 0.08441935818524819, "grad_norm": 12.717923909937156, "learning_rate": 4.985441440157494e-05, "loss": 3.0629, "mean_token_accuracy": 0.4286945790052414, "step": 83815 }, { "epoch": 0.08442439423835237, "grad_norm": 12.245332608468338, "learning_rate": 4.9854371840710174e-05, "loss": 2.3668, "mean_token_accuracy": 0.4603750824928284, "step": 83820 }, { "epoch": 0.08442943029145654, "grad_norm": 12.599469001959527, "learning_rate": 4.985432927364533e-05, "loss": 2.2551, "mean_token_accuracy": 0.45517241954803467, "step": 83825 }, { "epoch": 0.08443446634456071, "grad_norm": 13.254328463078659, "learning_rate": 4.9854286700380444e-05, "loss": 2.6073, "mean_token_accuracy": 0.4068965494632721, "step": 83830 }, { "epoch": 0.08443950239766489, "grad_norm": 13.8261174746677, "learning_rate": 4.98542441209155e-05, "loss": 2.5303, "mean_token_accuracy": 0.4034482717514038, "step": 83835 }, { "epoch": 0.08444453845076906, "grad_norm": 8.34441362785068, "learning_rate": 4.9854201535250525e-05, "loss": 1.9624, "mean_token_accuracy": 0.5344827592372894, "step": 83840 }, { "epoch": 0.08444957450387323, "grad_norm": 17.064341031092617, "learning_rate": 4.985415894338553e-05, "loss": 2.8597, "mean_token_accuracy": 0.36206896901130675, "step": 83845 }, { "epoch": 0.08445461055697741, "grad_norm": 10.152699338881392, "learning_rate": 4.985411634532053e-05, "loss": 2.5955, "mean_token_accuracy": 0.4172413766384125, "step": 83850 }, { "epoch": 0.08445964661008157, "grad_norm": 12.417441380586595, "learning_rate": 4.9854073741055526e-05, "loss": 2.6186, "mean_token_accuracy": 0.4344827502965927, "step": 83855 }, { "epoch": 0.08446468266318574, "grad_norm": 13.26591848561438, "learning_rate": 4.985403113059054e-05, "loss": 2.6703, "mean_token_accuracy": 0.41724138557910917, "step": 83860 }, { "epoch": 0.08446971871628992, "grad_norm": 12.022348160886756, "learning_rate": 4.985398851392558e-05, "loss": 2.4453, "mean_token_accuracy": 0.38965517580509185, "step": 83865 }, { "epoch": 0.08447475476939409, "grad_norm": 10.89647471812784, "learning_rate": 4.985394589106066e-05, "loss": 2.7098, "mean_token_accuracy": 0.38275861740112305, "step": 83870 }, { "epoch": 0.08447979082249826, "grad_norm": 10.76333245874957, "learning_rate": 4.985390326199579e-05, "loss": 2.7949, "mean_token_accuracy": 0.37241379022598264, "step": 83875 }, { "epoch": 0.08448482687560244, "grad_norm": 10.398702424385043, "learning_rate": 4.985386062673098e-05, "loss": 2.3506, "mean_token_accuracy": 0.4482758641242981, "step": 83880 }, { "epoch": 0.08448986292870661, "grad_norm": 13.013179038133718, "learning_rate": 4.985381798526624e-05, "loss": 2.7344, "mean_token_accuracy": 0.38275861740112305, "step": 83885 }, { "epoch": 0.08449489898181078, "grad_norm": 11.727675745059933, "learning_rate": 4.985377533760158e-05, "loss": 2.4588, "mean_token_accuracy": 0.4103448152542114, "step": 83890 }, { "epoch": 0.08449993503491496, "grad_norm": 11.952265164870486, "learning_rate": 4.985373268373703e-05, "loss": 2.4226, "mean_token_accuracy": 0.44482758045196535, "step": 83895 }, { "epoch": 0.08450497108801913, "grad_norm": 11.32807850503098, "learning_rate": 4.985369002367258e-05, "loss": 2.4614, "mean_token_accuracy": 0.42758620381355283, "step": 83900 }, { "epoch": 0.0845100071411233, "grad_norm": 11.632994481702731, "learning_rate": 4.985364735740826e-05, "loss": 2.6008, "mean_token_accuracy": 0.4344827592372894, "step": 83905 }, { "epoch": 0.08451504319422748, "grad_norm": 11.551755796900668, "learning_rate": 4.985360468494407e-05, "loss": 2.5523, "mean_token_accuracy": 0.4241379380226135, "step": 83910 }, { "epoch": 0.08452007924733165, "grad_norm": 11.4252815344893, "learning_rate": 4.985356200628001e-05, "loss": 2.4624, "mean_token_accuracy": 0.3896551787853241, "step": 83915 }, { "epoch": 0.08452511530043583, "grad_norm": 12.283142337152427, "learning_rate": 4.985351932141612e-05, "loss": 2.346, "mean_token_accuracy": 0.42758620977401735, "step": 83920 }, { "epoch": 0.08453015135353999, "grad_norm": 11.443495033579033, "learning_rate": 4.98534766303524e-05, "loss": 2.6946, "mean_token_accuracy": 0.32758620381355286, "step": 83925 }, { "epoch": 0.08453518740664416, "grad_norm": 13.267505861650257, "learning_rate": 4.985343393308885e-05, "loss": 2.8911, "mean_token_accuracy": 0.3705989122390747, "step": 83930 }, { "epoch": 0.08454022345974833, "grad_norm": 9.973610948780596, "learning_rate": 4.98533912296255e-05, "loss": 2.6067, "mean_token_accuracy": 0.35862068831920624, "step": 83935 }, { "epoch": 0.08454525951285251, "grad_norm": 12.205285253287201, "learning_rate": 4.985334851996235e-05, "loss": 2.5644, "mean_token_accuracy": 0.3896551728248596, "step": 83940 }, { "epoch": 0.08455029556595668, "grad_norm": 15.360173552977056, "learning_rate": 4.985330580409941e-05, "loss": 3.1112, "mean_token_accuracy": 0.33103448152542114, "step": 83945 }, { "epoch": 0.08455533161906086, "grad_norm": 12.476658347866254, "learning_rate": 4.9853263082036704e-05, "loss": 2.6085, "mean_token_accuracy": 0.3551724076271057, "step": 83950 }, { "epoch": 0.08456036767216503, "grad_norm": 10.232198684314515, "learning_rate": 4.985322035377424e-05, "loss": 2.5336, "mean_token_accuracy": 0.4551724076271057, "step": 83955 }, { "epoch": 0.0845654037252692, "grad_norm": 12.87012878214802, "learning_rate": 4.985317761931202e-05, "loss": 2.3909, "mean_token_accuracy": 0.4172413766384125, "step": 83960 }, { "epoch": 0.08457043977837338, "grad_norm": 10.973916665385671, "learning_rate": 4.985313487865006e-05, "loss": 2.3329, "mean_token_accuracy": 0.46551724672317507, "step": 83965 }, { "epoch": 0.08457547583147755, "grad_norm": 10.253114698910295, "learning_rate": 4.985309213178839e-05, "loss": 2.4348, "mean_token_accuracy": 0.41379310488700866, "step": 83970 }, { "epoch": 0.08458051188458172, "grad_norm": 12.071961388193635, "learning_rate": 4.985304937872699e-05, "loss": 2.7512, "mean_token_accuracy": 0.3743496656417847, "step": 83975 }, { "epoch": 0.0845855479376859, "grad_norm": 13.324347642940728, "learning_rate": 4.985300661946589e-05, "loss": 2.6727, "mean_token_accuracy": 0.379310342669487, "step": 83980 }, { "epoch": 0.08459058399079007, "grad_norm": 11.411819037813563, "learning_rate": 4.985296385400512e-05, "loss": 2.7982, "mean_token_accuracy": 0.36896551251411436, "step": 83985 }, { "epoch": 0.08459562004389425, "grad_norm": 11.257611565136207, "learning_rate": 4.985292108234465e-05, "loss": 2.3297, "mean_token_accuracy": 0.37586206793785093, "step": 83990 }, { "epoch": 0.0846006560969984, "grad_norm": 13.537053775996858, "learning_rate": 4.985287830448452e-05, "loss": 2.5451, "mean_token_accuracy": 0.38965516686439516, "step": 83995 }, { "epoch": 0.08460569215010258, "grad_norm": 13.906050299881006, "learning_rate": 4.985283552042474e-05, "loss": 2.7442, "mean_token_accuracy": 0.3793103337287903, "step": 84000 }, { "epoch": 0.08461072820320675, "grad_norm": 10.419281836334983, "learning_rate": 4.9852792730165314e-05, "loss": 2.6115, "mean_token_accuracy": 0.4068965494632721, "step": 84005 }, { "epoch": 0.08461576425631093, "grad_norm": 9.158864964003985, "learning_rate": 4.9852749933706256e-05, "loss": 2.1702, "mean_token_accuracy": 0.46551724672317507, "step": 84010 }, { "epoch": 0.0846208003094151, "grad_norm": 11.763344817770335, "learning_rate": 4.9852707131047575e-05, "loss": 2.6802, "mean_token_accuracy": 0.38620689511299133, "step": 84015 }, { "epoch": 0.08462583636251927, "grad_norm": 11.895008350369881, "learning_rate": 4.98526643221893e-05, "loss": 2.4321, "mean_token_accuracy": 0.44827585816383364, "step": 84020 }, { "epoch": 0.08463087241562345, "grad_norm": 12.071503916305106, "learning_rate": 4.985262150713142e-05, "loss": 2.491, "mean_token_accuracy": 0.42413793206214906, "step": 84025 }, { "epoch": 0.08463590846872762, "grad_norm": 10.246187236831082, "learning_rate": 4.985257868587396e-05, "loss": 2.5981, "mean_token_accuracy": 0.3862068891525269, "step": 84030 }, { "epoch": 0.0846409445218318, "grad_norm": 13.257689269394195, "learning_rate": 4.9852535858416934e-05, "loss": 2.5774, "mean_token_accuracy": 0.43103448748588563, "step": 84035 }, { "epoch": 0.08464598057493597, "grad_norm": 12.95358319162498, "learning_rate": 4.985249302476035e-05, "loss": 2.8369, "mean_token_accuracy": 0.3620689630508423, "step": 84040 }, { "epoch": 0.08465101662804014, "grad_norm": 10.245910768807207, "learning_rate": 4.985245018490421e-05, "loss": 2.628, "mean_token_accuracy": 0.37241379022598264, "step": 84045 }, { "epoch": 0.08465605268114432, "grad_norm": 14.578725142548901, "learning_rate": 4.985240733884854e-05, "loss": 2.9109, "mean_token_accuracy": 0.35172413289546967, "step": 84050 }, { "epoch": 0.08466108873424849, "grad_norm": 10.383156952358455, "learning_rate": 4.985236448659334e-05, "loss": 2.4021, "mean_token_accuracy": 0.39655172228813174, "step": 84055 }, { "epoch": 0.08466612478735266, "grad_norm": 12.575567199223936, "learning_rate": 4.985232162813863e-05, "loss": 2.2968, "mean_token_accuracy": 0.39655172228813174, "step": 84060 }, { "epoch": 0.08467116084045682, "grad_norm": 12.513828451956813, "learning_rate": 4.9852278763484426e-05, "loss": 2.4196, "mean_token_accuracy": 0.4344827651977539, "step": 84065 }, { "epoch": 0.084676196893561, "grad_norm": 9.393905132363841, "learning_rate": 4.9852235892630736e-05, "loss": 2.5785, "mean_token_accuracy": 0.4482758641242981, "step": 84070 }, { "epoch": 0.08468123294666517, "grad_norm": 12.269920962193842, "learning_rate": 4.9852193015577564e-05, "loss": 2.7835, "mean_token_accuracy": 0.4103448212146759, "step": 84075 }, { "epoch": 0.08468626899976935, "grad_norm": 9.79828248148721, "learning_rate": 4.9852150132324924e-05, "loss": 2.5029, "mean_token_accuracy": 0.3758620649576187, "step": 84080 }, { "epoch": 0.08469130505287352, "grad_norm": 10.583222648786517, "learning_rate": 4.985210724287284e-05, "loss": 2.2096, "mean_token_accuracy": 0.44718694090843203, "step": 84085 }, { "epoch": 0.0846963411059777, "grad_norm": 12.602681685972973, "learning_rate": 4.9852064347221314e-05, "loss": 2.5306, "mean_token_accuracy": 0.4275861978530884, "step": 84090 }, { "epoch": 0.08470137715908187, "grad_norm": 10.647772372043272, "learning_rate": 4.9852021445370365e-05, "loss": 2.6933, "mean_token_accuracy": 0.39655172228813174, "step": 84095 }, { "epoch": 0.08470641321218604, "grad_norm": 11.40920354490283, "learning_rate": 4.985197853731999e-05, "loss": 2.3704, "mean_token_accuracy": 0.42413792610168455, "step": 84100 }, { "epoch": 0.08471144926529021, "grad_norm": 15.877702334359258, "learning_rate": 4.9851935623070204e-05, "loss": 2.3639, "mean_token_accuracy": 0.43950392603874205, "step": 84105 }, { "epoch": 0.08471648531839439, "grad_norm": 11.707089327675579, "learning_rate": 4.985189270262104e-05, "loss": 2.2041, "mean_token_accuracy": 0.4448275864124298, "step": 84110 }, { "epoch": 0.08472152137149856, "grad_norm": 9.87025037329012, "learning_rate": 4.9851849775972484e-05, "loss": 2.2986, "mean_token_accuracy": 0.4310344815254211, "step": 84115 }, { "epoch": 0.08472655742460274, "grad_norm": 11.815281298484093, "learning_rate": 4.985180684312457e-05, "loss": 2.3183, "mean_token_accuracy": 0.4984876036643982, "step": 84120 }, { "epoch": 0.08473159347770691, "grad_norm": 10.829481199935287, "learning_rate": 4.985176390407729e-05, "loss": 2.4824, "mean_token_accuracy": 0.43448275327682495, "step": 84125 }, { "epoch": 0.08473662953081108, "grad_norm": 10.874685122677445, "learning_rate": 4.9851720958830675e-05, "loss": 2.421, "mean_token_accuracy": 0.4517241299152374, "step": 84130 }, { "epoch": 0.08474166558391524, "grad_norm": 17.465566195839703, "learning_rate": 4.985167800738471e-05, "loss": 2.7733, "mean_token_accuracy": 0.3931034505367279, "step": 84135 }, { "epoch": 0.08474670163701942, "grad_norm": 12.40572395076069, "learning_rate": 4.985163504973944e-05, "loss": 2.4605, "mean_token_accuracy": 0.37586207389831544, "step": 84140 }, { "epoch": 0.08475173769012359, "grad_norm": 11.207461417053574, "learning_rate": 4.9851592085894854e-05, "loss": 2.5483, "mean_token_accuracy": 0.4689655125141144, "step": 84145 }, { "epoch": 0.08475677374322776, "grad_norm": 13.662482575849292, "learning_rate": 4.985154911585097e-05, "loss": 2.7909, "mean_token_accuracy": 0.3827586203813553, "step": 84150 }, { "epoch": 0.08476180979633194, "grad_norm": 13.355202985175895, "learning_rate": 4.98515061396078e-05, "loss": 2.7766, "mean_token_accuracy": 0.39310344457626345, "step": 84155 }, { "epoch": 0.08476684584943611, "grad_norm": 11.200713467896184, "learning_rate": 4.9851463157165365e-05, "loss": 2.3058, "mean_token_accuracy": 0.4655172348022461, "step": 84160 }, { "epoch": 0.08477188190254029, "grad_norm": 14.286598093369648, "learning_rate": 4.985142016852366e-05, "loss": 2.9078, "mean_token_accuracy": 0.42413793206214906, "step": 84165 }, { "epoch": 0.08477691795564446, "grad_norm": 13.078357659040503, "learning_rate": 4.98513771736827e-05, "loss": 2.7162, "mean_token_accuracy": 0.39655172228813174, "step": 84170 }, { "epoch": 0.08478195400874863, "grad_norm": 9.648647865007197, "learning_rate": 4.9851334172642514e-05, "loss": 2.7713, "mean_token_accuracy": 0.41034482419490814, "step": 84175 }, { "epoch": 0.08478699006185281, "grad_norm": 18.711205379361328, "learning_rate": 4.985129116540309e-05, "loss": 2.7187, "mean_token_accuracy": 0.3999999940395355, "step": 84180 }, { "epoch": 0.08479202611495698, "grad_norm": 11.151462122417966, "learning_rate": 4.9851248151964466e-05, "loss": 2.6574, "mean_token_accuracy": 0.3965517282485962, "step": 84185 }, { "epoch": 0.08479706216806115, "grad_norm": 13.566341440282317, "learning_rate": 4.9851205132326626e-05, "loss": 2.8484, "mean_token_accuracy": 0.358620685338974, "step": 84190 }, { "epoch": 0.08480209822116533, "grad_norm": 14.286713248963341, "learning_rate": 4.985116210648961e-05, "loss": 2.7069, "mean_token_accuracy": 0.39655172228813174, "step": 84195 }, { "epoch": 0.0848071342742695, "grad_norm": 10.611132602784021, "learning_rate": 4.98511190744534e-05, "loss": 2.3405, "mean_token_accuracy": 0.4310344815254211, "step": 84200 }, { "epoch": 0.08481217032737366, "grad_norm": 11.03091011470549, "learning_rate": 4.985107603621803e-05, "loss": 2.766, "mean_token_accuracy": 0.358620685338974, "step": 84205 }, { "epoch": 0.08481720638047784, "grad_norm": 11.096924314656782, "learning_rate": 4.9851032991783514e-05, "loss": 2.1868, "mean_token_accuracy": 0.4275861978530884, "step": 84210 }, { "epoch": 0.08482224243358201, "grad_norm": 13.203385357226272, "learning_rate": 4.985098994114985e-05, "loss": 2.3506, "mean_token_accuracy": 0.47586206197738645, "step": 84215 }, { "epoch": 0.08482727848668618, "grad_norm": 20.08745848220974, "learning_rate": 4.9850946884317054e-05, "loss": 2.484, "mean_token_accuracy": 0.46031458377838136, "step": 84220 }, { "epoch": 0.08483231453979036, "grad_norm": 12.186828164252939, "learning_rate": 4.985090382128515e-05, "loss": 2.6952, "mean_token_accuracy": 0.34137930274009703, "step": 84225 }, { "epoch": 0.08483735059289453, "grad_norm": 11.552932838614483, "learning_rate": 4.9850860752054124e-05, "loss": 2.8502, "mean_token_accuracy": 0.4068965494632721, "step": 84230 }, { "epoch": 0.0848423866459987, "grad_norm": 9.979725244695615, "learning_rate": 4.9850817676624006e-05, "loss": 2.8145, "mean_token_accuracy": 0.39655172228813174, "step": 84235 }, { "epoch": 0.08484742269910288, "grad_norm": 9.994969684032503, "learning_rate": 4.985077459499481e-05, "loss": 2.3201, "mean_token_accuracy": 0.43103448748588563, "step": 84240 }, { "epoch": 0.08485245875220705, "grad_norm": 36.515186336496186, "learning_rate": 4.985073150716654e-05, "loss": 2.8818, "mean_token_accuracy": 0.4034482777118683, "step": 84245 }, { "epoch": 0.08485749480531123, "grad_norm": 11.10316859874001, "learning_rate": 4.985068841313921e-05, "loss": 2.521, "mean_token_accuracy": 0.4034482717514038, "step": 84250 }, { "epoch": 0.0848625308584154, "grad_norm": 9.287877523729954, "learning_rate": 4.985064531291284e-05, "loss": 2.5038, "mean_token_accuracy": 0.36896551847457887, "step": 84255 }, { "epoch": 0.08486756691151957, "grad_norm": 9.536114648483075, "learning_rate": 4.9850602206487434e-05, "loss": 2.0329, "mean_token_accuracy": 0.4517241418361664, "step": 84260 }, { "epoch": 0.08487260296462375, "grad_norm": 11.6830558221404, "learning_rate": 4.9850559093862995e-05, "loss": 2.5461, "mean_token_accuracy": 0.4206896543502808, "step": 84265 }, { "epoch": 0.08487763901772791, "grad_norm": 9.273789837989519, "learning_rate": 4.9850515975039555e-05, "loss": 2.3566, "mean_token_accuracy": 0.4103448212146759, "step": 84270 }, { "epoch": 0.08488267507083208, "grad_norm": 11.391562241937056, "learning_rate": 4.985047285001712e-05, "loss": 2.1968, "mean_token_accuracy": 0.4655172348022461, "step": 84275 }, { "epoch": 0.08488771112393625, "grad_norm": 11.030895412866492, "learning_rate": 4.985042971879569e-05, "loss": 2.7381, "mean_token_accuracy": 0.4068965554237366, "step": 84280 }, { "epoch": 0.08489274717704043, "grad_norm": 11.100432651413575, "learning_rate": 4.985038658137529e-05, "loss": 2.3755, "mean_token_accuracy": 0.4344827651977539, "step": 84285 }, { "epoch": 0.0848977832301446, "grad_norm": 11.103488017606288, "learning_rate": 4.985034343775592e-05, "loss": 2.4156, "mean_token_accuracy": 0.41034482717514037, "step": 84290 }, { "epoch": 0.08490281928324878, "grad_norm": 16.16668250795376, "learning_rate": 4.985030028793761e-05, "loss": 2.4185, "mean_token_accuracy": 0.4068965494632721, "step": 84295 }, { "epoch": 0.08490785533635295, "grad_norm": 12.333367533235188, "learning_rate": 4.985025713192035e-05, "loss": 2.4736, "mean_token_accuracy": 0.42413792610168455, "step": 84300 }, { "epoch": 0.08491289138945712, "grad_norm": 9.978560169014587, "learning_rate": 4.985021396970416e-05, "loss": 2.3768, "mean_token_accuracy": 0.3931034505367279, "step": 84305 }, { "epoch": 0.0849179274425613, "grad_norm": 11.769228079090027, "learning_rate": 4.985017080128906e-05, "loss": 2.4486, "mean_token_accuracy": 0.41929823756217954, "step": 84310 }, { "epoch": 0.08492296349566547, "grad_norm": 8.574489023713143, "learning_rate": 4.9850127626675055e-05, "loss": 2.3457, "mean_token_accuracy": 0.4862068951129913, "step": 84315 }, { "epoch": 0.08492799954876964, "grad_norm": 10.470837457361215, "learning_rate": 4.985008444586216e-05, "loss": 2.4462, "mean_token_accuracy": 0.4068965494632721, "step": 84320 }, { "epoch": 0.08493303560187382, "grad_norm": 16.492463480945514, "learning_rate": 4.985004125885039e-05, "loss": 2.5968, "mean_token_accuracy": 0.4172413766384125, "step": 84325 }, { "epoch": 0.08493807165497799, "grad_norm": 11.094778413128005, "learning_rate": 4.9849998065639754e-05, "loss": 2.558, "mean_token_accuracy": 0.4344827592372894, "step": 84330 }, { "epoch": 0.08494310770808217, "grad_norm": 12.184278906935136, "learning_rate": 4.984995486623026e-05, "loss": 2.3188, "mean_token_accuracy": 0.4517241299152374, "step": 84335 }, { "epoch": 0.08494814376118633, "grad_norm": 11.600304892147106, "learning_rate": 4.9849911660621916e-05, "loss": 2.1649, "mean_token_accuracy": 0.48088324069976807, "step": 84340 }, { "epoch": 0.0849531798142905, "grad_norm": 10.19200883249514, "learning_rate": 4.984986844881474e-05, "loss": 2.7399, "mean_token_accuracy": 0.42891711592674253, "step": 84345 }, { "epoch": 0.08495821586739467, "grad_norm": 9.686585989275923, "learning_rate": 4.984982523080876e-05, "loss": 2.8912, "mean_token_accuracy": 0.35862068831920624, "step": 84350 }, { "epoch": 0.08496325192049885, "grad_norm": 11.453765416333471, "learning_rate": 4.984978200660396e-05, "loss": 2.3594, "mean_token_accuracy": 0.41724138259887694, "step": 84355 }, { "epoch": 0.08496828797360302, "grad_norm": 12.184808333398058, "learning_rate": 4.984973877620037e-05, "loss": 2.7556, "mean_token_accuracy": 0.38275861740112305, "step": 84360 }, { "epoch": 0.0849733240267072, "grad_norm": 10.48616267662966, "learning_rate": 4.984969553959799e-05, "loss": 2.4211, "mean_token_accuracy": 0.41724138259887694, "step": 84365 }, { "epoch": 0.08497836007981137, "grad_norm": 10.767591480089358, "learning_rate": 4.984965229679684e-05, "loss": 2.4245, "mean_token_accuracy": 0.40689654648303986, "step": 84370 }, { "epoch": 0.08498339613291554, "grad_norm": 12.828653247519341, "learning_rate": 4.984960904779693e-05, "loss": 2.5812, "mean_token_accuracy": 0.41379310190677643, "step": 84375 }, { "epoch": 0.08498843218601972, "grad_norm": 11.287974775453847, "learning_rate": 4.984956579259828e-05, "loss": 2.4971, "mean_token_accuracy": 0.4137930989265442, "step": 84380 }, { "epoch": 0.08499346823912389, "grad_norm": 8.766901434050418, "learning_rate": 4.9849522531200895e-05, "loss": 2.4971, "mean_token_accuracy": 0.3987295746803284, "step": 84385 }, { "epoch": 0.08499850429222806, "grad_norm": 9.485667330991594, "learning_rate": 4.984947926360478e-05, "loss": 2.2096, "mean_token_accuracy": 0.48620688915252686, "step": 84390 }, { "epoch": 0.08500354034533224, "grad_norm": 11.78892736914468, "learning_rate": 4.984943598980996e-05, "loss": 2.4543, "mean_token_accuracy": 0.4172413766384125, "step": 84395 }, { "epoch": 0.08500857639843641, "grad_norm": 10.194695042178273, "learning_rate": 4.984939270981643e-05, "loss": 2.1158, "mean_token_accuracy": 0.47931034564971925, "step": 84400 }, { "epoch": 0.08501361245154058, "grad_norm": 11.861296369831022, "learning_rate": 4.984934942362422e-05, "loss": 2.3728, "mean_token_accuracy": 0.42758620977401735, "step": 84405 }, { "epoch": 0.08501864850464474, "grad_norm": 10.151868496312817, "learning_rate": 4.984930613123333e-05, "loss": 2.6983, "mean_token_accuracy": 0.4482758641242981, "step": 84410 }, { "epoch": 0.08502368455774892, "grad_norm": 13.26141666607643, "learning_rate": 4.984926283264378e-05, "loss": 2.7361, "mean_token_accuracy": 0.3724137872457504, "step": 84415 }, { "epoch": 0.08502872061085309, "grad_norm": 12.346749844357271, "learning_rate": 4.984921952785558e-05, "loss": 2.2865, "mean_token_accuracy": 0.4758620738983154, "step": 84420 }, { "epoch": 0.08503375666395727, "grad_norm": 12.020528042669813, "learning_rate": 4.984917621686874e-05, "loss": 2.6498, "mean_token_accuracy": 0.3758620619773865, "step": 84425 }, { "epoch": 0.08503879271706144, "grad_norm": 14.00184951981294, "learning_rate": 4.984913289968327e-05, "loss": 2.7715, "mean_token_accuracy": 0.40689654350280763, "step": 84430 }, { "epoch": 0.08504382877016561, "grad_norm": 10.131981225888152, "learning_rate": 4.984908957629919e-05, "loss": 2.6893, "mean_token_accuracy": 0.3862069010734558, "step": 84435 }, { "epoch": 0.08504886482326979, "grad_norm": 13.688999439861478, "learning_rate": 4.9849046246716495e-05, "loss": 2.8097, "mean_token_accuracy": 0.4137930989265442, "step": 84440 }, { "epoch": 0.08505390087637396, "grad_norm": 13.240859668973329, "learning_rate": 4.9849002910935216e-05, "loss": 2.7108, "mean_token_accuracy": 0.334482753276825, "step": 84445 }, { "epoch": 0.08505893692947814, "grad_norm": 10.47270945730837, "learning_rate": 4.984895956895536e-05, "loss": 2.6691, "mean_token_accuracy": 0.4275862067937851, "step": 84450 }, { "epoch": 0.08506397298258231, "grad_norm": 11.94512787235383, "learning_rate": 4.984891622077693e-05, "loss": 2.3838, "mean_token_accuracy": 0.4103448331356049, "step": 84455 }, { "epoch": 0.08506900903568648, "grad_norm": 11.341801043786331, "learning_rate": 4.9848872866399956e-05, "loss": 2.5576, "mean_token_accuracy": 0.37586206793785093, "step": 84460 }, { "epoch": 0.08507404508879066, "grad_norm": 12.43020649005942, "learning_rate": 4.984882950582443e-05, "loss": 2.412, "mean_token_accuracy": 0.45172414779663084, "step": 84465 }, { "epoch": 0.08507908114189483, "grad_norm": 11.199267528472951, "learning_rate": 4.9848786139050377e-05, "loss": 2.2614, "mean_token_accuracy": 0.441379314661026, "step": 84470 }, { "epoch": 0.085084117194999, "grad_norm": 15.572892566192156, "learning_rate": 4.98487427660778e-05, "loss": 2.8474, "mean_token_accuracy": 0.4137930989265442, "step": 84475 }, { "epoch": 0.08508915324810316, "grad_norm": 11.538591631311911, "learning_rate": 4.9848699386906713e-05, "loss": 2.2946, "mean_token_accuracy": 0.4206896543502808, "step": 84480 }, { "epoch": 0.08509418930120734, "grad_norm": 10.315776318851695, "learning_rate": 4.9848656001537143e-05, "loss": 2.5654, "mean_token_accuracy": 0.4379310250282288, "step": 84485 }, { "epoch": 0.08509922535431151, "grad_norm": 15.212140708893696, "learning_rate": 4.984861260996908e-05, "loss": 2.4587, "mean_token_accuracy": 0.4137930929660797, "step": 84490 }, { "epoch": 0.08510426140741569, "grad_norm": 12.003674766681122, "learning_rate": 4.984856921220255e-05, "loss": 2.4464, "mean_token_accuracy": 0.4551724076271057, "step": 84495 }, { "epoch": 0.08510929746051986, "grad_norm": 8.968695648919995, "learning_rate": 4.984852580823756e-05, "loss": 2.2852, "mean_token_accuracy": 0.42413792610168455, "step": 84500 }, { "epoch": 0.08511433351362403, "grad_norm": 12.589572264812752, "learning_rate": 4.984848239807412e-05, "loss": 2.2461, "mean_token_accuracy": 0.41379310488700866, "step": 84505 }, { "epoch": 0.0851193695667282, "grad_norm": 15.8679545111173, "learning_rate": 4.9848438981712256e-05, "loss": 2.4241, "mean_token_accuracy": 0.4103448212146759, "step": 84510 }, { "epoch": 0.08512440561983238, "grad_norm": 39.63071783502666, "learning_rate": 4.984839555915196e-05, "loss": 2.4455, "mean_token_accuracy": 0.43448275327682495, "step": 84515 }, { "epoch": 0.08512944167293655, "grad_norm": 10.151088486100734, "learning_rate": 4.984835213039325e-05, "loss": 2.5042, "mean_token_accuracy": 0.4620689630508423, "step": 84520 }, { "epoch": 0.08513447772604073, "grad_norm": 12.41790032572079, "learning_rate": 4.9848308695436144e-05, "loss": 2.4006, "mean_token_accuracy": 0.4172413766384125, "step": 84525 }, { "epoch": 0.0851395137791449, "grad_norm": 10.877655507858771, "learning_rate": 4.984826525428066e-05, "loss": 2.2611, "mean_token_accuracy": 0.43103447556495667, "step": 84530 }, { "epoch": 0.08514454983224908, "grad_norm": 9.97160224644332, "learning_rate": 4.984822180692679e-05, "loss": 2.6174, "mean_token_accuracy": 0.3862068891525269, "step": 84535 }, { "epoch": 0.08514958588535325, "grad_norm": 11.921062781809043, "learning_rate": 4.984817835337456e-05, "loss": 2.7481, "mean_token_accuracy": 0.36896550953388213, "step": 84540 }, { "epoch": 0.08515462193845742, "grad_norm": 11.668033587057701, "learning_rate": 4.984813489362398e-05, "loss": 2.4834, "mean_token_accuracy": 0.4226860165596008, "step": 84545 }, { "epoch": 0.08515965799156158, "grad_norm": 12.55272835602653, "learning_rate": 4.9848091427675056e-05, "loss": 2.3746, "mean_token_accuracy": 0.38275861740112305, "step": 84550 }, { "epoch": 0.08516469404466576, "grad_norm": 11.237504842676676, "learning_rate": 4.9848047955527815e-05, "loss": 2.5704, "mean_token_accuracy": 0.44827585816383364, "step": 84555 }, { "epoch": 0.08516973009776993, "grad_norm": 12.23209827172702, "learning_rate": 4.984800447718226e-05, "loss": 2.3089, "mean_token_accuracy": 0.4586206912994385, "step": 84560 }, { "epoch": 0.0851747661508741, "grad_norm": 10.903491627479307, "learning_rate": 4.984796099263839e-05, "loss": 2.4635, "mean_token_accuracy": 0.4448275864124298, "step": 84565 }, { "epoch": 0.08517980220397828, "grad_norm": 10.474051700439304, "learning_rate": 4.984791750189625e-05, "loss": 2.598, "mean_token_accuracy": 0.3586206793785095, "step": 84570 }, { "epoch": 0.08518483825708245, "grad_norm": 19.794011059465998, "learning_rate": 4.984787400495582e-05, "loss": 2.5507, "mean_token_accuracy": 0.41379310488700866, "step": 84575 }, { "epoch": 0.08518987431018663, "grad_norm": 10.280470348202181, "learning_rate": 4.984783050181712e-05, "loss": 2.5124, "mean_token_accuracy": 0.4310344815254211, "step": 84580 }, { "epoch": 0.0851949103632908, "grad_norm": 11.475574558126407, "learning_rate": 4.984778699248017e-05, "loss": 2.86, "mean_token_accuracy": 0.3275862067937851, "step": 84585 }, { "epoch": 0.08519994641639497, "grad_norm": 10.03683621786851, "learning_rate": 4.984774347694498e-05, "loss": 2.4673, "mean_token_accuracy": 0.43793103098869324, "step": 84590 }, { "epoch": 0.08520498246949915, "grad_norm": 12.12989956981753, "learning_rate": 4.9847699955211555e-05, "loss": 2.7586, "mean_token_accuracy": 0.3517241388559341, "step": 84595 }, { "epoch": 0.08521001852260332, "grad_norm": 10.244175830724057, "learning_rate": 4.984765642727991e-05, "loss": 2.5346, "mean_token_accuracy": 0.3862068891525269, "step": 84600 }, { "epoch": 0.0852150545757075, "grad_norm": 14.628615663778005, "learning_rate": 4.984761289315006e-05, "loss": 2.4964, "mean_token_accuracy": 0.4137930989265442, "step": 84605 }, { "epoch": 0.08522009062881167, "grad_norm": 9.870934901720963, "learning_rate": 4.984756935282203e-05, "loss": 2.6974, "mean_token_accuracy": 0.39999999701976774, "step": 84610 }, { "epoch": 0.08522512668191584, "grad_norm": 11.01603438631754, "learning_rate": 4.98475258062958e-05, "loss": 2.1647, "mean_token_accuracy": 0.45517241954803467, "step": 84615 }, { "epoch": 0.08523016273502, "grad_norm": 11.524847961866984, "learning_rate": 4.984748225357141e-05, "loss": 2.2512, "mean_token_accuracy": 0.47586206197738645, "step": 84620 }, { "epoch": 0.08523519878812418, "grad_norm": 13.4895765933074, "learning_rate": 4.984743869464887e-05, "loss": 2.3777, "mean_token_accuracy": 0.43793103098869324, "step": 84625 }, { "epoch": 0.08524023484122835, "grad_norm": 10.225665315704582, "learning_rate": 4.984739512952817e-05, "loss": 2.5444, "mean_token_accuracy": 0.4068965494632721, "step": 84630 }, { "epoch": 0.08524527089433252, "grad_norm": 10.278993971954705, "learning_rate": 4.984735155820934e-05, "loss": 2.6896, "mean_token_accuracy": 0.4034482717514038, "step": 84635 }, { "epoch": 0.0852503069474367, "grad_norm": 10.670394700520866, "learning_rate": 4.984730798069239e-05, "loss": 2.3579, "mean_token_accuracy": 0.41724138259887694, "step": 84640 }, { "epoch": 0.08525534300054087, "grad_norm": 11.55058103019508, "learning_rate": 4.984726439697733e-05, "loss": 2.8365, "mean_token_accuracy": 0.4068965554237366, "step": 84645 }, { "epoch": 0.08526037905364504, "grad_norm": 9.32338116219339, "learning_rate": 4.9847220807064174e-05, "loss": 2.3138, "mean_token_accuracy": 0.43793103098869324, "step": 84650 }, { "epoch": 0.08526541510674922, "grad_norm": 12.316541387402355, "learning_rate": 4.9847177210952935e-05, "loss": 2.4248, "mean_token_accuracy": 0.4172413766384125, "step": 84655 }, { "epoch": 0.08527045115985339, "grad_norm": 7.241387908224625, "learning_rate": 4.984713360864362e-05, "loss": 2.2264, "mean_token_accuracy": 0.5020935952663421, "step": 84660 }, { "epoch": 0.08527548721295757, "grad_norm": 14.523320856703261, "learning_rate": 4.984709000013625e-05, "loss": 2.9934, "mean_token_accuracy": 0.3620689630508423, "step": 84665 }, { "epoch": 0.08528052326606174, "grad_norm": 9.015426040179431, "learning_rate": 4.984704638543082e-05, "loss": 2.0629, "mean_token_accuracy": 0.47586206197738645, "step": 84670 }, { "epoch": 0.08528555931916591, "grad_norm": 13.302223112506663, "learning_rate": 4.984700276452737e-05, "loss": 2.2786, "mean_token_accuracy": 0.458620685338974, "step": 84675 }, { "epoch": 0.08529059537227009, "grad_norm": 11.16595514248166, "learning_rate": 4.984695913742588e-05, "loss": 2.4225, "mean_token_accuracy": 0.4137930989265442, "step": 84680 }, { "epoch": 0.08529563142537426, "grad_norm": 11.354800401263947, "learning_rate": 4.9846915504126376e-05, "loss": 2.6468, "mean_token_accuracy": 0.458620685338974, "step": 84685 }, { "epoch": 0.08530066747847842, "grad_norm": 11.583386402158622, "learning_rate": 4.9846871864628884e-05, "loss": 1.9683, "mean_token_accuracy": 0.5172413766384125, "step": 84690 }, { "epoch": 0.0853057035315826, "grad_norm": 10.327043853429629, "learning_rate": 4.98468282189334e-05, "loss": 2.1179, "mean_token_accuracy": 0.4620689570903778, "step": 84695 }, { "epoch": 0.08531073958468677, "grad_norm": 12.983142422137602, "learning_rate": 4.984678456703994e-05, "loss": 2.2929, "mean_token_accuracy": 0.4310344815254211, "step": 84700 }, { "epoch": 0.08531577563779094, "grad_norm": 8.933503006154062, "learning_rate": 4.984674090894851e-05, "loss": 2.5905, "mean_token_accuracy": 0.417241370677948, "step": 84705 }, { "epoch": 0.08532081169089512, "grad_norm": 12.272507498438783, "learning_rate": 4.984669724465913e-05, "loss": 2.7015, "mean_token_accuracy": 0.3517241358757019, "step": 84710 }, { "epoch": 0.08532584774399929, "grad_norm": 25.551052579900965, "learning_rate": 4.984665357417182e-05, "loss": 2.3321, "mean_token_accuracy": 0.45862067937850953, "step": 84715 }, { "epoch": 0.08533088379710346, "grad_norm": 10.29086593150973, "learning_rate": 4.984660989748657e-05, "loss": 2.4128, "mean_token_accuracy": 0.37586206793785093, "step": 84720 }, { "epoch": 0.08533591985020764, "grad_norm": 8.840366702981505, "learning_rate": 4.9846566214603405e-05, "loss": 2.4247, "mean_token_accuracy": 0.4103448182344437, "step": 84725 }, { "epoch": 0.08534095590331181, "grad_norm": 13.020275954691687, "learning_rate": 4.9846522525522346e-05, "loss": 2.2968, "mean_token_accuracy": 0.4344827592372894, "step": 84730 }, { "epoch": 0.08534599195641598, "grad_norm": 14.529412065382648, "learning_rate": 4.984647883024339e-05, "loss": 2.4133, "mean_token_accuracy": 0.4262552857398987, "step": 84735 }, { "epoch": 0.08535102800952016, "grad_norm": 9.838055254839905, "learning_rate": 4.984643512876655e-05, "loss": 2.6812, "mean_token_accuracy": 0.4019358724355698, "step": 84740 }, { "epoch": 0.08535606406262433, "grad_norm": 10.668436783535542, "learning_rate": 4.984639142109185e-05, "loss": 2.3161, "mean_token_accuracy": 0.44482758045196535, "step": 84745 }, { "epoch": 0.0853611001157285, "grad_norm": 10.318678538761349, "learning_rate": 4.984634770721929e-05, "loss": 2.4585, "mean_token_accuracy": 0.4344827592372894, "step": 84750 }, { "epoch": 0.08536613616883268, "grad_norm": 10.741265438412023, "learning_rate": 4.984630398714889e-05, "loss": 2.5932, "mean_token_accuracy": 0.34827586114406583, "step": 84755 }, { "epoch": 0.08537117222193684, "grad_norm": 10.263368501053987, "learning_rate": 4.984626026088066e-05, "loss": 2.4388, "mean_token_accuracy": 0.4172413766384125, "step": 84760 }, { "epoch": 0.08537620827504101, "grad_norm": 12.316449344697741, "learning_rate": 4.9846216528414614e-05, "loss": 2.5857, "mean_token_accuracy": 0.42758620381355283, "step": 84765 }, { "epoch": 0.08538124432814519, "grad_norm": 12.786897991712799, "learning_rate": 4.984617278975075e-05, "loss": 2.3611, "mean_token_accuracy": 0.41724138259887694, "step": 84770 }, { "epoch": 0.08538628038124936, "grad_norm": 18.4907771248876, "learning_rate": 4.9846129044889104e-05, "loss": 2.5688, "mean_token_accuracy": 0.4586206912994385, "step": 84775 }, { "epoch": 0.08539131643435353, "grad_norm": 11.127326146716031, "learning_rate": 4.984608529382967e-05, "loss": 2.3356, "mean_token_accuracy": 0.43448275327682495, "step": 84780 }, { "epoch": 0.08539635248745771, "grad_norm": 13.730937723790454, "learning_rate": 4.9846041536572464e-05, "loss": 2.6096, "mean_token_accuracy": 0.3827586233615875, "step": 84785 }, { "epoch": 0.08540138854056188, "grad_norm": 13.329863010094043, "learning_rate": 4.98459977731175e-05, "loss": 2.8554, "mean_token_accuracy": 0.36551723480224607, "step": 84790 }, { "epoch": 0.08540642459366606, "grad_norm": 9.54327958102799, "learning_rate": 4.98459540034648e-05, "loss": 2.531, "mean_token_accuracy": 0.4, "step": 84795 }, { "epoch": 0.08541146064677023, "grad_norm": 10.822723935559171, "learning_rate": 4.984591022761436e-05, "loss": 2.4074, "mean_token_accuracy": 0.42413793206214906, "step": 84800 }, { "epoch": 0.0854164966998744, "grad_norm": 10.337298328923886, "learning_rate": 4.9845866445566195e-05, "loss": 2.9763, "mean_token_accuracy": 0.35862069129943847, "step": 84805 }, { "epoch": 0.08542153275297858, "grad_norm": 11.754754415851192, "learning_rate": 4.9845822657320325e-05, "loss": 2.6188, "mean_token_accuracy": 0.4, "step": 84810 }, { "epoch": 0.08542656880608275, "grad_norm": 10.776760190852373, "learning_rate": 4.984577886287675e-05, "loss": 2.1525, "mean_token_accuracy": 0.4463054269552231, "step": 84815 }, { "epoch": 0.08543160485918692, "grad_norm": 8.700307520826124, "learning_rate": 4.98457350622355e-05, "loss": 2.1411, "mean_token_accuracy": 0.458620685338974, "step": 84820 }, { "epoch": 0.0854366409122911, "grad_norm": 14.362223184421186, "learning_rate": 4.984569125539657e-05, "loss": 2.2321, "mean_token_accuracy": 0.4344827592372894, "step": 84825 }, { "epoch": 0.08544167696539526, "grad_norm": 9.604512842635893, "learning_rate": 4.984564744235998e-05, "loss": 2.6096, "mean_token_accuracy": 0.41379310488700866, "step": 84830 }, { "epoch": 0.08544671301849943, "grad_norm": 12.749780105578493, "learning_rate": 4.984560362312575e-05, "loss": 2.2819, "mean_token_accuracy": 0.4327283680438995, "step": 84835 }, { "epoch": 0.0854517490716036, "grad_norm": 11.016649748836276, "learning_rate": 4.984555979769387e-05, "loss": 2.7468, "mean_token_accuracy": 0.39310344457626345, "step": 84840 }, { "epoch": 0.08545678512470778, "grad_norm": 11.22140505825962, "learning_rate": 4.9845515966064383e-05, "loss": 2.3012, "mean_token_accuracy": 0.44482758045196535, "step": 84845 }, { "epoch": 0.08546182117781195, "grad_norm": 14.846301264081985, "learning_rate": 4.984547212823727e-05, "loss": 2.5009, "mean_token_accuracy": 0.44827585816383364, "step": 84850 }, { "epoch": 0.08546685723091613, "grad_norm": 10.941981200457525, "learning_rate": 4.984542828421256e-05, "loss": 2.3513, "mean_token_accuracy": 0.4379310429096222, "step": 84855 }, { "epoch": 0.0854718932840203, "grad_norm": 10.20418301186699, "learning_rate": 4.984538443399026e-05, "loss": 2.1964, "mean_token_accuracy": 0.4761645495891571, "step": 84860 }, { "epoch": 0.08547692933712447, "grad_norm": 10.3312941527418, "learning_rate": 4.984534057757039e-05, "loss": 2.5541, "mean_token_accuracy": 0.4103448212146759, "step": 84865 }, { "epoch": 0.08548196539022865, "grad_norm": 11.719371097398584, "learning_rate": 4.9845296714952956e-05, "loss": 2.3445, "mean_token_accuracy": 0.41034482717514037, "step": 84870 }, { "epoch": 0.08548700144333282, "grad_norm": 11.559634627596225, "learning_rate": 4.984525284613796e-05, "loss": 2.4386, "mean_token_accuracy": 0.42952207922935487, "step": 84875 }, { "epoch": 0.085492037496437, "grad_norm": 10.366322945678176, "learning_rate": 4.984520897112543e-05, "loss": 2.3779, "mean_token_accuracy": 0.36896551251411436, "step": 84880 }, { "epoch": 0.08549707354954117, "grad_norm": 10.931057461523595, "learning_rate": 4.9845165089915386e-05, "loss": 2.2369, "mean_token_accuracy": 0.441379314661026, "step": 84885 }, { "epoch": 0.08550210960264534, "grad_norm": 9.658941064382661, "learning_rate": 4.984512120250781e-05, "loss": 2.2516, "mean_token_accuracy": 0.4310344815254211, "step": 84890 }, { "epoch": 0.08550714565574952, "grad_norm": 10.563480312026174, "learning_rate": 4.9845077308902737e-05, "loss": 2.3477, "mean_token_accuracy": 0.5015728950500489, "step": 84895 }, { "epoch": 0.08551218170885368, "grad_norm": 10.397525702347464, "learning_rate": 4.984503340910018e-05, "loss": 2.6549, "mean_token_accuracy": 0.3896551728248596, "step": 84900 }, { "epoch": 0.08551721776195785, "grad_norm": 9.583679109625598, "learning_rate": 4.9844989503100136e-05, "loss": 2.6838, "mean_token_accuracy": 0.34482758641242983, "step": 84905 }, { "epoch": 0.08552225381506202, "grad_norm": 10.539810370061335, "learning_rate": 4.984494559090263e-05, "loss": 2.515, "mean_token_accuracy": 0.4358136713504791, "step": 84910 }, { "epoch": 0.0855272898681662, "grad_norm": 12.121793066494048, "learning_rate": 4.984490167250767e-05, "loss": 2.797, "mean_token_accuracy": 0.3827586233615875, "step": 84915 }, { "epoch": 0.08553232592127037, "grad_norm": 12.837429395372114, "learning_rate": 4.984485774791526e-05, "loss": 2.6255, "mean_token_accuracy": 0.4344827651977539, "step": 84920 }, { "epoch": 0.08553736197437455, "grad_norm": 14.972352772295832, "learning_rate": 4.984481381712543e-05, "loss": 2.2976, "mean_token_accuracy": 0.4986085891723633, "step": 84925 }, { "epoch": 0.08554239802747872, "grad_norm": 12.282918672086751, "learning_rate": 4.984476988013818e-05, "loss": 2.5349, "mean_token_accuracy": 0.3827586233615875, "step": 84930 }, { "epoch": 0.0855474340805829, "grad_norm": 11.976660683360974, "learning_rate": 4.984472593695353e-05, "loss": 2.5609, "mean_token_accuracy": 0.39310344457626345, "step": 84935 }, { "epoch": 0.08555247013368707, "grad_norm": 11.803909869463793, "learning_rate": 4.984468198757147e-05, "loss": 2.5298, "mean_token_accuracy": 0.3896551728248596, "step": 84940 }, { "epoch": 0.08555750618679124, "grad_norm": 13.703224266665037, "learning_rate": 4.9844638031992046e-05, "loss": 2.8073, "mean_token_accuracy": 0.3758620709180832, "step": 84945 }, { "epoch": 0.08556254223989541, "grad_norm": 11.036223547197736, "learning_rate": 4.984459407021525e-05, "loss": 2.4779, "mean_token_accuracy": 0.45640393495559695, "step": 84950 }, { "epoch": 0.08556757829299959, "grad_norm": 10.039615791075628, "learning_rate": 4.9844550102241095e-05, "loss": 2.4055, "mean_token_accuracy": 0.43932244181632996, "step": 84955 }, { "epoch": 0.08557261434610376, "grad_norm": 10.257827787558794, "learning_rate": 4.9844506128069596e-05, "loss": 2.1784, "mean_token_accuracy": 0.441379314661026, "step": 84960 }, { "epoch": 0.08557765039920794, "grad_norm": 9.987932639948705, "learning_rate": 4.984446214770076e-05, "loss": 2.5851, "mean_token_accuracy": 0.417241370677948, "step": 84965 }, { "epoch": 0.0855826864523121, "grad_norm": 9.327547024341568, "learning_rate": 4.984441816113461e-05, "loss": 2.3614, "mean_token_accuracy": 0.41724138259887694, "step": 84970 }, { "epoch": 0.08558772250541627, "grad_norm": 10.509345441916844, "learning_rate": 4.984437416837115e-05, "loss": 2.5987, "mean_token_accuracy": 0.3896551728248596, "step": 84975 }, { "epoch": 0.08559275855852044, "grad_norm": 22.691751866348152, "learning_rate": 4.98443301694104e-05, "loss": 2.9315, "mean_token_accuracy": 0.37586206793785093, "step": 84980 }, { "epoch": 0.08559779461162462, "grad_norm": 12.726756382048482, "learning_rate": 4.9844286164252355e-05, "loss": 2.6035, "mean_token_accuracy": 0.3965517282485962, "step": 84985 }, { "epoch": 0.08560283066472879, "grad_norm": 12.428323106298778, "learning_rate": 4.984424215289705e-05, "loss": 2.4165, "mean_token_accuracy": 0.39655172228813174, "step": 84990 }, { "epoch": 0.08560786671783296, "grad_norm": 10.665648957937009, "learning_rate": 4.984419813534448e-05, "loss": 2.7658, "mean_token_accuracy": 0.37586206793785093, "step": 84995 }, { "epoch": 0.08561290277093714, "grad_norm": 10.993976142502344, "learning_rate": 4.984415411159467e-05, "loss": 2.266, "mean_token_accuracy": 0.4344827592372894, "step": 85000 }, { "epoch": 0.08561793882404131, "grad_norm": 10.852785505734532, "learning_rate": 4.984411008164761e-05, "loss": 2.3338, "mean_token_accuracy": 0.41724138259887694, "step": 85005 }, { "epoch": 0.08562297487714549, "grad_norm": 16.2854306918082, "learning_rate": 4.984406604550335e-05, "loss": 2.7387, "mean_token_accuracy": 0.3965517163276672, "step": 85010 }, { "epoch": 0.08562801093024966, "grad_norm": 10.045389519119256, "learning_rate": 4.9844022003161865e-05, "loss": 3.1191, "mean_token_accuracy": 0.34137930870056155, "step": 85015 }, { "epoch": 0.08563304698335383, "grad_norm": 11.963637450806097, "learning_rate": 4.9843977954623184e-05, "loss": 1.7848, "mean_token_accuracy": 0.4896551787853241, "step": 85020 }, { "epoch": 0.08563808303645801, "grad_norm": 11.725593220763537, "learning_rate": 4.984393389988732e-05, "loss": 3.1554, "mean_token_accuracy": 0.2999999910593033, "step": 85025 }, { "epoch": 0.08564311908956218, "grad_norm": 10.814965414121675, "learning_rate": 4.984388983895428e-05, "loss": 2.3534, "mean_token_accuracy": 0.4344827592372894, "step": 85030 }, { "epoch": 0.08564815514266635, "grad_norm": 14.875319527171524, "learning_rate": 4.984384577182408e-05, "loss": 2.536, "mean_token_accuracy": 0.4344827592372894, "step": 85035 }, { "epoch": 0.08565319119577051, "grad_norm": 13.70196564985981, "learning_rate": 4.9843801698496736e-05, "loss": 2.5501, "mean_token_accuracy": 0.41034482717514037, "step": 85040 }, { "epoch": 0.08565822724887469, "grad_norm": 10.37800883297888, "learning_rate": 4.9843757618972255e-05, "loss": 2.4787, "mean_token_accuracy": 0.4068965494632721, "step": 85045 }, { "epoch": 0.08566326330197886, "grad_norm": 9.468077476166412, "learning_rate": 4.9843713533250636e-05, "loss": 2.1058, "mean_token_accuracy": 0.46406533718109133, "step": 85050 }, { "epoch": 0.08566829935508304, "grad_norm": 10.636380153051858, "learning_rate": 4.984366944133192e-05, "loss": 2.5486, "mean_token_accuracy": 0.41379310488700866, "step": 85055 }, { "epoch": 0.08567333540818721, "grad_norm": 11.265505901346586, "learning_rate": 4.98436253432161e-05, "loss": 2.3632, "mean_token_accuracy": 0.36551723480224607, "step": 85060 }, { "epoch": 0.08567837146129138, "grad_norm": 10.095387213446813, "learning_rate": 4.984358123890319e-05, "loss": 2.3107, "mean_token_accuracy": 0.4310344815254211, "step": 85065 }, { "epoch": 0.08568340751439556, "grad_norm": 11.512709355570946, "learning_rate": 4.9843537128393206e-05, "loss": 2.4333, "mean_token_accuracy": 0.4551724076271057, "step": 85070 }, { "epoch": 0.08568844356749973, "grad_norm": 11.044413263613098, "learning_rate": 4.9843493011686147e-05, "loss": 2.5651, "mean_token_accuracy": 0.4034482777118683, "step": 85075 }, { "epoch": 0.0856934796206039, "grad_norm": 10.054607931507087, "learning_rate": 4.9843448888782045e-05, "loss": 2.5286, "mean_token_accuracy": 0.4034482777118683, "step": 85080 }, { "epoch": 0.08569851567370808, "grad_norm": 10.322332338126126, "learning_rate": 4.9843404759680915e-05, "loss": 2.8437, "mean_token_accuracy": 0.3620689630508423, "step": 85085 }, { "epoch": 0.08570355172681225, "grad_norm": 10.145294778338041, "learning_rate": 4.984336062438275e-05, "loss": 2.3634, "mean_token_accuracy": 0.4601330816745758, "step": 85090 }, { "epoch": 0.08570858777991643, "grad_norm": 10.914833485392618, "learning_rate": 4.984331648288756e-05, "loss": 2.8339, "mean_token_accuracy": 0.39655172228813174, "step": 85095 }, { "epoch": 0.0857136238330206, "grad_norm": 16.277723299251342, "learning_rate": 4.984327233519539e-05, "loss": 2.7048, "mean_token_accuracy": 0.40344826579093934, "step": 85100 }, { "epoch": 0.08571865988612477, "grad_norm": 9.859197820580183, "learning_rate": 4.984322818130621e-05, "loss": 2.6799, "mean_token_accuracy": 0.3896551728248596, "step": 85105 }, { "epoch": 0.08572369593922893, "grad_norm": 11.117010409719517, "learning_rate": 4.984318402122006e-05, "loss": 2.2761, "mean_token_accuracy": 0.4517241418361664, "step": 85110 }, { "epoch": 0.08572873199233311, "grad_norm": 9.05985384918835, "learning_rate": 4.984313985493694e-05, "loss": 2.3284, "mean_token_accuracy": 0.43103447556495667, "step": 85115 }, { "epoch": 0.08573376804543728, "grad_norm": 10.60185069467593, "learning_rate": 4.984309568245688e-05, "loss": 2.4633, "mean_token_accuracy": 0.4399273991584778, "step": 85120 }, { "epoch": 0.08573880409854145, "grad_norm": 10.281534014023256, "learning_rate": 4.984305150377986e-05, "loss": 2.917, "mean_token_accuracy": 0.3827586233615875, "step": 85125 }, { "epoch": 0.08574384015164563, "grad_norm": 11.942132248683746, "learning_rate": 4.984300731890593e-05, "loss": 2.2177, "mean_token_accuracy": 0.4413793087005615, "step": 85130 }, { "epoch": 0.0857488762047498, "grad_norm": 15.441877972568376, "learning_rate": 4.9842963127835075e-05, "loss": 2.4032, "mean_token_accuracy": 0.4, "step": 85135 }, { "epoch": 0.08575391225785398, "grad_norm": 9.825619475064997, "learning_rate": 4.9842918930567316e-05, "loss": 2.5206, "mean_token_accuracy": 0.4189957737922668, "step": 85140 }, { "epoch": 0.08575894831095815, "grad_norm": 11.588200095330597, "learning_rate": 4.984287472710267e-05, "loss": 2.6097, "mean_token_accuracy": 0.4517241358757019, "step": 85145 }, { "epoch": 0.08576398436406232, "grad_norm": 10.736010995061317, "learning_rate": 4.984283051744114e-05, "loss": 2.4531, "mean_token_accuracy": 0.42413792610168455, "step": 85150 }, { "epoch": 0.0857690204171665, "grad_norm": 15.14280086212328, "learning_rate": 4.984278630158273e-05, "loss": 3.0132, "mean_token_accuracy": 0.3448275923728943, "step": 85155 }, { "epoch": 0.08577405647027067, "grad_norm": 14.294142789283367, "learning_rate": 4.984274207952748e-05, "loss": 2.6942, "mean_token_accuracy": 0.38620689511299133, "step": 85160 }, { "epoch": 0.08577909252337484, "grad_norm": 10.396403966585945, "learning_rate": 4.984269785127539e-05, "loss": 2.3815, "mean_token_accuracy": 0.4448275864124298, "step": 85165 }, { "epoch": 0.08578412857647902, "grad_norm": 18.176901069389736, "learning_rate": 4.9842653616826456e-05, "loss": 3.205, "mean_token_accuracy": 0.3999999940395355, "step": 85170 }, { "epoch": 0.08578916462958319, "grad_norm": 11.11701505904349, "learning_rate": 4.9842609376180716e-05, "loss": 2.5131, "mean_token_accuracy": 0.403448274731636, "step": 85175 }, { "epoch": 0.08579420068268735, "grad_norm": 10.079025940852375, "learning_rate": 4.9842565129338164e-05, "loss": 2.3245, "mean_token_accuracy": 0.43793103098869324, "step": 85180 }, { "epoch": 0.08579923673579153, "grad_norm": 10.94634268586728, "learning_rate": 4.984252087629882e-05, "loss": 2.378, "mean_token_accuracy": 0.4344827592372894, "step": 85185 }, { "epoch": 0.0858042727888957, "grad_norm": 12.858617851379925, "learning_rate": 4.984247661706269e-05, "loss": 2.6084, "mean_token_accuracy": 0.4551724135875702, "step": 85190 }, { "epoch": 0.08580930884199987, "grad_norm": 14.985527937415704, "learning_rate": 4.98424323516298e-05, "loss": 2.5601, "mean_token_accuracy": 0.4499697506427765, "step": 85195 }, { "epoch": 0.08581434489510405, "grad_norm": 11.18939523392338, "learning_rate": 4.984238808000015e-05, "loss": 2.9234, "mean_token_accuracy": 0.40145190358161925, "step": 85200 }, { "epoch": 0.08581938094820822, "grad_norm": 11.111570224074125, "learning_rate": 4.984234380217375e-05, "loss": 3.2566, "mean_token_accuracy": 0.4000000059604645, "step": 85205 }, { "epoch": 0.0858244170013124, "grad_norm": 11.4627003574483, "learning_rate": 4.984229951815062e-05, "loss": 2.2581, "mean_token_accuracy": 0.4379310250282288, "step": 85210 }, { "epoch": 0.08582945305441657, "grad_norm": 11.13292601703709, "learning_rate": 4.984225522793077e-05, "loss": 2.4919, "mean_token_accuracy": 0.43793103098869324, "step": 85215 }, { "epoch": 0.08583448910752074, "grad_norm": 11.206306358297843, "learning_rate": 4.9842210931514214e-05, "loss": 2.3423, "mean_token_accuracy": 0.4310344815254211, "step": 85220 }, { "epoch": 0.08583952516062492, "grad_norm": 13.484362749937183, "learning_rate": 4.9842166628900966e-05, "loss": 2.6471, "mean_token_accuracy": 0.3689655214548111, "step": 85225 }, { "epoch": 0.08584456121372909, "grad_norm": 11.365625490029073, "learning_rate": 4.984212232009102e-05, "loss": 2.328, "mean_token_accuracy": 0.43103448748588563, "step": 85230 }, { "epoch": 0.08584959726683326, "grad_norm": 9.932729322464247, "learning_rate": 4.984207800508442e-05, "loss": 2.2987, "mean_token_accuracy": 0.4206896543502808, "step": 85235 }, { "epoch": 0.08585463331993744, "grad_norm": 13.909141499203182, "learning_rate": 4.984203368388115e-05, "loss": 2.9652, "mean_token_accuracy": 0.31724137663841245, "step": 85240 }, { "epoch": 0.08585966937304161, "grad_norm": 11.153437789051395, "learning_rate": 4.984198935648124e-05, "loss": 2.2758, "mean_token_accuracy": 0.4344827592372894, "step": 85245 }, { "epoch": 0.08586470542614577, "grad_norm": 11.7182021532364, "learning_rate": 4.9841945022884693e-05, "loss": 2.4504, "mean_token_accuracy": 0.4310344815254211, "step": 85250 }, { "epoch": 0.08586974147924994, "grad_norm": 10.576830298391938, "learning_rate": 4.984190068309153e-05, "loss": 2.5444, "mean_token_accuracy": 0.4, "step": 85255 }, { "epoch": 0.08587477753235412, "grad_norm": 11.739602391987388, "learning_rate": 4.984185633710175e-05, "loss": 2.7259, "mean_token_accuracy": 0.38275861740112305, "step": 85260 }, { "epoch": 0.08587981358545829, "grad_norm": 10.869190652738734, "learning_rate": 4.9841811984915374e-05, "loss": 2.247, "mean_token_accuracy": 0.4310344815254211, "step": 85265 }, { "epoch": 0.08588484963856247, "grad_norm": 10.187997515510254, "learning_rate": 4.984176762653241e-05, "loss": 2.2519, "mean_token_accuracy": 0.44827585816383364, "step": 85270 }, { "epoch": 0.08588988569166664, "grad_norm": 10.63250994867134, "learning_rate": 4.9841723261952885e-05, "loss": 2.3426, "mean_token_accuracy": 0.42413793206214906, "step": 85275 }, { "epoch": 0.08589492174477081, "grad_norm": 9.660467397302225, "learning_rate": 4.984167889117679e-05, "loss": 2.4858, "mean_token_accuracy": 0.41379310488700866, "step": 85280 }, { "epoch": 0.08589995779787499, "grad_norm": 13.710668627505783, "learning_rate": 4.984163451420415e-05, "loss": 2.7851, "mean_token_accuracy": 0.3482758581638336, "step": 85285 }, { "epoch": 0.08590499385097916, "grad_norm": 8.799701314304338, "learning_rate": 4.9841590131034974e-05, "loss": 2.1146, "mean_token_accuracy": 0.4551724076271057, "step": 85290 }, { "epoch": 0.08591002990408333, "grad_norm": 9.630625662645343, "learning_rate": 4.9841545741669276e-05, "loss": 2.3573, "mean_token_accuracy": 0.4172413766384125, "step": 85295 }, { "epoch": 0.08591506595718751, "grad_norm": 10.001590124425215, "learning_rate": 4.9841501346107064e-05, "loss": 2.1674, "mean_token_accuracy": 0.4310344815254211, "step": 85300 }, { "epoch": 0.08592010201029168, "grad_norm": 13.609977664991971, "learning_rate": 4.984145694434835e-05, "loss": 2.6749, "mean_token_accuracy": 0.3965517163276672, "step": 85305 }, { "epoch": 0.08592513806339586, "grad_norm": 12.239505555693631, "learning_rate": 4.9841412536393155e-05, "loss": 2.594, "mean_token_accuracy": 0.37241379022598264, "step": 85310 }, { "epoch": 0.08593017411650003, "grad_norm": 16.714787973824745, "learning_rate": 4.984136812224149e-05, "loss": 2.8628, "mean_token_accuracy": 0.3241379290819168, "step": 85315 }, { "epoch": 0.08593521016960419, "grad_norm": 11.161491601939215, "learning_rate": 4.984132370189335e-05, "loss": 2.6454, "mean_token_accuracy": 0.3896551728248596, "step": 85320 }, { "epoch": 0.08594024622270836, "grad_norm": 10.39513729267292, "learning_rate": 4.984127927534878e-05, "loss": 2.8, "mean_token_accuracy": 0.3620689630508423, "step": 85325 }, { "epoch": 0.08594528227581254, "grad_norm": 10.97565235178341, "learning_rate": 4.9841234842607755e-05, "loss": 2.4869, "mean_token_accuracy": 0.4517241358757019, "step": 85330 }, { "epoch": 0.08595031832891671, "grad_norm": 13.094679448185023, "learning_rate": 4.9841190403670315e-05, "loss": 2.3968, "mean_token_accuracy": 0.43617664873600004, "step": 85335 }, { "epoch": 0.08595535438202088, "grad_norm": 9.268381841700993, "learning_rate": 4.9841145958536455e-05, "loss": 2.4659, "mean_token_accuracy": 0.37586206793785093, "step": 85340 }, { "epoch": 0.08596039043512506, "grad_norm": 10.754550261288411, "learning_rate": 4.98411015072062e-05, "loss": 3.1551, "mean_token_accuracy": 0.3068965494632721, "step": 85345 }, { "epoch": 0.08596542648822923, "grad_norm": 12.350153453574032, "learning_rate": 4.984105704967956e-05, "loss": 2.4209, "mean_token_accuracy": 0.441379314661026, "step": 85350 }, { "epoch": 0.0859704625413334, "grad_norm": 12.896445346898673, "learning_rate": 4.9841012585956534e-05, "loss": 2.6002, "mean_token_accuracy": 0.3517241418361664, "step": 85355 }, { "epoch": 0.08597549859443758, "grad_norm": 11.133722761190292, "learning_rate": 4.9840968116037156e-05, "loss": 2.5355, "mean_token_accuracy": 0.4068965494632721, "step": 85360 }, { "epoch": 0.08598053464754175, "grad_norm": 10.917072446468302, "learning_rate": 4.984092363992142e-05, "loss": 2.4788, "mean_token_accuracy": 0.3862068891525269, "step": 85365 }, { "epoch": 0.08598557070064593, "grad_norm": 9.424491955677622, "learning_rate": 4.984087915760935e-05, "loss": 2.1365, "mean_token_accuracy": 0.46551724672317507, "step": 85370 }, { "epoch": 0.0859906067537501, "grad_norm": 12.778505686665666, "learning_rate": 4.9840834669100956e-05, "loss": 2.9923, "mean_token_accuracy": 0.3896551728248596, "step": 85375 }, { "epoch": 0.08599564280685427, "grad_norm": 9.726605335924999, "learning_rate": 4.984079017439624e-05, "loss": 2.3379, "mean_token_accuracy": 0.41548699140548706, "step": 85380 }, { "epoch": 0.08600067885995845, "grad_norm": 10.513691817665345, "learning_rate": 4.9840745673495224e-05, "loss": 2.4615, "mean_token_accuracy": 0.4379310429096222, "step": 85385 }, { "epoch": 0.08600571491306261, "grad_norm": 11.08152069332261, "learning_rate": 4.984070116639792e-05, "loss": 2.3981, "mean_token_accuracy": 0.43793103098869324, "step": 85390 }, { "epoch": 0.08601075096616678, "grad_norm": 9.407697331771768, "learning_rate": 4.9840656653104346e-05, "loss": 2.4266, "mean_token_accuracy": 0.4379310369491577, "step": 85395 }, { "epoch": 0.08601578701927096, "grad_norm": 11.360659170297724, "learning_rate": 4.98406121336145e-05, "loss": 2.2574, "mean_token_accuracy": 0.4448275864124298, "step": 85400 }, { "epoch": 0.08602082307237513, "grad_norm": 11.062678540631175, "learning_rate": 4.984056760792841e-05, "loss": 2.7794, "mean_token_accuracy": 0.3758620619773865, "step": 85405 }, { "epoch": 0.0860258591254793, "grad_norm": 15.597091022284994, "learning_rate": 4.984052307604607e-05, "loss": 2.0737, "mean_token_accuracy": 0.4815486967563629, "step": 85410 }, { "epoch": 0.08603089517858348, "grad_norm": 14.38673739807895, "learning_rate": 4.98404785379675e-05, "loss": 2.8609, "mean_token_accuracy": 0.3517241358757019, "step": 85415 }, { "epoch": 0.08603593123168765, "grad_norm": 9.681448442049616, "learning_rate": 4.9840433993692726e-05, "loss": 2.4233, "mean_token_accuracy": 0.4365396201610565, "step": 85420 }, { "epoch": 0.08604096728479182, "grad_norm": 13.878501551785694, "learning_rate": 4.984038944322175e-05, "loss": 2.3872, "mean_token_accuracy": 0.422202056646347, "step": 85425 }, { "epoch": 0.086046003337896, "grad_norm": 11.807575497010157, "learning_rate": 4.984034488655458e-05, "loss": 2.7204, "mean_token_accuracy": 0.3689655065536499, "step": 85430 }, { "epoch": 0.08605103939100017, "grad_norm": 12.546339320064558, "learning_rate": 4.984030032369123e-05, "loss": 2.3969, "mean_token_accuracy": 0.4379310429096222, "step": 85435 }, { "epoch": 0.08605607544410435, "grad_norm": 11.113824852858606, "learning_rate": 4.984025575463172e-05, "loss": 2.7767, "mean_token_accuracy": 0.39655172228813174, "step": 85440 }, { "epoch": 0.08606111149720852, "grad_norm": 10.351716956585157, "learning_rate": 4.984021117937605e-05, "loss": 2.5045, "mean_token_accuracy": 0.41379310488700866, "step": 85445 }, { "epoch": 0.0860661475503127, "grad_norm": 14.0565772878961, "learning_rate": 4.9840166597924235e-05, "loss": 2.5841, "mean_token_accuracy": 0.4103448331356049, "step": 85450 }, { "epoch": 0.08607118360341687, "grad_norm": 11.39834192620065, "learning_rate": 4.98401220102763e-05, "loss": 2.5898, "mean_token_accuracy": 0.3551724135875702, "step": 85455 }, { "epoch": 0.08607621965652103, "grad_norm": 11.159408865013248, "learning_rate": 4.9840077416432244e-05, "loss": 2.5626, "mean_token_accuracy": 0.4379310369491577, "step": 85460 }, { "epoch": 0.0860812557096252, "grad_norm": 11.162242629254587, "learning_rate": 4.9840032816392084e-05, "loss": 1.9189, "mean_token_accuracy": 0.4724137902259827, "step": 85465 }, { "epoch": 0.08608629176272937, "grad_norm": 12.65093655706823, "learning_rate": 4.9839988210155845e-05, "loss": 2.9522, "mean_token_accuracy": 0.3379310339689255, "step": 85470 }, { "epoch": 0.08609132781583355, "grad_norm": 12.16896884426746, "learning_rate": 4.983994359772351e-05, "loss": 2.4605, "mean_token_accuracy": 0.4000000059604645, "step": 85475 }, { "epoch": 0.08609636386893772, "grad_norm": 11.75199572193061, "learning_rate": 4.983989897909512e-05, "loss": 2.4777, "mean_token_accuracy": 0.4448275864124298, "step": 85480 }, { "epoch": 0.0861013999220419, "grad_norm": 14.382335485108813, "learning_rate": 4.983985435427067e-05, "loss": 2.7906, "mean_token_accuracy": 0.39655172228813174, "step": 85485 }, { "epoch": 0.08610643597514607, "grad_norm": 9.945878369688634, "learning_rate": 4.9839809723250176e-05, "loss": 2.3956, "mean_token_accuracy": 0.4275861978530884, "step": 85490 }, { "epoch": 0.08611147202825024, "grad_norm": 13.850151860829053, "learning_rate": 4.983976508603365e-05, "loss": 2.623, "mean_token_accuracy": 0.39655172228813174, "step": 85495 }, { "epoch": 0.08611650808135442, "grad_norm": 11.612715385123675, "learning_rate": 4.983972044262112e-05, "loss": 2.3115, "mean_token_accuracy": 0.47447065711021424, "step": 85500 }, { "epoch": 0.08612154413445859, "grad_norm": 10.950592942516883, "learning_rate": 4.9839675793012576e-05, "loss": 2.4697, "mean_token_accuracy": 0.4122807025909424, "step": 85505 }, { "epoch": 0.08612658018756277, "grad_norm": 9.935897196464015, "learning_rate": 4.983963113720804e-05, "loss": 2.3464, "mean_token_accuracy": 0.4482758641242981, "step": 85510 }, { "epoch": 0.08613161624066694, "grad_norm": 10.300166444493629, "learning_rate": 4.9839586475207525e-05, "loss": 2.5632, "mean_token_accuracy": 0.4, "step": 85515 }, { "epoch": 0.08613665229377111, "grad_norm": 19.439069266082544, "learning_rate": 4.983954180701104e-05, "loss": 2.9965, "mean_token_accuracy": 0.37931033968925476, "step": 85520 }, { "epoch": 0.08614168834687529, "grad_norm": 11.741771304225871, "learning_rate": 4.983949713261861e-05, "loss": 2.4968, "mean_token_accuracy": 0.4034482717514038, "step": 85525 }, { "epoch": 0.08614672439997945, "grad_norm": 11.682057675342719, "learning_rate": 4.983945245203023e-05, "loss": 2.8479, "mean_token_accuracy": 0.3999999940395355, "step": 85530 }, { "epoch": 0.08615176045308362, "grad_norm": 9.507023362124603, "learning_rate": 4.9839407765245914e-05, "loss": 2.3831, "mean_token_accuracy": 0.42413793206214906, "step": 85535 }, { "epoch": 0.0861567965061878, "grad_norm": 23.008108641110102, "learning_rate": 4.983936307226568e-05, "loss": 3.7925, "mean_token_accuracy": 0.31724137514829637, "step": 85540 }, { "epoch": 0.08616183255929197, "grad_norm": 9.494186571883864, "learning_rate": 4.983931837308955e-05, "loss": 2.2603, "mean_token_accuracy": 0.4551724135875702, "step": 85545 }, { "epoch": 0.08616686861239614, "grad_norm": 10.707906647919105, "learning_rate": 4.9839273667717524e-05, "loss": 2.8206, "mean_token_accuracy": 0.3896551787853241, "step": 85550 }, { "epoch": 0.08617190466550032, "grad_norm": 12.001503670005464, "learning_rate": 4.983922895614961e-05, "loss": 2.5182, "mean_token_accuracy": 0.36551723480224607, "step": 85555 }, { "epoch": 0.08617694071860449, "grad_norm": 10.22107580929728, "learning_rate": 4.9839184238385834e-05, "loss": 2.3689, "mean_token_accuracy": 0.41724138259887694, "step": 85560 }, { "epoch": 0.08618197677170866, "grad_norm": 9.56316902610241, "learning_rate": 4.98391395144262e-05, "loss": 2.4329, "mean_token_accuracy": 0.3931034505367279, "step": 85565 }, { "epoch": 0.08618701282481284, "grad_norm": 11.36748756237982, "learning_rate": 4.9839094784270725e-05, "loss": 2.4364, "mean_token_accuracy": 0.43448275327682495, "step": 85570 }, { "epoch": 0.08619204887791701, "grad_norm": 12.713213482473531, "learning_rate": 4.983905004791942e-05, "loss": 2.8481, "mean_token_accuracy": 0.3551724016666412, "step": 85575 }, { "epoch": 0.08619708493102118, "grad_norm": 10.562700749580577, "learning_rate": 4.9839005305372286e-05, "loss": 2.3568, "mean_token_accuracy": 0.44827585816383364, "step": 85580 }, { "epoch": 0.08620212098412536, "grad_norm": 11.202289890344549, "learning_rate": 4.983896055662935e-05, "loss": 2.5404, "mean_token_accuracy": 0.4344827651977539, "step": 85585 }, { "epoch": 0.08620715703722953, "grad_norm": 11.118000911378077, "learning_rate": 4.983891580169062e-05, "loss": 2.2792, "mean_token_accuracy": 0.44482759237289426, "step": 85590 }, { "epoch": 0.0862121930903337, "grad_norm": 11.95394681498992, "learning_rate": 4.983887104055611e-05, "loss": 2.8942, "mean_token_accuracy": 0.34482758641242983, "step": 85595 }, { "epoch": 0.08621722914343787, "grad_norm": 10.893648767087248, "learning_rate": 4.983882627322584e-05, "loss": 2.6495, "mean_token_accuracy": 0.37241379022598264, "step": 85600 }, { "epoch": 0.08622226519654204, "grad_norm": 9.821974307133512, "learning_rate": 4.9838781499699805e-05, "loss": 2.2899, "mean_token_accuracy": 0.44827585816383364, "step": 85605 }, { "epoch": 0.08622730124964621, "grad_norm": 11.041534966239618, "learning_rate": 4.9838736719978025e-05, "loss": 2.9946, "mean_token_accuracy": 0.3637023627758026, "step": 85610 }, { "epoch": 0.08623233730275039, "grad_norm": 10.722951042243706, "learning_rate": 4.983869193406051e-05, "loss": 2.4547, "mean_token_accuracy": 0.4189957737922668, "step": 85615 }, { "epoch": 0.08623737335585456, "grad_norm": 10.83658484455416, "learning_rate": 4.983864714194728e-05, "loss": 2.3429, "mean_token_accuracy": 0.4532365381717682, "step": 85620 }, { "epoch": 0.08624240940895873, "grad_norm": 11.518934069709971, "learning_rate": 4.983860234363833e-05, "loss": 2.6994, "mean_token_accuracy": 0.39310345649719236, "step": 85625 }, { "epoch": 0.08624744546206291, "grad_norm": 11.98270298914532, "learning_rate": 4.98385575391337e-05, "loss": 2.6463, "mean_token_accuracy": 0.41724138259887694, "step": 85630 }, { "epoch": 0.08625248151516708, "grad_norm": 11.742933095477468, "learning_rate": 4.9838512728433386e-05, "loss": 2.7388, "mean_token_accuracy": 0.3517241358757019, "step": 85635 }, { "epoch": 0.08625751756827126, "grad_norm": 10.641977258120052, "learning_rate": 4.9838467911537403e-05, "loss": 2.3493, "mean_token_accuracy": 0.41724138259887694, "step": 85640 }, { "epoch": 0.08626255362137543, "grad_norm": 10.493203291432566, "learning_rate": 4.9838423088445754e-05, "loss": 2.7103, "mean_token_accuracy": 0.3931034505367279, "step": 85645 }, { "epoch": 0.0862675896744796, "grad_norm": 14.192394149965299, "learning_rate": 4.983837825915847e-05, "loss": 2.6765, "mean_token_accuracy": 0.36896551251411436, "step": 85650 }, { "epoch": 0.08627262572758378, "grad_norm": 12.54586541101411, "learning_rate": 4.983833342367555e-05, "loss": 2.8413, "mean_token_accuracy": 0.39310344457626345, "step": 85655 }, { "epoch": 0.08627766178068795, "grad_norm": 10.185967309541747, "learning_rate": 4.983828858199701e-05, "loss": 2.6258, "mean_token_accuracy": 0.3551724076271057, "step": 85660 }, { "epoch": 0.08628269783379212, "grad_norm": 12.258931326834743, "learning_rate": 4.9838243734122863e-05, "loss": 2.6827, "mean_token_accuracy": 0.4000000059604645, "step": 85665 }, { "epoch": 0.08628773388689628, "grad_norm": 10.17082181339242, "learning_rate": 4.983819888005312e-05, "loss": 1.9994, "mean_token_accuracy": 0.4517241418361664, "step": 85670 }, { "epoch": 0.08629276994000046, "grad_norm": 10.99133632123915, "learning_rate": 4.9838154019787795e-05, "loss": 2.5435, "mean_token_accuracy": 0.4034482777118683, "step": 85675 }, { "epoch": 0.08629780599310463, "grad_norm": 10.781972261556966, "learning_rate": 4.983810915332689e-05, "loss": 2.1627, "mean_token_accuracy": 0.46551724076271056, "step": 85680 }, { "epoch": 0.0863028420462088, "grad_norm": 10.973681971932715, "learning_rate": 4.983806428067045e-05, "loss": 2.3818, "mean_token_accuracy": 0.4241379201412201, "step": 85685 }, { "epoch": 0.08630787809931298, "grad_norm": 10.35220906443794, "learning_rate": 4.983801940181844e-05, "loss": 2.7237, "mean_token_accuracy": 0.3793103456497192, "step": 85690 }, { "epoch": 0.08631291415241715, "grad_norm": 14.417212321958647, "learning_rate": 4.983797451677091e-05, "loss": 2.81, "mean_token_accuracy": 0.4275861978530884, "step": 85695 }, { "epoch": 0.08631795020552133, "grad_norm": 11.864233491738585, "learning_rate": 4.983792962552786e-05, "loss": 2.3464, "mean_token_accuracy": 0.42909860610961914, "step": 85700 }, { "epoch": 0.0863229862586255, "grad_norm": 10.344941526930626, "learning_rate": 4.9837884728089297e-05, "loss": 2.2091, "mean_token_accuracy": 0.4862068951129913, "step": 85705 }, { "epoch": 0.08632802231172967, "grad_norm": 14.738853829836518, "learning_rate": 4.9837839824455236e-05, "loss": 2.2436, "mean_token_accuracy": 0.4931034564971924, "step": 85710 }, { "epoch": 0.08633305836483385, "grad_norm": 10.69648651783352, "learning_rate": 4.98377949146257e-05, "loss": 2.5748, "mean_token_accuracy": 0.39310343861579894, "step": 85715 }, { "epoch": 0.08633809441793802, "grad_norm": 14.469735797404404, "learning_rate": 4.9837749998600684e-05, "loss": 2.5056, "mean_token_accuracy": 0.3862069010734558, "step": 85720 }, { "epoch": 0.0863431304710422, "grad_norm": 10.548277186525102, "learning_rate": 4.983770507638021e-05, "loss": 2.2705, "mean_token_accuracy": 0.3862069010734558, "step": 85725 }, { "epoch": 0.08634816652414637, "grad_norm": 10.644048411011205, "learning_rate": 4.98376601479643e-05, "loss": 2.1422, "mean_token_accuracy": 0.5103448271751404, "step": 85730 }, { "epoch": 0.08635320257725054, "grad_norm": 8.763835568818326, "learning_rate": 4.9837615213352956e-05, "loss": 2.1488, "mean_token_accuracy": 0.4551724135875702, "step": 85735 }, { "epoch": 0.0863582386303547, "grad_norm": 16.035138850691173, "learning_rate": 4.983757027254618e-05, "loss": 2.437, "mean_token_accuracy": 0.4103448331356049, "step": 85740 }, { "epoch": 0.08636327468345888, "grad_norm": 10.99502836984945, "learning_rate": 4.9837525325544e-05, "loss": 2.6739, "mean_token_accuracy": 0.3827586233615875, "step": 85745 }, { "epoch": 0.08636831073656305, "grad_norm": 11.144157045446766, "learning_rate": 4.983748037234643e-05, "loss": 2.459, "mean_token_accuracy": 0.3862069010734558, "step": 85750 }, { "epoch": 0.08637334678966722, "grad_norm": 10.839074288340393, "learning_rate": 4.983743541295347e-05, "loss": 2.5423, "mean_token_accuracy": 0.42413793206214906, "step": 85755 }, { "epoch": 0.0863783828427714, "grad_norm": 18.0758760556222, "learning_rate": 4.9837390447365145e-05, "loss": 2.4696, "mean_token_accuracy": 0.37931033968925476, "step": 85760 }, { "epoch": 0.08638341889587557, "grad_norm": 10.514795273276423, "learning_rate": 4.9837345475581456e-05, "loss": 2.4579, "mean_token_accuracy": 0.42758620977401735, "step": 85765 }, { "epoch": 0.08638845494897975, "grad_norm": 12.49302638113191, "learning_rate": 4.9837300497602425e-05, "loss": 2.7628, "mean_token_accuracy": 0.36896551549434664, "step": 85770 }, { "epoch": 0.08639349100208392, "grad_norm": 10.324372238504612, "learning_rate": 4.9837255513428054e-05, "loss": 2.8566, "mean_token_accuracy": 0.3896551728248596, "step": 85775 }, { "epoch": 0.08639852705518809, "grad_norm": 9.816893236187164, "learning_rate": 4.983721052305837e-05, "loss": 2.2771, "mean_token_accuracy": 0.4551724076271057, "step": 85780 }, { "epoch": 0.08640356310829227, "grad_norm": 9.302876224937018, "learning_rate": 4.983716552649337e-05, "loss": 2.337, "mean_token_accuracy": 0.4534785270690918, "step": 85785 }, { "epoch": 0.08640859916139644, "grad_norm": 16.174493748462172, "learning_rate": 4.9837120523733074e-05, "loss": 2.704, "mean_token_accuracy": 0.39618874788284303, "step": 85790 }, { "epoch": 0.08641363521450061, "grad_norm": 10.08429382932957, "learning_rate": 4.983707551477751e-05, "loss": 2.5566, "mean_token_accuracy": 0.42758620977401735, "step": 85795 }, { "epoch": 0.08641867126760479, "grad_norm": 11.289616944867047, "learning_rate": 4.983703049962665e-05, "loss": 2.4404, "mean_token_accuracy": 0.4034482717514038, "step": 85800 }, { "epoch": 0.08642370732070896, "grad_norm": 11.926243148678576, "learning_rate": 4.9836985478280545e-05, "loss": 2.4055, "mean_token_accuracy": 0.4724137902259827, "step": 85805 }, { "epoch": 0.08642874337381312, "grad_norm": 10.053274195140387, "learning_rate": 4.983694045073919e-05, "loss": 2.8584, "mean_token_accuracy": 0.38275861740112305, "step": 85810 }, { "epoch": 0.0864337794269173, "grad_norm": 10.356995719112014, "learning_rate": 4.9836895417002604e-05, "loss": 2.4488, "mean_token_accuracy": 0.42758620381355283, "step": 85815 }, { "epoch": 0.08643881548002147, "grad_norm": 11.475080760975624, "learning_rate": 4.98368503770708e-05, "loss": 2.4335, "mean_token_accuracy": 0.403448274731636, "step": 85820 }, { "epoch": 0.08644385153312564, "grad_norm": 14.292287719772636, "learning_rate": 4.983680533094378e-05, "loss": 2.5817, "mean_token_accuracy": 0.4206896543502808, "step": 85825 }, { "epoch": 0.08644888758622982, "grad_norm": 10.267525702771389, "learning_rate": 4.983676027862156e-05, "loss": 2.2198, "mean_token_accuracy": 0.44827585220336913, "step": 85830 }, { "epoch": 0.08645392363933399, "grad_norm": 14.629452839284372, "learning_rate": 4.983671522010417e-05, "loss": 2.3362, "mean_token_accuracy": 0.4241379380226135, "step": 85835 }, { "epoch": 0.08645895969243816, "grad_norm": 10.576145367635034, "learning_rate": 4.9836670155391604e-05, "loss": 2.4545, "mean_token_accuracy": 0.3896551728248596, "step": 85840 }, { "epoch": 0.08646399574554234, "grad_norm": 12.40155451548599, "learning_rate": 4.983662508448388e-05, "loss": 2.541, "mean_token_accuracy": 0.358620685338974, "step": 85845 }, { "epoch": 0.08646903179864651, "grad_norm": 11.685705279560947, "learning_rate": 4.9836580007381006e-05, "loss": 2.6927, "mean_token_accuracy": 0.4310344815254211, "step": 85850 }, { "epoch": 0.08647406785175069, "grad_norm": 9.25899390114931, "learning_rate": 4.9836534924083e-05, "loss": 2.3112, "mean_token_accuracy": 0.43103448748588563, "step": 85855 }, { "epoch": 0.08647910390485486, "grad_norm": 10.821564865270874, "learning_rate": 4.9836489834589863e-05, "loss": 2.4348, "mean_token_accuracy": 0.4137930989265442, "step": 85860 }, { "epoch": 0.08648413995795903, "grad_norm": 11.850768211085699, "learning_rate": 4.983644473890163e-05, "loss": 2.3831, "mean_token_accuracy": 0.40689654350280763, "step": 85865 }, { "epoch": 0.0864891760110632, "grad_norm": 11.385951252408212, "learning_rate": 4.98363996370183e-05, "loss": 2.3363, "mean_token_accuracy": 0.43448275327682495, "step": 85870 }, { "epoch": 0.08649421206416738, "grad_norm": 15.483238949539185, "learning_rate": 4.9836354528939885e-05, "loss": 2.6649, "mean_token_accuracy": 0.37586207389831544, "step": 85875 }, { "epoch": 0.08649924811727154, "grad_norm": 13.627558359624706, "learning_rate": 4.9836309414666396e-05, "loss": 2.3306, "mean_token_accuracy": 0.45862067937850953, "step": 85880 }, { "epoch": 0.08650428417037571, "grad_norm": 9.667361053916233, "learning_rate": 4.9836264294197844e-05, "loss": 2.4061, "mean_token_accuracy": 0.43103448748588563, "step": 85885 }, { "epoch": 0.08650932022347989, "grad_norm": 11.93158573649606, "learning_rate": 4.9836219167534255e-05, "loss": 2.4194, "mean_token_accuracy": 0.4137930989265442, "step": 85890 }, { "epoch": 0.08651435627658406, "grad_norm": 13.411382955143935, "learning_rate": 4.983617403467562e-05, "loss": 2.4228, "mean_token_accuracy": 0.4620689570903778, "step": 85895 }, { "epoch": 0.08651939232968824, "grad_norm": 12.003036849684241, "learning_rate": 4.9836128895621975e-05, "loss": 2.8335, "mean_token_accuracy": 0.38275861740112305, "step": 85900 }, { "epoch": 0.08652442838279241, "grad_norm": 11.77528367765045, "learning_rate": 4.9836083750373324e-05, "loss": 2.8777, "mean_token_accuracy": 0.34827586710453035, "step": 85905 }, { "epoch": 0.08652946443589658, "grad_norm": 11.427256847570119, "learning_rate": 4.983603859892967e-05, "loss": 2.6068, "mean_token_accuracy": 0.3655172407627106, "step": 85910 }, { "epoch": 0.08653450048900076, "grad_norm": 10.672561919520579, "learning_rate": 4.983599344129103e-05, "loss": 2.3823, "mean_token_accuracy": 0.441379314661026, "step": 85915 }, { "epoch": 0.08653953654210493, "grad_norm": 8.087882048585717, "learning_rate": 4.9835948277457425e-05, "loss": 2.1931, "mean_token_accuracy": 0.48965516686439514, "step": 85920 }, { "epoch": 0.0865445725952091, "grad_norm": 9.61838684599959, "learning_rate": 4.983590310742886e-05, "loss": 2.3961, "mean_token_accuracy": 0.4551724076271057, "step": 85925 }, { "epoch": 0.08654960864831328, "grad_norm": 10.421914128119445, "learning_rate": 4.9835857931205346e-05, "loss": 2.3432, "mean_token_accuracy": 0.41203871965408323, "step": 85930 }, { "epoch": 0.08655464470141745, "grad_norm": 11.950647950209852, "learning_rate": 4.9835812748786905e-05, "loss": 2.6569, "mean_token_accuracy": 0.4103448331356049, "step": 85935 }, { "epoch": 0.08655968075452163, "grad_norm": 12.671659578020153, "learning_rate": 4.983576756017353e-05, "loss": 2.4077, "mean_token_accuracy": 0.4034482777118683, "step": 85940 }, { "epoch": 0.0865647168076258, "grad_norm": 9.61105772479072, "learning_rate": 4.983572236536526e-05, "loss": 2.4934, "mean_token_accuracy": 0.3896551787853241, "step": 85945 }, { "epoch": 0.08656975286072996, "grad_norm": 9.555263617199538, "learning_rate": 4.983567716436208e-05, "loss": 2.3492, "mean_token_accuracy": 0.4413793087005615, "step": 85950 }, { "epoch": 0.08657478891383413, "grad_norm": 18.792391372218272, "learning_rate": 4.983563195716403e-05, "loss": 2.8506, "mean_token_accuracy": 0.4068965554237366, "step": 85955 }, { "epoch": 0.0865798249669383, "grad_norm": 11.848153592222557, "learning_rate": 4.983558674377111e-05, "loss": 2.4634, "mean_token_accuracy": 0.4034482717514038, "step": 85960 }, { "epoch": 0.08658486102004248, "grad_norm": 12.3563194987663, "learning_rate": 4.9835541524183324e-05, "loss": 2.6842, "mean_token_accuracy": 0.4068965554237366, "step": 85965 }, { "epoch": 0.08658989707314665, "grad_norm": 11.69272996333826, "learning_rate": 4.983549629840069e-05, "loss": 2.2356, "mean_token_accuracy": 0.44482758045196535, "step": 85970 }, { "epoch": 0.08659493312625083, "grad_norm": 10.874992142249097, "learning_rate": 4.9835451066423226e-05, "loss": 2.0792, "mean_token_accuracy": 0.4655172526836395, "step": 85975 }, { "epoch": 0.086599969179355, "grad_norm": 20.365826147334218, "learning_rate": 4.983540582825095e-05, "loss": 2.7875, "mean_token_accuracy": 0.39310345351696013, "step": 85980 }, { "epoch": 0.08660500523245918, "grad_norm": 9.49120931981935, "learning_rate": 4.983536058388385e-05, "loss": 2.2274, "mean_token_accuracy": 0.4379310250282288, "step": 85985 }, { "epoch": 0.08661004128556335, "grad_norm": 9.965378216462042, "learning_rate": 4.983531533332196e-05, "loss": 2.2841, "mean_token_accuracy": 0.4259528160095215, "step": 85990 }, { "epoch": 0.08661507733866752, "grad_norm": 12.958903556615356, "learning_rate": 4.98352700765653e-05, "loss": 3.0026, "mean_token_accuracy": 0.3103448271751404, "step": 85995 }, { "epoch": 0.0866201133917717, "grad_norm": 11.869039519069343, "learning_rate": 4.983522481361386e-05, "loss": 2.5708, "mean_token_accuracy": 0.38620689511299133, "step": 86000 }, { "epoch": 0.08662514944487587, "grad_norm": 12.047713454749866, "learning_rate": 4.983517954446766e-05, "loss": 2.6233, "mean_token_accuracy": 0.3793103456497192, "step": 86005 }, { "epoch": 0.08663018549798004, "grad_norm": 10.008161366499168, "learning_rate": 4.983513426912671e-05, "loss": 2.2921, "mean_token_accuracy": 0.44137930274009707, "step": 86010 }, { "epoch": 0.08663522155108422, "grad_norm": 9.912010630812357, "learning_rate": 4.983508898759104e-05, "loss": 2.6927, "mean_token_accuracy": 0.4310344815254211, "step": 86015 }, { "epoch": 0.08664025760418838, "grad_norm": 12.046994306364532, "learning_rate": 4.983504369986064e-05, "loss": 2.9386, "mean_token_accuracy": 0.38275861740112305, "step": 86020 }, { "epoch": 0.08664529365729255, "grad_norm": 11.289340462478492, "learning_rate": 4.983499840593553e-05, "loss": 2.5038, "mean_token_accuracy": 0.4448275864124298, "step": 86025 }, { "epoch": 0.08665032971039673, "grad_norm": 9.533670788355828, "learning_rate": 4.983495310581574e-05, "loss": 2.6954, "mean_token_accuracy": 0.34482758641242983, "step": 86030 }, { "epoch": 0.0866553657635009, "grad_norm": 11.75650870858756, "learning_rate": 4.983490779950126e-05, "loss": 2.4145, "mean_token_accuracy": 0.4816696882247925, "step": 86035 }, { "epoch": 0.08666040181660507, "grad_norm": 9.657465516571568, "learning_rate": 4.98348624869921e-05, "loss": 2.6209, "mean_token_accuracy": 0.4206896543502808, "step": 86040 }, { "epoch": 0.08666543786970925, "grad_norm": 13.663554580295168, "learning_rate": 4.983481716828829e-05, "loss": 3.2414, "mean_token_accuracy": 0.31724137663841245, "step": 86045 }, { "epoch": 0.08667047392281342, "grad_norm": 13.839672324889879, "learning_rate": 4.983477184338984e-05, "loss": 2.4131, "mean_token_accuracy": 0.42758620977401735, "step": 86050 }, { "epoch": 0.0866755099759176, "grad_norm": 9.915769522595859, "learning_rate": 4.9834726512296745e-05, "loss": 2.1302, "mean_token_accuracy": 0.5021173536777497, "step": 86055 }, { "epoch": 0.08668054602902177, "grad_norm": 10.133407924491985, "learning_rate": 4.983468117500904e-05, "loss": 2.6179, "mean_token_accuracy": 0.36551724672317504, "step": 86060 }, { "epoch": 0.08668558208212594, "grad_norm": 11.542681569791872, "learning_rate": 4.983463583152673e-05, "loss": 2.6594, "mean_token_accuracy": 0.4068965554237366, "step": 86065 }, { "epoch": 0.08669061813523012, "grad_norm": 11.148843643515967, "learning_rate": 4.983459048184982e-05, "loss": 2.3945, "mean_token_accuracy": 0.4482758641242981, "step": 86070 }, { "epoch": 0.08669565418833429, "grad_norm": 11.10073841352252, "learning_rate": 4.983454512597833e-05, "loss": 2.8404, "mean_token_accuracy": 0.3931034505367279, "step": 86075 }, { "epoch": 0.08670069024143846, "grad_norm": 13.213615982641578, "learning_rate": 4.9834499763912265e-05, "loss": 2.5827, "mean_token_accuracy": 0.42413793206214906, "step": 86080 }, { "epoch": 0.08670572629454264, "grad_norm": 9.953562392049552, "learning_rate": 4.983445439565165e-05, "loss": 2.4707, "mean_token_accuracy": 0.45698729157447815, "step": 86085 }, { "epoch": 0.0867107623476468, "grad_norm": 11.19288200020367, "learning_rate": 4.9834409021196494e-05, "loss": 2.8666, "mean_token_accuracy": 0.37586206793785093, "step": 86090 }, { "epoch": 0.08671579840075097, "grad_norm": 16.79206967013229, "learning_rate": 4.9834363640546797e-05, "loss": 2.4193, "mean_token_accuracy": 0.4600725889205933, "step": 86095 }, { "epoch": 0.08672083445385514, "grad_norm": 10.40342554165823, "learning_rate": 4.9834318253702584e-05, "loss": 2.6385, "mean_token_accuracy": 0.3965517282485962, "step": 86100 }, { "epoch": 0.08672587050695932, "grad_norm": 11.514682486430118, "learning_rate": 4.9834272860663864e-05, "loss": 2.5171, "mean_token_accuracy": 0.4379310369491577, "step": 86105 }, { "epoch": 0.08673090656006349, "grad_norm": 10.742702192675887, "learning_rate": 4.9834227461430656e-05, "loss": 2.942, "mean_token_accuracy": 0.34482758641242983, "step": 86110 }, { "epoch": 0.08673594261316767, "grad_norm": 9.636867658145365, "learning_rate": 4.983418205600297e-05, "loss": 2.358, "mean_token_accuracy": 0.4344827592372894, "step": 86115 }, { "epoch": 0.08674097866627184, "grad_norm": 11.9567650551864, "learning_rate": 4.9834136644380806e-05, "loss": 2.7101, "mean_token_accuracy": 0.34137930870056155, "step": 86120 }, { "epoch": 0.08674601471937601, "grad_norm": 9.223526399465987, "learning_rate": 4.983409122656419e-05, "loss": 2.5782, "mean_token_accuracy": 0.4124621868133545, "step": 86125 }, { "epoch": 0.08675105077248019, "grad_norm": 9.495158153590731, "learning_rate": 4.983404580255313e-05, "loss": 2.7089, "mean_token_accuracy": 0.38275861740112305, "step": 86130 }, { "epoch": 0.08675608682558436, "grad_norm": 13.040039760771384, "learning_rate": 4.983400037234763e-05, "loss": 2.7491, "mean_token_accuracy": 0.3517241358757019, "step": 86135 }, { "epoch": 0.08676112287868853, "grad_norm": 10.16921181916363, "learning_rate": 4.983395493594772e-05, "loss": 2.3211, "mean_token_accuracy": 0.41034482717514037, "step": 86140 }, { "epoch": 0.08676615893179271, "grad_norm": 12.274446196978882, "learning_rate": 4.98339094933534e-05, "loss": 2.293, "mean_token_accuracy": 0.45033273100852966, "step": 86145 }, { "epoch": 0.08677119498489688, "grad_norm": 12.000737017353911, "learning_rate": 4.983386404456469e-05, "loss": 2.5044, "mean_token_accuracy": 0.36896551847457887, "step": 86150 }, { "epoch": 0.08677623103800106, "grad_norm": 11.042622553676953, "learning_rate": 4.98338185895816e-05, "loss": 2.8826, "mean_token_accuracy": 0.38965516090393065, "step": 86155 }, { "epoch": 0.08678126709110522, "grad_norm": 11.157016237629469, "learning_rate": 4.983377312840414e-05, "loss": 2.5411, "mean_token_accuracy": 0.42413792610168455, "step": 86160 }, { "epoch": 0.08678630314420939, "grad_norm": 10.86416483573507, "learning_rate": 4.983372766103233e-05, "loss": 2.3252, "mean_token_accuracy": 0.4776164650917053, "step": 86165 }, { "epoch": 0.08679133919731356, "grad_norm": 11.811959331255306, "learning_rate": 4.983368218746617e-05, "loss": 2.4912, "mean_token_accuracy": 0.42068965137004855, "step": 86170 }, { "epoch": 0.08679637525041774, "grad_norm": 11.51004545374619, "learning_rate": 4.9833636707705686e-05, "loss": 2.5171, "mean_token_accuracy": 0.42758620381355283, "step": 86175 }, { "epoch": 0.08680141130352191, "grad_norm": 12.039657038884906, "learning_rate": 4.9833591221750887e-05, "loss": 2.6749, "mean_token_accuracy": 0.4034482777118683, "step": 86180 }, { "epoch": 0.08680644735662608, "grad_norm": 11.215071559776549, "learning_rate": 4.9833545729601776e-05, "loss": 2.4918, "mean_token_accuracy": 0.441379314661026, "step": 86185 }, { "epoch": 0.08681148340973026, "grad_norm": 11.197142773621714, "learning_rate": 4.9833500231258375e-05, "loss": 2.4815, "mean_token_accuracy": 0.41034482717514037, "step": 86190 }, { "epoch": 0.08681651946283443, "grad_norm": 8.876092523623365, "learning_rate": 4.9833454726720696e-05, "loss": 2.2781, "mean_token_accuracy": 0.4413793087005615, "step": 86195 }, { "epoch": 0.0868215555159386, "grad_norm": 12.53948120040899, "learning_rate": 4.983340921598875e-05, "loss": 2.3235, "mean_token_accuracy": 0.42413793206214906, "step": 86200 }, { "epoch": 0.08682659156904278, "grad_norm": 9.29443192637752, "learning_rate": 4.983336369906255e-05, "loss": 2.8324, "mean_token_accuracy": 0.3758620649576187, "step": 86205 }, { "epoch": 0.08683162762214695, "grad_norm": 13.324375278355333, "learning_rate": 4.98333181759421e-05, "loss": 2.0264, "mean_token_accuracy": 0.4517241418361664, "step": 86210 }, { "epoch": 0.08683666367525113, "grad_norm": 10.27157071510639, "learning_rate": 4.983327264662743e-05, "loss": 2.4632, "mean_token_accuracy": 0.38965516686439516, "step": 86215 }, { "epoch": 0.0868416997283553, "grad_norm": 10.237486979217078, "learning_rate": 4.983322711111854e-05, "loss": 2.1802, "mean_token_accuracy": 0.49842709898948667, "step": 86220 }, { "epoch": 0.08684673578145947, "grad_norm": 10.600766508349675, "learning_rate": 4.983318156941545e-05, "loss": 2.4158, "mean_token_accuracy": 0.43103448748588563, "step": 86225 }, { "epoch": 0.08685177183456363, "grad_norm": 8.130187666066819, "learning_rate": 4.983313602151817e-05, "loss": 2.2274, "mean_token_accuracy": 0.44827585816383364, "step": 86230 }, { "epoch": 0.08685680788766781, "grad_norm": 12.039287798293214, "learning_rate": 4.9833090467426704e-05, "loss": 2.4962, "mean_token_accuracy": 0.3999999940395355, "step": 86235 }, { "epoch": 0.08686184394077198, "grad_norm": 12.467670197539608, "learning_rate": 4.983304490714108e-05, "loss": 2.2156, "mean_token_accuracy": 0.48148820400238035, "step": 86240 }, { "epoch": 0.08686687999387616, "grad_norm": 15.500244431264585, "learning_rate": 4.9832999340661296e-05, "loss": 2.8387, "mean_token_accuracy": 0.38275861740112305, "step": 86245 }, { "epoch": 0.08687191604698033, "grad_norm": 10.973068833905813, "learning_rate": 4.9832953767987365e-05, "loss": 2.4226, "mean_token_accuracy": 0.42758620977401735, "step": 86250 }, { "epoch": 0.0868769521000845, "grad_norm": 10.12925357349675, "learning_rate": 4.983290818911932e-05, "loss": 2.6403, "mean_token_accuracy": 0.36551723778247835, "step": 86255 }, { "epoch": 0.08688198815318868, "grad_norm": 12.62430868079707, "learning_rate": 4.983286260405716e-05, "loss": 2.558, "mean_token_accuracy": 0.4862069010734558, "step": 86260 }, { "epoch": 0.08688702420629285, "grad_norm": 10.90219902747991, "learning_rate": 4.9832817012800885e-05, "loss": 2.6732, "mean_token_accuracy": 0.4068965494632721, "step": 86265 }, { "epoch": 0.08689206025939702, "grad_norm": 10.065177362608695, "learning_rate": 4.983277141535052e-05, "loss": 2.7294, "mean_token_accuracy": 0.3931034505367279, "step": 86270 }, { "epoch": 0.0868970963125012, "grad_norm": 10.896072485267833, "learning_rate": 4.9832725811706086e-05, "loss": 2.1008, "mean_token_accuracy": 0.4862068951129913, "step": 86275 }, { "epoch": 0.08690213236560537, "grad_norm": 10.444235410185499, "learning_rate": 4.983268020186759e-05, "loss": 2.4354, "mean_token_accuracy": 0.42885662317276, "step": 86280 }, { "epoch": 0.08690716841870955, "grad_norm": 11.4526631570211, "learning_rate": 4.983263458583503e-05, "loss": 2.4047, "mean_token_accuracy": 0.37586206793785093, "step": 86285 }, { "epoch": 0.08691220447181372, "grad_norm": 10.529449793034436, "learning_rate": 4.983258896360844e-05, "loss": 1.9105, "mean_token_accuracy": 0.48275862336158754, "step": 86290 }, { "epoch": 0.0869172405249179, "grad_norm": 10.809918082435273, "learning_rate": 4.983254333518782e-05, "loss": 2.0751, "mean_token_accuracy": 0.48814276456832884, "step": 86295 }, { "epoch": 0.08692227657802205, "grad_norm": 21.292560433174188, "learning_rate": 4.9832497700573185e-05, "loss": 2.5043, "mean_token_accuracy": 0.41724138259887694, "step": 86300 }, { "epoch": 0.08692731263112623, "grad_norm": 12.816673339828267, "learning_rate": 4.983245205976455e-05, "loss": 2.9364, "mean_token_accuracy": 0.3620689660310745, "step": 86305 }, { "epoch": 0.0869323486842304, "grad_norm": 11.833869941073408, "learning_rate": 4.9832406412761924e-05, "loss": 2.9552, "mean_token_accuracy": 0.36660616397857665, "step": 86310 }, { "epoch": 0.08693738473733457, "grad_norm": 13.609791094392282, "learning_rate": 4.983236075956532e-05, "loss": 2.6351, "mean_token_accuracy": 0.39310343861579894, "step": 86315 }, { "epoch": 0.08694242079043875, "grad_norm": 10.176924012645953, "learning_rate": 4.9832315100174767e-05, "loss": 2.5671, "mean_token_accuracy": 0.3827586233615875, "step": 86320 }, { "epoch": 0.08694745684354292, "grad_norm": 12.350553303582004, "learning_rate": 4.983226943459025e-05, "loss": 2.6646, "mean_token_accuracy": 0.39836661219596864, "step": 86325 }, { "epoch": 0.0869524928966471, "grad_norm": 16.064885238913867, "learning_rate": 4.9832223762811794e-05, "loss": 2.2893, "mean_token_accuracy": 0.540350866317749, "step": 86330 }, { "epoch": 0.08695752894975127, "grad_norm": 10.63716420320088, "learning_rate": 4.9832178084839417e-05, "loss": 2.2531, "mean_token_accuracy": 0.3896551728248596, "step": 86335 }, { "epoch": 0.08696256500285544, "grad_norm": 10.309520842853864, "learning_rate": 4.983213240067312e-05, "loss": 2.1544, "mean_token_accuracy": 0.4724137902259827, "step": 86340 }, { "epoch": 0.08696760105595962, "grad_norm": 11.535638647414885, "learning_rate": 4.983208671031293e-05, "loss": 2.3552, "mean_token_accuracy": 0.441379314661026, "step": 86345 }, { "epoch": 0.08697263710906379, "grad_norm": 12.796032050910778, "learning_rate": 4.983204101375885e-05, "loss": 2.0742, "mean_token_accuracy": 0.4758620738983154, "step": 86350 }, { "epoch": 0.08697767316216796, "grad_norm": 10.353262405733663, "learning_rate": 4.9831995311010894e-05, "loss": 2.0933, "mean_token_accuracy": 0.48965518474578856, "step": 86355 }, { "epoch": 0.08698270921527214, "grad_norm": 11.734730262023911, "learning_rate": 4.983194960206908e-05, "loss": 2.5185, "mean_token_accuracy": 0.4188747763633728, "step": 86360 }, { "epoch": 0.08698774526837631, "grad_norm": 14.351259118278191, "learning_rate": 4.983190388693342e-05, "loss": 2.9389, "mean_token_accuracy": 0.3655172407627106, "step": 86365 }, { "epoch": 0.08699278132148047, "grad_norm": 24.74334912592318, "learning_rate": 4.983185816560391e-05, "loss": 2.5021, "mean_token_accuracy": 0.4034482717514038, "step": 86370 }, { "epoch": 0.08699781737458465, "grad_norm": 10.264127803465115, "learning_rate": 4.983181243808058e-05, "loss": 1.9235, "mean_token_accuracy": 0.5119458079338074, "step": 86375 }, { "epoch": 0.08700285342768882, "grad_norm": 11.75127999648214, "learning_rate": 4.983176670436344e-05, "loss": 2.3934, "mean_token_accuracy": 0.45686631202697753, "step": 86380 }, { "epoch": 0.087007889480793, "grad_norm": 14.235735181515555, "learning_rate": 4.9831720964452504e-05, "loss": 2.5501, "mean_token_accuracy": 0.43448275327682495, "step": 86385 }, { "epoch": 0.08701292553389717, "grad_norm": 11.051051473869903, "learning_rate": 4.9831675218347785e-05, "loss": 2.5402, "mean_token_accuracy": 0.40689654350280763, "step": 86390 }, { "epoch": 0.08701796158700134, "grad_norm": 13.455568910844482, "learning_rate": 4.983162946604929e-05, "loss": 2.3657, "mean_token_accuracy": 0.42758620381355283, "step": 86395 }, { "epoch": 0.08702299764010551, "grad_norm": 9.001625401058353, "learning_rate": 4.9831583707557025e-05, "loss": 2.2946, "mean_token_accuracy": 0.4620689690113068, "step": 86400 }, { "epoch": 0.08702803369320969, "grad_norm": 10.509782201886818, "learning_rate": 4.983153794287102e-05, "loss": 2.681, "mean_token_accuracy": 0.4034482717514038, "step": 86405 }, { "epoch": 0.08703306974631386, "grad_norm": 11.021595009635545, "learning_rate": 4.983149217199128e-05, "loss": 2.4339, "mean_token_accuracy": 0.4551724135875702, "step": 86410 }, { "epoch": 0.08703810579941804, "grad_norm": 12.984169552387932, "learning_rate": 4.9831446394917813e-05, "loss": 2.5304, "mean_token_accuracy": 0.4206896543502808, "step": 86415 }, { "epoch": 0.08704314185252221, "grad_norm": 9.813264549211276, "learning_rate": 4.983140061165064e-05, "loss": 2.2901, "mean_token_accuracy": 0.4310344815254211, "step": 86420 }, { "epoch": 0.08704817790562638, "grad_norm": 9.218139689167623, "learning_rate": 4.9831354822189775e-05, "loss": 2.4292, "mean_token_accuracy": 0.38275861740112305, "step": 86425 }, { "epoch": 0.08705321395873056, "grad_norm": 9.699046529322704, "learning_rate": 4.9831309026535214e-05, "loss": 2.5026, "mean_token_accuracy": 0.46412582099437716, "step": 86430 }, { "epoch": 0.08705825001183473, "grad_norm": 9.998793929431956, "learning_rate": 4.983126322468699e-05, "loss": 2.123, "mean_token_accuracy": 0.4705989122390747, "step": 86435 }, { "epoch": 0.08706328606493889, "grad_norm": 12.300134160564687, "learning_rate": 4.98312174166451e-05, "loss": 2.4133, "mean_token_accuracy": 0.4448275864124298, "step": 86440 }, { "epoch": 0.08706832211804306, "grad_norm": 11.727302607047914, "learning_rate": 4.983117160240957e-05, "loss": 2.1086, "mean_token_accuracy": 0.47241380214691164, "step": 86445 }, { "epoch": 0.08707335817114724, "grad_norm": 11.382489306883448, "learning_rate": 4.9831125781980396e-05, "loss": 2.3407, "mean_token_accuracy": 0.46896552443504336, "step": 86450 }, { "epoch": 0.08707839422425141, "grad_norm": 9.486189437630772, "learning_rate": 4.983107995535761e-05, "loss": 2.3655, "mean_token_accuracy": 0.4068965554237366, "step": 86455 }, { "epoch": 0.08708343027735559, "grad_norm": 14.725889154213995, "learning_rate": 4.9831034122541216e-05, "loss": 2.7955, "mean_token_accuracy": 0.4000000059604645, "step": 86460 }, { "epoch": 0.08708846633045976, "grad_norm": 11.08604586911833, "learning_rate": 4.9830988283531226e-05, "loss": 2.8231, "mean_token_accuracy": 0.37586206793785093, "step": 86465 }, { "epoch": 0.08709350238356393, "grad_norm": 13.048654158113271, "learning_rate": 4.983094243832765e-05, "loss": 2.6801, "mean_token_accuracy": 0.3896551728248596, "step": 86470 }, { "epoch": 0.08709853843666811, "grad_norm": 11.60035183850772, "learning_rate": 4.983089658693051e-05, "loss": 2.4768, "mean_token_accuracy": 0.4103448212146759, "step": 86475 }, { "epoch": 0.08710357448977228, "grad_norm": 10.128145077852396, "learning_rate": 4.98308507293398e-05, "loss": 2.214, "mean_token_accuracy": 0.4258318245410919, "step": 86480 }, { "epoch": 0.08710861054287646, "grad_norm": 12.06838822051253, "learning_rate": 4.983080486555556e-05, "loss": 2.8142, "mean_token_accuracy": 0.3896551728248596, "step": 86485 }, { "epoch": 0.08711364659598063, "grad_norm": 11.006822229917786, "learning_rate": 4.9830758995577775e-05, "loss": 2.8015, "mean_token_accuracy": 0.3896551787853241, "step": 86490 }, { "epoch": 0.0871186826490848, "grad_norm": 10.962877414412725, "learning_rate": 4.983071311940648e-05, "loss": 2.6848, "mean_token_accuracy": 0.403448274731636, "step": 86495 }, { "epoch": 0.08712371870218898, "grad_norm": 10.034322644338795, "learning_rate": 4.9830667237041676e-05, "loss": 2.5382, "mean_token_accuracy": 0.4379310429096222, "step": 86500 }, { "epoch": 0.08712875475529315, "grad_norm": 13.701506922557952, "learning_rate": 4.9830621348483376e-05, "loss": 2.2847, "mean_token_accuracy": 0.4620689630508423, "step": 86505 }, { "epoch": 0.08713379080839731, "grad_norm": 10.222796104743601, "learning_rate": 4.983057545373159e-05, "loss": 2.5303, "mean_token_accuracy": 0.3896551638841629, "step": 86510 }, { "epoch": 0.08713882686150148, "grad_norm": 13.420470133741237, "learning_rate": 4.983052955278634e-05, "loss": 2.1636, "mean_token_accuracy": 0.43793103098869324, "step": 86515 }, { "epoch": 0.08714386291460566, "grad_norm": 13.057280724532147, "learning_rate": 4.983048364564764e-05, "loss": 2.4996, "mean_token_accuracy": 0.43968542814254763, "step": 86520 }, { "epoch": 0.08714889896770983, "grad_norm": 12.684822732156206, "learning_rate": 4.983043773231549e-05, "loss": 2.5853, "mean_token_accuracy": 0.4034482777118683, "step": 86525 }, { "epoch": 0.087153935020814, "grad_norm": 10.154302671341128, "learning_rate": 4.983039181278992e-05, "loss": 2.5854, "mean_token_accuracy": 0.3915305495262146, "step": 86530 }, { "epoch": 0.08715897107391818, "grad_norm": 12.384619706703251, "learning_rate": 4.983034588707092e-05, "loss": 2.8389, "mean_token_accuracy": 0.4241379380226135, "step": 86535 }, { "epoch": 0.08716400712702235, "grad_norm": 10.433217851793765, "learning_rate": 4.983029995515852e-05, "loss": 2.579, "mean_token_accuracy": 0.37931033968925476, "step": 86540 }, { "epoch": 0.08716904318012653, "grad_norm": 10.434153679564936, "learning_rate": 4.983025401705273e-05, "loss": 1.9213, "mean_token_accuracy": 0.5034482777118683, "step": 86545 }, { "epoch": 0.0871740792332307, "grad_norm": 9.216721062422392, "learning_rate": 4.983020807275356e-05, "loss": 2.1524, "mean_token_accuracy": 0.4724137902259827, "step": 86550 }, { "epoch": 0.08717911528633487, "grad_norm": 9.728362939193179, "learning_rate": 4.9830162122261026e-05, "loss": 2.4813, "mean_token_accuracy": 0.4103448212146759, "step": 86555 }, { "epoch": 0.08718415133943905, "grad_norm": 10.508825678536265, "learning_rate": 4.983011616557513e-05, "loss": 2.4365, "mean_token_accuracy": 0.4517241358757019, "step": 86560 }, { "epoch": 0.08718918739254322, "grad_norm": 10.507069489212183, "learning_rate": 4.98300702026959e-05, "loss": 2.1034, "mean_token_accuracy": 0.458620685338974, "step": 86565 }, { "epoch": 0.0871942234456474, "grad_norm": 10.097421939080888, "learning_rate": 4.9830024233623337e-05, "loss": 2.2706, "mean_token_accuracy": 0.4068965554237366, "step": 86570 }, { "epoch": 0.08719925949875157, "grad_norm": 12.001618999410463, "learning_rate": 4.9829978258357465e-05, "loss": 2.6023, "mean_token_accuracy": 0.3896551787853241, "step": 86575 }, { "epoch": 0.08720429555185573, "grad_norm": 13.415222099877685, "learning_rate": 4.9829932276898285e-05, "loss": 2.5043, "mean_token_accuracy": 0.41034482717514037, "step": 86580 }, { "epoch": 0.0872093316049599, "grad_norm": 10.559601364458098, "learning_rate": 4.982988628924581e-05, "loss": 2.1764, "mean_token_accuracy": 0.46896551847457885, "step": 86585 }, { "epoch": 0.08721436765806408, "grad_norm": 8.924331869007865, "learning_rate": 4.982984029540007e-05, "loss": 1.9151, "mean_token_accuracy": 0.4778584361076355, "step": 86590 }, { "epoch": 0.08721940371116825, "grad_norm": 10.365479045967056, "learning_rate": 4.982979429536105e-05, "loss": 2.6874, "mean_token_accuracy": 0.41034482717514037, "step": 86595 }, { "epoch": 0.08722443976427242, "grad_norm": 9.124149321907941, "learning_rate": 4.9829748289128794e-05, "loss": 2.8934, "mean_token_accuracy": 0.36551723480224607, "step": 86600 }, { "epoch": 0.0872294758173766, "grad_norm": 13.425100270446846, "learning_rate": 4.9829702276703296e-05, "loss": 2.6568, "mean_token_accuracy": 0.42758620381355283, "step": 86605 }, { "epoch": 0.08723451187048077, "grad_norm": 9.877935617474126, "learning_rate": 4.9829656258084564e-05, "loss": 2.4363, "mean_token_accuracy": 0.42758620381355283, "step": 86610 }, { "epoch": 0.08723954792358495, "grad_norm": 11.059372673197064, "learning_rate": 4.9829610233272625e-05, "loss": 2.4101, "mean_token_accuracy": 0.3931034505367279, "step": 86615 }, { "epoch": 0.08724458397668912, "grad_norm": 10.58638706795638, "learning_rate": 4.9829564202267486e-05, "loss": 2.5312, "mean_token_accuracy": 0.37586206793785093, "step": 86620 }, { "epoch": 0.08724962002979329, "grad_norm": 10.919830328714982, "learning_rate": 4.9829518165069154e-05, "loss": 2.2479, "mean_token_accuracy": 0.41379310488700866, "step": 86625 }, { "epoch": 0.08725465608289747, "grad_norm": 9.771688996947136, "learning_rate": 4.982947212167765e-05, "loss": 2.4804, "mean_token_accuracy": 0.42413793206214906, "step": 86630 }, { "epoch": 0.08725969213600164, "grad_norm": 9.404738746355573, "learning_rate": 4.9829426072092985e-05, "loss": 2.528, "mean_token_accuracy": 0.4172413766384125, "step": 86635 }, { "epoch": 0.08726472818910581, "grad_norm": 9.875264115181432, "learning_rate": 4.982938001631517e-05, "loss": 2.5382, "mean_token_accuracy": 0.40344828367233276, "step": 86640 }, { "epoch": 0.08726976424220999, "grad_norm": 9.505666994960123, "learning_rate": 4.9829333954344206e-05, "loss": 2.3996, "mean_token_accuracy": 0.4724137902259827, "step": 86645 }, { "epoch": 0.08727480029531415, "grad_norm": 9.31943869703653, "learning_rate": 4.982928788618014e-05, "loss": 2.5269, "mean_token_accuracy": 0.4379310369491577, "step": 86650 }, { "epoch": 0.08727983634841832, "grad_norm": 11.538320658559636, "learning_rate": 4.9829241811822946e-05, "loss": 2.4703, "mean_token_accuracy": 0.4103448331356049, "step": 86655 }, { "epoch": 0.0872848724015225, "grad_norm": 11.151273360181728, "learning_rate": 4.982919573127266e-05, "loss": 2.4613, "mean_token_accuracy": 0.44482759237289426, "step": 86660 }, { "epoch": 0.08728990845462667, "grad_norm": 10.596943778836422, "learning_rate": 4.9829149644529285e-05, "loss": 2.5471, "mean_token_accuracy": 0.4206896543502808, "step": 86665 }, { "epoch": 0.08729494450773084, "grad_norm": 10.969281037038321, "learning_rate": 4.9829103551592845e-05, "loss": 2.2822, "mean_token_accuracy": 0.458620685338974, "step": 86670 }, { "epoch": 0.08729998056083502, "grad_norm": 12.733654876446808, "learning_rate": 4.9829057452463326e-05, "loss": 2.1458, "mean_token_accuracy": 0.45396249890327456, "step": 86675 }, { "epoch": 0.08730501661393919, "grad_norm": 10.739116079069797, "learning_rate": 4.9829011347140784e-05, "loss": 2.5958, "mean_token_accuracy": 0.37241379022598264, "step": 86680 }, { "epoch": 0.08731005266704336, "grad_norm": 11.006318469723423, "learning_rate": 4.982896523562519e-05, "loss": 2.1154, "mean_token_accuracy": 0.46551724672317507, "step": 86685 }, { "epoch": 0.08731508872014754, "grad_norm": 10.177031584406697, "learning_rate": 4.982891911791658e-05, "loss": 2.5977, "mean_token_accuracy": 0.3793103516101837, "step": 86690 }, { "epoch": 0.08732012477325171, "grad_norm": 11.789513173750395, "learning_rate": 4.9828872994014966e-05, "loss": 2.4298, "mean_token_accuracy": 0.4137930929660797, "step": 86695 }, { "epoch": 0.08732516082635589, "grad_norm": 11.145334680598076, "learning_rate": 4.982882686392035e-05, "loss": 2.5246, "mean_token_accuracy": 0.43793103098869324, "step": 86700 }, { "epoch": 0.08733019687946006, "grad_norm": 9.739285686700763, "learning_rate": 4.982878072763275e-05, "loss": 2.7633, "mean_token_accuracy": 0.38965516686439516, "step": 86705 }, { "epoch": 0.08733523293256423, "grad_norm": 11.714416685611136, "learning_rate": 4.982873458515218e-05, "loss": 2.3571, "mean_token_accuracy": 0.4586206912994385, "step": 86710 }, { "epoch": 0.0873402689856684, "grad_norm": 10.953880668972772, "learning_rate": 4.9828688436478646e-05, "loss": 2.7275, "mean_token_accuracy": 0.4366606116294861, "step": 86715 }, { "epoch": 0.08734530503877257, "grad_norm": 9.919426386669343, "learning_rate": 4.9828642281612184e-05, "loss": 2.2293, "mean_token_accuracy": 0.4517241418361664, "step": 86720 }, { "epoch": 0.08735034109187674, "grad_norm": 13.089126783304165, "learning_rate": 4.982859612055277e-05, "loss": 2.384, "mean_token_accuracy": 0.42758620381355283, "step": 86725 }, { "epoch": 0.08735537714498091, "grad_norm": 11.603534112298055, "learning_rate": 4.982854995330045e-05, "loss": 2.4878, "mean_token_accuracy": 0.39655172228813174, "step": 86730 }, { "epoch": 0.08736041319808509, "grad_norm": 9.309726994722043, "learning_rate": 4.982850377985522e-05, "loss": 2.2412, "mean_token_accuracy": 0.48275862336158754, "step": 86735 }, { "epoch": 0.08736544925118926, "grad_norm": 8.149088414169967, "learning_rate": 4.9828457600217096e-05, "loss": 2.7559, "mean_token_accuracy": 0.39310343861579894, "step": 86740 }, { "epoch": 0.08737048530429344, "grad_norm": 11.883112089343422, "learning_rate": 4.982841141438609e-05, "loss": 2.2867, "mean_token_accuracy": 0.4689655125141144, "step": 86745 }, { "epoch": 0.08737552135739761, "grad_norm": 12.74491885443032, "learning_rate": 4.982836522236222e-05, "loss": 2.5502, "mean_token_accuracy": 0.4206896543502808, "step": 86750 }, { "epoch": 0.08738055741050178, "grad_norm": 11.418983817165802, "learning_rate": 4.9828319024145485e-05, "loss": 2.4003, "mean_token_accuracy": 0.4034482717514038, "step": 86755 }, { "epoch": 0.08738559346360596, "grad_norm": 10.973314148317188, "learning_rate": 4.982827281973592e-05, "loss": 2.3717, "mean_token_accuracy": 0.4103448212146759, "step": 86760 }, { "epoch": 0.08739062951671013, "grad_norm": 22.53830700427761, "learning_rate": 4.982822660913351e-05, "loss": 2.2425, "mean_token_accuracy": 0.4551724135875702, "step": 86765 }, { "epoch": 0.0873956655698143, "grad_norm": 12.04259844614838, "learning_rate": 4.982818039233829e-05, "loss": 2.6112, "mean_token_accuracy": 0.4, "step": 86770 }, { "epoch": 0.08740070162291848, "grad_norm": 10.039397257311627, "learning_rate": 4.9828134169350277e-05, "loss": 2.6759, "mean_token_accuracy": 0.3793103456497192, "step": 86775 }, { "epoch": 0.08740573767602265, "grad_norm": 15.017689868611921, "learning_rate": 4.982808794016946e-05, "loss": 2.7856, "mean_token_accuracy": 0.3620689630508423, "step": 86780 }, { "epoch": 0.08741077372912683, "grad_norm": 11.048605932412197, "learning_rate": 4.9828041704795865e-05, "loss": 2.4491, "mean_token_accuracy": 0.4206896543502808, "step": 86785 }, { "epoch": 0.08741580978223099, "grad_norm": 10.579281017008608, "learning_rate": 4.98279954632295e-05, "loss": 2.241, "mean_token_accuracy": 0.43448275327682495, "step": 86790 }, { "epoch": 0.08742084583533516, "grad_norm": 10.712214844866077, "learning_rate": 4.9827949215470394e-05, "loss": 2.1358, "mean_token_accuracy": 0.4206896543502808, "step": 86795 }, { "epoch": 0.08742588188843933, "grad_norm": 12.529640105942413, "learning_rate": 4.982790296151855e-05, "loss": 2.5098, "mean_token_accuracy": 0.36896551251411436, "step": 86800 }, { "epoch": 0.0874309179415435, "grad_norm": 9.999315738669619, "learning_rate": 4.982785670137397e-05, "loss": 2.2154, "mean_token_accuracy": 0.42758620977401735, "step": 86805 }, { "epoch": 0.08743595399464768, "grad_norm": 9.88797328963264, "learning_rate": 4.9827810435036676e-05, "loss": 2.6249, "mean_token_accuracy": 0.37241379022598264, "step": 86810 }, { "epoch": 0.08744099004775185, "grad_norm": 10.783193471154624, "learning_rate": 4.982776416250668e-05, "loss": 2.4282, "mean_token_accuracy": 0.4103448212146759, "step": 86815 }, { "epoch": 0.08744602610085603, "grad_norm": 16.876841926864223, "learning_rate": 4.9827717883783996e-05, "loss": 3.1343, "mean_token_accuracy": 0.3137931048870087, "step": 86820 }, { "epoch": 0.0874510621539602, "grad_norm": 9.761759132626144, "learning_rate": 4.982767159886864e-05, "loss": 2.0192, "mean_token_accuracy": 0.441379314661026, "step": 86825 }, { "epoch": 0.08745609820706438, "grad_norm": 10.196037604958772, "learning_rate": 4.982762530776062e-05, "loss": 2.9055, "mean_token_accuracy": 0.37931033968925476, "step": 86830 }, { "epoch": 0.08746113426016855, "grad_norm": 13.017654115123996, "learning_rate": 4.982757901045995e-05, "loss": 2.7014, "mean_token_accuracy": 0.39310344457626345, "step": 86835 }, { "epoch": 0.08746617031327272, "grad_norm": 11.64475351660073, "learning_rate": 4.9827532706966635e-05, "loss": 2.4145, "mean_token_accuracy": 0.3896551698446274, "step": 86840 }, { "epoch": 0.0874712063663769, "grad_norm": 11.971806701511674, "learning_rate": 4.9827486397280704e-05, "loss": 2.3643, "mean_token_accuracy": 0.3931034505367279, "step": 86845 }, { "epoch": 0.08747624241948107, "grad_norm": 9.943546299129222, "learning_rate": 4.982744008140216e-05, "loss": 2.7105, "mean_token_accuracy": 0.37586207389831544, "step": 86850 }, { "epoch": 0.08748127847258524, "grad_norm": 9.24082139907043, "learning_rate": 4.982739375933102e-05, "loss": 2.3129, "mean_token_accuracy": 0.3862069010734558, "step": 86855 }, { "epoch": 0.0874863145256894, "grad_norm": 9.391245096358105, "learning_rate": 4.982734743106728e-05, "loss": 2.0059, "mean_token_accuracy": 0.5018753707408905, "step": 86860 }, { "epoch": 0.08749135057879358, "grad_norm": 8.907376340652393, "learning_rate": 4.9827301096610985e-05, "loss": 1.8845, "mean_token_accuracy": 0.501996374130249, "step": 86865 }, { "epoch": 0.08749638663189775, "grad_norm": 11.261764231638274, "learning_rate": 4.9827254755962124e-05, "loss": 2.7146, "mean_token_accuracy": 0.4206896543502808, "step": 86870 }, { "epoch": 0.08750142268500193, "grad_norm": 12.998363865034651, "learning_rate": 4.9827208409120706e-05, "loss": 2.6097, "mean_token_accuracy": 0.3862069010734558, "step": 86875 }, { "epoch": 0.0875064587381061, "grad_norm": 11.121655010994333, "learning_rate": 4.982716205608677e-05, "loss": 2.6102, "mean_token_accuracy": 0.3896551728248596, "step": 86880 }, { "epoch": 0.08751149479121027, "grad_norm": 14.46171780760077, "learning_rate": 4.98271156968603e-05, "loss": 2.6401, "mean_token_accuracy": 0.38620689511299133, "step": 86885 }, { "epoch": 0.08751653084431445, "grad_norm": 12.159818863566842, "learning_rate": 4.982706933144133e-05, "loss": 2.6378, "mean_token_accuracy": 0.38275861740112305, "step": 86890 }, { "epoch": 0.08752156689741862, "grad_norm": 9.713863270946488, "learning_rate": 4.9827022959829854e-05, "loss": 2.1876, "mean_token_accuracy": 0.4448275864124298, "step": 86895 }, { "epoch": 0.0875266029505228, "grad_norm": 12.78569760820009, "learning_rate": 4.98269765820259e-05, "loss": 2.3564, "mean_token_accuracy": 0.441379314661026, "step": 86900 }, { "epoch": 0.08753163900362697, "grad_norm": 11.14834387010444, "learning_rate": 4.982693019802948e-05, "loss": 2.5375, "mean_token_accuracy": 0.43103448748588563, "step": 86905 }, { "epoch": 0.08753667505673114, "grad_norm": 9.31341223847559, "learning_rate": 4.982688380784059e-05, "loss": 2.3245, "mean_token_accuracy": 0.4344827651977539, "step": 86910 }, { "epoch": 0.08754171110983532, "grad_norm": 9.94550920296541, "learning_rate": 4.982683741145927e-05, "loss": 2.4138, "mean_token_accuracy": 0.4620689630508423, "step": 86915 }, { "epoch": 0.08754674716293949, "grad_norm": 10.891548127398309, "learning_rate": 4.98267910088855e-05, "loss": 2.6362, "mean_token_accuracy": 0.38620689511299133, "step": 86920 }, { "epoch": 0.08755178321604366, "grad_norm": 9.971053290444187, "learning_rate": 4.982674460011933e-05, "loss": 2.466, "mean_token_accuracy": 0.42909860610961914, "step": 86925 }, { "epoch": 0.08755681926914782, "grad_norm": 11.701906139022288, "learning_rate": 4.982669818516075e-05, "loss": 2.4669, "mean_token_accuracy": 0.4448275864124298, "step": 86930 }, { "epoch": 0.087561855322252, "grad_norm": 13.57037109092577, "learning_rate": 4.982665176400977e-05, "loss": 2.6491, "mean_token_accuracy": 0.41724138259887694, "step": 86935 }, { "epoch": 0.08756689137535617, "grad_norm": 10.595351682605596, "learning_rate": 4.982660533666641e-05, "loss": 2.1679, "mean_token_accuracy": 0.4379310369491577, "step": 86940 }, { "epoch": 0.08757192742846034, "grad_norm": 13.863960425673907, "learning_rate": 4.9826558903130686e-05, "loss": 2.6916, "mean_token_accuracy": 0.41379311084747317, "step": 86945 }, { "epoch": 0.08757696348156452, "grad_norm": 10.634173686977757, "learning_rate": 4.9826512463402606e-05, "loss": 2.2498, "mean_token_accuracy": 0.458620685338974, "step": 86950 }, { "epoch": 0.08758199953466869, "grad_norm": 15.582914926598583, "learning_rate": 4.9826466017482186e-05, "loss": 2.7264, "mean_token_accuracy": 0.4103448331356049, "step": 86955 }, { "epoch": 0.08758703558777287, "grad_norm": 12.316523176198718, "learning_rate": 4.982641956536944e-05, "loss": 2.6294, "mean_token_accuracy": 0.41724138259887694, "step": 86960 }, { "epoch": 0.08759207164087704, "grad_norm": 9.060379955347333, "learning_rate": 4.9826373107064376e-05, "loss": 2.393, "mean_token_accuracy": 0.4413793087005615, "step": 86965 }, { "epoch": 0.08759710769398121, "grad_norm": 10.285193891329168, "learning_rate": 4.982632664256702e-05, "loss": 2.3874, "mean_token_accuracy": 0.41034482717514037, "step": 86970 }, { "epoch": 0.08760214374708539, "grad_norm": 10.865535924474674, "learning_rate": 4.982628017187736e-05, "loss": 2.6657, "mean_token_accuracy": 0.4, "step": 86975 }, { "epoch": 0.08760717980018956, "grad_norm": 11.527270722616372, "learning_rate": 4.982623369499542e-05, "loss": 2.6692, "mean_token_accuracy": 0.34827585220336915, "step": 86980 }, { "epoch": 0.08761221585329373, "grad_norm": 14.159475648746131, "learning_rate": 4.9826187211921225e-05, "loss": 2.5246, "mean_token_accuracy": 0.3862068891525269, "step": 86985 }, { "epoch": 0.08761725190639791, "grad_norm": 10.677494534097285, "learning_rate": 4.9826140722654775e-05, "loss": 2.2707, "mean_token_accuracy": 0.45317603945732116, "step": 86990 }, { "epoch": 0.08762228795950208, "grad_norm": 9.682594111536721, "learning_rate": 4.982609422719609e-05, "loss": 2.3598, "mean_token_accuracy": 0.4137930929660797, "step": 86995 }, { "epoch": 0.08762732401260624, "grad_norm": 11.314496390733074, "learning_rate": 4.982604772554517e-05, "loss": 2.5376, "mean_token_accuracy": 0.3965517282485962, "step": 87000 }, { "epoch": 0.08763236006571042, "grad_norm": 10.95452738267262, "learning_rate": 4.982600121770205e-05, "loss": 2.7097, "mean_token_accuracy": 0.3896551728248596, "step": 87005 }, { "epoch": 0.08763739611881459, "grad_norm": 9.182224645586281, "learning_rate": 4.982595470366672e-05, "loss": 2.3037, "mean_token_accuracy": 0.4379310369491577, "step": 87010 }, { "epoch": 0.08764243217191876, "grad_norm": 11.642238156512468, "learning_rate": 4.982590818343921e-05, "loss": 2.4352, "mean_token_accuracy": 0.3931034505367279, "step": 87015 }, { "epoch": 0.08764746822502294, "grad_norm": 11.940889618409653, "learning_rate": 4.982586165701952e-05, "loss": 2.7302, "mean_token_accuracy": 0.41034482717514037, "step": 87020 }, { "epoch": 0.08765250427812711, "grad_norm": 11.44544664641868, "learning_rate": 4.9825815124407674e-05, "loss": 2.3605, "mean_token_accuracy": 0.4068965554237366, "step": 87025 }, { "epoch": 0.08765754033123128, "grad_norm": 10.943955321660269, "learning_rate": 4.982576858560368e-05, "loss": 2.5892, "mean_token_accuracy": 0.37586206793785093, "step": 87030 }, { "epoch": 0.08766257638433546, "grad_norm": 13.232842492203126, "learning_rate": 4.9825722040607545e-05, "loss": 2.3656, "mean_token_accuracy": 0.4034482717514038, "step": 87035 }, { "epoch": 0.08766761243743963, "grad_norm": 12.13484111208282, "learning_rate": 4.9825675489419294e-05, "loss": 2.8652, "mean_token_accuracy": 0.3620689630508423, "step": 87040 }, { "epoch": 0.0876726484905438, "grad_norm": 11.897758290369051, "learning_rate": 4.9825628932038934e-05, "loss": 2.5452, "mean_token_accuracy": 0.4344827651977539, "step": 87045 }, { "epoch": 0.08767768454364798, "grad_norm": 22.21820611773179, "learning_rate": 4.982558236846648e-05, "loss": 2.5833, "mean_token_accuracy": 0.4068965554237366, "step": 87050 }, { "epoch": 0.08768272059675215, "grad_norm": 10.818350400260256, "learning_rate": 4.982553579870193e-05, "loss": 2.6378, "mean_token_accuracy": 0.37241379022598264, "step": 87055 }, { "epoch": 0.08768775664985633, "grad_norm": 9.882879615463496, "learning_rate": 4.982548922274532e-05, "loss": 2.4579, "mean_token_accuracy": 0.4379310429096222, "step": 87060 }, { "epoch": 0.0876927927029605, "grad_norm": 12.040847120836338, "learning_rate": 4.982544264059665e-05, "loss": 2.3141, "mean_token_accuracy": 0.4034482777118683, "step": 87065 }, { "epoch": 0.08769782875606466, "grad_norm": 8.75294841010402, "learning_rate": 4.982539605225594e-05, "loss": 2.2632, "mean_token_accuracy": 0.4344827592372894, "step": 87070 }, { "epoch": 0.08770286480916883, "grad_norm": 10.143227851954201, "learning_rate": 4.982534945772319e-05, "loss": 2.1208, "mean_token_accuracy": 0.44482758045196535, "step": 87075 }, { "epoch": 0.08770790086227301, "grad_norm": 11.00323708617181, "learning_rate": 4.9825302856998416e-05, "loss": 2.6254, "mean_token_accuracy": 0.38275861740112305, "step": 87080 }, { "epoch": 0.08771293691537718, "grad_norm": 12.954254014768114, "learning_rate": 4.9825256250081645e-05, "loss": 2.4369, "mean_token_accuracy": 0.42256503403186796, "step": 87085 }, { "epoch": 0.08771797296848136, "grad_norm": 10.665546905339701, "learning_rate": 4.982520963697288e-05, "loss": 2.4251, "mean_token_accuracy": 0.37931033968925476, "step": 87090 }, { "epoch": 0.08772300902158553, "grad_norm": 11.867585382526197, "learning_rate": 4.982516301767214e-05, "loss": 2.7233, "mean_token_accuracy": 0.3774954676628113, "step": 87095 }, { "epoch": 0.0877280450746897, "grad_norm": 10.937442238322449, "learning_rate": 4.982511639217942e-05, "loss": 2.6178, "mean_token_accuracy": 0.42068964838981626, "step": 87100 }, { "epoch": 0.08773308112779388, "grad_norm": 13.142302888107096, "learning_rate": 4.9825069760494756e-05, "loss": 2.4144, "mean_token_accuracy": 0.3793103456497192, "step": 87105 }, { "epoch": 0.08773811718089805, "grad_norm": 9.594812700444695, "learning_rate": 4.982502312261815e-05, "loss": 2.8426, "mean_token_accuracy": 0.4034482717514038, "step": 87110 }, { "epoch": 0.08774315323400222, "grad_norm": 10.295609570047027, "learning_rate": 4.9824976478549606e-05, "loss": 2.3922, "mean_token_accuracy": 0.4206896543502808, "step": 87115 }, { "epoch": 0.0877481892871064, "grad_norm": 11.520112799592354, "learning_rate": 4.9824929828289156e-05, "loss": 2.4154, "mean_token_accuracy": 0.4413793087005615, "step": 87120 }, { "epoch": 0.08775322534021057, "grad_norm": 10.48388389337717, "learning_rate": 4.98248831718368e-05, "loss": 2.3563, "mean_token_accuracy": 0.42758620381355283, "step": 87125 }, { "epoch": 0.08775826139331475, "grad_norm": 10.266186740548447, "learning_rate": 4.982483650919256e-05, "loss": 2.3745, "mean_token_accuracy": 0.4620689570903778, "step": 87130 }, { "epoch": 0.08776329744641892, "grad_norm": 11.86086897499934, "learning_rate": 4.982478984035644e-05, "loss": 2.2111, "mean_token_accuracy": 0.4241379380226135, "step": 87135 }, { "epoch": 0.08776833349952308, "grad_norm": 10.242304814505573, "learning_rate": 4.9824743165328446e-05, "loss": 2.5535, "mean_token_accuracy": 0.4379310369491577, "step": 87140 }, { "epoch": 0.08777336955262725, "grad_norm": 9.913777131119105, "learning_rate": 4.982469648410861e-05, "loss": 2.3605, "mean_token_accuracy": 0.42068964540958403, "step": 87145 }, { "epoch": 0.08777840560573143, "grad_norm": 10.644775365379722, "learning_rate": 4.9824649796696935e-05, "loss": 2.5673, "mean_token_accuracy": 0.4344827592372894, "step": 87150 }, { "epoch": 0.0877834416588356, "grad_norm": 16.418794025065058, "learning_rate": 4.982460310309344e-05, "loss": 2.2685, "mean_token_accuracy": 0.458620685338974, "step": 87155 }, { "epoch": 0.08778847771193977, "grad_norm": 10.779892780686236, "learning_rate": 4.982455640329813e-05, "loss": 3.0532, "mean_token_accuracy": 0.3517241418361664, "step": 87160 }, { "epoch": 0.08779351376504395, "grad_norm": 11.973887640332048, "learning_rate": 4.9824509697311026e-05, "loss": 2.8534, "mean_token_accuracy": 0.3862068891525269, "step": 87165 }, { "epoch": 0.08779854981814812, "grad_norm": 10.158666604452753, "learning_rate": 4.982446298513213e-05, "loss": 2.694, "mean_token_accuracy": 0.379310342669487, "step": 87170 }, { "epoch": 0.0878035858712523, "grad_norm": 19.113830497743432, "learning_rate": 4.982441626676146e-05, "loss": 2.2325, "mean_token_accuracy": 0.4586206912994385, "step": 87175 }, { "epoch": 0.08780862192435647, "grad_norm": 10.731565663587366, "learning_rate": 4.982436954219903e-05, "loss": 2.3533, "mean_token_accuracy": 0.42758620977401735, "step": 87180 }, { "epoch": 0.08781365797746064, "grad_norm": 10.16734182813282, "learning_rate": 4.982432281144485e-05, "loss": 3.0081, "mean_token_accuracy": 0.36993345618247986, "step": 87185 }, { "epoch": 0.08781869403056482, "grad_norm": 11.817832648405341, "learning_rate": 4.982427607449894e-05, "loss": 2.2116, "mean_token_accuracy": 0.44482758045196535, "step": 87190 }, { "epoch": 0.08782373008366899, "grad_norm": 10.230732061590063, "learning_rate": 4.982422933136131e-05, "loss": 2.4577, "mean_token_accuracy": 0.39655172228813174, "step": 87195 }, { "epoch": 0.08782876613677316, "grad_norm": 15.242906643769517, "learning_rate": 4.982418258203197e-05, "loss": 2.9703, "mean_token_accuracy": 0.3620689630508423, "step": 87200 }, { "epoch": 0.08783380218987734, "grad_norm": 10.460467061839495, "learning_rate": 4.982413582651094e-05, "loss": 2.4856, "mean_token_accuracy": 0.41379310488700866, "step": 87205 }, { "epoch": 0.0878388382429815, "grad_norm": 13.165955247944053, "learning_rate": 4.982408906479821e-05, "loss": 2.2239, "mean_token_accuracy": 0.45517241954803467, "step": 87210 }, { "epoch": 0.08784387429608567, "grad_norm": 13.092107528941291, "learning_rate": 4.982404229689383e-05, "loss": 2.5351, "mean_token_accuracy": 0.38275861740112305, "step": 87215 }, { "epoch": 0.08784891034918985, "grad_norm": 11.707153090853298, "learning_rate": 4.9823995522797784e-05, "loss": 2.3156, "mean_token_accuracy": 0.4724137902259827, "step": 87220 }, { "epoch": 0.08785394640229402, "grad_norm": 11.882739974292752, "learning_rate": 4.98239487425101e-05, "loss": 2.5486, "mean_token_accuracy": 0.4, "step": 87225 }, { "epoch": 0.0878589824553982, "grad_norm": 10.902814296904362, "learning_rate": 4.982390195603078e-05, "loss": 2.7152, "mean_token_accuracy": 0.3793103456497192, "step": 87230 }, { "epoch": 0.08786401850850237, "grad_norm": 9.745525060057464, "learning_rate": 4.9823855163359844e-05, "loss": 2.3072, "mean_token_accuracy": 0.43103448748588563, "step": 87235 }, { "epoch": 0.08786905456160654, "grad_norm": 12.037085323003463, "learning_rate": 4.98238083644973e-05, "loss": 2.3546, "mean_token_accuracy": 0.4172413766384125, "step": 87240 }, { "epoch": 0.08787409061471071, "grad_norm": 9.525344242504563, "learning_rate": 4.982376155944318e-05, "loss": 2.3021, "mean_token_accuracy": 0.4379310369491577, "step": 87245 }, { "epoch": 0.08787912666781489, "grad_norm": 14.751099453954428, "learning_rate": 4.982371474819747e-05, "loss": 2.4847, "mean_token_accuracy": 0.4, "step": 87250 }, { "epoch": 0.08788416272091906, "grad_norm": 11.95270383405833, "learning_rate": 4.9823667930760194e-05, "loss": 2.5688, "mean_token_accuracy": 0.41379311084747317, "step": 87255 }, { "epoch": 0.08788919877402324, "grad_norm": 11.52681163526228, "learning_rate": 4.9823621107131364e-05, "loss": 2.4911, "mean_token_accuracy": 0.4172413766384125, "step": 87260 }, { "epoch": 0.08789423482712741, "grad_norm": 10.79811814801969, "learning_rate": 4.9823574277310995e-05, "loss": 2.2961, "mean_token_accuracy": 0.43448275327682495, "step": 87265 }, { "epoch": 0.08789927088023158, "grad_norm": 13.366674123513226, "learning_rate": 4.98235274412991e-05, "loss": 2.6388, "mean_token_accuracy": 0.412583190202713, "step": 87270 }, { "epoch": 0.08790430693333576, "grad_norm": 17.31629646573536, "learning_rate": 4.98234805990957e-05, "loss": 2.4811, "mean_token_accuracy": 0.37931033968925476, "step": 87275 }, { "epoch": 0.08790934298643992, "grad_norm": 12.282512364194803, "learning_rate": 4.982343375070079e-05, "loss": 2.4888, "mean_token_accuracy": 0.39310345649719236, "step": 87280 }, { "epoch": 0.08791437903954409, "grad_norm": 12.545339487312773, "learning_rate": 4.9823386896114396e-05, "loss": 2.5608, "mean_token_accuracy": 0.41034482717514037, "step": 87285 }, { "epoch": 0.08791941509264826, "grad_norm": 12.963097446800727, "learning_rate": 4.982334003533652e-05, "loss": 2.7561, "mean_token_accuracy": 0.33793103992938994, "step": 87290 }, { "epoch": 0.08792445114575244, "grad_norm": 13.587400378766953, "learning_rate": 4.9823293168367194e-05, "loss": 2.7437, "mean_token_accuracy": 0.38965516686439516, "step": 87295 }, { "epoch": 0.08792948719885661, "grad_norm": 10.430936503568278, "learning_rate": 4.982324629520641e-05, "loss": 2.4984, "mean_token_accuracy": 0.4551724135875702, "step": 87300 }, { "epoch": 0.08793452325196079, "grad_norm": 11.156440753178853, "learning_rate": 4.982319941585419e-05, "loss": 2.4182, "mean_token_accuracy": 0.4206896543502808, "step": 87305 }, { "epoch": 0.08793955930506496, "grad_norm": 8.027622348766691, "learning_rate": 4.982315253031055e-05, "loss": 2.3781, "mean_token_accuracy": 0.44827587008476255, "step": 87310 }, { "epoch": 0.08794459535816913, "grad_norm": 10.954552107193345, "learning_rate": 4.9823105638575504e-05, "loss": 2.5739, "mean_token_accuracy": 0.39655172228813174, "step": 87315 }, { "epoch": 0.08794963141127331, "grad_norm": 9.976398696761777, "learning_rate": 4.982305874064906e-05, "loss": 2.2393, "mean_token_accuracy": 0.44482758045196535, "step": 87320 }, { "epoch": 0.08795466746437748, "grad_norm": 12.339657634043824, "learning_rate": 4.9823011836531233e-05, "loss": 2.8757, "mean_token_accuracy": 0.38965516686439516, "step": 87325 }, { "epoch": 0.08795970351748165, "grad_norm": 12.458222174712473, "learning_rate": 4.982296492622203e-05, "loss": 2.6814, "mean_token_accuracy": 0.36206896901130675, "step": 87330 }, { "epoch": 0.08796473957058583, "grad_norm": 9.570308925693526, "learning_rate": 4.9822918009721475e-05, "loss": 2.6587, "mean_token_accuracy": 0.4034482777118683, "step": 87335 }, { "epoch": 0.08796977562369, "grad_norm": 10.868775942585662, "learning_rate": 4.9822871087029573e-05, "loss": 2.3548, "mean_token_accuracy": 0.46091954708099364, "step": 87340 }, { "epoch": 0.08797481167679418, "grad_norm": 11.555428055826185, "learning_rate": 4.982282415814634e-05, "loss": 2.6108, "mean_token_accuracy": 0.4034482777118683, "step": 87345 }, { "epoch": 0.08797984772989834, "grad_norm": 9.935083830958119, "learning_rate": 4.982277722307179e-05, "loss": 2.2064, "mean_token_accuracy": 0.44827585816383364, "step": 87350 }, { "epoch": 0.08798488378300251, "grad_norm": 10.111592160790885, "learning_rate": 4.982273028180593e-05, "loss": 2.8531, "mean_token_accuracy": 0.35862068831920624, "step": 87355 }, { "epoch": 0.08798991983610668, "grad_norm": 14.254124842018488, "learning_rate": 4.982268333434878e-05, "loss": 2.4886, "mean_token_accuracy": 0.4068965554237366, "step": 87360 }, { "epoch": 0.08799495588921086, "grad_norm": 14.393053490649478, "learning_rate": 4.982263638070035e-05, "loss": 2.511, "mean_token_accuracy": 0.4068965494632721, "step": 87365 }, { "epoch": 0.08799999194231503, "grad_norm": 10.156249388278196, "learning_rate": 4.982258942086066e-05, "loss": 2.2918, "mean_token_accuracy": 0.4310344815254211, "step": 87370 }, { "epoch": 0.0880050279954192, "grad_norm": 12.834463738934561, "learning_rate": 4.982254245482971e-05, "loss": 2.1622, "mean_token_accuracy": 0.4344827592372894, "step": 87375 }, { "epoch": 0.08801006404852338, "grad_norm": 11.13323113480036, "learning_rate": 4.982249548260752e-05, "loss": 2.7082, "mean_token_accuracy": 0.38124621510505674, "step": 87380 }, { "epoch": 0.08801510010162755, "grad_norm": 12.44538439734185, "learning_rate": 4.98224485041941e-05, "loss": 2.7141, "mean_token_accuracy": 0.32413792610168457, "step": 87385 }, { "epoch": 0.08802013615473173, "grad_norm": 12.090517429445391, "learning_rate": 4.982240151958947e-05, "loss": 2.2997, "mean_token_accuracy": 0.4344827651977539, "step": 87390 }, { "epoch": 0.0880251722078359, "grad_norm": 9.802507496600402, "learning_rate": 4.9822354528793636e-05, "loss": 2.437, "mean_token_accuracy": 0.4172413766384125, "step": 87395 }, { "epoch": 0.08803020826094007, "grad_norm": 13.246691159984875, "learning_rate": 4.982230753180662e-05, "loss": 3.0241, "mean_token_accuracy": 0.36551723778247835, "step": 87400 }, { "epoch": 0.08803524431404425, "grad_norm": 8.914377844947316, "learning_rate": 4.982226052862843e-05, "loss": 2.2299, "mean_token_accuracy": 0.4770935833454132, "step": 87405 }, { "epoch": 0.08804028036714842, "grad_norm": 11.888247169072717, "learning_rate": 4.982221351925906e-05, "loss": 2.6188, "mean_token_accuracy": 0.4241379380226135, "step": 87410 }, { "epoch": 0.0880453164202526, "grad_norm": 12.354941749975119, "learning_rate": 4.9822166503698556e-05, "loss": 3.0598, "mean_token_accuracy": 0.37586207687854767, "step": 87415 }, { "epoch": 0.08805035247335675, "grad_norm": 15.932795667046111, "learning_rate": 4.982211948194691e-05, "loss": 2.318, "mean_token_accuracy": 0.4758620738983154, "step": 87420 }, { "epoch": 0.08805538852646093, "grad_norm": 10.011272983179536, "learning_rate": 4.9822072454004146e-05, "loss": 2.4428, "mean_token_accuracy": 0.42413793206214906, "step": 87425 }, { "epoch": 0.0880604245795651, "grad_norm": 12.206825930185158, "learning_rate": 4.982202541987027e-05, "loss": 2.5906, "mean_token_accuracy": 0.40344828069210054, "step": 87430 }, { "epoch": 0.08806546063266928, "grad_norm": 12.026779364915114, "learning_rate": 4.98219783795453e-05, "loss": 2.5482, "mean_token_accuracy": 0.3497277647256851, "step": 87435 }, { "epoch": 0.08807049668577345, "grad_norm": 9.385388861481918, "learning_rate": 4.9821931333029234e-05, "loss": 2.6932, "mean_token_accuracy": 0.43103448748588563, "step": 87440 }, { "epoch": 0.08807553273887762, "grad_norm": 11.135418625599826, "learning_rate": 4.9821884280322106e-05, "loss": 2.6164, "mean_token_accuracy": 0.37586206793785093, "step": 87445 }, { "epoch": 0.0880805687919818, "grad_norm": 11.258497299871738, "learning_rate": 4.9821837221423925e-05, "loss": 2.6222, "mean_token_accuracy": 0.44827585816383364, "step": 87450 }, { "epoch": 0.08808560484508597, "grad_norm": 10.02352636895581, "learning_rate": 4.982179015633469e-05, "loss": 2.597, "mean_token_accuracy": 0.3931034505367279, "step": 87455 }, { "epoch": 0.08809064089819015, "grad_norm": 11.675619599129817, "learning_rate": 4.982174308505443e-05, "loss": 2.2365, "mean_token_accuracy": 0.4655172348022461, "step": 87460 }, { "epoch": 0.08809567695129432, "grad_norm": 11.686532819961682, "learning_rate": 4.982169600758315e-05, "loss": 3.0111, "mean_token_accuracy": 0.358620685338974, "step": 87465 }, { "epoch": 0.08810071300439849, "grad_norm": 10.829387671461287, "learning_rate": 4.9821648923920854e-05, "loss": 2.5108, "mean_token_accuracy": 0.43968542814254763, "step": 87470 }, { "epoch": 0.08810574905750267, "grad_norm": 9.897642529755426, "learning_rate": 4.982160183406758e-05, "loss": 2.5988, "mean_token_accuracy": 0.4103448331356049, "step": 87475 }, { "epoch": 0.08811078511060684, "grad_norm": 11.884456762703117, "learning_rate": 4.9821554738023315e-05, "loss": 2.727, "mean_token_accuracy": 0.39655172228813174, "step": 87480 }, { "epoch": 0.08811582116371101, "grad_norm": 11.68597755685487, "learning_rate": 4.98215076357881e-05, "loss": 2.7716, "mean_token_accuracy": 0.37241379618644715, "step": 87485 }, { "epoch": 0.08812085721681517, "grad_norm": 12.643818332552291, "learning_rate": 4.9821460527361915e-05, "loss": 2.5391, "mean_token_accuracy": 0.4465819776058197, "step": 87490 }, { "epoch": 0.08812589326991935, "grad_norm": 9.679271877094756, "learning_rate": 4.982141341274479e-05, "loss": 2.5196, "mean_token_accuracy": 0.3793103456497192, "step": 87495 }, { "epoch": 0.08813092932302352, "grad_norm": 13.44168573418974, "learning_rate": 4.982136629193674e-05, "loss": 2.5705, "mean_token_accuracy": 0.4068965494632721, "step": 87500 }, { "epoch": 0.0881359653761277, "grad_norm": 9.732729086347861, "learning_rate": 4.982131916493778e-05, "loss": 2.4329, "mean_token_accuracy": 0.3965517282485962, "step": 87505 }, { "epoch": 0.08814100142923187, "grad_norm": 9.844585183354637, "learning_rate": 4.982127203174792e-05, "loss": 2.7978, "mean_token_accuracy": 0.3709013909101486, "step": 87510 }, { "epoch": 0.08814603748233604, "grad_norm": 12.715067642577257, "learning_rate": 4.982122489236717e-05, "loss": 2.2821, "mean_token_accuracy": 0.4655172348022461, "step": 87515 }, { "epoch": 0.08815107353544022, "grad_norm": 13.135338725852302, "learning_rate": 4.9821177746795536e-05, "loss": 2.5331, "mean_token_accuracy": 0.4103448331356049, "step": 87520 }, { "epoch": 0.08815610958854439, "grad_norm": 9.589405395692806, "learning_rate": 4.9821130595033054e-05, "loss": 2.468, "mean_token_accuracy": 0.4344827651977539, "step": 87525 }, { "epoch": 0.08816114564164856, "grad_norm": 11.375059671353128, "learning_rate": 4.982108343707972e-05, "loss": 2.3441, "mean_token_accuracy": 0.4568663060665131, "step": 87530 }, { "epoch": 0.08816618169475274, "grad_norm": 24.812493939893496, "learning_rate": 4.9821036272935545e-05, "loss": 2.482, "mean_token_accuracy": 0.4206896543502808, "step": 87535 }, { "epoch": 0.08817121774785691, "grad_norm": 9.157420465336113, "learning_rate": 4.982098910260055e-05, "loss": 2.5036, "mean_token_accuracy": 0.4401693969964981, "step": 87540 }, { "epoch": 0.08817625380096109, "grad_norm": 10.446821851318393, "learning_rate": 4.982094192607475e-05, "loss": 2.2407, "mean_token_accuracy": 0.4465819776058197, "step": 87545 }, { "epoch": 0.08818128985406526, "grad_norm": 11.642744374592246, "learning_rate": 4.982089474335815e-05, "loss": 2.4322, "mean_token_accuracy": 0.42413793206214906, "step": 87550 }, { "epoch": 0.08818632590716943, "grad_norm": 11.983437453154355, "learning_rate": 4.982084755445077e-05, "loss": 2.8039, "mean_token_accuracy": 0.37586206793785093, "step": 87555 }, { "epoch": 0.08819136196027359, "grad_norm": 13.668340564527643, "learning_rate": 4.9820800359352605e-05, "loss": 2.5894, "mean_token_accuracy": 0.417241370677948, "step": 87560 }, { "epoch": 0.08819639801337777, "grad_norm": 10.479670811162928, "learning_rate": 4.98207531580637e-05, "loss": 2.2537, "mean_token_accuracy": 0.44482759237289426, "step": 87565 }, { "epoch": 0.08820143406648194, "grad_norm": 10.631849719830198, "learning_rate": 4.9820705950584035e-05, "loss": 2.3667, "mean_token_accuracy": 0.42413793206214906, "step": 87570 }, { "epoch": 0.08820647011958611, "grad_norm": 12.904279813597961, "learning_rate": 4.982065873691365e-05, "loss": 2.5433, "mean_token_accuracy": 0.4137930929660797, "step": 87575 }, { "epoch": 0.08821150617269029, "grad_norm": 11.02525058458468, "learning_rate": 4.9820611517052545e-05, "loss": 2.5532, "mean_token_accuracy": 0.3793103456497192, "step": 87580 }, { "epoch": 0.08821654222579446, "grad_norm": 11.504604589686444, "learning_rate": 4.982056429100074e-05, "loss": 2.1735, "mean_token_accuracy": 0.49655171632766726, "step": 87585 }, { "epoch": 0.08822157827889864, "grad_norm": 9.749868773533054, "learning_rate": 4.982051705875823e-05, "loss": 2.164, "mean_token_accuracy": 0.44827585220336913, "step": 87590 }, { "epoch": 0.08822661433200281, "grad_norm": 12.687294243642373, "learning_rate": 4.9820469820325053e-05, "loss": 2.5342, "mean_token_accuracy": 0.4241379380226135, "step": 87595 }, { "epoch": 0.08823165038510698, "grad_norm": 10.388131070098565, "learning_rate": 4.9820422575701205e-05, "loss": 2.5767, "mean_token_accuracy": 0.3999999940395355, "step": 87600 }, { "epoch": 0.08823668643821116, "grad_norm": 18.894800300736478, "learning_rate": 4.982037532488671e-05, "loss": 2.6195, "mean_token_accuracy": 0.39310344457626345, "step": 87605 }, { "epoch": 0.08824172249131533, "grad_norm": 9.997621936202252, "learning_rate": 4.982032806788157e-05, "loss": 2.0228, "mean_token_accuracy": 0.4620689570903778, "step": 87610 }, { "epoch": 0.0882467585444195, "grad_norm": 13.676212609794502, "learning_rate": 4.9820280804685806e-05, "loss": 2.6558, "mean_token_accuracy": 0.4103448331356049, "step": 87615 }, { "epoch": 0.08825179459752368, "grad_norm": 11.467256539505701, "learning_rate": 4.982023353529943e-05, "loss": 2.5336, "mean_token_accuracy": 0.4103448212146759, "step": 87620 }, { "epoch": 0.08825683065062785, "grad_norm": 11.218107029167502, "learning_rate": 4.982018625972244e-05, "loss": 2.3208, "mean_token_accuracy": 0.4103448212146759, "step": 87625 }, { "epoch": 0.08826186670373201, "grad_norm": 9.838688047664524, "learning_rate": 4.982013897795489e-05, "loss": 2.4711, "mean_token_accuracy": 0.38620689511299133, "step": 87630 }, { "epoch": 0.08826690275683619, "grad_norm": 10.528866640887859, "learning_rate": 4.982009168999675e-05, "loss": 2.4343, "mean_token_accuracy": 0.47586206197738645, "step": 87635 }, { "epoch": 0.08827193880994036, "grad_norm": 8.865704197902149, "learning_rate": 4.982004439584805e-05, "loss": 2.2717, "mean_token_accuracy": 0.42413792610168455, "step": 87640 }, { "epoch": 0.08827697486304453, "grad_norm": 15.057659895772288, "learning_rate": 4.9819997095508805e-05, "loss": 2.4646, "mean_token_accuracy": 0.40344828069210054, "step": 87645 }, { "epoch": 0.0882820109161487, "grad_norm": 11.12231753508984, "learning_rate": 4.981994978897902e-05, "loss": 2.5484, "mean_token_accuracy": 0.40689654350280763, "step": 87650 }, { "epoch": 0.08828704696925288, "grad_norm": 12.298926677072737, "learning_rate": 4.9819902476258714e-05, "loss": 2.2015, "mean_token_accuracy": 0.4620689690113068, "step": 87655 }, { "epoch": 0.08829208302235705, "grad_norm": 9.089841585734444, "learning_rate": 4.9819855157347914e-05, "loss": 2.3696, "mean_token_accuracy": 0.4344827592372894, "step": 87660 }, { "epoch": 0.08829711907546123, "grad_norm": 14.201703198175254, "learning_rate": 4.9819807832246604e-05, "loss": 2.3236, "mean_token_accuracy": 0.4551724135875702, "step": 87665 }, { "epoch": 0.0883021551285654, "grad_norm": 11.852600873225015, "learning_rate": 4.981976050095482e-05, "loss": 2.4893, "mean_token_accuracy": 0.3793103456497192, "step": 87670 }, { "epoch": 0.08830719118166958, "grad_norm": 12.506076091065202, "learning_rate": 4.9819713163472564e-05, "loss": 2.4389, "mean_token_accuracy": 0.4376890480518341, "step": 87675 }, { "epoch": 0.08831222723477375, "grad_norm": 10.798763548637718, "learning_rate": 4.9819665819799846e-05, "loss": 2.2858, "mean_token_accuracy": 0.44827587008476255, "step": 87680 }, { "epoch": 0.08831726328787792, "grad_norm": 13.390477937099545, "learning_rate": 4.9819618469936694e-05, "loss": 2.6976, "mean_token_accuracy": 0.41379310488700866, "step": 87685 }, { "epoch": 0.0883222993409821, "grad_norm": 10.88316406307631, "learning_rate": 4.981957111388311e-05, "loss": 2.2735, "mean_token_accuracy": 0.4448275864124298, "step": 87690 }, { "epoch": 0.08832733539408627, "grad_norm": 13.39144109417419, "learning_rate": 4.981952375163911e-05, "loss": 2.5784, "mean_token_accuracy": 0.3655172407627106, "step": 87695 }, { "epoch": 0.08833237144719043, "grad_norm": 11.101355904297504, "learning_rate": 4.9819476383204706e-05, "loss": 2.4605, "mean_token_accuracy": 0.46031458377838136, "step": 87700 }, { "epoch": 0.0883374075002946, "grad_norm": 11.56462169048237, "learning_rate": 4.981942900857991e-05, "loss": 2.5035, "mean_token_accuracy": 0.39655172228813174, "step": 87705 }, { "epoch": 0.08834244355339878, "grad_norm": 10.444161711773909, "learning_rate": 4.981938162776474e-05, "loss": 2.4502, "mean_token_accuracy": 0.4172413766384125, "step": 87710 }, { "epoch": 0.08834747960650295, "grad_norm": 11.47963795079637, "learning_rate": 4.981933424075921e-05, "loss": 2.4509, "mean_token_accuracy": 0.4, "step": 87715 }, { "epoch": 0.08835251565960713, "grad_norm": 10.2852508608451, "learning_rate": 4.9819286847563326e-05, "loss": 2.3855, "mean_token_accuracy": 0.4517241299152374, "step": 87720 }, { "epoch": 0.0883575517127113, "grad_norm": 11.703583073120697, "learning_rate": 4.9819239448177105e-05, "loss": 2.2483, "mean_token_accuracy": 0.42758620977401735, "step": 87725 }, { "epoch": 0.08836258776581547, "grad_norm": 11.829333089207436, "learning_rate": 4.981919204260056e-05, "loss": 2.2301, "mean_token_accuracy": 0.4241379380226135, "step": 87730 }, { "epoch": 0.08836762381891965, "grad_norm": 11.219533018776746, "learning_rate": 4.9819144630833695e-05, "loss": 2.467, "mean_token_accuracy": 0.4310344815254211, "step": 87735 }, { "epoch": 0.08837265987202382, "grad_norm": 11.177485302727815, "learning_rate": 4.9819097212876534e-05, "loss": 2.4254, "mean_token_accuracy": 0.4448275864124298, "step": 87740 }, { "epoch": 0.088377695925128, "grad_norm": 13.004480022597187, "learning_rate": 4.9819049788729094e-05, "loss": 1.9713, "mean_token_accuracy": 0.4931034505367279, "step": 87745 }, { "epoch": 0.08838273197823217, "grad_norm": 13.221198867385475, "learning_rate": 4.981900235839139e-05, "loss": 2.4457, "mean_token_accuracy": 0.42068966031074523, "step": 87750 }, { "epoch": 0.08838776803133634, "grad_norm": 13.253144482229784, "learning_rate": 4.9818954921863415e-05, "loss": 2.3111, "mean_token_accuracy": 0.4448275864124298, "step": 87755 }, { "epoch": 0.08839280408444052, "grad_norm": 9.364057670469068, "learning_rate": 4.98189074791452e-05, "loss": 2.3802, "mean_token_accuracy": 0.4620689630508423, "step": 87760 }, { "epoch": 0.08839784013754469, "grad_norm": 10.418001624689813, "learning_rate": 4.9818860030236754e-05, "loss": 2.0979, "mean_token_accuracy": 0.5034482777118683, "step": 87765 }, { "epoch": 0.08840287619064885, "grad_norm": 12.828709084193635, "learning_rate": 4.9818812575138086e-05, "loss": 2.5804, "mean_token_accuracy": 0.4551724135875702, "step": 87770 }, { "epoch": 0.08840791224375302, "grad_norm": 9.670134148783331, "learning_rate": 4.9818765113849206e-05, "loss": 2.372, "mean_token_accuracy": 0.458620685338974, "step": 87775 }, { "epoch": 0.0884129482968572, "grad_norm": 10.651866321093772, "learning_rate": 4.981871764637014e-05, "loss": 2.901, "mean_token_accuracy": 0.417241370677948, "step": 87780 }, { "epoch": 0.08841798434996137, "grad_norm": 15.820722669884764, "learning_rate": 4.981867017270089e-05, "loss": 2.4162, "mean_token_accuracy": 0.43448275327682495, "step": 87785 }, { "epoch": 0.08842302040306554, "grad_norm": 11.330669954628922, "learning_rate": 4.981862269284148e-05, "loss": 2.7044, "mean_token_accuracy": 0.3655172407627106, "step": 87790 }, { "epoch": 0.08842805645616972, "grad_norm": 10.562349229340056, "learning_rate": 4.9818575206791916e-05, "loss": 2.2856, "mean_token_accuracy": 0.4482758641242981, "step": 87795 }, { "epoch": 0.08843309250927389, "grad_norm": 12.241803030407496, "learning_rate": 4.9818527714552215e-05, "loss": 2.7995, "mean_token_accuracy": 0.35862069129943847, "step": 87800 }, { "epoch": 0.08843812856237807, "grad_norm": 11.690954324615594, "learning_rate": 4.981848021612238e-05, "loss": 2.7606, "mean_token_accuracy": 0.3620689630508423, "step": 87805 }, { "epoch": 0.08844316461548224, "grad_norm": 11.233172023577994, "learning_rate": 4.9818432711502436e-05, "loss": 2.4705, "mean_token_accuracy": 0.4103448331356049, "step": 87810 }, { "epoch": 0.08844820066858641, "grad_norm": 10.480789938385163, "learning_rate": 4.981838520069238e-05, "loss": 2.1786, "mean_token_accuracy": 0.4620689690113068, "step": 87815 }, { "epoch": 0.08845323672169059, "grad_norm": 10.841802129327977, "learning_rate": 4.9818337683692244e-05, "loss": 2.4375, "mean_token_accuracy": 0.4344827651977539, "step": 87820 }, { "epoch": 0.08845827277479476, "grad_norm": 12.848947789351016, "learning_rate": 4.981829016050205e-05, "loss": 2.3929, "mean_token_accuracy": 0.41379310488700866, "step": 87825 }, { "epoch": 0.08846330882789893, "grad_norm": 10.80538439172015, "learning_rate": 4.9818242631121774e-05, "loss": 2.2153, "mean_token_accuracy": 0.48620688915252686, "step": 87830 }, { "epoch": 0.08846834488100311, "grad_norm": 10.187808847368395, "learning_rate": 4.981819509555145e-05, "loss": 2.4428, "mean_token_accuracy": 0.3827586233615875, "step": 87835 }, { "epoch": 0.08847338093410727, "grad_norm": 11.69449228151899, "learning_rate": 4.981814755379111e-05, "loss": 2.6341, "mean_token_accuracy": 0.4137930989265442, "step": 87840 }, { "epoch": 0.08847841698721144, "grad_norm": 10.03887366549081, "learning_rate": 4.981810000584073e-05, "loss": 2.3729, "mean_token_accuracy": 0.42068966031074523, "step": 87845 }, { "epoch": 0.08848345304031562, "grad_norm": 12.747454825514911, "learning_rate": 4.981805245170035e-05, "loss": 2.5979, "mean_token_accuracy": 0.4137930989265442, "step": 87850 }, { "epoch": 0.08848848909341979, "grad_norm": 11.150895883090465, "learning_rate": 4.9818004891369966e-05, "loss": 2.2396, "mean_token_accuracy": 0.4586206912994385, "step": 87855 }, { "epoch": 0.08849352514652396, "grad_norm": 13.141795495815318, "learning_rate": 4.981795732484961e-05, "loss": 2.9082, "mean_token_accuracy": 0.3724137872457504, "step": 87860 }, { "epoch": 0.08849856119962814, "grad_norm": 15.321646771853748, "learning_rate": 4.9817909752139285e-05, "loss": 2.971, "mean_token_accuracy": 0.337931028008461, "step": 87865 }, { "epoch": 0.08850359725273231, "grad_norm": 10.821358503351378, "learning_rate": 4.9817862173239e-05, "loss": 2.1116, "mean_token_accuracy": 0.4620689690113068, "step": 87870 }, { "epoch": 0.08850863330583648, "grad_norm": 12.661220390796624, "learning_rate": 4.981781458814877e-05, "loss": 2.4968, "mean_token_accuracy": 0.417241370677948, "step": 87875 }, { "epoch": 0.08851366935894066, "grad_norm": 17.646373875883018, "learning_rate": 4.981776699686862e-05, "loss": 3.1473, "mean_token_accuracy": 0.3517241358757019, "step": 87880 }, { "epoch": 0.08851870541204483, "grad_norm": 9.587476550573102, "learning_rate": 4.9817719399398546e-05, "loss": 2.7776, "mean_token_accuracy": 0.44137930274009707, "step": 87885 }, { "epoch": 0.088523741465149, "grad_norm": 11.667651580756042, "learning_rate": 4.981767179573857e-05, "loss": 2.6116, "mean_token_accuracy": 0.41379311084747317, "step": 87890 }, { "epoch": 0.08852877751825318, "grad_norm": 10.613058487223562, "learning_rate": 4.9817624185888713e-05, "loss": 2.0334, "mean_token_accuracy": 0.4896551609039307, "step": 87895 }, { "epoch": 0.08853381357135735, "grad_norm": 10.785547654042238, "learning_rate": 4.9817576569848974e-05, "loss": 2.6073, "mean_token_accuracy": 0.42068964838981626, "step": 87900 }, { "epoch": 0.08853884962446153, "grad_norm": 12.023743516054164, "learning_rate": 4.981752894761937e-05, "loss": 2.7896, "mean_token_accuracy": 0.3931034505367279, "step": 87905 }, { "epoch": 0.08854388567756569, "grad_norm": 10.84399122971706, "learning_rate": 4.9817481319199924e-05, "loss": 2.5543, "mean_token_accuracy": 0.4344827592372894, "step": 87910 }, { "epoch": 0.08854892173066986, "grad_norm": 9.180768734603031, "learning_rate": 4.981743368459063e-05, "loss": 2.1375, "mean_token_accuracy": 0.4517241358757019, "step": 87915 }, { "epoch": 0.08855395778377403, "grad_norm": 8.938753631615219, "learning_rate": 4.981738604379151e-05, "loss": 2.2546, "mean_token_accuracy": 0.4771324872970581, "step": 87920 }, { "epoch": 0.08855899383687821, "grad_norm": 13.926193669797161, "learning_rate": 4.981733839680259e-05, "loss": 2.7287, "mean_token_accuracy": 0.4137930989265442, "step": 87925 }, { "epoch": 0.08856402988998238, "grad_norm": 12.325971245645706, "learning_rate": 4.981729074362387e-05, "loss": 2.2006, "mean_token_accuracy": 0.42758620381355283, "step": 87930 }, { "epoch": 0.08856906594308656, "grad_norm": 11.207253569357638, "learning_rate": 4.981724308425537e-05, "loss": 2.6595, "mean_token_accuracy": 0.38620689511299133, "step": 87935 }, { "epoch": 0.08857410199619073, "grad_norm": 14.064535457601599, "learning_rate": 4.98171954186971e-05, "loss": 2.4378, "mean_token_accuracy": 0.4068965494632721, "step": 87940 }, { "epoch": 0.0885791380492949, "grad_norm": 9.960627697770352, "learning_rate": 4.981714774694907e-05, "loss": 2.1481, "mean_token_accuracy": 0.4344827651977539, "step": 87945 }, { "epoch": 0.08858417410239908, "grad_norm": 9.993233894479232, "learning_rate": 4.981710006901129e-05, "loss": 2.509, "mean_token_accuracy": 0.4689655125141144, "step": 87950 }, { "epoch": 0.08858921015550325, "grad_norm": 10.113483608820857, "learning_rate": 4.9817052384883785e-05, "loss": 2.3949, "mean_token_accuracy": 0.40344828069210054, "step": 87955 }, { "epoch": 0.08859424620860742, "grad_norm": 11.393337158619628, "learning_rate": 4.981700469456656e-05, "loss": 2.3427, "mean_token_accuracy": 0.42220205068588257, "step": 87960 }, { "epoch": 0.0885992822617116, "grad_norm": 9.484012663581293, "learning_rate": 4.981695699805963e-05, "loss": 2.596, "mean_token_accuracy": 0.3931034505367279, "step": 87965 }, { "epoch": 0.08860431831481577, "grad_norm": 16.120742144363355, "learning_rate": 4.981690929536301e-05, "loss": 2.6038, "mean_token_accuracy": 0.41724138259887694, "step": 87970 }, { "epoch": 0.08860935436791993, "grad_norm": 9.711929420038334, "learning_rate": 4.981686158647671e-05, "loss": 2.5201, "mean_token_accuracy": 0.38620689511299133, "step": 87975 }, { "epoch": 0.0886143904210241, "grad_norm": 9.248194928750396, "learning_rate": 4.9816813871400746e-05, "loss": 2.442, "mean_token_accuracy": 0.4016333997249603, "step": 87980 }, { "epoch": 0.08861942647412828, "grad_norm": 11.893083325521431, "learning_rate": 4.981676615013513e-05, "loss": 2.4671, "mean_token_accuracy": 0.4310344815254211, "step": 87985 }, { "epoch": 0.08862446252723245, "grad_norm": 10.955280481873618, "learning_rate": 4.981671842267988e-05, "loss": 2.851, "mean_token_accuracy": 0.3896551728248596, "step": 87990 }, { "epoch": 0.08862949858033663, "grad_norm": 12.355012198273926, "learning_rate": 4.9816670689035e-05, "loss": 2.3293, "mean_token_accuracy": 0.43103447556495667, "step": 87995 }, { "epoch": 0.0886345346334408, "grad_norm": 15.01858193412576, "learning_rate": 4.981662294920051e-05, "loss": 2.17, "mean_token_accuracy": 0.4589901328086853, "step": 88000 }, { "epoch": 0.08863957068654497, "grad_norm": 10.262428157981082, "learning_rate": 4.9816575203176416e-05, "loss": 2.2651, "mean_token_accuracy": 0.47931033968925474, "step": 88005 }, { "epoch": 0.08864460673964915, "grad_norm": 9.476088887849924, "learning_rate": 4.981652745096275e-05, "loss": 2.3649, "mean_token_accuracy": 0.4034482777118683, "step": 88010 }, { "epoch": 0.08864964279275332, "grad_norm": 10.96851701050586, "learning_rate": 4.98164796925595e-05, "loss": 2.5077, "mean_token_accuracy": 0.39147005379199984, "step": 88015 }, { "epoch": 0.0886546788458575, "grad_norm": 11.95229470923467, "learning_rate": 4.98164319279667e-05, "loss": 2.3368, "mean_token_accuracy": 0.43103447556495667, "step": 88020 }, { "epoch": 0.08865971489896167, "grad_norm": 12.142336831484991, "learning_rate": 4.981638415718434e-05, "loss": 2.6342, "mean_token_accuracy": 0.3758620649576187, "step": 88025 }, { "epoch": 0.08866475095206584, "grad_norm": 9.37232380597033, "learning_rate": 4.981633638021246e-05, "loss": 2.2108, "mean_token_accuracy": 0.42758620977401735, "step": 88030 }, { "epoch": 0.08866978700517002, "grad_norm": 10.467462267537808, "learning_rate": 4.981628859705106e-05, "loss": 2.6763, "mean_token_accuracy": 0.4034482777118683, "step": 88035 }, { "epoch": 0.08867482305827419, "grad_norm": 13.585630991246987, "learning_rate": 4.981624080770015e-05, "loss": 2.6103, "mean_token_accuracy": 0.41724138259887694, "step": 88040 }, { "epoch": 0.08867985911137835, "grad_norm": 10.910376517535214, "learning_rate": 4.981619301215975e-05, "loss": 2.2711, "mean_token_accuracy": 0.47931033968925474, "step": 88045 }, { "epoch": 0.08868489516448252, "grad_norm": 9.565287758192914, "learning_rate": 4.981614521042987e-05, "loss": 2.1115, "mean_token_accuracy": 0.4655172348022461, "step": 88050 }, { "epoch": 0.0886899312175867, "grad_norm": 10.265469483756734, "learning_rate": 4.981609740251053e-05, "loss": 2.722, "mean_token_accuracy": 0.38620689511299133, "step": 88055 }, { "epoch": 0.08869496727069087, "grad_norm": 13.984695819649867, "learning_rate": 4.981604958840172e-05, "loss": 2.5449, "mean_token_accuracy": 0.4034482717514038, "step": 88060 }, { "epoch": 0.08870000332379505, "grad_norm": 11.490298826910722, "learning_rate": 4.9816001768103485e-05, "loss": 2.5584, "mean_token_accuracy": 0.3793103456497192, "step": 88065 }, { "epoch": 0.08870503937689922, "grad_norm": 10.73307178993154, "learning_rate": 4.9815953941615814e-05, "loss": 2.5555, "mean_token_accuracy": 0.41724138259887694, "step": 88070 }, { "epoch": 0.0887100754300034, "grad_norm": 9.737514866336396, "learning_rate": 4.9815906108938745e-05, "loss": 2.4479, "mean_token_accuracy": 0.46696914434432985, "step": 88075 }, { "epoch": 0.08871511148310757, "grad_norm": 10.049729271602805, "learning_rate": 4.9815858270072264e-05, "loss": 2.3386, "mean_token_accuracy": 0.43103448748588563, "step": 88080 }, { "epoch": 0.08872014753621174, "grad_norm": 10.112141150316083, "learning_rate": 4.98158104250164e-05, "loss": 2.6625, "mean_token_accuracy": 0.4034482777118683, "step": 88085 }, { "epoch": 0.08872518358931591, "grad_norm": 10.9528475206623, "learning_rate": 4.9815762573771155e-05, "loss": 1.8999, "mean_token_accuracy": 0.493103438615799, "step": 88090 }, { "epoch": 0.08873021964242009, "grad_norm": 10.791139790458981, "learning_rate": 4.9815714716336566e-05, "loss": 2.1594, "mean_token_accuracy": 0.4413793087005615, "step": 88095 }, { "epoch": 0.08873525569552426, "grad_norm": 11.287480888915699, "learning_rate": 4.981566685271262e-05, "loss": 2.7638, "mean_token_accuracy": 0.31379309892654417, "step": 88100 }, { "epoch": 0.08874029174862844, "grad_norm": 13.451286924950844, "learning_rate": 4.981561898289934e-05, "loss": 2.2972, "mean_token_accuracy": 0.46551724076271056, "step": 88105 }, { "epoch": 0.08874532780173261, "grad_norm": 11.763888455204093, "learning_rate": 4.981557110689674e-05, "loss": 2.3433, "mean_token_accuracy": 0.4068965554237366, "step": 88110 }, { "epoch": 0.08875036385483677, "grad_norm": 13.99681202925253, "learning_rate": 4.981552322470484e-05, "loss": 2.8444, "mean_token_accuracy": 0.3827586233615875, "step": 88115 }, { "epoch": 0.08875539990794094, "grad_norm": 11.451954376677241, "learning_rate": 4.981547533632364e-05, "loss": 2.548, "mean_token_accuracy": 0.42413793206214906, "step": 88120 }, { "epoch": 0.08876043596104512, "grad_norm": 8.937528006109897, "learning_rate": 4.981542744175316e-05, "loss": 2.2427, "mean_token_accuracy": 0.47586206793785096, "step": 88125 }, { "epoch": 0.08876547201414929, "grad_norm": 15.956200834208557, "learning_rate": 4.981537954099341e-05, "loss": 2.5141, "mean_token_accuracy": 0.45517241954803467, "step": 88130 }, { "epoch": 0.08877050806725346, "grad_norm": 13.398975260285793, "learning_rate": 4.9815331634044414e-05, "loss": 2.9357, "mean_token_accuracy": 0.3655172407627106, "step": 88135 }, { "epoch": 0.08877554412035764, "grad_norm": 12.037393996641288, "learning_rate": 4.9815283720906174e-05, "loss": 2.8407, "mean_token_accuracy": 0.3931034505367279, "step": 88140 }, { "epoch": 0.08878058017346181, "grad_norm": 9.425014503997332, "learning_rate": 4.9815235801578706e-05, "loss": 2.5829, "mean_token_accuracy": 0.4206896543502808, "step": 88145 }, { "epoch": 0.08878561622656599, "grad_norm": 10.755955148242137, "learning_rate": 4.981518787606203e-05, "loss": 2.5614, "mean_token_accuracy": 0.38620689511299133, "step": 88150 }, { "epoch": 0.08879065227967016, "grad_norm": 10.968081030510584, "learning_rate": 4.981513994435614e-05, "loss": 2.4424, "mean_token_accuracy": 0.42758620977401735, "step": 88155 }, { "epoch": 0.08879568833277433, "grad_norm": 11.238602196070923, "learning_rate": 4.981509200646107e-05, "loss": 2.4673, "mean_token_accuracy": 0.43793103098869324, "step": 88160 }, { "epoch": 0.08880072438587851, "grad_norm": 9.20094631117747, "learning_rate": 4.9815044062376825e-05, "loss": 2.16, "mean_token_accuracy": 0.5, "step": 88165 }, { "epoch": 0.08880576043898268, "grad_norm": 11.46798057363033, "learning_rate": 4.981499611210342e-05, "loss": 2.7783, "mean_token_accuracy": 0.34137930870056155, "step": 88170 }, { "epoch": 0.08881079649208685, "grad_norm": 11.340504787504353, "learning_rate": 4.9814948155640875e-05, "loss": 2.4942, "mean_token_accuracy": 0.4275861978530884, "step": 88175 }, { "epoch": 0.08881583254519103, "grad_norm": 16.418668225567245, "learning_rate": 4.981490019298919e-05, "loss": 2.4762, "mean_token_accuracy": 0.41724138259887694, "step": 88180 }, { "epoch": 0.08882086859829519, "grad_norm": 10.471701628887658, "learning_rate": 4.981485222414838e-05, "loss": 2.8361, "mean_token_accuracy": 0.39310344457626345, "step": 88185 }, { "epoch": 0.08882590465139936, "grad_norm": 9.464536189225788, "learning_rate": 4.981480424911846e-05, "loss": 2.3155, "mean_token_accuracy": 0.46412583589553835, "step": 88190 }, { "epoch": 0.08883094070450354, "grad_norm": 11.984185953018105, "learning_rate": 4.981475626789946e-05, "loss": 2.8221, "mean_token_accuracy": 0.4, "step": 88195 }, { "epoch": 0.08883597675760771, "grad_norm": 12.122517540991241, "learning_rate": 4.981470828049137e-05, "loss": 2.483, "mean_token_accuracy": 0.42068966031074523, "step": 88200 }, { "epoch": 0.08884101281071188, "grad_norm": 10.55925564768586, "learning_rate": 4.981466028689421e-05, "loss": 2.2514, "mean_token_accuracy": 0.4689655125141144, "step": 88205 }, { "epoch": 0.08884604886381606, "grad_norm": 11.945111735010908, "learning_rate": 4.9814612287108006e-05, "loss": 2.5595, "mean_token_accuracy": 0.4206896543502808, "step": 88210 }, { "epoch": 0.08885108491692023, "grad_norm": 9.748495150875272, "learning_rate": 4.981456428113275e-05, "loss": 2.3752, "mean_token_accuracy": 0.4172413766384125, "step": 88215 }, { "epoch": 0.0888561209700244, "grad_norm": 13.19253592187581, "learning_rate": 4.981451626896847e-05, "loss": 2.3144, "mean_token_accuracy": 0.43448275327682495, "step": 88220 }, { "epoch": 0.08886115702312858, "grad_norm": 9.833835454451819, "learning_rate": 4.981446825061518e-05, "loss": 2.3138, "mean_token_accuracy": 0.44482759237289426, "step": 88225 }, { "epoch": 0.08886619307623275, "grad_norm": 9.757708002450425, "learning_rate": 4.981442022607288e-05, "loss": 2.2574, "mean_token_accuracy": 0.45347853302955626, "step": 88230 }, { "epoch": 0.08887122912933693, "grad_norm": 9.55098972091099, "learning_rate": 4.9814372195341605e-05, "loss": 2.1682, "mean_token_accuracy": 0.4344827592372894, "step": 88235 }, { "epoch": 0.0888762651824411, "grad_norm": 10.44393236548825, "learning_rate": 4.9814324158421345e-05, "loss": 2.2303, "mean_token_accuracy": 0.47586206197738645, "step": 88240 }, { "epoch": 0.08888130123554527, "grad_norm": 11.81333264912459, "learning_rate": 4.981427611531213e-05, "loss": 2.5796, "mean_token_accuracy": 0.39655172228813174, "step": 88245 }, { "epoch": 0.08888633728864945, "grad_norm": 10.149998411617167, "learning_rate": 4.981422806601396e-05, "loss": 2.1921, "mean_token_accuracy": 0.4620689570903778, "step": 88250 }, { "epoch": 0.08889137334175361, "grad_norm": 10.616800871727099, "learning_rate": 4.981418001052686e-05, "loss": 2.684, "mean_token_accuracy": 0.3655172407627106, "step": 88255 }, { "epoch": 0.08889640939485778, "grad_norm": 14.241874143915625, "learning_rate": 4.981413194885084e-05, "loss": 2.7456, "mean_token_accuracy": 0.36206896007061007, "step": 88260 }, { "epoch": 0.08890144544796195, "grad_norm": 12.79969314837717, "learning_rate": 4.981408388098591e-05, "loss": 2.1283, "mean_token_accuracy": 0.42413792610168455, "step": 88265 }, { "epoch": 0.08890648150106613, "grad_norm": 17.055605273053796, "learning_rate": 4.981403580693208e-05, "loss": 2.418, "mean_token_accuracy": 0.4724137902259827, "step": 88270 }, { "epoch": 0.0889115175541703, "grad_norm": 11.344691468805907, "learning_rate": 4.9813987726689385e-05, "loss": 2.4015, "mean_token_accuracy": 0.41979949474334716, "step": 88275 }, { "epoch": 0.08891655360727448, "grad_norm": 11.564263236144832, "learning_rate": 4.9813939640257806e-05, "loss": 2.4473, "mean_token_accuracy": 0.45517240166664125, "step": 88280 }, { "epoch": 0.08892158966037865, "grad_norm": 14.324921227716956, "learning_rate": 4.9813891547637385e-05, "loss": 2.7665, "mean_token_accuracy": 0.42413793206214906, "step": 88285 }, { "epoch": 0.08892662571348282, "grad_norm": 10.279499215315559, "learning_rate": 4.981384344882812e-05, "loss": 2.4464, "mean_token_accuracy": 0.4344827592372894, "step": 88290 }, { "epoch": 0.088931661766587, "grad_norm": 12.210893266693814, "learning_rate": 4.981379534383002e-05, "loss": 2.3941, "mean_token_accuracy": 0.43793103098869324, "step": 88295 }, { "epoch": 0.08893669781969117, "grad_norm": 12.192906495433, "learning_rate": 4.981374723264311e-05, "loss": 2.267, "mean_token_accuracy": 0.45172413885593415, "step": 88300 }, { "epoch": 0.08894173387279534, "grad_norm": 13.580096100733952, "learning_rate": 4.98136991152674e-05, "loss": 3.1224, "mean_token_accuracy": 0.3448275804519653, "step": 88305 }, { "epoch": 0.08894676992589952, "grad_norm": 11.121474910423105, "learning_rate": 4.98136509917029e-05, "loss": 2.2364, "mean_token_accuracy": 0.4517241418361664, "step": 88310 }, { "epoch": 0.08895180597900369, "grad_norm": 12.726503836828549, "learning_rate": 4.9813602861949623e-05, "loss": 2.8347, "mean_token_accuracy": 0.3482758551836014, "step": 88315 }, { "epoch": 0.08895684203210787, "grad_norm": 11.227749281178523, "learning_rate": 4.981355472600759e-05, "loss": 2.6061, "mean_token_accuracy": 0.3827586114406586, "step": 88320 }, { "epoch": 0.08896187808521203, "grad_norm": 11.796305929356699, "learning_rate": 4.9813506583876815e-05, "loss": 2.7586, "mean_token_accuracy": 0.37586206793785093, "step": 88325 }, { "epoch": 0.0889669141383162, "grad_norm": 8.585885773844021, "learning_rate": 4.9813458435557296e-05, "loss": 2.349, "mean_token_accuracy": 0.42758620381355283, "step": 88330 }, { "epoch": 0.08897195019142037, "grad_norm": 14.752013523218668, "learning_rate": 4.981341028104906e-05, "loss": 2.6184, "mean_token_accuracy": 0.34137930870056155, "step": 88335 }, { "epoch": 0.08897698624452455, "grad_norm": 13.64467951166241, "learning_rate": 4.9813362120352115e-05, "loss": 2.7721, "mean_token_accuracy": 0.37241379618644715, "step": 88340 }, { "epoch": 0.08898202229762872, "grad_norm": 11.182612266176323, "learning_rate": 4.981331395346648e-05, "loss": 2.6699, "mean_token_accuracy": 0.42758620381355283, "step": 88345 }, { "epoch": 0.0889870583507329, "grad_norm": 11.778045830837229, "learning_rate": 4.981326578039216e-05, "loss": 2.8188, "mean_token_accuracy": 0.3571082890033722, "step": 88350 }, { "epoch": 0.08899209440383707, "grad_norm": 13.343582253881378, "learning_rate": 4.981321760112917e-05, "loss": 2.5702, "mean_token_accuracy": 0.42413792610168455, "step": 88355 }, { "epoch": 0.08899713045694124, "grad_norm": 10.803557516748283, "learning_rate": 4.981316941567754e-05, "loss": 2.5504, "mean_token_accuracy": 0.3965517163276672, "step": 88360 }, { "epoch": 0.08900216651004542, "grad_norm": 10.172451433090384, "learning_rate": 4.981312122403725e-05, "loss": 2.5646, "mean_token_accuracy": 0.37931033968925476, "step": 88365 }, { "epoch": 0.08900720256314959, "grad_norm": 12.586700150046994, "learning_rate": 4.9813073026208336e-05, "loss": 2.2526, "mean_token_accuracy": 0.41034482717514037, "step": 88370 }, { "epoch": 0.08901223861625376, "grad_norm": 9.05318852082888, "learning_rate": 4.981302482219082e-05, "loss": 2.108, "mean_token_accuracy": 0.47586206793785096, "step": 88375 }, { "epoch": 0.08901727466935794, "grad_norm": 10.418086001980356, "learning_rate": 4.981297661198469e-05, "loss": 2.3438, "mean_token_accuracy": 0.46551724672317507, "step": 88380 }, { "epoch": 0.08902231072246211, "grad_norm": 12.757490771064086, "learning_rate": 4.981292839558999e-05, "loss": 2.7367, "mean_token_accuracy": 0.3793103337287903, "step": 88385 }, { "epoch": 0.08902734677556629, "grad_norm": 11.454282546502752, "learning_rate": 4.98128801730067e-05, "loss": 2.2747, "mean_token_accuracy": 0.4413793087005615, "step": 88390 }, { "epoch": 0.08903238282867044, "grad_norm": 19.536367320268926, "learning_rate": 4.9812831944234853e-05, "loss": 2.6747, "mean_token_accuracy": 0.4103448212146759, "step": 88395 }, { "epoch": 0.08903741888177462, "grad_norm": 9.838760332683808, "learning_rate": 4.9812783709274455e-05, "loss": 2.4179, "mean_token_accuracy": 0.41185722351074217, "step": 88400 }, { "epoch": 0.08904245493487879, "grad_norm": 8.821647631950514, "learning_rate": 4.981273546812553e-05, "loss": 2.6125, "mean_token_accuracy": 0.4310344815254211, "step": 88405 }, { "epoch": 0.08904749098798297, "grad_norm": 10.797919908335528, "learning_rate": 4.9812687220788085e-05, "loss": 2.1077, "mean_token_accuracy": 0.45862069725990295, "step": 88410 }, { "epoch": 0.08905252704108714, "grad_norm": 10.20014079587903, "learning_rate": 4.9812638967262134e-05, "loss": 2.3504, "mean_token_accuracy": 0.4673926293849945, "step": 88415 }, { "epoch": 0.08905756309419131, "grad_norm": 19.3823305401709, "learning_rate": 4.981259070754768e-05, "loss": 2.4823, "mean_token_accuracy": 0.4517241418361664, "step": 88420 }, { "epoch": 0.08906259914729549, "grad_norm": 10.268804649975353, "learning_rate": 4.981254244164475e-05, "loss": 2.5385, "mean_token_accuracy": 0.4103448212146759, "step": 88425 }, { "epoch": 0.08906763520039966, "grad_norm": 11.306011665288132, "learning_rate": 4.981249416955336e-05, "loss": 2.4799, "mean_token_accuracy": 0.4068965494632721, "step": 88430 }, { "epoch": 0.08907267125350384, "grad_norm": 13.320936019408437, "learning_rate": 4.981244589127351e-05, "loss": 2.7569, "mean_token_accuracy": 0.41034482717514037, "step": 88435 }, { "epoch": 0.08907770730660801, "grad_norm": 9.832002826380656, "learning_rate": 4.9812397606805224e-05, "loss": 2.5133, "mean_token_accuracy": 0.4310344815254211, "step": 88440 }, { "epoch": 0.08908274335971218, "grad_norm": 12.017551111659943, "learning_rate": 4.9812349316148507e-05, "loss": 2.3956, "mean_token_accuracy": 0.4310344815254211, "step": 88445 }, { "epoch": 0.08908777941281636, "grad_norm": 15.94137603383071, "learning_rate": 4.981230101930337e-05, "loss": 2.576, "mean_token_accuracy": 0.4448275864124298, "step": 88450 }, { "epoch": 0.08909281546592053, "grad_norm": 13.717884695676045, "learning_rate": 4.981225271626985e-05, "loss": 2.8414, "mean_token_accuracy": 0.34827586114406583, "step": 88455 }, { "epoch": 0.0890978515190247, "grad_norm": 15.247413543689415, "learning_rate": 4.981220440704794e-05, "loss": 2.5534, "mean_token_accuracy": 0.3724137932062149, "step": 88460 }, { "epoch": 0.08910288757212886, "grad_norm": 9.403994355589962, "learning_rate": 4.9812156091637645e-05, "loss": 2.3149, "mean_token_accuracy": 0.4620689630508423, "step": 88465 }, { "epoch": 0.08910792362523304, "grad_norm": 13.912584185858721, "learning_rate": 4.9812107770039004e-05, "loss": 2.8523, "mean_token_accuracy": 0.38620689511299133, "step": 88470 }, { "epoch": 0.08911295967833721, "grad_norm": 13.406585293328076, "learning_rate": 4.981205944225201e-05, "loss": 2.3045, "mean_token_accuracy": 0.46551724076271056, "step": 88475 }, { "epoch": 0.08911799573144139, "grad_norm": 10.726055731378517, "learning_rate": 4.981201110827668e-05, "loss": 2.3646, "mean_token_accuracy": 0.4413793206214905, "step": 88480 }, { "epoch": 0.08912303178454556, "grad_norm": 8.211322244315545, "learning_rate": 4.981196276811303e-05, "loss": 2.3994, "mean_token_accuracy": 0.4931034505367279, "step": 88485 }, { "epoch": 0.08912806783764973, "grad_norm": 10.091749856479506, "learning_rate": 4.981191442176108e-05, "loss": 2.0191, "mean_token_accuracy": 0.5000000059604645, "step": 88490 }, { "epoch": 0.0891331038907539, "grad_norm": 16.788940807438482, "learning_rate": 4.981186606922083e-05, "loss": 2.4965, "mean_token_accuracy": 0.4517241418361664, "step": 88495 }, { "epoch": 0.08913813994385808, "grad_norm": 10.716928908616541, "learning_rate": 4.981181771049231e-05, "loss": 2.4667, "mean_token_accuracy": 0.40344826579093934, "step": 88500 }, { "epoch": 0.08914317599696225, "grad_norm": 9.822811137664917, "learning_rate": 4.981176934557552e-05, "loss": 2.3252, "mean_token_accuracy": 0.42758620381355283, "step": 88505 }, { "epoch": 0.08914821205006643, "grad_norm": 11.311884010511532, "learning_rate": 4.9811720974470475e-05, "loss": 2.1513, "mean_token_accuracy": 0.495160311460495, "step": 88510 }, { "epoch": 0.0891532481031706, "grad_norm": 11.551471667786691, "learning_rate": 4.981167259717719e-05, "loss": 2.8687, "mean_token_accuracy": 0.34137930572032926, "step": 88515 }, { "epoch": 0.08915828415627478, "grad_norm": 10.224425634835514, "learning_rate": 4.981162421369569e-05, "loss": 2.4248, "mean_token_accuracy": 0.42758620381355283, "step": 88520 }, { "epoch": 0.08916332020937895, "grad_norm": 10.120152346658063, "learning_rate": 4.981157582402596e-05, "loss": 2.5911, "mean_token_accuracy": 0.39461584091186525, "step": 88525 }, { "epoch": 0.08916835626248312, "grad_norm": 9.372358473185935, "learning_rate": 4.981152742816804e-05, "loss": 2.2235, "mean_token_accuracy": 0.46896551847457885, "step": 88530 }, { "epoch": 0.08917339231558728, "grad_norm": 12.72747694610874, "learning_rate": 4.9811479026121945e-05, "loss": 2.4633, "mean_token_accuracy": 0.41034482717514037, "step": 88535 }, { "epoch": 0.08917842836869146, "grad_norm": 11.283195217690508, "learning_rate": 4.981143061788767e-05, "loss": 2.0434, "mean_token_accuracy": 0.441379314661026, "step": 88540 }, { "epoch": 0.08918346442179563, "grad_norm": 11.617391086592413, "learning_rate": 4.981138220346523e-05, "loss": 2.6947, "mean_token_accuracy": 0.4000000059604645, "step": 88545 }, { "epoch": 0.0891885004748998, "grad_norm": 14.80718775723769, "learning_rate": 4.981133378285465e-05, "loss": 2.6486, "mean_token_accuracy": 0.3965517163276672, "step": 88550 }, { "epoch": 0.08919353652800398, "grad_norm": 10.813875284555783, "learning_rate": 4.981128535605594e-05, "loss": 2.2735, "mean_token_accuracy": 0.4586206912994385, "step": 88555 }, { "epoch": 0.08919857258110815, "grad_norm": 11.291826413437617, "learning_rate": 4.981123692306911e-05, "loss": 2.2853, "mean_token_accuracy": 0.45535390377044677, "step": 88560 }, { "epoch": 0.08920360863421233, "grad_norm": 11.697506059245041, "learning_rate": 4.981118848389418e-05, "loss": 2.5882, "mean_token_accuracy": 0.4068965554237366, "step": 88565 }, { "epoch": 0.0892086446873165, "grad_norm": 10.997245902251391, "learning_rate": 4.9811140038531154e-05, "loss": 2.5946, "mean_token_accuracy": 0.417241370677948, "step": 88570 }, { "epoch": 0.08921368074042067, "grad_norm": 10.083958968404207, "learning_rate": 4.9811091586980046e-05, "loss": 2.4107, "mean_token_accuracy": 0.42413792610168455, "step": 88575 }, { "epoch": 0.08921871679352485, "grad_norm": 8.281066632901785, "learning_rate": 4.981104312924088e-05, "loss": 2.486, "mean_token_accuracy": 0.4344827592372894, "step": 88580 }, { "epoch": 0.08922375284662902, "grad_norm": 13.570746231715745, "learning_rate": 4.981099466531366e-05, "loss": 2.7587, "mean_token_accuracy": 0.44827585816383364, "step": 88585 }, { "epoch": 0.0892287888997332, "grad_norm": 11.792382226269343, "learning_rate": 4.9810946195198404e-05, "loss": 2.5379, "mean_token_accuracy": 0.4310344815254211, "step": 88590 }, { "epoch": 0.08923382495283737, "grad_norm": 9.3939267167085, "learning_rate": 4.9810897718895124e-05, "loss": 2.4109, "mean_token_accuracy": 0.43103447556495667, "step": 88595 }, { "epoch": 0.08923886100594154, "grad_norm": 10.139717915996181, "learning_rate": 4.981084923640383e-05, "loss": 2.6587, "mean_token_accuracy": 0.41881427466869353, "step": 88600 }, { "epoch": 0.0892438970590457, "grad_norm": 12.457259495789197, "learning_rate": 4.981080074772454e-05, "loss": 2.0711, "mean_token_accuracy": 0.4862069010734558, "step": 88605 }, { "epoch": 0.08924893311214988, "grad_norm": 11.01616087442303, "learning_rate": 4.9810752252857274e-05, "loss": 2.7153, "mean_token_accuracy": 0.37241379022598264, "step": 88610 }, { "epoch": 0.08925396916525405, "grad_norm": 12.707151120157436, "learning_rate": 4.981070375180203e-05, "loss": 2.4768, "mean_token_accuracy": 0.43793103098869324, "step": 88615 }, { "epoch": 0.08925900521835822, "grad_norm": 11.747031254045655, "learning_rate": 4.9810655244558826e-05, "loss": 2.6156, "mean_token_accuracy": 0.4, "step": 88620 }, { "epoch": 0.0892640412714624, "grad_norm": 12.035667167847626, "learning_rate": 4.9810606731127686e-05, "loss": 2.6258, "mean_token_accuracy": 0.38965516686439516, "step": 88625 }, { "epoch": 0.08926907732456657, "grad_norm": 10.607826688754747, "learning_rate": 4.981055821150861e-05, "loss": 2.5199, "mean_token_accuracy": 0.4256503224372864, "step": 88630 }, { "epoch": 0.08927411337767074, "grad_norm": 11.601784277743983, "learning_rate": 4.981050968570162e-05, "loss": 2.4547, "mean_token_accuracy": 0.42758620977401735, "step": 88635 }, { "epoch": 0.08927914943077492, "grad_norm": 11.320044625146535, "learning_rate": 4.9810461153706726e-05, "loss": 2.5633, "mean_token_accuracy": 0.4000000059604645, "step": 88640 }, { "epoch": 0.08928418548387909, "grad_norm": 16.309084913136417, "learning_rate": 4.981041261552394e-05, "loss": 2.8536, "mean_token_accuracy": 0.36551723480224607, "step": 88645 }, { "epoch": 0.08928922153698327, "grad_norm": 11.73820038773606, "learning_rate": 4.981036407115329e-05, "loss": 2.0621, "mean_token_accuracy": 0.517241370677948, "step": 88650 }, { "epoch": 0.08929425759008744, "grad_norm": 9.873145075753127, "learning_rate": 4.9810315520594765e-05, "loss": 2.1935, "mean_token_accuracy": 0.42413792610168455, "step": 88655 }, { "epoch": 0.08929929364319161, "grad_norm": 12.225217101135746, "learning_rate": 4.981026696384839e-05, "loss": 2.3013, "mean_token_accuracy": 0.4586206912994385, "step": 88660 }, { "epoch": 0.08930432969629579, "grad_norm": 10.782369566066784, "learning_rate": 4.981021840091419e-05, "loss": 2.404, "mean_token_accuracy": 0.42758620381355283, "step": 88665 }, { "epoch": 0.08930936574939996, "grad_norm": 10.959229780250883, "learning_rate": 4.9810169831792156e-05, "loss": 2.3447, "mean_token_accuracy": 0.4206896543502808, "step": 88670 }, { "epoch": 0.08931440180250412, "grad_norm": 10.128813364265458, "learning_rate": 4.9810121256482325e-05, "loss": 2.6536, "mean_token_accuracy": 0.4206896543502808, "step": 88675 }, { "epoch": 0.0893194378556083, "grad_norm": 11.530614708630122, "learning_rate": 4.981007267498469e-05, "loss": 2.4418, "mean_token_accuracy": 0.3896551787853241, "step": 88680 }, { "epoch": 0.08932447390871247, "grad_norm": 13.54442770991281, "learning_rate": 4.981002408729927e-05, "loss": 2.6123, "mean_token_accuracy": 0.3862068891525269, "step": 88685 }, { "epoch": 0.08932950996181664, "grad_norm": 12.256078575058927, "learning_rate": 4.9809975493426085e-05, "loss": 2.0949, "mean_token_accuracy": 0.47241379618644713, "step": 88690 }, { "epoch": 0.08933454601492082, "grad_norm": 11.681332258014175, "learning_rate": 4.980992689336515e-05, "loss": 2.2527, "mean_token_accuracy": 0.443315190076828, "step": 88695 }, { "epoch": 0.08933958206802499, "grad_norm": 19.221952606856398, "learning_rate": 4.9809878287116466e-05, "loss": 2.2652, "mean_token_accuracy": 0.4620689690113068, "step": 88700 }, { "epoch": 0.08934461812112916, "grad_norm": 10.748527206752398, "learning_rate": 4.9809829674680064e-05, "loss": 2.3248, "mean_token_accuracy": 0.42413792610168455, "step": 88705 }, { "epoch": 0.08934965417423334, "grad_norm": 19.02627348692893, "learning_rate": 4.980978105605594e-05, "loss": 2.7091, "mean_token_accuracy": 0.34482758641242983, "step": 88710 }, { "epoch": 0.08935469022733751, "grad_norm": 12.880097884012867, "learning_rate": 4.980973243124412e-05, "loss": 2.594, "mean_token_accuracy": 0.3896551728248596, "step": 88715 }, { "epoch": 0.08935972628044168, "grad_norm": 11.691876961986177, "learning_rate": 4.980968380024461e-05, "loss": 2.2871, "mean_token_accuracy": 0.4344827592372894, "step": 88720 }, { "epoch": 0.08936476233354586, "grad_norm": 9.820351069932935, "learning_rate": 4.980963516305742e-05, "loss": 2.2912, "mean_token_accuracy": 0.5137930929660797, "step": 88725 }, { "epoch": 0.08936979838665003, "grad_norm": 11.811294119579873, "learning_rate": 4.980958651968258e-05, "loss": 2.5051, "mean_token_accuracy": 0.3931034505367279, "step": 88730 }, { "epoch": 0.0893748344397542, "grad_norm": 9.730203203550879, "learning_rate": 4.980953787012008e-05, "loss": 2.0427, "mean_token_accuracy": 0.4931034505367279, "step": 88735 }, { "epoch": 0.08937987049285838, "grad_norm": 25.87116323521781, "learning_rate": 4.980948921436996e-05, "loss": 2.0511, "mean_token_accuracy": 0.47586207985877993, "step": 88740 }, { "epoch": 0.08938490654596254, "grad_norm": 12.112253550446777, "learning_rate": 4.980944055243221e-05, "loss": 2.4128, "mean_token_accuracy": 0.3655172437429428, "step": 88745 }, { "epoch": 0.08938994259906671, "grad_norm": 12.748044329990211, "learning_rate": 4.980939188430686e-05, "loss": 2.509, "mean_token_accuracy": 0.4379310369491577, "step": 88750 }, { "epoch": 0.08939497865217089, "grad_norm": 15.257796320872282, "learning_rate": 4.9809343209993917e-05, "loss": 2.6401, "mean_token_accuracy": 0.39818512797355654, "step": 88755 }, { "epoch": 0.08940001470527506, "grad_norm": 15.912562536718099, "learning_rate": 4.980929452949339e-05, "loss": 2.4125, "mean_token_accuracy": 0.38965516686439516, "step": 88760 }, { "epoch": 0.08940505075837923, "grad_norm": 11.136093316441404, "learning_rate": 4.98092458428053e-05, "loss": 2.7067, "mean_token_accuracy": 0.3551724135875702, "step": 88765 }, { "epoch": 0.08941008681148341, "grad_norm": 11.794137154006105, "learning_rate": 4.980919714992965e-05, "loss": 2.2529, "mean_token_accuracy": 0.4641863226890564, "step": 88770 }, { "epoch": 0.08941512286458758, "grad_norm": 18.425034497237206, "learning_rate": 4.980914845086647e-05, "loss": 2.718, "mean_token_accuracy": 0.42413792610168455, "step": 88775 }, { "epoch": 0.08942015891769176, "grad_norm": 14.373899774130383, "learning_rate": 4.980909974561577e-05, "loss": 2.3775, "mean_token_accuracy": 0.4241379380226135, "step": 88780 }, { "epoch": 0.08942519497079593, "grad_norm": 10.927495392278031, "learning_rate": 4.9809051034177545e-05, "loss": 2.588, "mean_token_accuracy": 0.3517241358757019, "step": 88785 }, { "epoch": 0.0894302310239001, "grad_norm": 10.760302741924633, "learning_rate": 4.980900231655182e-05, "loss": 2.344, "mean_token_accuracy": 0.4034482717514038, "step": 88790 }, { "epoch": 0.08943526707700428, "grad_norm": 11.710142723926088, "learning_rate": 4.980895359273862e-05, "loss": 2.504, "mean_token_accuracy": 0.3931034505367279, "step": 88795 }, { "epoch": 0.08944030313010845, "grad_norm": 11.430697738447106, "learning_rate": 4.980890486273794e-05, "loss": 2.7736, "mean_token_accuracy": 0.3896551728248596, "step": 88800 }, { "epoch": 0.08944533918321262, "grad_norm": 10.420876302055264, "learning_rate": 4.9808856126549815e-05, "loss": 2.4461, "mean_token_accuracy": 0.4034482777118683, "step": 88805 }, { "epoch": 0.0894503752363168, "grad_norm": 9.85966700874723, "learning_rate": 4.9808807384174235e-05, "loss": 2.3485, "mean_token_accuracy": 0.38965516686439516, "step": 88810 }, { "epoch": 0.08945541128942096, "grad_norm": 14.021783881127808, "learning_rate": 4.980875863561122e-05, "loss": 2.5708, "mean_token_accuracy": 0.3827586114406586, "step": 88815 }, { "epoch": 0.08946044734252513, "grad_norm": 10.97386090813227, "learning_rate": 4.98087098808608e-05, "loss": 2.5292, "mean_token_accuracy": 0.4068965494632721, "step": 88820 }, { "epoch": 0.0894654833956293, "grad_norm": 12.39595984720901, "learning_rate": 4.9808661119922975e-05, "loss": 2.5683, "mean_token_accuracy": 0.4034482717514038, "step": 88825 }, { "epoch": 0.08947051944873348, "grad_norm": 9.059698588952815, "learning_rate": 4.980861235279774e-05, "loss": 2.3682, "mean_token_accuracy": 0.41379310488700866, "step": 88830 }, { "epoch": 0.08947555550183765, "grad_norm": 8.812770000059357, "learning_rate": 4.980856357948515e-05, "loss": 2.7104, "mean_token_accuracy": 0.41034482717514037, "step": 88835 }, { "epoch": 0.08948059155494183, "grad_norm": 9.401394732155904, "learning_rate": 4.980851479998519e-05, "loss": 2.6027, "mean_token_accuracy": 0.4206896543502808, "step": 88840 }, { "epoch": 0.089485627608046, "grad_norm": 10.184007204548672, "learning_rate": 4.980846601429788e-05, "loss": 2.1776, "mean_token_accuracy": 0.44482759237289426, "step": 88845 }, { "epoch": 0.08949066366115017, "grad_norm": 11.269501231204929, "learning_rate": 4.980841722242323e-05, "loss": 2.1187, "mean_token_accuracy": 0.4758620738983154, "step": 88850 }, { "epoch": 0.08949569971425435, "grad_norm": 11.630401138207038, "learning_rate": 4.9808368424361265e-05, "loss": 2.305, "mean_token_accuracy": 0.40344826579093934, "step": 88855 }, { "epoch": 0.08950073576735852, "grad_norm": 12.86499098309019, "learning_rate": 4.980831962011198e-05, "loss": 2.5492, "mean_token_accuracy": 0.42413793206214906, "step": 88860 }, { "epoch": 0.0895057718204627, "grad_norm": 10.232198629674695, "learning_rate": 4.9808270809675406e-05, "loss": 2.2401, "mean_token_accuracy": 0.47241379618644713, "step": 88865 }, { "epoch": 0.08951080787356687, "grad_norm": 11.246561723037441, "learning_rate": 4.9808221993051554e-05, "loss": 2.5824, "mean_token_accuracy": 0.3862068921327591, "step": 88870 }, { "epoch": 0.08951584392667104, "grad_norm": 10.776144683650923, "learning_rate": 4.980817317024043e-05, "loss": 2.1402, "mean_token_accuracy": 0.4413793087005615, "step": 88875 }, { "epoch": 0.08952087997977522, "grad_norm": 12.116261601442222, "learning_rate": 4.980812434124204e-05, "loss": 2.6475, "mean_token_accuracy": 0.4, "step": 88880 }, { "epoch": 0.08952591603287938, "grad_norm": 10.781475579518753, "learning_rate": 4.9808075506056426e-05, "loss": 2.8414, "mean_token_accuracy": 0.3517241418361664, "step": 88885 }, { "epoch": 0.08953095208598355, "grad_norm": 9.480294957987562, "learning_rate": 4.980802666468358e-05, "loss": 2.3973, "mean_token_accuracy": 0.4172413766384125, "step": 88890 }, { "epoch": 0.08953598813908772, "grad_norm": 11.729358087061678, "learning_rate": 4.9807977817123514e-05, "loss": 3.0077, "mean_token_accuracy": 0.39655172228813174, "step": 88895 }, { "epoch": 0.0895410241921919, "grad_norm": 12.384990244062646, "learning_rate": 4.980792896337625e-05, "loss": 2.0314, "mean_token_accuracy": 0.44827585816383364, "step": 88900 }, { "epoch": 0.08954606024529607, "grad_norm": 11.668598783205345, "learning_rate": 4.980788010344179e-05, "loss": 2.2008, "mean_token_accuracy": 0.4931034505367279, "step": 88905 }, { "epoch": 0.08955109629840025, "grad_norm": 16.795184590849257, "learning_rate": 4.9807831237320166e-05, "loss": 2.664, "mean_token_accuracy": 0.38620689511299133, "step": 88910 }, { "epoch": 0.08955613235150442, "grad_norm": 12.34637462326204, "learning_rate": 4.9807782365011385e-05, "loss": 2.7931, "mean_token_accuracy": 0.36551723480224607, "step": 88915 }, { "epoch": 0.0895611684046086, "grad_norm": 10.34173261495006, "learning_rate": 4.980773348651545e-05, "loss": 2.5549, "mean_token_accuracy": 0.4379310250282288, "step": 88920 }, { "epoch": 0.08956620445771277, "grad_norm": 10.658613668468902, "learning_rate": 4.980768460183238e-05, "loss": 2.9469, "mean_token_accuracy": 0.37586206793785093, "step": 88925 }, { "epoch": 0.08957124051081694, "grad_norm": 10.716763629797212, "learning_rate": 4.9807635710962196e-05, "loss": 2.6615, "mean_token_accuracy": 0.4034482717514038, "step": 88930 }, { "epoch": 0.08957627656392111, "grad_norm": 11.284583244256021, "learning_rate": 4.980758681390491e-05, "loss": 2.5194, "mean_token_accuracy": 0.4310344815254211, "step": 88935 }, { "epoch": 0.08958131261702529, "grad_norm": 10.856493708208125, "learning_rate": 4.9807537910660526e-05, "loss": 2.4815, "mean_token_accuracy": 0.3896551728248596, "step": 88940 }, { "epoch": 0.08958634867012946, "grad_norm": 11.126605977991746, "learning_rate": 4.9807489001229055e-05, "loss": 2.3011, "mean_token_accuracy": 0.42413793206214906, "step": 88945 }, { "epoch": 0.08959138472323364, "grad_norm": 12.783895315112714, "learning_rate": 4.980744008561053e-05, "loss": 2.2365, "mean_token_accuracy": 0.458620685338974, "step": 88950 }, { "epoch": 0.0895964207763378, "grad_norm": 11.93786668747794, "learning_rate": 4.980739116380495e-05, "loss": 2.8233, "mean_token_accuracy": 0.3862069010734558, "step": 88955 }, { "epoch": 0.08960145682944197, "grad_norm": 9.96566782357653, "learning_rate": 4.9807342235812334e-05, "loss": 2.3387, "mean_token_accuracy": 0.458620685338974, "step": 88960 }, { "epoch": 0.08960649288254614, "grad_norm": 9.591225869909564, "learning_rate": 4.980729330163269e-05, "loss": 2.2193, "mean_token_accuracy": 0.4896551728248596, "step": 88965 }, { "epoch": 0.08961152893565032, "grad_norm": 11.419279517587784, "learning_rate": 4.980724436126604e-05, "loss": 2.6288, "mean_token_accuracy": 0.41034482717514037, "step": 88970 }, { "epoch": 0.08961656498875449, "grad_norm": 9.68558232542842, "learning_rate": 4.9807195414712385e-05, "loss": 1.9563, "mean_token_accuracy": 0.5206896543502808, "step": 88975 }, { "epoch": 0.08962160104185866, "grad_norm": 17.290443635755594, "learning_rate": 4.980714646197175e-05, "loss": 2.2139, "mean_token_accuracy": 0.45517240166664125, "step": 88980 }, { "epoch": 0.08962663709496284, "grad_norm": 11.754790486088517, "learning_rate": 4.980709750304415e-05, "loss": 2.7672, "mean_token_accuracy": 0.39655172228813174, "step": 88985 }, { "epoch": 0.08963167314806701, "grad_norm": 8.548199383403793, "learning_rate": 4.980704853792958e-05, "loss": 2.4781, "mean_token_accuracy": 0.4551724076271057, "step": 88990 }, { "epoch": 0.08963670920117119, "grad_norm": 12.300607751217163, "learning_rate": 4.9806999566628074e-05, "loss": 2.4459, "mean_token_accuracy": 0.40169388651847837, "step": 88995 }, { "epoch": 0.08964174525427536, "grad_norm": 11.928165946405505, "learning_rate": 4.980695058913965e-05, "loss": 2.2292, "mean_token_accuracy": 0.44482758045196535, "step": 89000 }, { "epoch": 0.08964678130737953, "grad_norm": 12.320084509580049, "learning_rate": 4.980690160546429e-05, "loss": 2.4652, "mean_token_accuracy": 0.44682395458221436, "step": 89005 }, { "epoch": 0.08965181736048371, "grad_norm": 12.67278095080595, "learning_rate": 4.980685261560204e-05, "loss": 2.8141, "mean_token_accuracy": 0.3586206823587418, "step": 89010 }, { "epoch": 0.08965685341358788, "grad_norm": 12.892253987390248, "learning_rate": 4.98068036195529e-05, "loss": 2.2555, "mean_token_accuracy": 0.47241379618644713, "step": 89015 }, { "epoch": 0.08966188946669205, "grad_norm": 11.173543877242919, "learning_rate": 4.980675461731689e-05, "loss": 2.7552, "mean_token_accuracy": 0.42413793206214906, "step": 89020 }, { "epoch": 0.08966692551979621, "grad_norm": 14.26831942806606, "learning_rate": 4.9806705608894004e-05, "loss": 2.2437, "mean_token_accuracy": 0.4517241418361664, "step": 89025 }, { "epoch": 0.08967196157290039, "grad_norm": 11.138337558261712, "learning_rate": 4.980665659428428e-05, "loss": 2.8268, "mean_token_accuracy": 0.4103448212146759, "step": 89030 }, { "epoch": 0.08967699762600456, "grad_norm": 10.031194739446683, "learning_rate": 4.980660757348772e-05, "loss": 3.1029, "mean_token_accuracy": 0.3896551728248596, "step": 89035 }, { "epoch": 0.08968203367910874, "grad_norm": 9.442011550358695, "learning_rate": 4.980655854650434e-05, "loss": 2.2487, "mean_token_accuracy": 0.42413793206214906, "step": 89040 }, { "epoch": 0.08968706973221291, "grad_norm": 11.161102510269616, "learning_rate": 4.980650951333414e-05, "loss": 2.6213, "mean_token_accuracy": 0.3931034505367279, "step": 89045 }, { "epoch": 0.08969210578531708, "grad_norm": 10.34183894956065, "learning_rate": 4.980646047397716e-05, "loss": 2.4111, "mean_token_accuracy": 0.41034482419490814, "step": 89050 }, { "epoch": 0.08969714183842126, "grad_norm": 11.031198680088721, "learning_rate": 4.98064114284334e-05, "loss": 2.552, "mean_token_accuracy": 0.3758620649576187, "step": 89055 }, { "epoch": 0.08970217789152543, "grad_norm": 10.513530044492862, "learning_rate": 4.980636237670287e-05, "loss": 2.6727, "mean_token_accuracy": 0.3482758641242981, "step": 89060 }, { "epoch": 0.0897072139446296, "grad_norm": 12.392702818878702, "learning_rate": 4.9806313318785586e-05, "loss": 2.7229, "mean_token_accuracy": 0.41724138855934145, "step": 89065 }, { "epoch": 0.08971224999773378, "grad_norm": 10.058024018084051, "learning_rate": 4.980626425468157e-05, "loss": 2.058, "mean_token_accuracy": 0.4689655125141144, "step": 89070 }, { "epoch": 0.08971728605083795, "grad_norm": 9.750503298341185, "learning_rate": 4.980621518439082e-05, "loss": 2.3217, "mean_token_accuracy": 0.4465819835662842, "step": 89075 }, { "epoch": 0.08972232210394213, "grad_norm": 10.478791689941625, "learning_rate": 4.9806166107913364e-05, "loss": 2.4015, "mean_token_accuracy": 0.41379310488700866, "step": 89080 }, { "epoch": 0.0897273581570463, "grad_norm": 11.212419021102791, "learning_rate": 4.98061170252492e-05, "loss": 2.5125, "mean_token_accuracy": 0.43641863465309144, "step": 89085 }, { "epoch": 0.08973239421015047, "grad_norm": 12.010196342465585, "learning_rate": 4.9806067936398366e-05, "loss": 2.4077, "mean_token_accuracy": 0.44827585816383364, "step": 89090 }, { "epoch": 0.08973743026325463, "grad_norm": 11.746203634177599, "learning_rate": 4.9806018841360855e-05, "loss": 2.4096, "mean_token_accuracy": 0.42758620977401735, "step": 89095 }, { "epoch": 0.08974246631635881, "grad_norm": 11.398230844590623, "learning_rate": 4.9805969740136684e-05, "loss": 2.4469, "mean_token_accuracy": 0.403448274731636, "step": 89100 }, { "epoch": 0.08974750236946298, "grad_norm": 9.80675841969019, "learning_rate": 4.9805920632725865e-05, "loss": 2.1556, "mean_token_accuracy": 0.4517241418361664, "step": 89105 }, { "epoch": 0.08975253842256715, "grad_norm": 10.946226520240662, "learning_rate": 4.980587151912842e-05, "loss": 2.3536, "mean_token_accuracy": 0.44700543880462645, "step": 89110 }, { "epoch": 0.08975757447567133, "grad_norm": 10.053619413286702, "learning_rate": 4.980582239934436e-05, "loss": 2.157, "mean_token_accuracy": 0.4620689690113068, "step": 89115 }, { "epoch": 0.0897626105287755, "grad_norm": 11.446653916110963, "learning_rate": 4.980577327337369e-05, "loss": 2.2358, "mean_token_accuracy": 0.4758620738983154, "step": 89120 }, { "epoch": 0.08976764658187968, "grad_norm": 12.066149768237091, "learning_rate": 4.980572414121644e-05, "loss": 2.8071, "mean_token_accuracy": 0.39655172228813174, "step": 89125 }, { "epoch": 0.08977268263498385, "grad_norm": 9.168009126835535, "learning_rate": 4.980567500287261e-05, "loss": 2.5513, "mean_token_accuracy": 0.3965517282485962, "step": 89130 }, { "epoch": 0.08977771868808802, "grad_norm": 12.152801571607393, "learning_rate": 4.980562585834222e-05, "loss": 2.8497, "mean_token_accuracy": 0.3827586233615875, "step": 89135 }, { "epoch": 0.0897827547411922, "grad_norm": 10.78609264106069, "learning_rate": 4.980557670762528e-05, "loss": 2.3863, "mean_token_accuracy": 0.4537810027599335, "step": 89140 }, { "epoch": 0.08978779079429637, "grad_norm": 12.53664476511771, "learning_rate": 4.9805527550721805e-05, "loss": 2.6806, "mean_token_accuracy": 0.379310342669487, "step": 89145 }, { "epoch": 0.08979282684740054, "grad_norm": 11.653323448303745, "learning_rate": 4.9805478387631803e-05, "loss": 2.5135, "mean_token_accuracy": 0.41724138259887694, "step": 89150 }, { "epoch": 0.08979786290050472, "grad_norm": 11.781907634974027, "learning_rate": 4.9805429218355304e-05, "loss": 2.3832, "mean_token_accuracy": 0.4034482717514038, "step": 89155 }, { "epoch": 0.08980289895360889, "grad_norm": 11.120036669823298, "learning_rate": 4.98053800428923e-05, "loss": 2.2474, "mean_token_accuracy": 0.4610405325889587, "step": 89160 }, { "epoch": 0.08980793500671305, "grad_norm": 12.561784539326514, "learning_rate": 4.9805330861242824e-05, "loss": 2.7697, "mean_token_accuracy": 0.3655172437429428, "step": 89165 }, { "epoch": 0.08981297105981723, "grad_norm": 10.710851267986031, "learning_rate": 4.9805281673406884e-05, "loss": 2.8721, "mean_token_accuracy": 0.3896551728248596, "step": 89170 }, { "epoch": 0.0898180071129214, "grad_norm": 11.43177074189504, "learning_rate": 4.9805232479384486e-05, "loss": 2.2844, "mean_token_accuracy": 0.42413792610168455, "step": 89175 }, { "epoch": 0.08982304316602557, "grad_norm": 10.47288126508769, "learning_rate": 4.980518327917564e-05, "loss": 2.2873, "mean_token_accuracy": 0.458620685338974, "step": 89180 }, { "epoch": 0.08982807921912975, "grad_norm": 10.841134882792245, "learning_rate": 4.980513407278038e-05, "loss": 2.2116, "mean_token_accuracy": 0.42758620977401735, "step": 89185 }, { "epoch": 0.08983311527223392, "grad_norm": 9.69031705929063, "learning_rate": 4.9805084860198704e-05, "loss": 2.7386, "mean_token_accuracy": 0.41379310488700866, "step": 89190 }, { "epoch": 0.0898381513253381, "grad_norm": 9.098758904633891, "learning_rate": 4.980503564143063e-05, "loss": 2.3527, "mean_token_accuracy": 0.4310344785451889, "step": 89195 }, { "epoch": 0.08984318737844227, "grad_norm": 11.033472964786132, "learning_rate": 4.980498641647617e-05, "loss": 2.4341, "mean_token_accuracy": 0.4600121021270752, "step": 89200 }, { "epoch": 0.08984822343154644, "grad_norm": 9.506357489399608, "learning_rate": 4.980493718533534e-05, "loss": 2.1277, "mean_token_accuracy": 0.42068966031074523, "step": 89205 }, { "epoch": 0.08985325948465062, "grad_norm": 10.247112141175057, "learning_rate": 4.980488794800815e-05, "loss": 2.4222, "mean_token_accuracy": 0.4413793087005615, "step": 89210 }, { "epoch": 0.08985829553775479, "grad_norm": 11.062786215659182, "learning_rate": 4.980483870449462e-05, "loss": 2.2784, "mean_token_accuracy": 0.4517241299152374, "step": 89215 }, { "epoch": 0.08986333159085896, "grad_norm": 10.154611166102077, "learning_rate": 4.980478945479476e-05, "loss": 2.5179, "mean_token_accuracy": 0.3724137842655182, "step": 89220 }, { "epoch": 0.08986836764396314, "grad_norm": 10.978198502735562, "learning_rate": 4.980474019890858e-05, "loss": 2.9226, "mean_token_accuracy": 0.4, "step": 89225 }, { "epoch": 0.08987340369706731, "grad_norm": 10.962235650044633, "learning_rate": 4.98046909368361e-05, "loss": 2.4186, "mean_token_accuracy": 0.4034482717514038, "step": 89230 }, { "epoch": 0.08987843975017147, "grad_norm": 11.383121740940902, "learning_rate": 4.980464166857733e-05, "loss": 2.3145, "mean_token_accuracy": 0.3896551728248596, "step": 89235 }, { "epoch": 0.08988347580327564, "grad_norm": 11.139209226289138, "learning_rate": 4.980459239413228e-05, "loss": 2.2044, "mean_token_accuracy": 0.4551724076271057, "step": 89240 }, { "epoch": 0.08988851185637982, "grad_norm": 10.893190142158977, "learning_rate": 4.980454311350097e-05, "loss": 2.3664, "mean_token_accuracy": 0.42758620977401735, "step": 89245 }, { "epoch": 0.08989354790948399, "grad_norm": 11.585124757743001, "learning_rate": 4.9804493826683416e-05, "loss": 3.0074, "mean_token_accuracy": 0.31451905369758604, "step": 89250 }, { "epoch": 0.08989858396258817, "grad_norm": 10.74201603368157, "learning_rate": 4.980444453367962e-05, "loss": 2.1617, "mean_token_accuracy": 0.42068966031074523, "step": 89255 }, { "epoch": 0.08990362001569234, "grad_norm": 11.474935523830943, "learning_rate": 4.980439523448961e-05, "loss": 2.4509, "mean_token_accuracy": 0.4000000059604645, "step": 89260 }, { "epoch": 0.08990865606879651, "grad_norm": 9.877916416891493, "learning_rate": 4.980434592911339e-05, "loss": 1.9656, "mean_token_accuracy": 0.5068014919757843, "step": 89265 }, { "epoch": 0.08991369212190069, "grad_norm": 12.763712341506066, "learning_rate": 4.9804296617550976e-05, "loss": 2.3534, "mean_token_accuracy": 0.417241370677948, "step": 89270 }, { "epoch": 0.08991872817500486, "grad_norm": 13.078230341401856, "learning_rate": 4.980424729980239e-05, "loss": 2.5354, "mean_token_accuracy": 0.4224440515041351, "step": 89275 }, { "epoch": 0.08992376422810903, "grad_norm": 9.49898908395129, "learning_rate": 4.9804197975867626e-05, "loss": 2.5313, "mean_token_accuracy": 0.413793095946312, "step": 89280 }, { "epoch": 0.08992880028121321, "grad_norm": 12.266704101766987, "learning_rate": 4.980414864574672e-05, "loss": 2.4059, "mean_token_accuracy": 0.4672111332416534, "step": 89285 }, { "epoch": 0.08993383633431738, "grad_norm": 9.959348986637753, "learning_rate": 4.9804099309439666e-05, "loss": 2.8252, "mean_token_accuracy": 0.37931033968925476, "step": 89290 }, { "epoch": 0.08993887238742156, "grad_norm": 9.945778559127607, "learning_rate": 4.98040499669465e-05, "loss": 2.3529, "mean_token_accuracy": 0.4000000059604645, "step": 89295 }, { "epoch": 0.08994390844052573, "grad_norm": 14.949670262162178, "learning_rate": 4.98040006182672e-05, "loss": 2.607, "mean_token_accuracy": 0.4034482777118683, "step": 89300 }, { "epoch": 0.08994894449362989, "grad_norm": 11.37624161930812, "learning_rate": 4.980395126340182e-05, "loss": 2.457, "mean_token_accuracy": 0.41724138259887694, "step": 89305 }, { "epoch": 0.08995398054673406, "grad_norm": 12.02775055752868, "learning_rate": 4.9803901902350354e-05, "loss": 2.3368, "mean_token_accuracy": 0.45353902578353883, "step": 89310 }, { "epoch": 0.08995901659983824, "grad_norm": 9.431806944366356, "learning_rate": 4.9803852535112815e-05, "loss": 2.4278, "mean_token_accuracy": 0.4758620738983154, "step": 89315 }, { "epoch": 0.08996405265294241, "grad_norm": 8.607377616206996, "learning_rate": 4.980380316168922e-05, "loss": 2.5854, "mean_token_accuracy": 0.441379314661026, "step": 89320 }, { "epoch": 0.08996908870604658, "grad_norm": 11.406114013034856, "learning_rate": 4.9803753782079585e-05, "loss": 2.4872, "mean_token_accuracy": 0.3931034505367279, "step": 89325 }, { "epoch": 0.08997412475915076, "grad_norm": 12.909625668938764, "learning_rate": 4.9803704396283914e-05, "loss": 2.2817, "mean_token_accuracy": 0.4103448301553726, "step": 89330 }, { "epoch": 0.08997916081225493, "grad_norm": 11.329548408424255, "learning_rate": 4.980365500430223e-05, "loss": 2.3183, "mean_token_accuracy": 0.4261947929859161, "step": 89335 }, { "epoch": 0.0899841968653591, "grad_norm": 13.03113378760136, "learning_rate": 4.980360560613454e-05, "loss": 2.5486, "mean_token_accuracy": 0.42758620977401735, "step": 89340 }, { "epoch": 0.08998923291846328, "grad_norm": 9.738836440166937, "learning_rate": 4.980355620178087e-05, "loss": 2.65, "mean_token_accuracy": 0.3931034505367279, "step": 89345 }, { "epoch": 0.08999426897156745, "grad_norm": 9.312330069141487, "learning_rate": 4.980350679124122e-05, "loss": 2.1247, "mean_token_accuracy": 0.4448275864124298, "step": 89350 }, { "epoch": 0.08999930502467163, "grad_norm": 11.791326709408748, "learning_rate": 4.980345737451561e-05, "loss": 2.4289, "mean_token_accuracy": 0.41034482717514037, "step": 89355 }, { "epoch": 0.0900043410777758, "grad_norm": 12.633562027631791, "learning_rate": 4.980340795160406e-05, "loss": 3.0207, "mean_token_accuracy": 0.29655171632766725, "step": 89360 }, { "epoch": 0.09000937713087998, "grad_norm": 9.34854942967477, "learning_rate": 4.980335852250657e-05, "loss": 2.456, "mean_token_accuracy": 0.4517241358757019, "step": 89365 }, { "epoch": 0.09001441318398415, "grad_norm": 10.922874467958811, "learning_rate": 4.980330908722315e-05, "loss": 2.628, "mean_token_accuracy": 0.4034482777118683, "step": 89370 }, { "epoch": 0.09001944923708831, "grad_norm": 10.59371329513189, "learning_rate": 4.980325964575384e-05, "loss": 2.3972, "mean_token_accuracy": 0.42068966031074523, "step": 89375 }, { "epoch": 0.09002448529019248, "grad_norm": 9.766748815422012, "learning_rate": 4.9803210198098636e-05, "loss": 2.3807, "mean_token_accuracy": 0.4689655125141144, "step": 89380 }, { "epoch": 0.09002952134329666, "grad_norm": 9.475546826070007, "learning_rate": 4.980316074425754e-05, "loss": 2.154, "mean_token_accuracy": 0.441379314661026, "step": 89385 }, { "epoch": 0.09003455739640083, "grad_norm": 11.64292061366294, "learning_rate": 4.9803111284230594e-05, "loss": 2.629, "mean_token_accuracy": 0.4379310429096222, "step": 89390 }, { "epoch": 0.090039593449505, "grad_norm": 10.984056284813985, "learning_rate": 4.980306181801779e-05, "loss": 2.5703, "mean_token_accuracy": 0.42068964838981626, "step": 89395 }, { "epoch": 0.09004462950260918, "grad_norm": 19.264303809883685, "learning_rate": 4.980301234561916e-05, "loss": 2.4223, "mean_token_accuracy": 0.42068964838981626, "step": 89400 }, { "epoch": 0.09004966555571335, "grad_norm": 9.419958913467397, "learning_rate": 4.980296286703469e-05, "loss": 2.2604, "mean_token_accuracy": 0.43793103098869324, "step": 89405 }, { "epoch": 0.09005470160881753, "grad_norm": 9.936090255601128, "learning_rate": 4.980291338226442e-05, "loss": 2.3986, "mean_token_accuracy": 0.41034482717514037, "step": 89410 }, { "epoch": 0.0900597376619217, "grad_norm": 9.96388540448984, "learning_rate": 4.980286389130835e-05, "loss": 2.5763, "mean_token_accuracy": 0.4172413766384125, "step": 89415 }, { "epoch": 0.09006477371502587, "grad_norm": 10.24179014970209, "learning_rate": 4.98028143941665e-05, "loss": 2.7051, "mean_token_accuracy": 0.3551724195480347, "step": 89420 }, { "epoch": 0.09006980976813005, "grad_norm": 12.894785073754278, "learning_rate": 4.980276489083888e-05, "loss": 2.4915, "mean_token_accuracy": 0.42450737953186035, "step": 89425 }, { "epoch": 0.09007484582123422, "grad_norm": 12.648845959972073, "learning_rate": 4.98027153813255e-05, "loss": 2.3551, "mean_token_accuracy": 0.4241379380226135, "step": 89430 }, { "epoch": 0.0900798818743384, "grad_norm": 13.979314309011984, "learning_rate": 4.9802665865626385e-05, "loss": 2.687, "mean_token_accuracy": 0.39655172526836396, "step": 89435 }, { "epoch": 0.09008491792744257, "grad_norm": 9.6190303624526, "learning_rate": 4.9802616343741535e-05, "loss": 2.5995, "mean_token_accuracy": 0.4034482717514038, "step": 89440 }, { "epoch": 0.09008995398054673, "grad_norm": 11.234231625437367, "learning_rate": 4.980256681567098e-05, "loss": 2.4409, "mean_token_accuracy": 0.47931033968925474, "step": 89445 }, { "epoch": 0.0900949900336509, "grad_norm": 11.285223602031163, "learning_rate": 4.9802517281414726e-05, "loss": 2.5347, "mean_token_accuracy": 0.40689654350280763, "step": 89450 }, { "epoch": 0.09010002608675508, "grad_norm": 9.597119945194336, "learning_rate": 4.980246774097278e-05, "loss": 2.6043, "mean_token_accuracy": 0.4724137902259827, "step": 89455 }, { "epoch": 0.09010506213985925, "grad_norm": 9.613447208049644, "learning_rate": 4.980241819434516e-05, "loss": 2.3459, "mean_token_accuracy": 0.4482758641242981, "step": 89460 }, { "epoch": 0.09011009819296342, "grad_norm": 7.8082269457076645, "learning_rate": 4.980236864153188e-05, "loss": 2.0781, "mean_token_accuracy": 0.4758620738983154, "step": 89465 }, { "epoch": 0.0901151342460676, "grad_norm": 10.489823163825507, "learning_rate": 4.980231908253297e-05, "loss": 2.8922, "mean_token_accuracy": 0.37586207389831544, "step": 89470 }, { "epoch": 0.09012017029917177, "grad_norm": 13.498182730425654, "learning_rate": 4.9802269517348416e-05, "loss": 2.5733, "mean_token_accuracy": 0.4, "step": 89475 }, { "epoch": 0.09012520635227594, "grad_norm": 9.595057518077255, "learning_rate": 4.980221994597825e-05, "loss": 2.1619, "mean_token_accuracy": 0.4862068831920624, "step": 89480 }, { "epoch": 0.09013024240538012, "grad_norm": 11.974777938557821, "learning_rate": 4.980217036842246e-05, "loss": 2.5547, "mean_token_accuracy": 0.41893526911735535, "step": 89485 }, { "epoch": 0.09013527845848429, "grad_norm": 15.161573706472566, "learning_rate": 4.98021207846811e-05, "loss": 2.7025, "mean_token_accuracy": 0.3793103337287903, "step": 89490 }, { "epoch": 0.09014031451158847, "grad_norm": 9.78861561822191, "learning_rate": 4.980207119475416e-05, "loss": 2.3206, "mean_token_accuracy": 0.44482759237289426, "step": 89495 }, { "epoch": 0.09014535056469264, "grad_norm": 10.873358152148521, "learning_rate": 4.980202159864166e-05, "loss": 2.407, "mean_token_accuracy": 0.36896551847457887, "step": 89500 }, { "epoch": 0.09015038661779681, "grad_norm": 11.863697391982628, "learning_rate": 4.980197199634361e-05, "loss": 2.1326, "mean_token_accuracy": 0.4950393199920654, "step": 89505 }, { "epoch": 0.09015542267090099, "grad_norm": 24.480723984677372, "learning_rate": 4.980192238786002e-05, "loss": 2.0521, "mean_token_accuracy": 0.49195402264595034, "step": 89510 }, { "epoch": 0.09016045872400515, "grad_norm": 14.897479359287122, "learning_rate": 4.9801872773190904e-05, "loss": 2.8093, "mean_token_accuracy": 0.3724137842655182, "step": 89515 }, { "epoch": 0.09016549477710932, "grad_norm": 10.331445419890413, "learning_rate": 4.980182315233629e-05, "loss": 2.561, "mean_token_accuracy": 0.4068965554237366, "step": 89520 }, { "epoch": 0.0901705308302135, "grad_norm": 10.157616423943821, "learning_rate": 4.9801773525296184e-05, "loss": 2.2291, "mean_token_accuracy": 0.5019963562488556, "step": 89525 }, { "epoch": 0.09017556688331767, "grad_norm": 8.578523808417657, "learning_rate": 4.9801723892070595e-05, "loss": 2.4999, "mean_token_accuracy": 0.4931034445762634, "step": 89530 }, { "epoch": 0.09018060293642184, "grad_norm": 11.199658568549152, "learning_rate": 4.980167425265953e-05, "loss": 2.56, "mean_token_accuracy": 0.4068965554237366, "step": 89535 }, { "epoch": 0.09018563898952602, "grad_norm": 10.81261562574399, "learning_rate": 4.9801624607063025e-05, "loss": 2.5521, "mean_token_accuracy": 0.4068965494632721, "step": 89540 }, { "epoch": 0.09019067504263019, "grad_norm": 12.397684042542485, "learning_rate": 4.980157495528107e-05, "loss": 2.246, "mean_token_accuracy": 0.48275861144065857, "step": 89545 }, { "epoch": 0.09019571109573436, "grad_norm": 10.263558624186125, "learning_rate": 4.9801525297313703e-05, "loss": 2.4698, "mean_token_accuracy": 0.4500302612781525, "step": 89550 }, { "epoch": 0.09020074714883854, "grad_norm": 10.911008053012855, "learning_rate": 4.980147563316092e-05, "loss": 2.4743, "mean_token_accuracy": 0.4034482717514038, "step": 89555 }, { "epoch": 0.09020578320194271, "grad_norm": 10.398200123418599, "learning_rate": 4.980142596282273e-05, "loss": 2.1676, "mean_token_accuracy": 0.46551724076271056, "step": 89560 }, { "epoch": 0.09021081925504688, "grad_norm": 11.741667481138428, "learning_rate": 4.980137628629917e-05, "loss": 2.592, "mean_token_accuracy": 0.4206896543502808, "step": 89565 }, { "epoch": 0.09021585530815106, "grad_norm": 10.278878996890148, "learning_rate": 4.9801326603590235e-05, "loss": 2.4424, "mean_token_accuracy": 0.44137930274009707, "step": 89570 }, { "epoch": 0.09022089136125523, "grad_norm": 9.518535191942501, "learning_rate": 4.9801276914695946e-05, "loss": 2.4707, "mean_token_accuracy": 0.4413793087005615, "step": 89575 }, { "epoch": 0.0902259274143594, "grad_norm": 10.744519140850255, "learning_rate": 4.980122721961631e-05, "loss": 2.3023, "mean_token_accuracy": 0.47428917288780215, "step": 89580 }, { "epoch": 0.09023096346746357, "grad_norm": 11.08300507358741, "learning_rate": 4.9801177518351343e-05, "loss": 2.5837, "mean_token_accuracy": 0.441379314661026, "step": 89585 }, { "epoch": 0.09023599952056774, "grad_norm": 11.068931772379434, "learning_rate": 4.980112781090107e-05, "loss": 2.4723, "mean_token_accuracy": 0.4517241358757019, "step": 89590 }, { "epoch": 0.09024103557367191, "grad_norm": 11.007884296600027, "learning_rate": 4.980107809726549e-05, "loss": 1.9707, "mean_token_accuracy": 0.4551724135875702, "step": 89595 }, { "epoch": 0.09024607162677609, "grad_norm": 9.982572388667446, "learning_rate": 4.980102837744463e-05, "loss": 2.5701, "mean_token_accuracy": 0.458620685338974, "step": 89600 }, { "epoch": 0.09025110767988026, "grad_norm": 12.544098994060565, "learning_rate": 4.980097865143849e-05, "loss": 2.3746, "mean_token_accuracy": 0.4068965584039688, "step": 89605 }, { "epoch": 0.09025614373298443, "grad_norm": 10.963597351737873, "learning_rate": 4.980092891924709e-05, "loss": 2.3418, "mean_token_accuracy": 0.4, "step": 89610 }, { "epoch": 0.09026117978608861, "grad_norm": 10.337950950454628, "learning_rate": 4.9800879180870444e-05, "loss": 1.8491, "mean_token_accuracy": 0.45172414779663084, "step": 89615 }, { "epoch": 0.09026621583919278, "grad_norm": 11.726784907954611, "learning_rate": 4.980082943630857e-05, "loss": 2.3897, "mean_token_accuracy": 0.4068965494632721, "step": 89620 }, { "epoch": 0.09027125189229696, "grad_norm": 9.159151951332664, "learning_rate": 4.980077968556148e-05, "loss": 2.1515, "mean_token_accuracy": 0.46896551847457885, "step": 89625 }, { "epoch": 0.09027628794540113, "grad_norm": 12.791561095039945, "learning_rate": 4.980072992862917e-05, "loss": 2.2198, "mean_token_accuracy": 0.4571687877178192, "step": 89630 }, { "epoch": 0.0902813239985053, "grad_norm": 10.650282470473591, "learning_rate": 4.980068016551168e-05, "loss": 2.3643, "mean_token_accuracy": 0.4068965494632721, "step": 89635 }, { "epoch": 0.09028636005160948, "grad_norm": 9.791089150752125, "learning_rate": 4.980063039620901e-05, "loss": 2.6021, "mean_token_accuracy": 0.3896551728248596, "step": 89640 }, { "epoch": 0.09029139610471365, "grad_norm": 11.23902155681011, "learning_rate": 4.980058062072119e-05, "loss": 2.0895, "mean_token_accuracy": 0.44827587008476255, "step": 89645 }, { "epoch": 0.09029643215781782, "grad_norm": 10.976026087324039, "learning_rate": 4.980053083904821e-05, "loss": 2.5002, "mean_token_accuracy": 0.4172413766384125, "step": 89650 }, { "epoch": 0.09030146821092198, "grad_norm": 9.683779651959597, "learning_rate": 4.98004810511901e-05, "loss": 2.2294, "mean_token_accuracy": 0.43793103098869324, "step": 89655 }, { "epoch": 0.09030650426402616, "grad_norm": 8.858658165494553, "learning_rate": 4.980043125714686e-05, "loss": 2.47, "mean_token_accuracy": 0.4257713258266449, "step": 89660 }, { "epoch": 0.09031154031713033, "grad_norm": 9.620203503659363, "learning_rate": 4.980038145691851e-05, "loss": 2.4133, "mean_token_accuracy": 0.47241378426551817, "step": 89665 }, { "epoch": 0.0903165763702345, "grad_norm": 9.6808906980857, "learning_rate": 4.980033165050508e-05, "loss": 2.338, "mean_token_accuracy": 0.4620689690113068, "step": 89670 }, { "epoch": 0.09032161242333868, "grad_norm": 12.628712858241414, "learning_rate": 4.980028183790657e-05, "loss": 2.3345, "mean_token_accuracy": 0.42758620381355283, "step": 89675 }, { "epoch": 0.09032664847644285, "grad_norm": 10.708773431178253, "learning_rate": 4.980023201912298e-05, "loss": 2.4065, "mean_token_accuracy": 0.42546884417533876, "step": 89680 }, { "epoch": 0.09033168452954703, "grad_norm": 14.110464517957853, "learning_rate": 4.980018219415434e-05, "loss": 2.8139, "mean_token_accuracy": 0.39655172526836396, "step": 89685 }, { "epoch": 0.0903367205826512, "grad_norm": 9.466674212911029, "learning_rate": 4.9800132363000664e-05, "loss": 2.2661, "mean_token_accuracy": 0.46896551847457885, "step": 89690 }, { "epoch": 0.09034175663575537, "grad_norm": 12.009711878692949, "learning_rate": 4.980008252566197e-05, "loss": 2.6866, "mean_token_accuracy": 0.43793103098869324, "step": 89695 }, { "epoch": 0.09034679268885955, "grad_norm": 10.177266035445074, "learning_rate": 4.980003268213826e-05, "loss": 2.8138, "mean_token_accuracy": 0.32758620381355286, "step": 89700 }, { "epoch": 0.09035182874196372, "grad_norm": 14.795475884401593, "learning_rate": 4.9799982832429545e-05, "loss": 2.9559, "mean_token_accuracy": 0.34137930274009703, "step": 89705 }, { "epoch": 0.0903568647950679, "grad_norm": 10.004349344472923, "learning_rate": 4.9799932976535853e-05, "loss": 2.2157, "mean_token_accuracy": 0.4172413766384125, "step": 89710 }, { "epoch": 0.09036190084817207, "grad_norm": 9.373980865264821, "learning_rate": 4.979988311445719e-05, "loss": 2.3224, "mean_token_accuracy": 0.4344827592372894, "step": 89715 }, { "epoch": 0.09036693690127624, "grad_norm": 11.487682078253425, "learning_rate": 4.979983324619357e-05, "loss": 2.5058, "mean_token_accuracy": 0.4206896543502808, "step": 89720 }, { "epoch": 0.0903719729543804, "grad_norm": 11.08545359585476, "learning_rate": 4.979978337174501e-05, "loss": 2.543, "mean_token_accuracy": 0.44482758045196535, "step": 89725 }, { "epoch": 0.09037700900748458, "grad_norm": 9.368259448465437, "learning_rate": 4.979973349111152e-05, "loss": 2.3643, "mean_token_accuracy": 0.4552955687046051, "step": 89730 }, { "epoch": 0.09038204506058875, "grad_norm": 11.056629978553346, "learning_rate": 4.979968360429313e-05, "loss": 2.5208, "mean_token_accuracy": 0.44482759237289426, "step": 89735 }, { "epoch": 0.09038708111369292, "grad_norm": 10.97682821744861, "learning_rate": 4.979963371128982e-05, "loss": 2.1671, "mean_token_accuracy": 0.45396249890327456, "step": 89740 }, { "epoch": 0.0903921171667971, "grad_norm": 14.92401666873045, "learning_rate": 4.979958381210163e-05, "loss": 2.8555, "mean_token_accuracy": 0.37586206793785093, "step": 89745 }, { "epoch": 0.09039715321990127, "grad_norm": 10.490020018437995, "learning_rate": 4.979953390672858e-05, "loss": 2.2008, "mean_token_accuracy": 0.5137930929660797, "step": 89750 }, { "epoch": 0.09040218927300545, "grad_norm": 10.577593088039851, "learning_rate": 4.979948399517065e-05, "loss": 2.8129, "mean_token_accuracy": 0.38965516686439516, "step": 89755 }, { "epoch": 0.09040722532610962, "grad_norm": 13.78668540609991, "learning_rate": 4.979943407742789e-05, "loss": 2.3458, "mean_token_accuracy": 0.4034482717514038, "step": 89760 }, { "epoch": 0.09041226137921379, "grad_norm": 11.242434678654442, "learning_rate": 4.9799384153500286e-05, "loss": 2.3997, "mean_token_accuracy": 0.4344827592372894, "step": 89765 }, { "epoch": 0.09041729743231797, "grad_norm": 10.471158829010824, "learning_rate": 4.979933422338787e-05, "loss": 2.4459, "mean_token_accuracy": 0.41566848158836367, "step": 89770 }, { "epoch": 0.09042233348542214, "grad_norm": 10.466539592990177, "learning_rate": 4.9799284287090656e-05, "loss": 2.5413, "mean_token_accuracy": 0.379310342669487, "step": 89775 }, { "epoch": 0.09042736953852631, "grad_norm": 12.344009992791031, "learning_rate": 4.979923434460865e-05, "loss": 2.818, "mean_token_accuracy": 0.324137932062149, "step": 89780 }, { "epoch": 0.09043240559163049, "grad_norm": 10.85778929331864, "learning_rate": 4.9799184395941864e-05, "loss": 2.2806, "mean_token_accuracy": 0.4310344815254211, "step": 89785 }, { "epoch": 0.09043744164473466, "grad_norm": 10.353984305726781, "learning_rate": 4.9799134441090315e-05, "loss": 2.8948, "mean_token_accuracy": 0.36896551549434664, "step": 89790 }, { "epoch": 0.09044247769783882, "grad_norm": 11.195854668208888, "learning_rate": 4.9799084480054024e-05, "loss": 2.6097, "mean_token_accuracy": 0.4, "step": 89795 }, { "epoch": 0.090447513750943, "grad_norm": 9.865166784270995, "learning_rate": 4.9799034512832996e-05, "loss": 2.3296, "mean_token_accuracy": 0.4034482777118683, "step": 89800 }, { "epoch": 0.09045254980404717, "grad_norm": 9.755263433932416, "learning_rate": 4.979898453942725e-05, "loss": 2.0767, "mean_token_accuracy": 0.5206896483898162, "step": 89805 }, { "epoch": 0.09045758585715134, "grad_norm": 12.596984690778859, "learning_rate": 4.97989345598368e-05, "loss": 2.9297, "mean_token_accuracy": 0.37586206793785093, "step": 89810 }, { "epoch": 0.09046262191025552, "grad_norm": 14.704391942833652, "learning_rate": 4.979888457406165e-05, "loss": 2.7163, "mean_token_accuracy": 0.40471869707107544, "step": 89815 }, { "epoch": 0.09046765796335969, "grad_norm": 11.139590053658273, "learning_rate": 4.9798834582101826e-05, "loss": 2.453, "mean_token_accuracy": 0.3896551728248596, "step": 89820 }, { "epoch": 0.09047269401646386, "grad_norm": 13.086361339131246, "learning_rate": 4.979878458395733e-05, "loss": 2.8472, "mean_token_accuracy": 0.4, "step": 89825 }, { "epoch": 0.09047773006956804, "grad_norm": 12.80889241486848, "learning_rate": 4.979873457962819e-05, "loss": 2.5557, "mean_token_accuracy": 0.36896551251411436, "step": 89830 }, { "epoch": 0.09048276612267221, "grad_norm": 10.906013250754546, "learning_rate": 4.9798684569114415e-05, "loss": 2.7782, "mean_token_accuracy": 0.3620689660310745, "step": 89835 }, { "epoch": 0.09048780217577639, "grad_norm": 10.449931341292485, "learning_rate": 4.979863455241601e-05, "loss": 2.5355, "mean_token_accuracy": 0.37241379618644715, "step": 89840 }, { "epoch": 0.09049283822888056, "grad_norm": 10.725942993950925, "learning_rate": 4.979858452953299e-05, "loss": 2.9839, "mean_token_accuracy": 0.4103448331356049, "step": 89845 }, { "epoch": 0.09049787428198473, "grad_norm": 9.700439098044725, "learning_rate": 4.9798534500465386e-05, "loss": 2.2058, "mean_token_accuracy": 0.46019359230995177, "step": 89850 }, { "epoch": 0.0905029103350889, "grad_norm": 10.185458098897252, "learning_rate": 4.97984844652132e-05, "loss": 2.3467, "mean_token_accuracy": 0.4068965494632721, "step": 89855 }, { "epoch": 0.09050794638819308, "grad_norm": 10.124563778032973, "learning_rate": 4.979843442377644e-05, "loss": 2.5142, "mean_token_accuracy": 0.45172414779663084, "step": 89860 }, { "epoch": 0.09051298244129724, "grad_norm": 9.831461088488423, "learning_rate": 4.979838437615513e-05, "loss": 2.4274, "mean_token_accuracy": 0.3896551728248596, "step": 89865 }, { "epoch": 0.09051801849440141, "grad_norm": 12.343217333677707, "learning_rate": 4.979833432234928e-05, "loss": 2.6617, "mean_token_accuracy": 0.3862068891525269, "step": 89870 }, { "epoch": 0.09052305454750559, "grad_norm": 13.668558250500858, "learning_rate": 4.9798284262358904e-05, "loss": 2.2909, "mean_token_accuracy": 0.5119458138942719, "step": 89875 }, { "epoch": 0.09052809060060976, "grad_norm": 8.997900885936795, "learning_rate": 4.979823419618401e-05, "loss": 2.8476, "mean_token_accuracy": 0.38275861740112305, "step": 89880 }, { "epoch": 0.09053312665371394, "grad_norm": 10.638642856656697, "learning_rate": 4.9798184123824626e-05, "loss": 2.5555, "mean_token_accuracy": 0.37586206793785093, "step": 89885 }, { "epoch": 0.09053816270681811, "grad_norm": 10.930350248426473, "learning_rate": 4.979813404528075e-05, "loss": 2.6318, "mean_token_accuracy": 0.44482759237289426, "step": 89890 }, { "epoch": 0.09054319875992228, "grad_norm": 12.253057347086001, "learning_rate": 4.979808396055241e-05, "loss": 2.6129, "mean_token_accuracy": 0.4206896424293518, "step": 89895 }, { "epoch": 0.09054823481302646, "grad_norm": 10.356642769829785, "learning_rate": 4.97980338696396e-05, "loss": 2.8061, "mean_token_accuracy": 0.3689655065536499, "step": 89900 }, { "epoch": 0.09055327086613063, "grad_norm": 10.957841223937617, "learning_rate": 4.979798377254236e-05, "loss": 2.1052, "mean_token_accuracy": 0.4931034564971924, "step": 89905 }, { "epoch": 0.0905583069192348, "grad_norm": 10.397135914787276, "learning_rate": 4.979793366926069e-05, "loss": 2.6678, "mean_token_accuracy": 0.38275861740112305, "step": 89910 }, { "epoch": 0.09056334297233898, "grad_norm": 8.581688175686919, "learning_rate": 4.97978835597946e-05, "loss": 2.393, "mean_token_accuracy": 0.44482758045196535, "step": 89915 }, { "epoch": 0.09056837902544315, "grad_norm": 13.028720723379386, "learning_rate": 4.979783344414411e-05, "loss": 2.7789, "mean_token_accuracy": 0.4068965554237366, "step": 89920 }, { "epoch": 0.09057341507854733, "grad_norm": 11.497920484227917, "learning_rate": 4.9797783322309236e-05, "loss": 2.6603, "mean_token_accuracy": 0.42758620381355283, "step": 89925 }, { "epoch": 0.0905784511316515, "grad_norm": 11.501101468125015, "learning_rate": 4.979773319428999e-05, "loss": 2.4437, "mean_token_accuracy": 0.48965516686439514, "step": 89930 }, { "epoch": 0.09058348718475566, "grad_norm": 10.944577200199786, "learning_rate": 4.979768306008638e-05, "loss": 2.624, "mean_token_accuracy": 0.4103448212146759, "step": 89935 }, { "epoch": 0.09058852323785983, "grad_norm": 14.702623777716283, "learning_rate": 4.979763291969842e-05, "loss": 2.5248, "mean_token_accuracy": 0.3896551728248596, "step": 89940 }, { "epoch": 0.09059355929096401, "grad_norm": 8.970624052944412, "learning_rate": 4.979758277312614e-05, "loss": 2.4454, "mean_token_accuracy": 0.4793103516101837, "step": 89945 }, { "epoch": 0.09059859534406818, "grad_norm": 10.712482946901611, "learning_rate": 4.979753262036954e-05, "loss": 2.5091, "mean_token_accuracy": 0.43103447556495667, "step": 89950 }, { "epoch": 0.09060363139717235, "grad_norm": 11.831569064849802, "learning_rate": 4.979748246142863e-05, "loss": 2.421, "mean_token_accuracy": 0.3965517282485962, "step": 89955 }, { "epoch": 0.09060866745027653, "grad_norm": 10.11255921332216, "learning_rate": 4.9797432296303434e-05, "loss": 2.3327, "mean_token_accuracy": 0.441379314661026, "step": 89960 }, { "epoch": 0.0906137035033807, "grad_norm": 11.105098978324026, "learning_rate": 4.979738212499396e-05, "loss": 2.2692, "mean_token_accuracy": 0.434664249420166, "step": 89965 }, { "epoch": 0.09061873955648488, "grad_norm": 10.376273405905163, "learning_rate": 4.979733194750022e-05, "loss": 2.6551, "mean_token_accuracy": 0.4103448212146759, "step": 89970 }, { "epoch": 0.09062377560958905, "grad_norm": 10.251359336170374, "learning_rate": 4.979728176382223e-05, "loss": 2.7534, "mean_token_accuracy": 0.4085299462080002, "step": 89975 }, { "epoch": 0.09062881166269322, "grad_norm": 15.203125868701372, "learning_rate": 4.979723157396002e-05, "loss": 2.5864, "mean_token_accuracy": 0.43103447258472444, "step": 89980 }, { "epoch": 0.0906338477157974, "grad_norm": 10.45509224782312, "learning_rate": 4.979718137791358e-05, "loss": 1.9837, "mean_token_accuracy": 0.48620688915252686, "step": 89985 }, { "epoch": 0.09063888376890157, "grad_norm": 11.831825205248354, "learning_rate": 4.979713117568294e-05, "loss": 2.4685, "mean_token_accuracy": 0.4034482777118683, "step": 89990 }, { "epoch": 0.09064391982200574, "grad_norm": 11.2030171028342, "learning_rate": 4.97970809672681e-05, "loss": 2.3308, "mean_token_accuracy": 0.4586206912994385, "step": 89995 }, { "epoch": 0.09064895587510992, "grad_norm": 10.636865517144962, "learning_rate": 4.9797030752669086e-05, "loss": 2.3383, "mean_token_accuracy": 0.44482759237289426, "step": 90000 }, { "epoch": 0.09065399192821408, "grad_norm": 8.417781344952532, "learning_rate": 4.97969805318859e-05, "loss": 2.5096, "mean_token_accuracy": 0.4068965494632721, "step": 90005 }, { "epoch": 0.09065902798131825, "grad_norm": 14.519615140287229, "learning_rate": 4.979693030491857e-05, "loss": 2.5658, "mean_token_accuracy": 0.40459770560264585, "step": 90010 }, { "epoch": 0.09066406403442243, "grad_norm": 12.358196881177804, "learning_rate": 4.97968800717671e-05, "loss": 2.4126, "mean_token_accuracy": 0.4034482717514038, "step": 90015 }, { "epoch": 0.0906691000875266, "grad_norm": 10.484100275508538, "learning_rate": 4.979682983243151e-05, "loss": 2.3515, "mean_token_accuracy": 0.43793103098869324, "step": 90020 }, { "epoch": 0.09067413614063077, "grad_norm": 12.517378663361818, "learning_rate": 4.979677958691181e-05, "loss": 2.3503, "mean_token_accuracy": 0.4603750824928284, "step": 90025 }, { "epoch": 0.09067917219373495, "grad_norm": 11.401731936434388, "learning_rate": 4.979672933520801e-05, "loss": 2.4802, "mean_token_accuracy": 0.43284936547279357, "step": 90030 }, { "epoch": 0.09068420824683912, "grad_norm": 9.21107120337026, "learning_rate": 4.979667907732014e-05, "loss": 2.3437, "mean_token_accuracy": 0.4034482717514038, "step": 90035 }, { "epoch": 0.0906892442999433, "grad_norm": 8.63920093549094, "learning_rate": 4.97966288132482e-05, "loss": 2.6577, "mean_token_accuracy": 0.39310345649719236, "step": 90040 }, { "epoch": 0.09069428035304747, "grad_norm": 9.382052401386913, "learning_rate": 4.97965785429922e-05, "loss": 2.2856, "mean_token_accuracy": 0.4448275864124298, "step": 90045 }, { "epoch": 0.09069931640615164, "grad_norm": 11.874455773091102, "learning_rate": 4.979652826655217e-05, "loss": 2.6981, "mean_token_accuracy": 0.3655172407627106, "step": 90050 }, { "epoch": 0.09070435245925582, "grad_norm": 12.620921942031611, "learning_rate": 4.97964779839281e-05, "loss": 2.2865, "mean_token_accuracy": 0.4344827592372894, "step": 90055 }, { "epoch": 0.09070938851235999, "grad_norm": 11.087335124788046, "learning_rate": 4.979642769512004e-05, "loss": 2.5002, "mean_token_accuracy": 0.43998789191246035, "step": 90060 }, { "epoch": 0.09071442456546416, "grad_norm": 9.944084144460588, "learning_rate": 4.979637740012796e-05, "loss": 2.3205, "mean_token_accuracy": 0.42068964540958403, "step": 90065 }, { "epoch": 0.09071946061856834, "grad_norm": 10.376947689776596, "learning_rate": 4.979632709895191e-05, "loss": 2.0403, "mean_token_accuracy": 0.4871921122074127, "step": 90070 }, { "epoch": 0.0907244966716725, "grad_norm": 14.013078594211372, "learning_rate": 4.979627679159189e-05, "loss": 2.3627, "mean_token_accuracy": 0.45722927451133727, "step": 90075 }, { "epoch": 0.09072953272477667, "grad_norm": 9.749797366255892, "learning_rate": 4.9796226478047906e-05, "loss": 2.7198, "mean_token_accuracy": 0.4034482777118683, "step": 90080 }, { "epoch": 0.09073456877788084, "grad_norm": 17.446174134908766, "learning_rate": 4.979617615831999e-05, "loss": 2.2867, "mean_token_accuracy": 0.44349666833877566, "step": 90085 }, { "epoch": 0.09073960483098502, "grad_norm": 16.443375965076694, "learning_rate": 4.979612583240814e-05, "loss": 2.6136, "mean_token_accuracy": 0.4361766457557678, "step": 90090 }, { "epoch": 0.09074464088408919, "grad_norm": 12.423581880476975, "learning_rate": 4.9796075500312385e-05, "loss": 2.3477, "mean_token_accuracy": 0.41034482717514037, "step": 90095 }, { "epoch": 0.09074967693719337, "grad_norm": 15.382391685590592, "learning_rate": 4.9796025162032724e-05, "loss": 2.6125, "mean_token_accuracy": 0.44827585816383364, "step": 90100 }, { "epoch": 0.09075471299029754, "grad_norm": 10.486824218668685, "learning_rate": 4.9795974817569175e-05, "loss": 2.2152, "mean_token_accuracy": 0.46551724076271056, "step": 90105 }, { "epoch": 0.09075974904340171, "grad_norm": 11.96119698924273, "learning_rate": 4.979592446692175e-05, "loss": 2.2717, "mean_token_accuracy": 0.4448275864124298, "step": 90110 }, { "epoch": 0.09076478509650589, "grad_norm": 11.971491652001868, "learning_rate": 4.979587411009048e-05, "loss": 2.2746, "mean_token_accuracy": 0.4206896543502808, "step": 90115 }, { "epoch": 0.09076982114961006, "grad_norm": 9.120103620342542, "learning_rate": 4.979582374707536e-05, "loss": 2.6215, "mean_token_accuracy": 0.42758620977401735, "step": 90120 }, { "epoch": 0.09077485720271423, "grad_norm": 24.832586891941805, "learning_rate": 4.979577337787641e-05, "loss": 2.4918, "mean_token_accuracy": 0.4517241299152374, "step": 90125 }, { "epoch": 0.09077989325581841, "grad_norm": 13.918599225202675, "learning_rate": 4.979572300249364e-05, "loss": 2.0342, "mean_token_accuracy": 0.4448275864124298, "step": 90130 }, { "epoch": 0.09078492930892258, "grad_norm": 9.695191817223243, "learning_rate": 4.9795672620927075e-05, "loss": 2.6955, "mean_token_accuracy": 0.3689655214548111, "step": 90135 }, { "epoch": 0.09078996536202676, "grad_norm": 11.79130036712235, "learning_rate": 4.979562223317672e-05, "loss": 2.656, "mean_token_accuracy": 0.37241379022598264, "step": 90140 }, { "epoch": 0.09079500141513092, "grad_norm": 11.308758070166531, "learning_rate": 4.979557183924259e-05, "loss": 2.2333, "mean_token_accuracy": 0.480541867017746, "step": 90145 }, { "epoch": 0.09080003746823509, "grad_norm": 10.866395967795059, "learning_rate": 4.97955214391247e-05, "loss": 2.4114, "mean_token_accuracy": 0.432667875289917, "step": 90150 }, { "epoch": 0.09080507352133926, "grad_norm": 9.87271174112646, "learning_rate": 4.9795471032823064e-05, "loss": 2.6152, "mean_token_accuracy": 0.3620689630508423, "step": 90155 }, { "epoch": 0.09081010957444344, "grad_norm": 10.651213614860794, "learning_rate": 4.9795420620337693e-05, "loss": 2.2372, "mean_token_accuracy": 0.4517241358757019, "step": 90160 }, { "epoch": 0.09081514562754761, "grad_norm": 13.321726294103668, "learning_rate": 4.979537020166861e-05, "loss": 2.547, "mean_token_accuracy": 0.3999999940395355, "step": 90165 }, { "epoch": 0.09082018168065178, "grad_norm": 11.77181217350565, "learning_rate": 4.9795319776815826e-05, "loss": 2.2948, "mean_token_accuracy": 0.43103448748588563, "step": 90170 }, { "epoch": 0.09082521773375596, "grad_norm": 11.78587914091947, "learning_rate": 4.979526934577934e-05, "loss": 2.7219, "mean_token_accuracy": 0.334482753276825, "step": 90175 }, { "epoch": 0.09083025378686013, "grad_norm": 9.024357791585787, "learning_rate": 4.979521890855919e-05, "loss": 2.515, "mean_token_accuracy": 0.39310345649719236, "step": 90180 }, { "epoch": 0.0908352898399643, "grad_norm": 9.455658382449057, "learning_rate": 4.979516846515536e-05, "loss": 2.3388, "mean_token_accuracy": 0.42413793206214906, "step": 90185 }, { "epoch": 0.09084032589306848, "grad_norm": 8.943009744953645, "learning_rate": 4.97951180155679e-05, "loss": 2.3904, "mean_token_accuracy": 0.42758620977401735, "step": 90190 }, { "epoch": 0.09084536194617265, "grad_norm": 9.176328295144586, "learning_rate": 4.97950675597968e-05, "loss": 2.209, "mean_token_accuracy": 0.47931034564971925, "step": 90195 }, { "epoch": 0.09085039799927683, "grad_norm": 9.00930930413258, "learning_rate": 4.979501709784208e-05, "loss": 2.2891, "mean_token_accuracy": 0.4882637619972229, "step": 90200 }, { "epoch": 0.090855434052381, "grad_norm": 13.5390219560126, "learning_rate": 4.9794966629703755e-05, "loss": 2.8878, "mean_token_accuracy": 0.39812461137771604, "step": 90205 }, { "epoch": 0.09086047010548517, "grad_norm": 11.348178108166087, "learning_rate": 4.9794916155381835e-05, "loss": 2.4645, "mean_token_accuracy": 0.43103448748588563, "step": 90210 }, { "epoch": 0.09086550615858933, "grad_norm": 11.940855772603383, "learning_rate": 4.979486567487634e-05, "loss": 2.4485, "mean_token_accuracy": 0.3827586233615875, "step": 90215 }, { "epoch": 0.09087054221169351, "grad_norm": 12.71153254281118, "learning_rate": 4.9794815188187276e-05, "loss": 2.777, "mean_token_accuracy": 0.3896551728248596, "step": 90220 }, { "epoch": 0.09087557826479768, "grad_norm": 11.566195167024897, "learning_rate": 4.9794764695314663e-05, "loss": 2.4122, "mean_token_accuracy": 0.3827586233615875, "step": 90225 }, { "epoch": 0.09088061431790186, "grad_norm": 10.03224433079392, "learning_rate": 4.9794714196258514e-05, "loss": 2.3411, "mean_token_accuracy": 0.4310344815254211, "step": 90230 }, { "epoch": 0.09088565037100603, "grad_norm": 9.755984745215926, "learning_rate": 4.979466369101884e-05, "loss": 2.4842, "mean_token_accuracy": 0.482758617401123, "step": 90235 }, { "epoch": 0.0908906864241102, "grad_norm": 11.44696252590824, "learning_rate": 4.979461317959567e-05, "loss": 2.4561, "mean_token_accuracy": 0.41379310488700866, "step": 90240 }, { "epoch": 0.09089572247721438, "grad_norm": 8.635530114473005, "learning_rate": 4.979456266198899e-05, "loss": 2.3462, "mean_token_accuracy": 0.42413793206214906, "step": 90245 }, { "epoch": 0.09090075853031855, "grad_norm": 9.829816833076503, "learning_rate": 4.979451213819884e-05, "loss": 2.3878, "mean_token_accuracy": 0.46206897497177124, "step": 90250 }, { "epoch": 0.09090579458342272, "grad_norm": 10.734777912593835, "learning_rate": 4.979446160822522e-05, "loss": 2.6515, "mean_token_accuracy": 0.4206896543502808, "step": 90255 }, { "epoch": 0.0909108306365269, "grad_norm": 9.240119736500542, "learning_rate": 4.979441107206816e-05, "loss": 2.4341, "mean_token_accuracy": 0.4034482717514038, "step": 90260 }, { "epoch": 0.09091586668963107, "grad_norm": 14.992409494532323, "learning_rate": 4.9794360529727645e-05, "loss": 2.8744, "mean_token_accuracy": 0.36551723480224607, "step": 90265 }, { "epoch": 0.09092090274273525, "grad_norm": 11.038814978434667, "learning_rate": 4.9794309981203715e-05, "loss": 2.4813, "mean_token_accuracy": 0.37586206793785093, "step": 90270 }, { "epoch": 0.09092593879583942, "grad_norm": 11.626340404857135, "learning_rate": 4.979425942649636e-05, "loss": 2.7444, "mean_token_accuracy": 0.29999999701976776, "step": 90275 }, { "epoch": 0.0909309748489436, "grad_norm": 11.933873914580772, "learning_rate": 4.9794208865605636e-05, "loss": 2.7416, "mean_token_accuracy": 0.3655172407627106, "step": 90280 }, { "epoch": 0.09093601090204775, "grad_norm": 10.36206408283393, "learning_rate": 4.979415829853151e-05, "loss": 2.3698, "mean_token_accuracy": 0.42413793206214906, "step": 90285 }, { "epoch": 0.09094104695515193, "grad_norm": 13.638229922322656, "learning_rate": 4.9794107725274025e-05, "loss": 2.5357, "mean_token_accuracy": 0.43448275327682495, "step": 90290 }, { "epoch": 0.0909460830082561, "grad_norm": 9.93508467206544, "learning_rate": 4.9794057145833175e-05, "loss": 2.3269, "mean_token_accuracy": 0.4413793087005615, "step": 90295 }, { "epoch": 0.09095111906136027, "grad_norm": 12.955726782874757, "learning_rate": 4.9794006560209e-05, "loss": 2.4371, "mean_token_accuracy": 0.39310344457626345, "step": 90300 }, { "epoch": 0.09095615511446445, "grad_norm": 11.427335303798769, "learning_rate": 4.979395596840149e-05, "loss": 2.614, "mean_token_accuracy": 0.441379314661026, "step": 90305 }, { "epoch": 0.09096119116756862, "grad_norm": 12.7170375383586, "learning_rate": 4.9793905370410664e-05, "loss": 2.2512, "mean_token_accuracy": 0.4620689690113068, "step": 90310 }, { "epoch": 0.0909662272206728, "grad_norm": 10.663190632699047, "learning_rate": 4.9793854766236547e-05, "loss": 2.2782, "mean_token_accuracy": 0.39310344457626345, "step": 90315 }, { "epoch": 0.09097126327377697, "grad_norm": 14.009714275799094, "learning_rate": 4.979380415587915e-05, "loss": 2.9549, "mean_token_accuracy": 0.3827586233615875, "step": 90320 }, { "epoch": 0.09097629932688114, "grad_norm": 9.563323599008491, "learning_rate": 4.9793753539338474e-05, "loss": 2.4672, "mean_token_accuracy": 0.39999999701976774, "step": 90325 }, { "epoch": 0.09098133537998532, "grad_norm": 10.790800811109358, "learning_rate": 4.979370291661455e-05, "loss": 2.0366, "mean_token_accuracy": 0.5034482717514038, "step": 90330 }, { "epoch": 0.09098637143308949, "grad_norm": 10.01730169428084, "learning_rate": 4.979365228770738e-05, "loss": 2.218, "mean_token_accuracy": 0.42068965137004855, "step": 90335 }, { "epoch": 0.09099140748619367, "grad_norm": 10.114256161866365, "learning_rate": 4.9793601652616985e-05, "loss": 2.0994, "mean_token_accuracy": 0.4793103516101837, "step": 90340 }, { "epoch": 0.09099644353929784, "grad_norm": 11.932799460616552, "learning_rate": 4.979355101134337e-05, "loss": 2.4318, "mean_token_accuracy": 0.4, "step": 90345 }, { "epoch": 0.09100147959240201, "grad_norm": 12.641709572950468, "learning_rate": 4.979350036388656e-05, "loss": 2.5387, "mean_token_accuracy": 0.4172413766384125, "step": 90350 }, { "epoch": 0.09100651564550617, "grad_norm": 12.813043403925558, "learning_rate": 4.979344971024656e-05, "loss": 2.5127, "mean_token_accuracy": 0.45009074807167054, "step": 90355 }, { "epoch": 0.09101155169861035, "grad_norm": 10.79932317974267, "learning_rate": 4.97933990504234e-05, "loss": 2.2764, "mean_token_accuracy": 0.4586206912994385, "step": 90360 }, { "epoch": 0.09101658775171452, "grad_norm": 12.074016544407364, "learning_rate": 4.9793348384417075e-05, "loss": 2.677, "mean_token_accuracy": 0.38965516686439516, "step": 90365 }, { "epoch": 0.0910216238048187, "grad_norm": 10.937451877435155, "learning_rate": 4.979329771222761e-05, "loss": 3.0193, "mean_token_accuracy": 0.3551724135875702, "step": 90370 }, { "epoch": 0.09102665985792287, "grad_norm": 8.539580040942838, "learning_rate": 4.979324703385501e-05, "loss": 2.5895, "mean_token_accuracy": 0.3862069010734558, "step": 90375 }, { "epoch": 0.09103169591102704, "grad_norm": 11.671894538238327, "learning_rate": 4.9793196349299296e-05, "loss": 2.2363, "mean_token_accuracy": 0.4620689630508423, "step": 90380 }, { "epoch": 0.09103673196413122, "grad_norm": 10.886463193863884, "learning_rate": 4.9793145658560486e-05, "loss": 2.2605, "mean_token_accuracy": 0.47090138792991637, "step": 90385 }, { "epoch": 0.09104176801723539, "grad_norm": 12.256596198909342, "learning_rate": 4.9793094961638585e-05, "loss": 2.1743, "mean_token_accuracy": 0.5000000119209289, "step": 90390 }, { "epoch": 0.09104680407033956, "grad_norm": 11.070599545868937, "learning_rate": 4.979304425853361e-05, "loss": 2.5198, "mean_token_accuracy": 0.40344828367233276, "step": 90395 }, { "epoch": 0.09105184012344374, "grad_norm": 12.587351358309391, "learning_rate": 4.979299354924558e-05, "loss": 2.6132, "mean_token_accuracy": 0.42758620977401735, "step": 90400 }, { "epoch": 0.09105687617654791, "grad_norm": 10.271802623825817, "learning_rate": 4.97929428337745e-05, "loss": 2.4358, "mean_token_accuracy": 0.41034482717514037, "step": 90405 }, { "epoch": 0.09106191222965208, "grad_norm": 9.849203672541021, "learning_rate": 4.979289211212039e-05, "loss": 2.2003, "mean_token_accuracy": 0.4448275864124298, "step": 90410 }, { "epoch": 0.09106694828275626, "grad_norm": 9.15943413962576, "learning_rate": 4.979284138428327e-05, "loss": 2.2174, "mean_token_accuracy": 0.43793103098869324, "step": 90415 }, { "epoch": 0.09107198433586043, "grad_norm": 9.932951153609872, "learning_rate": 4.9792790650263136e-05, "loss": 2.1313, "mean_token_accuracy": 0.46896552443504336, "step": 90420 }, { "epoch": 0.09107702038896459, "grad_norm": 11.427666093479466, "learning_rate": 4.979273991006003e-05, "loss": 2.9135, "mean_token_accuracy": 0.39794313311576845, "step": 90425 }, { "epoch": 0.09108205644206877, "grad_norm": 10.427592659859615, "learning_rate": 4.9792689163673937e-05, "loss": 2.2687, "mean_token_accuracy": 0.37586206793785093, "step": 90430 }, { "epoch": 0.09108709249517294, "grad_norm": 12.109536432537869, "learning_rate": 4.979263841110488e-05, "loss": 2.1345, "mean_token_accuracy": 0.4845735013484955, "step": 90435 }, { "epoch": 0.09109212854827711, "grad_norm": 12.145734702257682, "learning_rate": 4.979258765235289e-05, "loss": 2.341, "mean_token_accuracy": 0.4379310369491577, "step": 90440 }, { "epoch": 0.09109716460138129, "grad_norm": 10.95845020122772, "learning_rate": 4.9792536887417953e-05, "loss": 2.3744, "mean_token_accuracy": 0.42413793206214906, "step": 90445 }, { "epoch": 0.09110220065448546, "grad_norm": 10.76244480901666, "learning_rate": 4.979248611630011e-05, "loss": 2.7237, "mean_token_accuracy": 0.36206896007061007, "step": 90450 }, { "epoch": 0.09110723670758963, "grad_norm": 12.123624168753311, "learning_rate": 4.9792435338999354e-05, "loss": 2.3586, "mean_token_accuracy": 0.4517241299152374, "step": 90455 }, { "epoch": 0.09111227276069381, "grad_norm": 13.433875712063838, "learning_rate": 4.979238455551571e-05, "loss": 2.4971, "mean_token_accuracy": 0.4448275864124298, "step": 90460 }, { "epoch": 0.09111730881379798, "grad_norm": 11.954055079329189, "learning_rate": 4.9792333765849194e-05, "loss": 2.1803, "mean_token_accuracy": 0.4482758641242981, "step": 90465 }, { "epoch": 0.09112234486690216, "grad_norm": 11.822011253803288, "learning_rate": 4.979228296999981e-05, "loss": 2.5046, "mean_token_accuracy": 0.4379310369491577, "step": 90470 }, { "epoch": 0.09112738092000633, "grad_norm": 11.42334023357203, "learning_rate": 4.979223216796758e-05, "loss": 2.7535, "mean_token_accuracy": 0.39655172228813174, "step": 90475 }, { "epoch": 0.0911324169731105, "grad_norm": 13.134017112740706, "learning_rate": 4.979218135975253e-05, "loss": 2.4668, "mean_token_accuracy": 0.39655172526836396, "step": 90480 }, { "epoch": 0.09113745302621468, "grad_norm": 11.330034704872727, "learning_rate": 4.979213054535464e-05, "loss": 2.6321, "mean_token_accuracy": 0.38965516686439516, "step": 90485 }, { "epoch": 0.09114248907931885, "grad_norm": 11.917819059138314, "learning_rate": 4.979207972477395e-05, "loss": 2.532, "mean_token_accuracy": 0.4172413766384125, "step": 90490 }, { "epoch": 0.09114752513242301, "grad_norm": 13.918984551781895, "learning_rate": 4.979202889801047e-05, "loss": 2.632, "mean_token_accuracy": 0.41034482717514037, "step": 90495 }, { "epoch": 0.09115256118552718, "grad_norm": 10.06744244741391, "learning_rate": 4.979197806506421e-05, "loss": 2.6835, "mean_token_accuracy": 0.4184512972831726, "step": 90500 }, { "epoch": 0.09115759723863136, "grad_norm": 9.427658596157837, "learning_rate": 4.979192722593519e-05, "loss": 1.9898, "mean_token_accuracy": 0.5332728505134583, "step": 90505 }, { "epoch": 0.09116263329173553, "grad_norm": 10.294488142990467, "learning_rate": 4.9791876380623424e-05, "loss": 2.4381, "mean_token_accuracy": 0.417241370677948, "step": 90510 }, { "epoch": 0.0911676693448397, "grad_norm": 10.380071947045385, "learning_rate": 4.979182552912892e-05, "loss": 2.0485, "mean_token_accuracy": 0.4931034505367279, "step": 90515 }, { "epoch": 0.09117270539794388, "grad_norm": 13.95284837348923, "learning_rate": 4.979177467145169e-05, "loss": 2.9467, "mean_token_accuracy": 0.37241379618644715, "step": 90520 }, { "epoch": 0.09117774145104805, "grad_norm": 13.305801505372806, "learning_rate": 4.979172380759177e-05, "loss": 2.0719, "mean_token_accuracy": 0.5034482717514038, "step": 90525 }, { "epoch": 0.09118277750415223, "grad_norm": 11.51578153973372, "learning_rate": 4.979167293754914e-05, "loss": 2.7472, "mean_token_accuracy": 0.41379310488700866, "step": 90530 }, { "epoch": 0.0911878135572564, "grad_norm": 14.005653518694388, "learning_rate": 4.9791622061323837e-05, "loss": 2.1233, "mean_token_accuracy": 0.4482758641242981, "step": 90535 }, { "epoch": 0.09119284961036057, "grad_norm": 16.603290507121553, "learning_rate": 4.979157117891587e-05, "loss": 2.33, "mean_token_accuracy": 0.44827585816383364, "step": 90540 }, { "epoch": 0.09119788566346475, "grad_norm": 19.344976339485882, "learning_rate": 4.979152029032525e-05, "loss": 2.3921, "mean_token_accuracy": 0.4206896424293518, "step": 90545 }, { "epoch": 0.09120292171656892, "grad_norm": 12.532908099895122, "learning_rate": 4.9791469395552e-05, "loss": 2.7797, "mean_token_accuracy": 0.3482758641242981, "step": 90550 }, { "epoch": 0.0912079577696731, "grad_norm": 11.00086755507857, "learning_rate": 4.9791418494596126e-05, "loss": 2.742, "mean_token_accuracy": 0.32413792610168457, "step": 90555 }, { "epoch": 0.09121299382277727, "grad_norm": 12.889850960941862, "learning_rate": 4.9791367587457636e-05, "loss": 2.6405, "mean_token_accuracy": 0.39310344457626345, "step": 90560 }, { "epoch": 0.09121802987588143, "grad_norm": 8.803262891234478, "learning_rate": 4.9791316674136564e-05, "loss": 2.466, "mean_token_accuracy": 0.41379310488700866, "step": 90565 }, { "epoch": 0.0912230659289856, "grad_norm": 9.790388551810393, "learning_rate": 4.9791265754632904e-05, "loss": 2.2316, "mean_token_accuracy": 0.4620689690113068, "step": 90570 }, { "epoch": 0.09122810198208978, "grad_norm": 11.23008474108112, "learning_rate": 4.9791214828946676e-05, "loss": 2.413, "mean_token_accuracy": 0.4620689690113068, "step": 90575 }, { "epoch": 0.09123313803519395, "grad_norm": 10.128532772528109, "learning_rate": 4.97911638970779e-05, "loss": 2.6874, "mean_token_accuracy": 0.4344827622175217, "step": 90580 }, { "epoch": 0.09123817408829812, "grad_norm": 9.292320863219803, "learning_rate": 4.9791112959026584e-05, "loss": 2.2474, "mean_token_accuracy": 0.42413792610168455, "step": 90585 }, { "epoch": 0.0912432101414023, "grad_norm": 13.28177556868752, "learning_rate": 4.9791062014792754e-05, "loss": 2.891, "mean_token_accuracy": 0.42202056646347047, "step": 90590 }, { "epoch": 0.09124824619450647, "grad_norm": 9.513147556239415, "learning_rate": 4.979101106437641e-05, "loss": 2.4266, "mean_token_accuracy": 0.39310345649719236, "step": 90595 }, { "epoch": 0.09125328224761065, "grad_norm": 13.834591506315695, "learning_rate": 4.9790960107777565e-05, "loss": 2.593, "mean_token_accuracy": 0.4103448212146759, "step": 90600 }, { "epoch": 0.09125831830071482, "grad_norm": 13.569034296855074, "learning_rate": 4.979090914499624e-05, "loss": 2.6677, "mean_token_accuracy": 0.3931034505367279, "step": 90605 }, { "epoch": 0.09126335435381899, "grad_norm": 13.022871098093944, "learning_rate": 4.979085817603245e-05, "loss": 2.5268, "mean_token_accuracy": 0.4344827651977539, "step": 90610 }, { "epoch": 0.09126839040692317, "grad_norm": 9.828888983216695, "learning_rate": 4.9790807200886213e-05, "loss": 2.0444, "mean_token_accuracy": 0.45517241954803467, "step": 90615 }, { "epoch": 0.09127342646002734, "grad_norm": 12.827386655104135, "learning_rate": 4.979075621955753e-05, "loss": 2.7361, "mean_token_accuracy": 0.4344827651977539, "step": 90620 }, { "epoch": 0.09127846251313151, "grad_norm": 14.577570724483381, "learning_rate": 4.979070523204642e-05, "loss": 2.5248, "mean_token_accuracy": 0.3758620649576187, "step": 90625 }, { "epoch": 0.09128349856623569, "grad_norm": 10.604168408774838, "learning_rate": 4.979065423835292e-05, "loss": 2.3437, "mean_token_accuracy": 0.47586206793785096, "step": 90630 }, { "epoch": 0.09128853461933985, "grad_norm": 17.039432442128064, "learning_rate": 4.979060323847701e-05, "loss": 2.9704, "mean_token_accuracy": 0.38965516686439516, "step": 90635 }, { "epoch": 0.09129357067244402, "grad_norm": 11.654590768566218, "learning_rate": 4.9790552232418705e-05, "loss": 3.0543, "mean_token_accuracy": 0.3758620619773865, "step": 90640 }, { "epoch": 0.0912986067255482, "grad_norm": 10.132216565602302, "learning_rate": 4.9790501220178056e-05, "loss": 2.2722, "mean_token_accuracy": 0.4413793206214905, "step": 90645 }, { "epoch": 0.09130364277865237, "grad_norm": 8.865474886685172, "learning_rate": 4.979045020175504e-05, "loss": 2.5735, "mean_token_accuracy": 0.39655172228813174, "step": 90650 }, { "epoch": 0.09130867883175654, "grad_norm": 14.298358312157104, "learning_rate": 4.9790399177149684e-05, "loss": 2.4501, "mean_token_accuracy": 0.4137930929660797, "step": 90655 }, { "epoch": 0.09131371488486072, "grad_norm": 9.560555186015067, "learning_rate": 4.9790348146362e-05, "loss": 2.5912, "mean_token_accuracy": 0.4344827592372894, "step": 90660 }, { "epoch": 0.09131875093796489, "grad_norm": 10.178985971743959, "learning_rate": 4.9790297109392017e-05, "loss": 2.2362, "mean_token_accuracy": 0.41034482717514037, "step": 90665 }, { "epoch": 0.09132378699106906, "grad_norm": 13.399232515024941, "learning_rate": 4.9790246066239734e-05, "loss": 2.1666, "mean_token_accuracy": 0.4620689630508423, "step": 90670 }, { "epoch": 0.09132882304417324, "grad_norm": 10.840527955665955, "learning_rate": 4.979019501690516e-05, "loss": 2.4118, "mean_token_accuracy": 0.4689655125141144, "step": 90675 }, { "epoch": 0.09133385909727741, "grad_norm": 9.436586919515545, "learning_rate": 4.979014396138831e-05, "loss": 2.3065, "mean_token_accuracy": 0.42413792610168455, "step": 90680 }, { "epoch": 0.09133889515038159, "grad_norm": 10.86709552401155, "learning_rate": 4.9790092899689226e-05, "loss": 2.6396, "mean_token_accuracy": 0.4137930989265442, "step": 90685 }, { "epoch": 0.09134393120348576, "grad_norm": 10.32896685249154, "learning_rate": 4.979004183180789e-05, "loss": 2.4058, "mean_token_accuracy": 0.42758620977401735, "step": 90690 }, { "epoch": 0.09134896725658993, "grad_norm": 10.069455114522174, "learning_rate": 4.978999075774434e-05, "loss": 3.1, "mean_token_accuracy": 0.3310344755649567, "step": 90695 }, { "epoch": 0.0913540033096941, "grad_norm": 11.873116990985515, "learning_rate": 4.978993967749856e-05, "loss": 3.0076, "mean_token_accuracy": 0.3275862097740173, "step": 90700 }, { "epoch": 0.09135903936279827, "grad_norm": 12.949946246719957, "learning_rate": 4.978988859107059e-05, "loss": 2.275, "mean_token_accuracy": 0.43103447556495667, "step": 90705 }, { "epoch": 0.09136407541590244, "grad_norm": 14.766304129031719, "learning_rate": 4.9789837498460434e-05, "loss": 2.4875, "mean_token_accuracy": 0.42068964838981626, "step": 90710 }, { "epoch": 0.09136911146900661, "grad_norm": 11.446498943434896, "learning_rate": 4.9789786399668116e-05, "loss": 2.0552, "mean_token_accuracy": 0.41724138855934145, "step": 90715 }, { "epoch": 0.09137414752211079, "grad_norm": 10.215952945408292, "learning_rate": 4.978973529469363e-05, "loss": 2.2684, "mean_token_accuracy": 0.4121597111225128, "step": 90720 }, { "epoch": 0.09137918357521496, "grad_norm": 12.394624234398494, "learning_rate": 4.9789684183537015e-05, "loss": 2.5155, "mean_token_accuracy": 0.41034482717514037, "step": 90725 }, { "epoch": 0.09138421962831914, "grad_norm": 8.94612254831858, "learning_rate": 4.9789633066198266e-05, "loss": 2.6316, "mean_token_accuracy": 0.4275861978530884, "step": 90730 }, { "epoch": 0.09138925568142331, "grad_norm": 9.57877828392471, "learning_rate": 4.9789581942677404e-05, "loss": 2.7033, "mean_token_accuracy": 0.3827586233615875, "step": 90735 }, { "epoch": 0.09139429173452748, "grad_norm": 9.352924585756302, "learning_rate": 4.978953081297445e-05, "loss": 2.1651, "mean_token_accuracy": 0.4620689511299133, "step": 90740 }, { "epoch": 0.09139932778763166, "grad_norm": 13.0342959548774, "learning_rate": 4.9789479677089404e-05, "loss": 2.4411, "mean_token_accuracy": 0.41724138259887694, "step": 90745 }, { "epoch": 0.09140436384073583, "grad_norm": 9.575965786675098, "learning_rate": 4.9789428535022294e-05, "loss": 2.7624, "mean_token_accuracy": 0.4156079888343811, "step": 90750 }, { "epoch": 0.09140939989384, "grad_norm": 10.256049240278756, "learning_rate": 4.9789377386773125e-05, "loss": 2.575, "mean_token_accuracy": 0.4275862008333206, "step": 90755 }, { "epoch": 0.09141443594694418, "grad_norm": 12.969379706655172, "learning_rate": 4.978932623234191e-05, "loss": 2.6783, "mean_token_accuracy": 0.3911675691604614, "step": 90760 }, { "epoch": 0.09141947200004835, "grad_norm": 13.497087733534899, "learning_rate": 4.9789275071728674e-05, "loss": 2.4845, "mean_token_accuracy": 0.3758620619773865, "step": 90765 }, { "epoch": 0.09142450805315253, "grad_norm": 9.117923971815056, "learning_rate": 4.978922390493342e-05, "loss": 2.5062, "mean_token_accuracy": 0.4689655125141144, "step": 90770 }, { "epoch": 0.09142954410625669, "grad_norm": 13.82756726187566, "learning_rate": 4.978917273195617e-05, "loss": 2.5924, "mean_token_accuracy": 0.4482758641242981, "step": 90775 }, { "epoch": 0.09143458015936086, "grad_norm": 10.129820451205916, "learning_rate": 4.978912155279692e-05, "loss": 2.2354, "mean_token_accuracy": 0.4413793087005615, "step": 90780 }, { "epoch": 0.09143961621246503, "grad_norm": 11.054948340056065, "learning_rate": 4.978907036745572e-05, "loss": 2.704, "mean_token_accuracy": 0.4034482717514038, "step": 90785 }, { "epoch": 0.0914446522655692, "grad_norm": 16.174054692559015, "learning_rate": 4.978901917593255e-05, "loss": 2.2476, "mean_token_accuracy": 0.4206896543502808, "step": 90790 }, { "epoch": 0.09144968831867338, "grad_norm": 11.188054268859252, "learning_rate": 4.9788967978227445e-05, "loss": 2.1996, "mean_token_accuracy": 0.4931034445762634, "step": 90795 }, { "epoch": 0.09145472437177755, "grad_norm": 12.159251287221021, "learning_rate": 4.978891677434041e-05, "loss": 2.6476, "mean_token_accuracy": 0.3275862127542496, "step": 90800 }, { "epoch": 0.09145976042488173, "grad_norm": 13.460968075733728, "learning_rate": 4.978886556427146e-05, "loss": 2.3511, "mean_token_accuracy": 0.4034482777118683, "step": 90805 }, { "epoch": 0.0914647964779859, "grad_norm": 9.718573725500347, "learning_rate": 4.978881434802061e-05, "loss": 1.9663, "mean_token_accuracy": 0.47586206197738645, "step": 90810 }, { "epoch": 0.09146983253109008, "grad_norm": 13.677593013482635, "learning_rate": 4.978876312558787e-05, "loss": 2.7577, "mean_token_accuracy": 0.3586206942796707, "step": 90815 }, { "epoch": 0.09147486858419425, "grad_norm": 8.38362387308308, "learning_rate": 4.978871189697326e-05, "loss": 2.3008, "mean_token_accuracy": 0.4310344815254211, "step": 90820 }, { "epoch": 0.09147990463729842, "grad_norm": 14.270403772669965, "learning_rate": 4.97886606621768e-05, "loss": 2.8973, "mean_token_accuracy": 0.4137930989265442, "step": 90825 }, { "epoch": 0.0914849406904026, "grad_norm": 13.636687567834448, "learning_rate": 4.978860942119849e-05, "loss": 2.455, "mean_token_accuracy": 0.44137930274009707, "step": 90830 }, { "epoch": 0.09148997674350677, "grad_norm": 12.822580915724185, "learning_rate": 4.9788558174038353e-05, "loss": 2.2571, "mean_token_accuracy": 0.46551724672317507, "step": 90835 }, { "epoch": 0.09149501279661094, "grad_norm": 10.246435403048995, "learning_rate": 4.9788506920696397e-05, "loss": 2.6831, "mean_token_accuracy": 0.37931033968925476, "step": 90840 }, { "epoch": 0.0915000488497151, "grad_norm": 9.925755719085261, "learning_rate": 4.9788455661172644e-05, "loss": 2.0752, "mean_token_accuracy": 0.46896552443504336, "step": 90845 }, { "epoch": 0.09150508490281928, "grad_norm": 12.083805192232951, "learning_rate": 4.97884043954671e-05, "loss": 2.7616, "mean_token_accuracy": 0.39310344457626345, "step": 90850 }, { "epoch": 0.09151012095592345, "grad_norm": 12.131000154951197, "learning_rate": 4.9788353123579794e-05, "loss": 2.1764, "mean_token_accuracy": 0.4448275864124298, "step": 90855 }, { "epoch": 0.09151515700902763, "grad_norm": 9.923317496449467, "learning_rate": 4.978830184551072e-05, "loss": 2.2892, "mean_token_accuracy": 0.4137930989265442, "step": 90860 }, { "epoch": 0.0915201930621318, "grad_norm": 10.543632140348988, "learning_rate": 4.978825056125992e-05, "loss": 2.3153, "mean_token_accuracy": 0.4379310250282288, "step": 90865 }, { "epoch": 0.09152522911523597, "grad_norm": 8.772785049755147, "learning_rate": 4.9788199270827376e-05, "loss": 2.1919, "mean_token_accuracy": 0.4103448212146759, "step": 90870 }, { "epoch": 0.09153026516834015, "grad_norm": 12.306312426433067, "learning_rate": 4.9788147974213115e-05, "loss": 2.3901, "mean_token_accuracy": 0.44827587008476255, "step": 90875 }, { "epoch": 0.09153530122144432, "grad_norm": 12.371878323458768, "learning_rate": 4.978809667141715e-05, "loss": 2.6801, "mean_token_accuracy": 0.4206896543502808, "step": 90880 }, { "epoch": 0.0915403372745485, "grad_norm": 10.801793776274907, "learning_rate": 4.9788045362439504e-05, "loss": 2.5133, "mean_token_accuracy": 0.3827586114406586, "step": 90885 }, { "epoch": 0.09154537332765267, "grad_norm": 12.998255239260608, "learning_rate": 4.978799404728019e-05, "loss": 2.4602, "mean_token_accuracy": 0.4068965494632721, "step": 90890 }, { "epoch": 0.09155040938075684, "grad_norm": 12.735133937176153, "learning_rate": 4.978794272593921e-05, "loss": 2.0186, "mean_token_accuracy": 0.4816696882247925, "step": 90895 }, { "epoch": 0.09155544543386102, "grad_norm": 11.601192044198942, "learning_rate": 4.97878913984166e-05, "loss": 3.2055, "mean_token_accuracy": 0.34827586114406583, "step": 90900 }, { "epoch": 0.09156048148696519, "grad_norm": 8.181351658599045, "learning_rate": 4.978784006471234e-05, "loss": 2.3685, "mean_token_accuracy": 0.46061705946922304, "step": 90905 }, { "epoch": 0.09156551754006936, "grad_norm": 11.253534394075366, "learning_rate": 4.978778872482648e-05, "loss": 2.7065, "mean_token_accuracy": 0.41554749608039854, "step": 90910 }, { "epoch": 0.09157055359317352, "grad_norm": 10.23745815098917, "learning_rate": 4.978773737875901e-05, "loss": 2.7288, "mean_token_accuracy": 0.4344827711582184, "step": 90915 }, { "epoch": 0.0915755896462777, "grad_norm": 10.87043324931439, "learning_rate": 4.9787686026509955e-05, "loss": 2.1362, "mean_token_accuracy": 0.4551724135875702, "step": 90920 }, { "epoch": 0.09158062569938187, "grad_norm": 13.260782976745794, "learning_rate": 4.9787634668079325e-05, "loss": 2.4237, "mean_token_accuracy": 0.41379310488700866, "step": 90925 }, { "epoch": 0.09158566175248604, "grad_norm": 9.924293225660122, "learning_rate": 4.978758330346714e-05, "loss": 2.4388, "mean_token_accuracy": 0.4, "step": 90930 }, { "epoch": 0.09159069780559022, "grad_norm": 10.772146633205935, "learning_rate": 4.9787531932673415e-05, "loss": 2.3506, "mean_token_accuracy": 0.39655172228813174, "step": 90935 }, { "epoch": 0.09159573385869439, "grad_norm": 11.5605364807525, "learning_rate": 4.978748055569815e-05, "loss": 2.829, "mean_token_accuracy": 0.3827586114406586, "step": 90940 }, { "epoch": 0.09160076991179857, "grad_norm": 9.62955078092501, "learning_rate": 4.978742917254138e-05, "loss": 2.5052, "mean_token_accuracy": 0.3827586233615875, "step": 90945 }, { "epoch": 0.09160580596490274, "grad_norm": 14.336809030837403, "learning_rate": 4.97873777832031e-05, "loss": 2.433, "mean_token_accuracy": 0.38620689511299133, "step": 90950 }, { "epoch": 0.09161084201800691, "grad_norm": 10.668974401281597, "learning_rate": 4.9787326387683336e-05, "loss": 2.861, "mean_token_accuracy": 0.358620685338974, "step": 90955 }, { "epoch": 0.09161587807111109, "grad_norm": 10.689576635563803, "learning_rate": 4.9787274985982094e-05, "loss": 2.9082, "mean_token_accuracy": 0.3706594049930573, "step": 90960 }, { "epoch": 0.09162091412421526, "grad_norm": 13.111652477133445, "learning_rate": 4.97872235780994e-05, "loss": 2.472, "mean_token_accuracy": 0.4517241358757019, "step": 90965 }, { "epoch": 0.09162595017731943, "grad_norm": 11.413221592210212, "learning_rate": 4.9787172164035255e-05, "loss": 2.4236, "mean_token_accuracy": 0.43968542814254763, "step": 90970 }, { "epoch": 0.09163098623042361, "grad_norm": 12.329692031597835, "learning_rate": 4.978712074378969e-05, "loss": 2.3945, "mean_token_accuracy": 0.45069570541381837, "step": 90975 }, { "epoch": 0.09163602228352778, "grad_norm": 10.289655021961428, "learning_rate": 4.978706931736269e-05, "loss": 2.3951, "mean_token_accuracy": 0.42758620977401735, "step": 90980 }, { "epoch": 0.09164105833663194, "grad_norm": 12.282528363652855, "learning_rate": 4.978701788475431e-05, "loss": 2.6259, "mean_token_accuracy": 0.41246218085289, "step": 90985 }, { "epoch": 0.09164609438973612, "grad_norm": 13.890347942870086, "learning_rate": 4.978696644596453e-05, "loss": 2.5411, "mean_token_accuracy": 0.44482759237289426, "step": 90990 }, { "epoch": 0.09165113044284029, "grad_norm": 11.329850069988508, "learning_rate": 4.9786915000993384e-05, "loss": 2.3926, "mean_token_accuracy": 0.39655172228813174, "step": 90995 }, { "epoch": 0.09165616649594446, "grad_norm": 12.067304061637882, "learning_rate": 4.9786863549840875e-05, "loss": 2.3705, "mean_token_accuracy": 0.4586206912994385, "step": 91000 }, { "epoch": 0.09166120254904864, "grad_norm": 12.330877014349838, "learning_rate": 4.978681209250701e-05, "loss": 2.3797, "mean_token_accuracy": 0.3947973310947418, "step": 91005 }, { "epoch": 0.09166623860215281, "grad_norm": 13.164310074296923, "learning_rate": 4.978676062899183e-05, "loss": 2.7087, "mean_token_accuracy": 0.3482758641242981, "step": 91010 }, { "epoch": 0.09167127465525698, "grad_norm": 11.151750244471126, "learning_rate": 4.9786709159295334e-05, "loss": 2.3916, "mean_token_accuracy": 0.4172413766384125, "step": 91015 }, { "epoch": 0.09167631070836116, "grad_norm": 9.213784796293915, "learning_rate": 4.978665768341753e-05, "loss": 2.4231, "mean_token_accuracy": 0.4620689630508423, "step": 91020 }, { "epoch": 0.09168134676146533, "grad_norm": 10.567450431742312, "learning_rate": 4.978660620135844e-05, "loss": 2.4791, "mean_token_accuracy": 0.42068966031074523, "step": 91025 }, { "epoch": 0.0916863828145695, "grad_norm": 9.333370236418645, "learning_rate": 4.978655471311807e-05, "loss": 1.9976, "mean_token_accuracy": 0.5310344755649566, "step": 91030 }, { "epoch": 0.09169141886767368, "grad_norm": 8.572559705260733, "learning_rate": 4.978650321869645e-05, "loss": 2.599, "mean_token_accuracy": 0.42262552976608275, "step": 91035 }, { "epoch": 0.09169645492077785, "grad_norm": 10.57211996119525, "learning_rate": 4.978645171809358e-05, "loss": 2.5638, "mean_token_accuracy": 0.33793103098869326, "step": 91040 }, { "epoch": 0.09170149097388203, "grad_norm": 11.856143308769521, "learning_rate": 4.978640021130948e-05, "loss": 2.6072, "mean_token_accuracy": 0.337931028008461, "step": 91045 }, { "epoch": 0.0917065270269862, "grad_norm": 9.570392324391149, "learning_rate": 4.978634869834417e-05, "loss": 2.3877, "mean_token_accuracy": 0.42758620977401735, "step": 91050 }, { "epoch": 0.09171156308009036, "grad_norm": 11.430163687315764, "learning_rate": 4.9786297179197646e-05, "loss": 2.6755, "mean_token_accuracy": 0.37241379022598264, "step": 91055 }, { "epoch": 0.09171659913319453, "grad_norm": 10.843737410843842, "learning_rate": 4.978624565386995e-05, "loss": 2.4512, "mean_token_accuracy": 0.4310344815254211, "step": 91060 }, { "epoch": 0.09172163518629871, "grad_norm": 11.358222977494112, "learning_rate": 4.978619412236107e-05, "loss": 2.0878, "mean_token_accuracy": 0.49999999403953554, "step": 91065 }, { "epoch": 0.09172667123940288, "grad_norm": 10.162079198746095, "learning_rate": 4.9786142584671033e-05, "loss": 2.4035, "mean_token_accuracy": 0.39655172228813174, "step": 91070 }, { "epoch": 0.09173170729250706, "grad_norm": 14.268007778474047, "learning_rate": 4.9786091040799845e-05, "loss": 2.3024, "mean_token_accuracy": 0.4464004814624786, "step": 91075 }, { "epoch": 0.09173674334561123, "grad_norm": 10.606547059993941, "learning_rate": 4.9786039490747546e-05, "loss": 2.4423, "mean_token_accuracy": 0.3896551787853241, "step": 91080 }, { "epoch": 0.0917417793987154, "grad_norm": 13.216989838962837, "learning_rate": 4.978598793451411e-05, "loss": 2.3964, "mean_token_accuracy": 0.4344827592372894, "step": 91085 }, { "epoch": 0.09174681545181958, "grad_norm": 11.276264329114268, "learning_rate": 4.978593637209958e-05, "loss": 2.5131, "mean_token_accuracy": 0.4068965554237366, "step": 91090 }, { "epoch": 0.09175185150492375, "grad_norm": 12.688798500865797, "learning_rate": 4.978588480350397e-05, "loss": 2.697, "mean_token_accuracy": 0.4087719261646271, "step": 91095 }, { "epoch": 0.09175688755802792, "grad_norm": 12.388882303348021, "learning_rate": 4.978583322872727e-05, "loss": 2.8277, "mean_token_accuracy": 0.37586207389831544, "step": 91100 }, { "epoch": 0.0917619236111321, "grad_norm": 10.238108676625371, "learning_rate": 4.9785781647769525e-05, "loss": 2.2034, "mean_token_accuracy": 0.4746521532535553, "step": 91105 }, { "epoch": 0.09176695966423627, "grad_norm": 10.688883014791124, "learning_rate": 4.978573006063072e-05, "loss": 2.7253, "mean_token_accuracy": 0.38965516686439516, "step": 91110 }, { "epoch": 0.09177199571734045, "grad_norm": 10.101650083739493, "learning_rate": 4.97856784673109e-05, "loss": 2.6327, "mean_token_accuracy": 0.4068965554237366, "step": 91115 }, { "epoch": 0.09177703177044462, "grad_norm": 10.954479480035626, "learning_rate": 4.978562686781007e-05, "loss": 2.5482, "mean_token_accuracy": 0.39655172228813174, "step": 91120 }, { "epoch": 0.09178206782354878, "grad_norm": 11.7706401268676, "learning_rate": 4.9785575262128215e-05, "loss": 2.4409, "mean_token_accuracy": 0.4068965554237366, "step": 91125 }, { "epoch": 0.09178710387665295, "grad_norm": 10.888179909089782, "learning_rate": 4.9785523650265394e-05, "loss": 2.5967, "mean_token_accuracy": 0.3655172407627106, "step": 91130 }, { "epoch": 0.09179213992975713, "grad_norm": 8.097531004249877, "learning_rate": 4.9785472032221584e-05, "loss": 2.6859, "mean_token_accuracy": 0.4020568609237671, "step": 91135 }, { "epoch": 0.0917971759828613, "grad_norm": 9.791141932951607, "learning_rate": 4.9785420407996825e-05, "loss": 2.8971, "mean_token_accuracy": 0.34137930572032926, "step": 91140 }, { "epoch": 0.09180221203596547, "grad_norm": 15.024615932269995, "learning_rate": 4.978536877759112e-05, "loss": 2.5507, "mean_token_accuracy": 0.43103448748588563, "step": 91145 }, { "epoch": 0.09180724808906965, "grad_norm": 11.452345941900216, "learning_rate": 4.9785317141004476e-05, "loss": 3.0245, "mean_token_accuracy": 0.3620689630508423, "step": 91150 }, { "epoch": 0.09181228414217382, "grad_norm": 9.576852377368313, "learning_rate": 4.9785265498236926e-05, "loss": 2.1225, "mean_token_accuracy": 0.47931034564971925, "step": 91155 }, { "epoch": 0.091817320195278, "grad_norm": 10.715093676823798, "learning_rate": 4.978521384928847e-05, "loss": 2.5336, "mean_token_accuracy": 0.3793103516101837, "step": 91160 }, { "epoch": 0.09182235624838217, "grad_norm": 11.649782628733607, "learning_rate": 4.978516219415913e-05, "loss": 2.6976, "mean_token_accuracy": 0.39443435668945315, "step": 91165 }, { "epoch": 0.09182739230148634, "grad_norm": 11.257245827100283, "learning_rate": 4.978511053284892e-05, "loss": 2.3676, "mean_token_accuracy": 0.47241379618644713, "step": 91170 }, { "epoch": 0.09183242835459052, "grad_norm": 12.138631260258938, "learning_rate": 4.9785058865357845e-05, "loss": 2.2924, "mean_token_accuracy": 0.4724137902259827, "step": 91175 }, { "epoch": 0.09183746440769469, "grad_norm": 11.678413161577193, "learning_rate": 4.9785007191685925e-05, "loss": 2.2639, "mean_token_accuracy": 0.4413793087005615, "step": 91180 }, { "epoch": 0.09184250046079886, "grad_norm": 9.64401156495291, "learning_rate": 4.978495551183318e-05, "loss": 2.6117, "mean_token_accuracy": 0.441379314661026, "step": 91185 }, { "epoch": 0.09184753651390304, "grad_norm": 17.846222257789034, "learning_rate": 4.978490382579962e-05, "loss": 2.4625, "mean_token_accuracy": 0.44482758045196535, "step": 91190 }, { "epoch": 0.0918525725670072, "grad_norm": 10.405443464258369, "learning_rate": 4.9784852133585255e-05, "loss": 2.3639, "mean_token_accuracy": 0.4172413766384125, "step": 91195 }, { "epoch": 0.09185760862011137, "grad_norm": 9.00877648839713, "learning_rate": 4.9784800435190104e-05, "loss": 2.5032, "mean_token_accuracy": 0.4172413796186447, "step": 91200 }, { "epoch": 0.09186264467321555, "grad_norm": 11.719650160135206, "learning_rate": 4.978474873061419e-05, "loss": 2.3411, "mean_token_accuracy": 0.46382336020469667, "step": 91205 }, { "epoch": 0.09186768072631972, "grad_norm": 18.081283107055093, "learning_rate": 4.97846970198575e-05, "loss": 2.5881, "mean_token_accuracy": 0.38620689511299133, "step": 91210 }, { "epoch": 0.0918727167794239, "grad_norm": 12.122610555969581, "learning_rate": 4.978464530292008e-05, "loss": 2.3651, "mean_token_accuracy": 0.42758620381355283, "step": 91215 }, { "epoch": 0.09187775283252807, "grad_norm": 10.66503984841771, "learning_rate": 4.978459357980192e-05, "loss": 2.6304, "mean_token_accuracy": 0.3655172407627106, "step": 91220 }, { "epoch": 0.09188278888563224, "grad_norm": 9.235295270831163, "learning_rate": 4.9784541850503055e-05, "loss": 2.194, "mean_token_accuracy": 0.46896551847457885, "step": 91225 }, { "epoch": 0.09188782493873641, "grad_norm": 10.517772011518831, "learning_rate": 4.978449011502348e-05, "loss": 2.2202, "mean_token_accuracy": 0.4551724076271057, "step": 91230 }, { "epoch": 0.09189286099184059, "grad_norm": 12.460092814106488, "learning_rate": 4.978443837336322e-05, "loss": 2.4646, "mean_token_accuracy": 0.36896551847457887, "step": 91235 }, { "epoch": 0.09189789704494476, "grad_norm": 9.828437523843268, "learning_rate": 4.97843866255223e-05, "loss": 2.431, "mean_token_accuracy": 0.46896551847457885, "step": 91240 }, { "epoch": 0.09190293309804894, "grad_norm": 8.31641743333721, "learning_rate": 4.9784334871500704e-05, "loss": 2.0964, "mean_token_accuracy": 0.48275862336158754, "step": 91245 }, { "epoch": 0.09190796915115311, "grad_norm": 12.874378177323996, "learning_rate": 4.978428311129847e-05, "loss": 2.4202, "mean_token_accuracy": 0.37586207389831544, "step": 91250 }, { "epoch": 0.09191300520425728, "grad_norm": 10.731294427394966, "learning_rate": 4.978423134491562e-05, "loss": 2.1245, "mean_token_accuracy": 0.47241378426551817, "step": 91255 }, { "epoch": 0.09191804125736146, "grad_norm": 9.52657583058018, "learning_rate": 4.978417957235215e-05, "loss": 2.302, "mean_token_accuracy": 0.4931034445762634, "step": 91260 }, { "epoch": 0.09192307731046562, "grad_norm": 12.513974946585465, "learning_rate": 4.978412779360807e-05, "loss": 2.3103, "mean_token_accuracy": 0.3999999940395355, "step": 91265 }, { "epoch": 0.09192811336356979, "grad_norm": 9.77231374462633, "learning_rate": 4.978407600868341e-05, "loss": 2.2763, "mean_token_accuracy": 0.4448275864124298, "step": 91270 }, { "epoch": 0.09193314941667396, "grad_norm": 9.313448215191945, "learning_rate": 4.978402421757818e-05, "loss": 2.4836, "mean_token_accuracy": 0.4448275864124298, "step": 91275 }, { "epoch": 0.09193818546977814, "grad_norm": 10.76638902821825, "learning_rate": 4.978397242029239e-05, "loss": 2.3406, "mean_token_accuracy": 0.4396249234676361, "step": 91280 }, { "epoch": 0.09194322152288231, "grad_norm": 13.039744634128711, "learning_rate": 4.978392061682605e-05, "loss": 2.8103, "mean_token_accuracy": 0.42068966031074523, "step": 91285 }, { "epoch": 0.09194825757598649, "grad_norm": 13.512668424616919, "learning_rate": 4.9783868807179195e-05, "loss": 2.3436, "mean_token_accuracy": 0.4137930989265442, "step": 91290 }, { "epoch": 0.09195329362909066, "grad_norm": 9.212437342648288, "learning_rate": 4.978381699135182e-05, "loss": 2.3996, "mean_token_accuracy": 0.4551724135875702, "step": 91295 }, { "epoch": 0.09195832968219483, "grad_norm": 10.786412347645207, "learning_rate": 4.978376516934394e-05, "loss": 2.5634, "mean_token_accuracy": 0.3862068891525269, "step": 91300 }, { "epoch": 0.09196336573529901, "grad_norm": 10.37327827270894, "learning_rate": 4.978371334115559e-05, "loss": 2.7917, "mean_token_accuracy": 0.358620685338974, "step": 91305 }, { "epoch": 0.09196840178840318, "grad_norm": 11.51002076350022, "learning_rate": 4.978366150678675e-05, "loss": 2.4022, "mean_token_accuracy": 0.4137930989265442, "step": 91310 }, { "epoch": 0.09197343784150736, "grad_norm": 11.841280611853335, "learning_rate": 4.9783609666237465e-05, "loss": 2.82, "mean_token_accuracy": 0.37931033968925476, "step": 91315 }, { "epoch": 0.09197847389461153, "grad_norm": 10.058080052906332, "learning_rate": 4.9783557819507735e-05, "loss": 2.9309, "mean_token_accuracy": 0.3862068891525269, "step": 91320 }, { "epoch": 0.0919835099477157, "grad_norm": 9.902150094797275, "learning_rate": 4.978350596659758e-05, "loss": 2.6271, "mean_token_accuracy": 0.36896551251411436, "step": 91325 }, { "epoch": 0.09198854600081988, "grad_norm": 9.574789530730675, "learning_rate": 4.9783454107507003e-05, "loss": 2.4316, "mean_token_accuracy": 0.37586207389831544, "step": 91330 }, { "epoch": 0.09199358205392404, "grad_norm": 12.645337941566169, "learning_rate": 4.978340224223604e-05, "loss": 2.6544, "mean_token_accuracy": 0.37586206793785093, "step": 91335 }, { "epoch": 0.09199861810702821, "grad_norm": 10.022772676418775, "learning_rate": 4.9783350370784687e-05, "loss": 2.3916, "mean_token_accuracy": 0.41034482717514037, "step": 91340 }, { "epoch": 0.09200365416013238, "grad_norm": 13.479686085998255, "learning_rate": 4.978329849315295e-05, "loss": 2.6592, "mean_token_accuracy": 0.3965517282485962, "step": 91345 }, { "epoch": 0.09200869021323656, "grad_norm": 11.761702815337637, "learning_rate": 4.978324660934088e-05, "loss": 2.7515, "mean_token_accuracy": 0.36896551251411436, "step": 91350 }, { "epoch": 0.09201372626634073, "grad_norm": 11.791522341050483, "learning_rate": 4.978319471934845e-05, "loss": 3.1241, "mean_token_accuracy": 0.3379310339689255, "step": 91355 }, { "epoch": 0.0920187623194449, "grad_norm": 9.399245591651717, "learning_rate": 4.97831428231757e-05, "loss": 2.5292, "mean_token_accuracy": 0.42891711592674253, "step": 91360 }, { "epoch": 0.09202379837254908, "grad_norm": 10.27640859674723, "learning_rate": 4.9783090920822636e-05, "loss": 2.0546, "mean_token_accuracy": 0.4862069070339203, "step": 91365 }, { "epoch": 0.09202883442565325, "grad_norm": 12.845246502024796, "learning_rate": 4.978303901228928e-05, "loss": 2.536, "mean_token_accuracy": 0.39310344457626345, "step": 91370 }, { "epoch": 0.09203387047875743, "grad_norm": 11.126769991830244, "learning_rate": 4.9782987097575626e-05, "loss": 2.3211, "mean_token_accuracy": 0.4413793087005615, "step": 91375 }, { "epoch": 0.0920389065318616, "grad_norm": 11.790111470068537, "learning_rate": 4.978293517668172e-05, "loss": 2.558, "mean_token_accuracy": 0.39655172228813174, "step": 91380 }, { "epoch": 0.09204394258496577, "grad_norm": 8.722133410112265, "learning_rate": 4.9782883249607546e-05, "loss": 2.0106, "mean_token_accuracy": 0.49655171036720275, "step": 91385 }, { "epoch": 0.09204897863806995, "grad_norm": 11.546907510635183, "learning_rate": 4.978283131635314e-05, "loss": 2.3021, "mean_token_accuracy": 0.4103448331356049, "step": 91390 }, { "epoch": 0.09205401469117412, "grad_norm": 16.525178089386465, "learning_rate": 4.97827793769185e-05, "loss": 2.3678, "mean_token_accuracy": 0.45335754156112673, "step": 91395 }, { "epoch": 0.0920590507442783, "grad_norm": 11.611752325724158, "learning_rate": 4.978272743130364e-05, "loss": 2.4188, "mean_token_accuracy": 0.4206896543502808, "step": 91400 }, { "epoch": 0.09206408679738246, "grad_norm": 11.816883859746438, "learning_rate": 4.97826754795086e-05, "loss": 2.4548, "mean_token_accuracy": 0.42758620381355283, "step": 91405 }, { "epoch": 0.09206912285048663, "grad_norm": 9.839824991648943, "learning_rate": 4.978262352153337e-05, "loss": 2.7432, "mean_token_accuracy": 0.3827586233615875, "step": 91410 }, { "epoch": 0.0920741589035908, "grad_norm": 12.816350938713398, "learning_rate": 4.978257155737797e-05, "loss": 2.458, "mean_token_accuracy": 0.42413793206214906, "step": 91415 }, { "epoch": 0.09207919495669498, "grad_norm": 10.681758742236385, "learning_rate": 4.978251958704242e-05, "loss": 2.5561, "mean_token_accuracy": 0.39655173420906065, "step": 91420 }, { "epoch": 0.09208423100979915, "grad_norm": 10.462061438455038, "learning_rate": 4.978246761052672e-05, "loss": 2.1847, "mean_token_accuracy": 0.4862068831920624, "step": 91425 }, { "epoch": 0.09208926706290332, "grad_norm": 9.390953316882701, "learning_rate": 4.9782415627830904e-05, "loss": 2.3809, "mean_token_accuracy": 0.4137930989265442, "step": 91430 }, { "epoch": 0.0920943031160075, "grad_norm": 10.039678233060283, "learning_rate": 4.978236363895498e-05, "loss": 2.4045, "mean_token_accuracy": 0.4379310429096222, "step": 91435 }, { "epoch": 0.09209933916911167, "grad_norm": 13.06541766087045, "learning_rate": 4.978231164389895e-05, "loss": 2.0001, "mean_token_accuracy": 0.47241378426551817, "step": 91440 }, { "epoch": 0.09210437522221585, "grad_norm": 9.28070298066796, "learning_rate": 4.9782259642662844e-05, "loss": 2.2202, "mean_token_accuracy": 0.44343616962432864, "step": 91445 }, { "epoch": 0.09210941127532002, "grad_norm": 11.242640551191501, "learning_rate": 4.978220763524666e-05, "loss": 2.6348, "mean_token_accuracy": 0.38106473684310915, "step": 91450 }, { "epoch": 0.09211444732842419, "grad_norm": 10.331647111100947, "learning_rate": 4.978215562165043e-05, "loss": 2.4388, "mean_token_accuracy": 0.41034482717514037, "step": 91455 }, { "epoch": 0.09211948338152837, "grad_norm": 10.853816320991227, "learning_rate": 4.978210360187416e-05, "loss": 2.4833, "mean_token_accuracy": 0.4379310369491577, "step": 91460 }, { "epoch": 0.09212451943463254, "grad_norm": 10.435375444345308, "learning_rate": 4.978205157591787e-05, "loss": 2.1422, "mean_token_accuracy": 0.42758620381355283, "step": 91465 }, { "epoch": 0.09212955548773671, "grad_norm": 12.174594495056327, "learning_rate": 4.978199954378157e-05, "loss": 2.4169, "mean_token_accuracy": 0.38620689511299133, "step": 91470 }, { "epoch": 0.09213459154084087, "grad_norm": 11.430912925076578, "learning_rate": 4.978194750546527e-05, "loss": 2.3814, "mean_token_accuracy": 0.44482758045196535, "step": 91475 }, { "epoch": 0.09213962759394505, "grad_norm": 11.990234621061962, "learning_rate": 4.978189546096899e-05, "loss": 2.5902, "mean_token_accuracy": 0.4297035574913025, "step": 91480 }, { "epoch": 0.09214466364704922, "grad_norm": 12.729142575973905, "learning_rate": 4.978184341029275e-05, "loss": 2.3773, "mean_token_accuracy": 0.441379314661026, "step": 91485 }, { "epoch": 0.0921496997001534, "grad_norm": 10.29821072746131, "learning_rate": 4.978179135343654e-05, "loss": 2.2013, "mean_token_accuracy": 0.4551724135875702, "step": 91490 }, { "epoch": 0.09215473575325757, "grad_norm": 11.741602705842391, "learning_rate": 4.9781739290400406e-05, "loss": 2.8711, "mean_token_accuracy": 0.31034482419490816, "step": 91495 }, { "epoch": 0.09215977180636174, "grad_norm": 10.201847294756833, "learning_rate": 4.978168722118435e-05, "loss": 2.2467, "mean_token_accuracy": 0.4482758641242981, "step": 91500 }, { "epoch": 0.09216480785946592, "grad_norm": 8.356342195898398, "learning_rate": 4.978163514578837e-05, "loss": 1.9762, "mean_token_accuracy": 0.5068965435028077, "step": 91505 }, { "epoch": 0.09216984391257009, "grad_norm": 11.703853903717821, "learning_rate": 4.978158306421251e-05, "loss": 2.4705, "mean_token_accuracy": 0.38965516686439516, "step": 91510 }, { "epoch": 0.09217487996567426, "grad_norm": 12.911967002653734, "learning_rate": 4.9781530976456756e-05, "loss": 2.447, "mean_token_accuracy": 0.3793103456497192, "step": 91515 }, { "epoch": 0.09217991601877844, "grad_norm": 13.121119445270315, "learning_rate": 4.978147888252115e-05, "loss": 2.3268, "mean_token_accuracy": 0.4551724135875702, "step": 91520 }, { "epoch": 0.09218495207188261, "grad_norm": 10.455021403239186, "learning_rate": 4.978142678240569e-05, "loss": 2.6135, "mean_token_accuracy": 0.358620685338974, "step": 91525 }, { "epoch": 0.09218998812498679, "grad_norm": 9.56973498370362, "learning_rate": 4.9781374676110385e-05, "loss": 2.7487, "mean_token_accuracy": 0.36551724672317504, "step": 91530 }, { "epoch": 0.09219502417809096, "grad_norm": 13.85458236160237, "learning_rate": 4.978132256363527e-05, "loss": 2.2364, "mean_token_accuracy": 0.458620685338974, "step": 91535 }, { "epoch": 0.09220006023119513, "grad_norm": 8.693696820227887, "learning_rate": 4.978127044498034e-05, "loss": 1.8794, "mean_token_accuracy": 0.49655171632766726, "step": 91540 }, { "epoch": 0.09220509628429929, "grad_norm": 11.20318368926345, "learning_rate": 4.9781218320145616e-05, "loss": 2.2114, "mean_token_accuracy": 0.44482759237289426, "step": 91545 }, { "epoch": 0.09221013233740347, "grad_norm": 10.661506949574862, "learning_rate": 4.978116618913111e-05, "loss": 2.0798, "mean_token_accuracy": 0.4801724135875702, "step": 91550 }, { "epoch": 0.09221516839050764, "grad_norm": 11.27391694580578, "learning_rate": 4.9781114051936843e-05, "loss": 2.5363, "mean_token_accuracy": 0.3551724076271057, "step": 91555 }, { "epoch": 0.09222020444361181, "grad_norm": 11.420019388772344, "learning_rate": 4.9781061908562834e-05, "loss": 2.7347, "mean_token_accuracy": 0.38620689511299133, "step": 91560 }, { "epoch": 0.09222524049671599, "grad_norm": 11.98924588703446, "learning_rate": 4.978100975900908e-05, "loss": 2.6827, "mean_token_accuracy": 0.3551724076271057, "step": 91565 }, { "epoch": 0.09223027654982016, "grad_norm": 9.197592441243513, "learning_rate": 4.978095760327561e-05, "loss": 2.3162, "mean_token_accuracy": 0.4275862008333206, "step": 91570 }, { "epoch": 0.09223531260292434, "grad_norm": 12.51261344716163, "learning_rate": 4.9780905441362424e-05, "loss": 2.4994, "mean_token_accuracy": 0.4379310369491577, "step": 91575 }, { "epoch": 0.09224034865602851, "grad_norm": 12.749508383699188, "learning_rate": 4.9780853273269554e-05, "loss": 2.173, "mean_token_accuracy": 0.46896552443504336, "step": 91580 }, { "epoch": 0.09224538470913268, "grad_norm": 10.787347778131876, "learning_rate": 4.9780801098997e-05, "loss": 2.238, "mean_token_accuracy": 0.49051724672317504, "step": 91585 }, { "epoch": 0.09225042076223686, "grad_norm": 9.35630598120748, "learning_rate": 4.978074891854479e-05, "loss": 2.0199, "mean_token_accuracy": 0.458620685338974, "step": 91590 }, { "epoch": 0.09225545681534103, "grad_norm": 10.885274669303373, "learning_rate": 4.978069673191292e-05, "loss": 2.8517, "mean_token_accuracy": 0.4034482777118683, "step": 91595 }, { "epoch": 0.0922604928684452, "grad_norm": 9.616921929781528, "learning_rate": 4.978064453910143e-05, "loss": 2.4912, "mean_token_accuracy": 0.4103448331356049, "step": 91600 }, { "epoch": 0.09226552892154938, "grad_norm": 10.062089845878605, "learning_rate": 4.978059234011031e-05, "loss": 2.4715, "mean_token_accuracy": 0.38965516686439516, "step": 91605 }, { "epoch": 0.09227056497465354, "grad_norm": 10.07272620028625, "learning_rate": 4.97805401349396e-05, "loss": 2.1668, "mean_token_accuracy": 0.4528325110673904, "step": 91610 }, { "epoch": 0.09227560102775771, "grad_norm": 14.241190986049615, "learning_rate": 4.978048792358928e-05, "loss": 2.9241, "mean_token_accuracy": 0.3517241388559341, "step": 91615 }, { "epoch": 0.09228063708086189, "grad_norm": 12.457992284389933, "learning_rate": 4.9780435706059395e-05, "loss": 2.4205, "mean_token_accuracy": 0.40157289505004884, "step": 91620 }, { "epoch": 0.09228567313396606, "grad_norm": 14.140774760851768, "learning_rate": 4.978038348234994e-05, "loss": 2.8097, "mean_token_accuracy": 0.39479734003543854, "step": 91625 }, { "epoch": 0.09229070918707023, "grad_norm": 10.81906840772269, "learning_rate": 4.978033125246094e-05, "loss": 2.7073, "mean_token_accuracy": 0.36551723480224607, "step": 91630 }, { "epoch": 0.0922957452401744, "grad_norm": 9.324775650957427, "learning_rate": 4.978027901639241e-05, "loss": 2.7326, "mean_token_accuracy": 0.4206896543502808, "step": 91635 }, { "epoch": 0.09230078129327858, "grad_norm": 9.289029355684029, "learning_rate": 4.978022677414436e-05, "loss": 2.4426, "mean_token_accuracy": 0.4172413766384125, "step": 91640 }, { "epoch": 0.09230581734638275, "grad_norm": 7.782671014750335, "learning_rate": 4.97801745257168e-05, "loss": 2.3891, "mean_token_accuracy": 0.482758629322052, "step": 91645 }, { "epoch": 0.09231085339948693, "grad_norm": 12.689298582568577, "learning_rate": 4.978012227110976e-05, "loss": 2.2271, "mean_token_accuracy": 0.44827587008476255, "step": 91650 }, { "epoch": 0.0923158894525911, "grad_norm": 10.974861990713135, "learning_rate": 4.978007001032324e-05, "loss": 2.5394, "mean_token_accuracy": 0.3896551728248596, "step": 91655 }, { "epoch": 0.09232092550569528, "grad_norm": 10.28262970441023, "learning_rate": 4.978001774335727e-05, "loss": 2.6577, "mean_token_accuracy": 0.39655172228813174, "step": 91660 }, { "epoch": 0.09232596155879945, "grad_norm": 10.440677114680609, "learning_rate": 4.977996547021184e-05, "loss": 2.8434, "mean_token_accuracy": 0.3586206942796707, "step": 91665 }, { "epoch": 0.09233099761190362, "grad_norm": 12.068466683882658, "learning_rate": 4.977991319088698e-05, "loss": 2.3134, "mean_token_accuracy": 0.4137930989265442, "step": 91670 }, { "epoch": 0.0923360336650078, "grad_norm": 11.875474533938775, "learning_rate": 4.977986090538271e-05, "loss": 2.2169, "mean_token_accuracy": 0.4413793087005615, "step": 91675 }, { "epoch": 0.09234106971811196, "grad_norm": 9.975442368986737, "learning_rate": 4.977980861369903e-05, "loss": 2.3594, "mean_token_accuracy": 0.4068965494632721, "step": 91680 }, { "epoch": 0.09234610577121613, "grad_norm": 9.282875046921536, "learning_rate": 4.977975631583596e-05, "loss": 2.3858, "mean_token_accuracy": 0.4103448331356049, "step": 91685 }, { "epoch": 0.0923511418243203, "grad_norm": 10.018154854933718, "learning_rate": 4.977970401179353e-05, "loss": 2.1811, "mean_token_accuracy": 0.4896551728248596, "step": 91690 }, { "epoch": 0.09235617787742448, "grad_norm": 12.707676003404776, "learning_rate": 4.9779651701571735e-05, "loss": 2.531, "mean_token_accuracy": 0.4172413766384125, "step": 91695 }, { "epoch": 0.09236121393052865, "grad_norm": 10.944946015227982, "learning_rate": 4.977959938517059e-05, "loss": 2.4025, "mean_token_accuracy": 0.40689654350280763, "step": 91700 }, { "epoch": 0.09236624998363283, "grad_norm": 9.27722498234549, "learning_rate": 4.977954706259011e-05, "loss": 2.1266, "mean_token_accuracy": 0.4896551728248596, "step": 91705 }, { "epoch": 0.092371286036737, "grad_norm": 11.33304535786818, "learning_rate": 4.977949473383033e-05, "loss": 2.4682, "mean_token_accuracy": 0.4294010877609253, "step": 91710 }, { "epoch": 0.09237632208984117, "grad_norm": 8.75964993407579, "learning_rate": 4.977944239889125e-05, "loss": 2.3978, "mean_token_accuracy": 0.41379310488700866, "step": 91715 }, { "epoch": 0.09238135814294535, "grad_norm": 12.24738178738062, "learning_rate": 4.977939005777287e-05, "loss": 2.7404, "mean_token_accuracy": 0.3896551728248596, "step": 91720 }, { "epoch": 0.09238639419604952, "grad_norm": 8.731044733469064, "learning_rate": 4.977933771047522e-05, "loss": 2.3912, "mean_token_accuracy": 0.4448275864124298, "step": 91725 }, { "epoch": 0.0923914302491537, "grad_norm": 10.509514323265288, "learning_rate": 4.977928535699832e-05, "loss": 2.7638, "mean_token_accuracy": 0.3931034505367279, "step": 91730 }, { "epoch": 0.09239646630225787, "grad_norm": 8.406831182886414, "learning_rate": 4.9779232997342176e-05, "loss": 2.1772, "mean_token_accuracy": 0.46551724672317507, "step": 91735 }, { "epoch": 0.09240150235536204, "grad_norm": 13.073561562939256, "learning_rate": 4.9779180631506794e-05, "loss": 2.4156, "mean_token_accuracy": 0.40344828367233276, "step": 91740 }, { "epoch": 0.09240653840846622, "grad_norm": 10.106868256835467, "learning_rate": 4.9779128259492205e-05, "loss": 2.3699, "mean_token_accuracy": 0.44827585220336913, "step": 91745 }, { "epoch": 0.09241157446157038, "grad_norm": 11.061674537900139, "learning_rate": 4.977907588129842e-05, "loss": 2.5383, "mean_token_accuracy": 0.4259528160095215, "step": 91750 }, { "epoch": 0.09241661051467455, "grad_norm": 11.601144673137227, "learning_rate": 4.9779023496925446e-05, "loss": 2.84, "mean_token_accuracy": 0.358620685338974, "step": 91755 }, { "epoch": 0.09242164656777872, "grad_norm": 8.999449245397361, "learning_rate": 4.9778971106373304e-05, "loss": 2.51, "mean_token_accuracy": 0.4344827592372894, "step": 91760 }, { "epoch": 0.0924266826208829, "grad_norm": 11.407555044080482, "learning_rate": 4.9778918709642e-05, "loss": 2.5785, "mean_token_accuracy": 0.38620689511299133, "step": 91765 }, { "epoch": 0.09243171867398707, "grad_norm": 10.828041041878562, "learning_rate": 4.977886630673156e-05, "loss": 2.4427, "mean_token_accuracy": 0.42413792610168455, "step": 91770 }, { "epoch": 0.09243675472709124, "grad_norm": 10.078295495759738, "learning_rate": 4.9778813897642e-05, "loss": 2.5963, "mean_token_accuracy": 0.4103448212146759, "step": 91775 }, { "epoch": 0.09244179078019542, "grad_norm": 14.683737954380915, "learning_rate": 4.9778761482373315e-05, "loss": 3.0128, "mean_token_accuracy": 0.33448275923728943, "step": 91780 }, { "epoch": 0.09244682683329959, "grad_norm": 12.284099927649612, "learning_rate": 4.9778709060925545e-05, "loss": 2.3091, "mean_token_accuracy": 0.4586206912994385, "step": 91785 }, { "epoch": 0.09245186288640377, "grad_norm": 10.951028182152557, "learning_rate": 4.977865663329868e-05, "loss": 2.4797, "mean_token_accuracy": 0.43448275327682495, "step": 91790 }, { "epoch": 0.09245689893950794, "grad_norm": 12.810140409139485, "learning_rate": 4.9778604199492755e-05, "loss": 2.4609, "mean_token_accuracy": 0.4068965494632721, "step": 91795 }, { "epoch": 0.09246193499261211, "grad_norm": 8.976250219029128, "learning_rate": 4.977855175950777e-05, "loss": 2.0758, "mean_token_accuracy": 0.482758629322052, "step": 91800 }, { "epoch": 0.09246697104571629, "grad_norm": 10.488429325737775, "learning_rate": 4.977849931334375e-05, "loss": 2.0741, "mean_token_accuracy": 0.4620689570903778, "step": 91805 }, { "epoch": 0.09247200709882046, "grad_norm": 15.265020753872792, "learning_rate": 4.97784468610007e-05, "loss": 2.2982, "mean_token_accuracy": 0.4379310369491577, "step": 91810 }, { "epoch": 0.09247704315192463, "grad_norm": 12.302034888542556, "learning_rate": 4.977839440247864e-05, "loss": 3.0835, "mean_token_accuracy": 0.3793103516101837, "step": 91815 }, { "epoch": 0.0924820792050288, "grad_norm": 13.534320079919688, "learning_rate": 4.977834193777759e-05, "loss": 2.5138, "mean_token_accuracy": 0.4586206912994385, "step": 91820 }, { "epoch": 0.09248711525813297, "grad_norm": 9.922570608599559, "learning_rate": 4.977828946689756e-05, "loss": 2.6005, "mean_token_accuracy": 0.44482758045196535, "step": 91825 }, { "epoch": 0.09249215131123714, "grad_norm": 10.145738653780745, "learning_rate": 4.977823698983856e-05, "loss": 2.7858, "mean_token_accuracy": 0.3965517282485962, "step": 91830 }, { "epoch": 0.09249718736434132, "grad_norm": 18.54497894520018, "learning_rate": 4.977818450660061e-05, "loss": 2.5204, "mean_token_accuracy": 0.39999999701976774, "step": 91835 }, { "epoch": 0.09250222341744549, "grad_norm": 16.19976698891755, "learning_rate": 4.977813201718371e-05, "loss": 2.4778, "mean_token_accuracy": 0.3758620649576187, "step": 91840 }, { "epoch": 0.09250725947054966, "grad_norm": 10.165292794722225, "learning_rate": 4.97780795215879e-05, "loss": 2.4351, "mean_token_accuracy": 0.43793103098869324, "step": 91845 }, { "epoch": 0.09251229552365384, "grad_norm": 17.872040505445923, "learning_rate": 4.9778027019813175e-05, "loss": 2.4495, "mean_token_accuracy": 0.4172413766384125, "step": 91850 }, { "epoch": 0.09251733157675801, "grad_norm": 9.677642463929592, "learning_rate": 4.977797451185956e-05, "loss": 2.1387, "mean_token_accuracy": 0.47931034564971925, "step": 91855 }, { "epoch": 0.09252236762986218, "grad_norm": 12.743173363533447, "learning_rate": 4.977792199772707e-05, "loss": 2.521, "mean_token_accuracy": 0.3620689630508423, "step": 91860 }, { "epoch": 0.09252740368296636, "grad_norm": 11.17643234497167, "learning_rate": 4.9777869477415706e-05, "loss": 2.5418, "mean_token_accuracy": 0.41379310488700866, "step": 91865 }, { "epoch": 0.09253243973607053, "grad_norm": 11.979008736123793, "learning_rate": 4.9777816950925496e-05, "loss": 2.5577, "mean_token_accuracy": 0.37241379618644715, "step": 91870 }, { "epoch": 0.0925374757891747, "grad_norm": 10.53161624886413, "learning_rate": 4.977776441825645e-05, "loss": 2.9344, "mean_token_accuracy": 0.3482758641242981, "step": 91875 }, { "epoch": 0.09254251184227888, "grad_norm": 10.437213775023281, "learning_rate": 4.9777711879408586e-05, "loss": 2.4423, "mean_token_accuracy": 0.4034482777118683, "step": 91880 }, { "epoch": 0.09254754789538305, "grad_norm": 12.176535903551, "learning_rate": 4.977765933438191e-05, "loss": 2.4869, "mean_token_accuracy": 0.4275861978530884, "step": 91885 }, { "epoch": 0.09255258394848721, "grad_norm": 11.249453368327826, "learning_rate": 4.977760678317645e-05, "loss": 2.6986, "mean_token_accuracy": 0.39655172228813174, "step": 91890 }, { "epoch": 0.09255762000159139, "grad_norm": 9.249909664423493, "learning_rate": 4.97775542257922e-05, "loss": 2.4171, "mean_token_accuracy": 0.44694494605064394, "step": 91895 }, { "epoch": 0.09256265605469556, "grad_norm": 8.855090959371763, "learning_rate": 4.9777501662229195e-05, "loss": 2.5927, "mean_token_accuracy": 0.4, "step": 91900 }, { "epoch": 0.09256769210779973, "grad_norm": 9.9698017791453, "learning_rate": 4.977744909248744e-05, "loss": 2.0786, "mean_token_accuracy": 0.4896551609039307, "step": 91905 }, { "epoch": 0.09257272816090391, "grad_norm": 10.428980185046118, "learning_rate": 4.977739651656696e-05, "loss": 2.6379, "mean_token_accuracy": 0.39655172228813174, "step": 91910 }, { "epoch": 0.09257776421400808, "grad_norm": 13.786325710956842, "learning_rate": 4.9777343934467747e-05, "loss": 2.6835, "mean_token_accuracy": 0.39655172228813174, "step": 91915 }, { "epoch": 0.09258280026711226, "grad_norm": 13.878426982845411, "learning_rate": 4.977729134618984e-05, "loss": 2.2496, "mean_token_accuracy": 0.42758620381355283, "step": 91920 }, { "epoch": 0.09258783632021643, "grad_norm": 10.888687605112107, "learning_rate": 4.9777238751733235e-05, "loss": 2.3617, "mean_token_accuracy": 0.44482759237289426, "step": 91925 }, { "epoch": 0.0925928723733206, "grad_norm": 10.68763075756421, "learning_rate": 4.977718615109796e-05, "loss": 2.4413, "mean_token_accuracy": 0.3862069010734558, "step": 91930 }, { "epoch": 0.09259790842642478, "grad_norm": 10.353540514424385, "learning_rate": 4.9777133544284024e-05, "loss": 2.3548, "mean_token_accuracy": 0.44137929677963256, "step": 91935 }, { "epoch": 0.09260294447952895, "grad_norm": 10.65611579966718, "learning_rate": 4.9777080931291435e-05, "loss": 2.5851, "mean_token_accuracy": 0.43103448748588563, "step": 91940 }, { "epoch": 0.09260798053263312, "grad_norm": 10.4293616869514, "learning_rate": 4.977702831212023e-05, "loss": 2.1197, "mean_token_accuracy": 0.4793103337287903, "step": 91945 }, { "epoch": 0.0926130165857373, "grad_norm": 10.48002616586928, "learning_rate": 4.9776975686770397e-05, "loss": 2.3118, "mean_token_accuracy": 0.46430732011795045, "step": 91950 }, { "epoch": 0.09261805263884147, "grad_norm": 12.153737526317196, "learning_rate": 4.977692305524196e-05, "loss": 2.7396, "mean_token_accuracy": 0.3793103456497192, "step": 91955 }, { "epoch": 0.09262308869194563, "grad_norm": 10.435647939083914, "learning_rate": 4.977687041753494e-05, "loss": 1.8212, "mean_token_accuracy": 0.482758617401123, "step": 91960 }, { "epoch": 0.0926281247450498, "grad_norm": 11.182411693703138, "learning_rate": 4.9776817773649354e-05, "loss": 2.3089, "mean_token_accuracy": 0.45172414779663084, "step": 91965 }, { "epoch": 0.09263316079815398, "grad_norm": 10.100994992939976, "learning_rate": 4.9776765123585195e-05, "loss": 2.2155, "mean_token_accuracy": 0.47586206793785096, "step": 91970 }, { "epoch": 0.09263819685125815, "grad_norm": 11.24097245390771, "learning_rate": 4.9776712467342494e-05, "loss": 2.5727, "mean_token_accuracy": 0.37241377830505373, "step": 91975 }, { "epoch": 0.09264323290436233, "grad_norm": 9.800373781442817, "learning_rate": 4.977665980492127e-05, "loss": 2.6779, "mean_token_accuracy": 0.35172412991523744, "step": 91980 }, { "epoch": 0.0926482689574665, "grad_norm": 8.997495749366735, "learning_rate": 4.977660713632153e-05, "loss": 2.972, "mean_token_accuracy": 0.4068965554237366, "step": 91985 }, { "epoch": 0.09265330501057067, "grad_norm": 12.638631931501637, "learning_rate": 4.977655446154329e-05, "loss": 2.6116, "mean_token_accuracy": 0.4103448331356049, "step": 91990 }, { "epoch": 0.09265834106367485, "grad_norm": 11.09903312623628, "learning_rate": 4.977650178058656e-05, "loss": 2.4968, "mean_token_accuracy": 0.4379310250282288, "step": 91995 }, { "epoch": 0.09266337711677902, "grad_norm": 10.132028741577532, "learning_rate": 4.977644909345137e-05, "loss": 2.5507, "mean_token_accuracy": 0.42758620381355283, "step": 92000 }, { "epoch": 0.0926684131698832, "grad_norm": 13.763401665158721, "learning_rate": 4.977639640013772e-05, "loss": 2.3868, "mean_token_accuracy": 0.4310344845056534, "step": 92005 }, { "epoch": 0.09267344922298737, "grad_norm": 10.427830041259291, "learning_rate": 4.9776343700645614e-05, "loss": 2.4399, "mean_token_accuracy": 0.43103447556495667, "step": 92010 }, { "epoch": 0.09267848527609154, "grad_norm": 12.422102661140848, "learning_rate": 4.97762909949751e-05, "loss": 3.0089, "mean_token_accuracy": 0.3275862067937851, "step": 92015 }, { "epoch": 0.09268352132919572, "grad_norm": 10.961419462194126, "learning_rate": 4.977623828312617e-05, "loss": 2.4861, "mean_token_accuracy": 0.37931033968925476, "step": 92020 }, { "epoch": 0.09268855738229989, "grad_norm": 10.1723811928474, "learning_rate": 4.977618556509883e-05, "loss": 2.4913, "mean_token_accuracy": 0.4172413766384125, "step": 92025 }, { "epoch": 0.09269359343540405, "grad_norm": 9.29291224249899, "learning_rate": 4.977613284089312e-05, "loss": 2.8625, "mean_token_accuracy": 0.3827586144208908, "step": 92030 }, { "epoch": 0.09269862948850822, "grad_norm": 9.704491513823605, "learning_rate": 4.977608011050903e-05, "loss": 2.2936, "mean_token_accuracy": 0.3827586233615875, "step": 92035 }, { "epoch": 0.0927036655416124, "grad_norm": 9.715265628073409, "learning_rate": 4.9776027373946594e-05, "loss": 2.1545, "mean_token_accuracy": 0.43103448748588563, "step": 92040 }, { "epoch": 0.09270870159471657, "grad_norm": 11.498606343050595, "learning_rate": 4.9775974631205816e-05, "loss": 2.4863, "mean_token_accuracy": 0.4068965554237366, "step": 92045 }, { "epoch": 0.09271373764782075, "grad_norm": 13.063819258469659, "learning_rate": 4.977592188228671e-05, "loss": 2.2778, "mean_token_accuracy": 0.4068965494632721, "step": 92050 }, { "epoch": 0.09271877370092492, "grad_norm": 10.182690010366171, "learning_rate": 4.977586912718931e-05, "loss": 2.0258, "mean_token_accuracy": 0.49655172824859617, "step": 92055 }, { "epoch": 0.0927238097540291, "grad_norm": 9.723184051776537, "learning_rate": 4.9775816365913594e-05, "loss": 2.3052, "mean_token_accuracy": 0.46551724076271056, "step": 92060 }, { "epoch": 0.09272884580713327, "grad_norm": 8.779780888062787, "learning_rate": 4.9775763598459613e-05, "loss": 2.0233, "mean_token_accuracy": 0.4896551549434662, "step": 92065 }, { "epoch": 0.09273388186023744, "grad_norm": 10.213109816403085, "learning_rate": 4.9775710824827354e-05, "loss": 2.35, "mean_token_accuracy": 0.4434361755847931, "step": 92070 }, { "epoch": 0.09273891791334161, "grad_norm": 8.541281858234596, "learning_rate": 4.977565804501685e-05, "loss": 2.0914, "mean_token_accuracy": 0.4758620738983154, "step": 92075 }, { "epoch": 0.09274395396644579, "grad_norm": 11.550270211639331, "learning_rate": 4.97756052590281e-05, "loss": 2.9165, "mean_token_accuracy": 0.3068965494632721, "step": 92080 }, { "epoch": 0.09274899001954996, "grad_norm": 12.056004785210744, "learning_rate": 4.977555246686114e-05, "loss": 2.7581, "mean_token_accuracy": 0.40000000298023225, "step": 92085 }, { "epoch": 0.09275402607265414, "grad_norm": 11.424738048819354, "learning_rate": 4.9775499668515965e-05, "loss": 2.5809, "mean_token_accuracy": 0.37586206793785093, "step": 92090 }, { "epoch": 0.09275906212575831, "grad_norm": 11.131140235142094, "learning_rate": 4.977544686399261e-05, "loss": 2.2902, "mean_token_accuracy": 0.4068965494632721, "step": 92095 }, { "epoch": 0.09276409817886247, "grad_norm": 8.079514965820419, "learning_rate": 4.977539405329106e-05, "loss": 1.9756, "mean_token_accuracy": 0.46896551847457885, "step": 92100 }, { "epoch": 0.09276913423196664, "grad_norm": 11.705788493417108, "learning_rate": 4.977534123641136e-05, "loss": 2.5874, "mean_token_accuracy": 0.3896551728248596, "step": 92105 }, { "epoch": 0.09277417028507082, "grad_norm": 11.018933014607764, "learning_rate": 4.97752884133535e-05, "loss": 2.6168, "mean_token_accuracy": 0.4068965494632721, "step": 92110 }, { "epoch": 0.09277920633817499, "grad_norm": 11.789714327346651, "learning_rate": 4.9775235584117505e-05, "loss": 2.4127, "mean_token_accuracy": 0.42413792610168455, "step": 92115 }, { "epoch": 0.09278424239127916, "grad_norm": 9.193109234180799, "learning_rate": 4.9775182748703394e-05, "loss": 2.3977, "mean_token_accuracy": 0.4172413766384125, "step": 92120 }, { "epoch": 0.09278927844438334, "grad_norm": 11.21024811940659, "learning_rate": 4.977512990711118e-05, "loss": 2.3065, "mean_token_accuracy": 0.42758620977401735, "step": 92125 }, { "epoch": 0.09279431449748751, "grad_norm": 10.58554497592227, "learning_rate": 4.977507705934087e-05, "loss": 2.4399, "mean_token_accuracy": 0.4152450144290924, "step": 92130 }, { "epoch": 0.09279935055059169, "grad_norm": 12.252767802961273, "learning_rate": 4.9775024205392496e-05, "loss": 2.9294, "mean_token_accuracy": 0.3965517282485962, "step": 92135 }, { "epoch": 0.09280438660369586, "grad_norm": 13.29023453607371, "learning_rate": 4.977497134526605e-05, "loss": 2.4149, "mean_token_accuracy": 0.41034482717514037, "step": 92140 }, { "epoch": 0.09280942265680003, "grad_norm": 10.185945496245392, "learning_rate": 4.977491847896156e-05, "loss": 2.4953, "mean_token_accuracy": 0.4034482777118683, "step": 92145 }, { "epoch": 0.09281445870990421, "grad_norm": 10.617728271658123, "learning_rate": 4.977486560647904e-05, "loss": 2.4225, "mean_token_accuracy": 0.37586207389831544, "step": 92150 }, { "epoch": 0.09281949476300838, "grad_norm": 12.479448377256997, "learning_rate": 4.9774812727818504e-05, "loss": 2.3188, "mean_token_accuracy": 0.4448275864124298, "step": 92155 }, { "epoch": 0.09282453081611255, "grad_norm": 8.531249062996755, "learning_rate": 4.977475984297997e-05, "loss": 2.1227, "mean_token_accuracy": 0.4517241299152374, "step": 92160 }, { "epoch": 0.09282956686921673, "grad_norm": 11.376709565160194, "learning_rate": 4.977470695196344e-05, "loss": 2.3685, "mean_token_accuracy": 0.4068965554237366, "step": 92165 }, { "epoch": 0.09283460292232089, "grad_norm": 15.471327162921611, "learning_rate": 4.977465405476894e-05, "loss": 2.6729, "mean_token_accuracy": 0.43793103098869324, "step": 92170 }, { "epoch": 0.09283963897542506, "grad_norm": 11.84291796918407, "learning_rate": 4.977460115139648e-05, "loss": 2.4258, "mean_token_accuracy": 0.44137930274009707, "step": 92175 }, { "epoch": 0.09284467502852924, "grad_norm": 10.573241099918995, "learning_rate": 4.977454824184607e-05, "loss": 2.1563, "mean_token_accuracy": 0.4379310369491577, "step": 92180 }, { "epoch": 0.09284971108163341, "grad_norm": 13.910168303833267, "learning_rate": 4.977449532611774e-05, "loss": 2.3143, "mean_token_accuracy": 0.43793103098869324, "step": 92185 }, { "epoch": 0.09285474713473758, "grad_norm": 12.128865982554586, "learning_rate": 4.97744424042115e-05, "loss": 2.6239, "mean_token_accuracy": 0.382758629322052, "step": 92190 }, { "epoch": 0.09285978318784176, "grad_norm": 18.301792247461226, "learning_rate": 4.977438947612735e-05, "loss": 2.4265, "mean_token_accuracy": 0.42068966031074523, "step": 92195 }, { "epoch": 0.09286481924094593, "grad_norm": 15.053752274233478, "learning_rate": 4.977433654186532e-05, "loss": 2.5898, "mean_token_accuracy": 0.4517241418361664, "step": 92200 }, { "epoch": 0.0928698552940501, "grad_norm": 10.879343736393263, "learning_rate": 4.977428360142541e-05, "loss": 2.2888, "mean_token_accuracy": 0.42758620381355283, "step": 92205 }, { "epoch": 0.09287489134715428, "grad_norm": 11.39817981346126, "learning_rate": 4.977423065480766e-05, "loss": 2.8916, "mean_token_accuracy": 0.3931034505367279, "step": 92210 }, { "epoch": 0.09287992740025845, "grad_norm": 12.344893920747747, "learning_rate": 4.977417770201206e-05, "loss": 2.4541, "mean_token_accuracy": 0.4310344815254211, "step": 92215 }, { "epoch": 0.09288496345336263, "grad_norm": 10.12584355401762, "learning_rate": 4.977412474303864e-05, "loss": 2.4165, "mean_token_accuracy": 0.42413792610168455, "step": 92220 }, { "epoch": 0.0928899995064668, "grad_norm": 11.231283778892562, "learning_rate": 4.977407177788741e-05, "loss": 2.3637, "mean_token_accuracy": 0.45172414779663084, "step": 92225 }, { "epoch": 0.09289503555957097, "grad_norm": 13.572449450162914, "learning_rate": 4.977401880655837e-05, "loss": 2.5735, "mean_token_accuracy": 0.3862068891525269, "step": 92230 }, { "epoch": 0.09290007161267515, "grad_norm": 13.088077005900482, "learning_rate": 4.9773965829051555e-05, "loss": 2.195, "mean_token_accuracy": 0.5084694564342499, "step": 92235 }, { "epoch": 0.09290510766577931, "grad_norm": 9.53956793636164, "learning_rate": 4.977391284536697e-05, "loss": 2.5247, "mean_token_accuracy": 0.37586205899715425, "step": 92240 }, { "epoch": 0.09291014371888348, "grad_norm": 12.331755354295211, "learning_rate": 4.9773859855504634e-05, "loss": 2.6305, "mean_token_accuracy": 0.43448275327682495, "step": 92245 }, { "epoch": 0.09291517977198765, "grad_norm": 8.445318060540886, "learning_rate": 4.977380685946457e-05, "loss": 2.503, "mean_token_accuracy": 0.42758620977401735, "step": 92250 }, { "epoch": 0.09292021582509183, "grad_norm": 13.125980582437558, "learning_rate": 4.977375385724676e-05, "loss": 2.2676, "mean_token_accuracy": 0.4517241299152374, "step": 92255 }, { "epoch": 0.092925251878196, "grad_norm": 11.168617743320192, "learning_rate": 4.977370084885126e-05, "loss": 2.5238, "mean_token_accuracy": 0.36896551251411436, "step": 92260 }, { "epoch": 0.09293028793130018, "grad_norm": 12.623181626605657, "learning_rate": 4.977364783427806e-05, "loss": 2.4592, "mean_token_accuracy": 0.4172413766384125, "step": 92265 }, { "epoch": 0.09293532398440435, "grad_norm": 10.46315237037504, "learning_rate": 4.977359481352718e-05, "loss": 1.9819, "mean_token_accuracy": 0.49655172824859617, "step": 92270 }, { "epoch": 0.09294036003750852, "grad_norm": 9.68745395691892, "learning_rate": 4.9773541786598636e-05, "loss": 2.6187, "mean_token_accuracy": 0.358620685338974, "step": 92275 }, { "epoch": 0.0929453960906127, "grad_norm": 15.587786286686514, "learning_rate": 4.9773488753492444e-05, "loss": 2.1654, "mean_token_accuracy": 0.48275862336158754, "step": 92280 }, { "epoch": 0.09295043214371687, "grad_norm": 10.624603633727313, "learning_rate": 4.9773435714208616e-05, "loss": 2.172, "mean_token_accuracy": 0.4620689690113068, "step": 92285 }, { "epoch": 0.09295546819682105, "grad_norm": 11.818433108672334, "learning_rate": 4.9773382668747166e-05, "loss": 2.3845, "mean_token_accuracy": 0.4393224537372589, "step": 92290 }, { "epoch": 0.09296050424992522, "grad_norm": 12.2907485413273, "learning_rate": 4.977332961710811e-05, "loss": 2.4794, "mean_token_accuracy": 0.39655172228813174, "step": 92295 }, { "epoch": 0.09296554030302939, "grad_norm": 10.636346049314323, "learning_rate": 4.9773276559291465e-05, "loss": 2.6095, "mean_token_accuracy": 0.4896551787853241, "step": 92300 }, { "epoch": 0.09297057635613357, "grad_norm": 10.924448326483795, "learning_rate": 4.977322349529725e-05, "loss": 2.5351, "mean_token_accuracy": 0.42915910482406616, "step": 92305 }, { "epoch": 0.09297561240923773, "grad_norm": 10.208026338177653, "learning_rate": 4.977317042512546e-05, "loss": 2.8844, "mean_token_accuracy": 0.3896551728248596, "step": 92310 }, { "epoch": 0.0929806484623419, "grad_norm": 10.245016768810828, "learning_rate": 4.9773117348776136e-05, "loss": 2.657, "mean_token_accuracy": 0.3931034505367279, "step": 92315 }, { "epoch": 0.09298568451544607, "grad_norm": 13.039856796282708, "learning_rate": 4.977306426624926e-05, "loss": 2.6561, "mean_token_accuracy": 0.3482758581638336, "step": 92320 }, { "epoch": 0.09299072056855025, "grad_norm": 14.861437747977563, "learning_rate": 4.9773011177544885e-05, "loss": 2.2725, "mean_token_accuracy": 0.41379311084747317, "step": 92325 }, { "epoch": 0.09299575662165442, "grad_norm": 11.474356884105152, "learning_rate": 4.9772958082663e-05, "loss": 2.2247, "mean_token_accuracy": 0.4689655125141144, "step": 92330 }, { "epoch": 0.0930007926747586, "grad_norm": 10.888752820123793, "learning_rate": 4.9772904981603636e-05, "loss": 2.2366, "mean_token_accuracy": 0.3931034505367279, "step": 92335 }, { "epoch": 0.09300582872786277, "grad_norm": 9.103565766910908, "learning_rate": 4.977285187436678e-05, "loss": 2.2712, "mean_token_accuracy": 0.44482759237289426, "step": 92340 }, { "epoch": 0.09301086478096694, "grad_norm": 13.376822153954766, "learning_rate": 4.9772798760952485e-05, "loss": 2.617, "mean_token_accuracy": 0.42068964838981626, "step": 92345 }, { "epoch": 0.09301590083407112, "grad_norm": 9.158230367936357, "learning_rate": 4.977274564136073e-05, "loss": 2.2565, "mean_token_accuracy": 0.4413793087005615, "step": 92350 }, { "epoch": 0.09302093688717529, "grad_norm": 10.562088824944562, "learning_rate": 4.9772692515591554e-05, "loss": 2.376, "mean_token_accuracy": 0.43793103098869324, "step": 92355 }, { "epoch": 0.09302597294027946, "grad_norm": 13.52528386028242, "learning_rate": 4.977263938364497e-05, "loss": 2.914, "mean_token_accuracy": 0.3896551728248596, "step": 92360 }, { "epoch": 0.09303100899338364, "grad_norm": 9.776135298229528, "learning_rate": 4.977258624552098e-05, "loss": 2.0694, "mean_token_accuracy": 0.47931034564971925, "step": 92365 }, { "epoch": 0.09303604504648781, "grad_norm": 9.673529790468157, "learning_rate": 4.9772533101219606e-05, "loss": 2.1271, "mean_token_accuracy": 0.4448275864124298, "step": 92370 }, { "epoch": 0.09304108109959199, "grad_norm": 11.27846004579561, "learning_rate": 4.977247995074086e-05, "loss": 2.3891, "mean_token_accuracy": 0.41724138259887694, "step": 92375 }, { "epoch": 0.09304611715269615, "grad_norm": 13.089590304339058, "learning_rate": 4.977242679408476e-05, "loss": 2.3708, "mean_token_accuracy": 0.4344827592372894, "step": 92380 }, { "epoch": 0.09305115320580032, "grad_norm": 11.624101259178094, "learning_rate": 4.9772373631251315e-05, "loss": 2.6765, "mean_token_accuracy": 0.41379310488700866, "step": 92385 }, { "epoch": 0.09305618925890449, "grad_norm": 12.758754334082468, "learning_rate": 4.977232046224055e-05, "loss": 2.6919, "mean_token_accuracy": 0.35172412991523744, "step": 92390 }, { "epoch": 0.09306122531200867, "grad_norm": 13.648556543919906, "learning_rate": 4.9772267287052474e-05, "loss": 2.6681, "mean_token_accuracy": 0.38620689809322356, "step": 92395 }, { "epoch": 0.09306626136511284, "grad_norm": 12.107248006856352, "learning_rate": 4.9772214105687095e-05, "loss": 2.7961, "mean_token_accuracy": 0.39310344457626345, "step": 92400 }, { "epoch": 0.09307129741821701, "grad_norm": 10.430876193386759, "learning_rate": 4.977216091814444e-05, "loss": 2.4226, "mean_token_accuracy": 0.42401694059371947, "step": 92405 }, { "epoch": 0.09307633347132119, "grad_norm": 9.508432308920876, "learning_rate": 4.977210772442452e-05, "loss": 2.7651, "mean_token_accuracy": 0.4206896543502808, "step": 92410 }, { "epoch": 0.09308136952442536, "grad_norm": 10.718863890587022, "learning_rate": 4.9772054524527346e-05, "loss": 2.7359, "mean_token_accuracy": 0.38965516686439516, "step": 92415 }, { "epoch": 0.09308640557752954, "grad_norm": 9.752604468726473, "learning_rate": 4.977200131845293e-05, "loss": 2.5699, "mean_token_accuracy": 0.43448275327682495, "step": 92420 }, { "epoch": 0.09309144163063371, "grad_norm": 12.063498928439017, "learning_rate": 4.97719481062013e-05, "loss": 2.1773, "mean_token_accuracy": 0.42758620381355283, "step": 92425 }, { "epoch": 0.09309647768373788, "grad_norm": 9.955827190444213, "learning_rate": 4.977189488777246e-05, "loss": 2.1902, "mean_token_accuracy": 0.5137930989265442, "step": 92430 }, { "epoch": 0.09310151373684206, "grad_norm": 11.283787100903222, "learning_rate": 4.977184166316642e-05, "loss": 2.6678, "mean_token_accuracy": 0.3965517282485962, "step": 92435 }, { "epoch": 0.09310654978994623, "grad_norm": 12.862959013953216, "learning_rate": 4.9771788432383205e-05, "loss": 2.5473, "mean_token_accuracy": 0.4517241418361664, "step": 92440 }, { "epoch": 0.0931115858430504, "grad_norm": 18.87527828649575, "learning_rate": 4.977173519542282e-05, "loss": 3.0212, "mean_token_accuracy": 0.3586206823587418, "step": 92445 }, { "epoch": 0.09311662189615456, "grad_norm": 12.948380552154804, "learning_rate": 4.97716819522853e-05, "loss": 2.5381, "mean_token_accuracy": 0.41379310488700866, "step": 92450 }, { "epoch": 0.09312165794925874, "grad_norm": 13.459223684172914, "learning_rate": 4.9771628702970633e-05, "loss": 2.5765, "mean_token_accuracy": 0.44482759237289426, "step": 92455 }, { "epoch": 0.09312669400236291, "grad_norm": 9.79990962662244, "learning_rate": 4.9771575447478854e-05, "loss": 2.4353, "mean_token_accuracy": 0.44827585816383364, "step": 92460 }, { "epoch": 0.09313173005546709, "grad_norm": 11.94439005235742, "learning_rate": 4.977152218580997e-05, "loss": 2.6181, "mean_token_accuracy": 0.41034482717514037, "step": 92465 }, { "epoch": 0.09313676610857126, "grad_norm": 11.116290695884967, "learning_rate": 4.977146891796399e-05, "loss": 2.7058, "mean_token_accuracy": 0.4068965554237366, "step": 92470 }, { "epoch": 0.09314180216167543, "grad_norm": 11.675002101422656, "learning_rate": 4.9771415643940945e-05, "loss": 2.6381, "mean_token_accuracy": 0.36206896007061007, "step": 92475 }, { "epoch": 0.0931468382147796, "grad_norm": 11.550484897198121, "learning_rate": 4.9771362363740835e-05, "loss": 2.3384, "mean_token_accuracy": 0.4310344815254211, "step": 92480 }, { "epoch": 0.09315187426788378, "grad_norm": 12.021461095014665, "learning_rate": 4.977130907736367e-05, "loss": 2.3555, "mean_token_accuracy": 0.4310344815254211, "step": 92485 }, { "epoch": 0.09315691032098795, "grad_norm": 9.123185054846829, "learning_rate": 4.977125578480949e-05, "loss": 2.4685, "mean_token_accuracy": 0.3999999940395355, "step": 92490 }, { "epoch": 0.09316194637409213, "grad_norm": 11.88124269712118, "learning_rate": 4.977120248607829e-05, "loss": 2.9889, "mean_token_accuracy": 0.4, "step": 92495 }, { "epoch": 0.0931669824271963, "grad_norm": 9.808831903113491, "learning_rate": 4.977114918117009e-05, "loss": 2.2065, "mean_token_accuracy": 0.42758620381355283, "step": 92500 }, { "epoch": 0.09317201848030048, "grad_norm": 12.290037315707783, "learning_rate": 4.9771095870084896e-05, "loss": 2.285, "mean_token_accuracy": 0.3896551728248596, "step": 92505 }, { "epoch": 0.09317705453340465, "grad_norm": 13.617816476940597, "learning_rate": 4.977104255282274e-05, "loss": 2.7779, "mean_token_accuracy": 0.3999999940395355, "step": 92510 }, { "epoch": 0.09318209058650882, "grad_norm": 10.05176596408957, "learning_rate": 4.977098922938362e-05, "loss": 2.4282, "mean_token_accuracy": 0.412099215388298, "step": 92515 }, { "epoch": 0.09318712663961298, "grad_norm": 11.908948292467882, "learning_rate": 4.9770935899767556e-05, "loss": 2.91, "mean_token_accuracy": 0.3620689570903778, "step": 92520 }, { "epoch": 0.09319216269271716, "grad_norm": 11.298092776541889, "learning_rate": 4.9770882563974566e-05, "loss": 2.5022, "mean_token_accuracy": 0.3965517282485962, "step": 92525 }, { "epoch": 0.09319719874582133, "grad_norm": 12.556377468936898, "learning_rate": 4.977082922200468e-05, "loss": 2.1543, "mean_token_accuracy": 0.5059286117553711, "step": 92530 }, { "epoch": 0.0932022347989255, "grad_norm": 12.00185048374263, "learning_rate": 4.977077587385788e-05, "loss": 2.5706, "mean_token_accuracy": 0.37931033968925476, "step": 92535 }, { "epoch": 0.09320727085202968, "grad_norm": 10.3115550833988, "learning_rate": 4.97707225195342e-05, "loss": 2.5789, "mean_token_accuracy": 0.38965517580509185, "step": 92540 }, { "epoch": 0.09321230690513385, "grad_norm": 12.468289418287181, "learning_rate": 4.977066915903365e-05, "loss": 2.3518, "mean_token_accuracy": 0.4620689690113068, "step": 92545 }, { "epoch": 0.09321734295823803, "grad_norm": 12.026601684565376, "learning_rate": 4.9770615792356254e-05, "loss": 2.3714, "mean_token_accuracy": 0.42413793206214906, "step": 92550 }, { "epoch": 0.0932223790113422, "grad_norm": 11.397426746458457, "learning_rate": 4.977056241950201e-05, "loss": 2.6073, "mean_token_accuracy": 0.4137930989265442, "step": 92555 }, { "epoch": 0.09322741506444637, "grad_norm": 12.522030582816551, "learning_rate": 4.977050904047095e-05, "loss": 2.9241, "mean_token_accuracy": 0.36896551251411436, "step": 92560 }, { "epoch": 0.09323245111755055, "grad_norm": 8.975437569726779, "learning_rate": 4.977045565526308e-05, "loss": 2.6703, "mean_token_accuracy": 0.3827586233615875, "step": 92565 }, { "epoch": 0.09323748717065472, "grad_norm": 11.967474950934282, "learning_rate": 4.9770402263878416e-05, "loss": 2.3723, "mean_token_accuracy": 0.415426504611969, "step": 92570 }, { "epoch": 0.0932425232237589, "grad_norm": 9.45573059764412, "learning_rate": 4.9770348866316965e-05, "loss": 2.5565, "mean_token_accuracy": 0.41034482717514037, "step": 92575 }, { "epoch": 0.09324755927686307, "grad_norm": 11.261449647949393, "learning_rate": 4.977029546257876e-05, "loss": 2.4819, "mean_token_accuracy": 0.46896551847457885, "step": 92580 }, { "epoch": 0.09325259532996724, "grad_norm": 10.71339535193657, "learning_rate": 4.9770242052663794e-05, "loss": 2.8431, "mean_token_accuracy": 0.3909255862236023, "step": 92585 }, { "epoch": 0.0932576313830714, "grad_norm": 9.737008574500482, "learning_rate": 4.977018863657211e-05, "loss": 2.4798, "mean_token_accuracy": 0.44482759237289426, "step": 92590 }, { "epoch": 0.09326266743617558, "grad_norm": 11.497102399346923, "learning_rate": 4.9770135214303695e-05, "loss": 2.4002, "mean_token_accuracy": 0.41034482717514037, "step": 92595 }, { "epoch": 0.09326770348927975, "grad_norm": 12.329289841573527, "learning_rate": 4.9770081785858576e-05, "loss": 2.4525, "mean_token_accuracy": 0.4551724135875702, "step": 92600 }, { "epoch": 0.09327273954238392, "grad_norm": 11.056950583508407, "learning_rate": 4.977002835123677e-05, "loss": 2.4617, "mean_token_accuracy": 0.4482758641242981, "step": 92605 }, { "epoch": 0.0932777755954881, "grad_norm": 12.087564256325242, "learning_rate": 4.976997491043829e-05, "loss": 2.5137, "mean_token_accuracy": 0.4172413766384125, "step": 92610 }, { "epoch": 0.09328281164859227, "grad_norm": 9.06300492334387, "learning_rate": 4.976992146346314e-05, "loss": 2.4869, "mean_token_accuracy": 0.42928009629249575, "step": 92615 }, { "epoch": 0.09328784770169644, "grad_norm": 8.153345723630702, "learning_rate": 4.9769868010311344e-05, "loss": 2.4661, "mean_token_accuracy": 0.41034482717514037, "step": 92620 }, { "epoch": 0.09329288375480062, "grad_norm": 10.361493382764031, "learning_rate": 4.976981455098292e-05, "loss": 2.4599, "mean_token_accuracy": 0.37241379618644715, "step": 92625 }, { "epoch": 0.09329791980790479, "grad_norm": 11.815837297641073, "learning_rate": 4.976976108547789e-05, "loss": 2.7318, "mean_token_accuracy": 0.42758620977401735, "step": 92630 }, { "epoch": 0.09330295586100897, "grad_norm": 10.588833025503037, "learning_rate": 4.976970761379625e-05, "loss": 2.1989, "mean_token_accuracy": 0.42413792610168455, "step": 92635 }, { "epoch": 0.09330799191411314, "grad_norm": 9.68139490931494, "learning_rate": 4.976965413593803e-05, "loss": 2.6461, "mean_token_accuracy": 0.4369026005268097, "step": 92640 }, { "epoch": 0.09331302796721731, "grad_norm": 10.080561392583725, "learning_rate": 4.976960065190323e-05, "loss": 2.4649, "mean_token_accuracy": 0.46896551847457885, "step": 92645 }, { "epoch": 0.09331806402032149, "grad_norm": 10.266502420945113, "learning_rate": 4.976954716169187e-05, "loss": 2.3872, "mean_token_accuracy": 0.4620689570903778, "step": 92650 }, { "epoch": 0.09332310007342566, "grad_norm": 10.865245518004352, "learning_rate": 4.9769493665303986e-05, "loss": 2.7558, "mean_token_accuracy": 0.38620689511299133, "step": 92655 }, { "epoch": 0.09332813612652982, "grad_norm": 12.255538383261452, "learning_rate": 4.976944016273956e-05, "loss": 3.0601, "mean_token_accuracy": 0.36896551251411436, "step": 92660 }, { "epoch": 0.093333172179634, "grad_norm": 13.479489374202345, "learning_rate": 4.976938665399863e-05, "loss": 2.5504, "mean_token_accuracy": 0.38965516686439516, "step": 92665 }, { "epoch": 0.09333820823273817, "grad_norm": 10.751500446597479, "learning_rate": 4.9769333139081185e-05, "loss": 2.0808, "mean_token_accuracy": 0.48965516686439514, "step": 92670 }, { "epoch": 0.09334324428584234, "grad_norm": 10.649360807449133, "learning_rate": 4.976927961798728e-05, "loss": 2.1683, "mean_token_accuracy": 0.4896551728248596, "step": 92675 }, { "epoch": 0.09334828033894652, "grad_norm": 11.226245054681119, "learning_rate": 4.97692260907169e-05, "loss": 2.5563, "mean_token_accuracy": 0.39655172228813174, "step": 92680 }, { "epoch": 0.09335331639205069, "grad_norm": 15.789296700315713, "learning_rate": 4.976917255727006e-05, "loss": 2.6985, "mean_token_accuracy": 0.4068965494632721, "step": 92685 }, { "epoch": 0.09335835244515486, "grad_norm": 9.815690936516095, "learning_rate": 4.976911901764679e-05, "loss": 2.4069, "mean_token_accuracy": 0.43103447556495667, "step": 92690 }, { "epoch": 0.09336338849825904, "grad_norm": 30.93793282628475, "learning_rate": 4.9769065471847084e-05, "loss": 3.825, "mean_token_accuracy": 0.2655172377824783, "step": 92695 }, { "epoch": 0.09336842455136321, "grad_norm": 11.34813796181201, "learning_rate": 4.976901191987098e-05, "loss": 2.4703, "mean_token_accuracy": 0.4, "step": 92700 }, { "epoch": 0.09337346060446738, "grad_norm": 11.939390201434765, "learning_rate": 4.976895836171849e-05, "loss": 2.4997, "mean_token_accuracy": 0.4137930989265442, "step": 92705 }, { "epoch": 0.09337849665757156, "grad_norm": 19.794229274360738, "learning_rate": 4.97689047973896e-05, "loss": 2.5721, "mean_token_accuracy": 0.38620689511299133, "step": 92710 }, { "epoch": 0.09338353271067573, "grad_norm": 9.290166109030958, "learning_rate": 4.976885122688436e-05, "loss": 2.6266, "mean_token_accuracy": 0.4137930929660797, "step": 92715 }, { "epoch": 0.0933885687637799, "grad_norm": 9.06630074574957, "learning_rate": 4.976879765020277e-05, "loss": 2.2635, "mean_token_accuracy": 0.4448275864124298, "step": 92720 }, { "epoch": 0.09339360481688408, "grad_norm": 11.956370372052069, "learning_rate": 4.976874406734485e-05, "loss": 2.5353, "mean_token_accuracy": 0.493103438615799, "step": 92725 }, { "epoch": 0.09339864086998824, "grad_norm": 9.328049034772038, "learning_rate": 4.9768690478310604e-05, "loss": 2.2657, "mean_token_accuracy": 0.4448275864124298, "step": 92730 }, { "epoch": 0.09340367692309241, "grad_norm": 9.81283040580341, "learning_rate": 4.976863688310006e-05, "loss": 2.3843, "mean_token_accuracy": 0.41379311084747317, "step": 92735 }, { "epoch": 0.09340871297619659, "grad_norm": 11.683928363714813, "learning_rate": 4.976858328171322e-05, "loss": 2.9107, "mean_token_accuracy": 0.37241379618644715, "step": 92740 }, { "epoch": 0.09341374902930076, "grad_norm": 9.134145965917087, "learning_rate": 4.97685296741501e-05, "loss": 2.4425, "mean_token_accuracy": 0.47453115582466127, "step": 92745 }, { "epoch": 0.09341878508240493, "grad_norm": 10.768601625409739, "learning_rate": 4.976847606041073e-05, "loss": 2.8465, "mean_token_accuracy": 0.34827585220336915, "step": 92750 }, { "epoch": 0.09342382113550911, "grad_norm": 14.582288336736537, "learning_rate": 4.9768422440495116e-05, "loss": 2.891, "mean_token_accuracy": 0.3655172437429428, "step": 92755 }, { "epoch": 0.09342885718861328, "grad_norm": 9.542700587414329, "learning_rate": 4.976836881440327e-05, "loss": 2.2164, "mean_token_accuracy": 0.41016334295272827, "step": 92760 }, { "epoch": 0.09343389324171746, "grad_norm": 15.739911789447238, "learning_rate": 4.976831518213521e-05, "loss": 2.6229, "mean_token_accuracy": 0.4586206912994385, "step": 92765 }, { "epoch": 0.09343892929482163, "grad_norm": 12.337118209961616, "learning_rate": 4.976826154369095e-05, "loss": 2.2329, "mean_token_accuracy": 0.4379310369491577, "step": 92770 }, { "epoch": 0.0934439653479258, "grad_norm": 11.795702055333203, "learning_rate": 4.97682078990705e-05, "loss": 2.9782, "mean_token_accuracy": 0.3655172437429428, "step": 92775 }, { "epoch": 0.09344900140102998, "grad_norm": 13.525298118143013, "learning_rate": 4.976815424827388e-05, "loss": 2.7375, "mean_token_accuracy": 0.34137930870056155, "step": 92780 }, { "epoch": 0.09345403745413415, "grad_norm": 9.915901863999816, "learning_rate": 4.9768100591301106e-05, "loss": 2.4255, "mean_token_accuracy": 0.382758629322052, "step": 92785 }, { "epoch": 0.09345907350723832, "grad_norm": 11.610746273242707, "learning_rate": 4.976804692815219e-05, "loss": 2.3981, "mean_token_accuracy": 0.37241379022598264, "step": 92790 }, { "epoch": 0.0934641095603425, "grad_norm": 9.929665599632138, "learning_rate": 4.976799325882715e-05, "loss": 2.5908, "mean_token_accuracy": 0.41724138259887694, "step": 92795 }, { "epoch": 0.09346914561344666, "grad_norm": 12.502597543460368, "learning_rate": 4.9767939583326e-05, "loss": 2.6876, "mean_token_accuracy": 0.35172413289546967, "step": 92800 }, { "epoch": 0.09347418166655083, "grad_norm": 12.0264402983314, "learning_rate": 4.9767885901648745e-05, "loss": 2.7972, "mean_token_accuracy": 0.3896551787853241, "step": 92805 }, { "epoch": 0.093479217719655, "grad_norm": 12.872803196102195, "learning_rate": 4.976783221379542e-05, "loss": 2.5473, "mean_token_accuracy": 0.3931034505367279, "step": 92810 }, { "epoch": 0.09348425377275918, "grad_norm": 11.218713767646067, "learning_rate": 4.976777851976603e-05, "loss": 2.4669, "mean_token_accuracy": 0.4551724135875702, "step": 92815 }, { "epoch": 0.09348928982586335, "grad_norm": 11.21067105207157, "learning_rate": 4.976772481956057e-05, "loss": 2.7913, "mean_token_accuracy": 0.337931028008461, "step": 92820 }, { "epoch": 0.09349432587896753, "grad_norm": 11.5304409438593, "learning_rate": 4.9767671113179094e-05, "loss": 2.6654, "mean_token_accuracy": 0.3655172407627106, "step": 92825 }, { "epoch": 0.0934993619320717, "grad_norm": 10.100310297208768, "learning_rate": 4.9767617400621594e-05, "loss": 2.4878, "mean_token_accuracy": 0.4172413766384125, "step": 92830 }, { "epoch": 0.09350439798517587, "grad_norm": 15.031401884169995, "learning_rate": 4.976756368188808e-05, "loss": 2.9683, "mean_token_accuracy": 0.358620685338974, "step": 92835 }, { "epoch": 0.09350943403828005, "grad_norm": 11.074894011660747, "learning_rate": 4.9767509956978575e-05, "loss": 2.4594, "mean_token_accuracy": 0.44482758045196535, "step": 92840 }, { "epoch": 0.09351447009138422, "grad_norm": 10.790180562306132, "learning_rate": 4.9767456225893096e-05, "loss": 2.6397, "mean_token_accuracy": 0.4103448212146759, "step": 92845 }, { "epoch": 0.0935195061444884, "grad_norm": 9.971107304948978, "learning_rate": 4.976740248863165e-05, "loss": 2.2712, "mean_token_accuracy": 0.4586206912994385, "step": 92850 }, { "epoch": 0.09352454219759257, "grad_norm": 14.923968101643517, "learning_rate": 4.9767348745194256e-05, "loss": 2.6099, "mean_token_accuracy": 0.40344826579093934, "step": 92855 }, { "epoch": 0.09352957825069674, "grad_norm": 10.257936206929745, "learning_rate": 4.9767294995580934e-05, "loss": 2.337, "mean_token_accuracy": 0.4086509466171265, "step": 92860 }, { "epoch": 0.09353461430380092, "grad_norm": 11.451341204745237, "learning_rate": 4.976724123979169e-05, "loss": 2.3821, "mean_token_accuracy": 0.42758620977401735, "step": 92865 }, { "epoch": 0.09353965035690508, "grad_norm": 10.386684890816936, "learning_rate": 4.976718747782656e-05, "loss": 2.2279, "mean_token_accuracy": 0.47126436829566953, "step": 92870 }, { "epoch": 0.09354468641000925, "grad_norm": 11.245326408676142, "learning_rate": 4.976713370968553e-05, "loss": 2.2516, "mean_token_accuracy": 0.4344827592372894, "step": 92875 }, { "epoch": 0.09354972246311342, "grad_norm": 11.85728772701635, "learning_rate": 4.976707993536863e-05, "loss": 2.4737, "mean_token_accuracy": 0.4448275864124298, "step": 92880 }, { "epoch": 0.0935547585162176, "grad_norm": 9.382579490391777, "learning_rate": 4.9767026154875865e-05, "loss": 2.3123, "mean_token_accuracy": 0.47586206197738645, "step": 92885 }, { "epoch": 0.09355979456932177, "grad_norm": 9.820522558071156, "learning_rate": 4.976697236820726e-05, "loss": 2.2382, "mean_token_accuracy": 0.4275861978530884, "step": 92890 }, { "epoch": 0.09356483062242595, "grad_norm": 11.095776261192862, "learning_rate": 4.976691857536283e-05, "loss": 2.2401, "mean_token_accuracy": 0.5085299432277679, "step": 92895 }, { "epoch": 0.09356986667553012, "grad_norm": 11.237277379479858, "learning_rate": 4.9766864776342594e-05, "loss": 2.1821, "mean_token_accuracy": 0.46896551847457885, "step": 92900 }, { "epoch": 0.0935749027286343, "grad_norm": 10.882568940158816, "learning_rate": 4.976681097114655e-05, "loss": 2.6394, "mean_token_accuracy": 0.42758620381355283, "step": 92905 }, { "epoch": 0.09357993878173847, "grad_norm": 12.376169467126376, "learning_rate": 4.976675715977472e-05, "loss": 2.3338, "mean_token_accuracy": 0.4689655125141144, "step": 92910 }, { "epoch": 0.09358497483484264, "grad_norm": 11.95347216128273, "learning_rate": 4.9766703342227136e-05, "loss": 2.2289, "mean_token_accuracy": 0.4914700508117676, "step": 92915 }, { "epoch": 0.09359001088794681, "grad_norm": 10.60752367373058, "learning_rate": 4.976664951850379e-05, "loss": 2.1172, "mean_token_accuracy": 0.46551724076271056, "step": 92920 }, { "epoch": 0.09359504694105099, "grad_norm": 12.280786195925531, "learning_rate": 4.97665956886047e-05, "loss": 2.2369, "mean_token_accuracy": 0.44482759237289426, "step": 92925 }, { "epoch": 0.09360008299415516, "grad_norm": 13.230850490859646, "learning_rate": 4.976654185252989e-05, "loss": 2.8799, "mean_token_accuracy": 0.3793103456497192, "step": 92930 }, { "epoch": 0.09360511904725934, "grad_norm": 8.986820056734857, "learning_rate": 4.9766488010279374e-05, "loss": 1.8635, "mean_token_accuracy": 0.5329703569412232, "step": 92935 }, { "epoch": 0.0936101551003635, "grad_norm": 11.128783391075146, "learning_rate": 4.976643416185317e-05, "loss": 1.8699, "mean_token_accuracy": 0.5448275864124298, "step": 92940 }, { "epoch": 0.09361519115346767, "grad_norm": 8.956521516257721, "learning_rate": 4.976638030725128e-05, "loss": 2.2377, "mean_token_accuracy": 0.46206897497177124, "step": 92945 }, { "epoch": 0.09362022720657184, "grad_norm": 11.081685447838057, "learning_rate": 4.976632644647373e-05, "loss": 2.0652, "mean_token_accuracy": 0.4862068951129913, "step": 92950 }, { "epoch": 0.09362526325967602, "grad_norm": 11.259181423267059, "learning_rate": 4.976627257952054e-05, "loss": 2.106, "mean_token_accuracy": 0.4896551728248596, "step": 92955 }, { "epoch": 0.09363029931278019, "grad_norm": 10.778768502039574, "learning_rate": 4.976621870639169e-05, "loss": 2.4444, "mean_token_accuracy": 0.43793103098869324, "step": 92960 }, { "epoch": 0.09363533536588436, "grad_norm": 13.798955386865313, "learning_rate": 4.9766164827087245e-05, "loss": 2.5698, "mean_token_accuracy": 0.38965516686439516, "step": 92965 }, { "epoch": 0.09364037141898854, "grad_norm": 14.723063509418964, "learning_rate": 4.976611094160719e-05, "loss": 2.3381, "mean_token_accuracy": 0.48965516686439514, "step": 92970 }, { "epoch": 0.09364540747209271, "grad_norm": 11.358259754575805, "learning_rate": 4.976605704995154e-05, "loss": 2.5215, "mean_token_accuracy": 0.4344827651977539, "step": 92975 }, { "epoch": 0.09365044352519689, "grad_norm": 10.120690628897718, "learning_rate": 4.976600315212032e-05, "loss": 2.3113, "mean_token_accuracy": 0.41379311084747317, "step": 92980 }, { "epoch": 0.09365547957830106, "grad_norm": 10.436636703961874, "learning_rate": 4.9765949248113546e-05, "loss": 2.4461, "mean_token_accuracy": 0.4344827651977539, "step": 92985 }, { "epoch": 0.09366051563140523, "grad_norm": 10.01201053586698, "learning_rate": 4.976589533793122e-05, "loss": 2.2459, "mean_token_accuracy": 0.4172413766384125, "step": 92990 }, { "epoch": 0.09366555168450941, "grad_norm": 8.786147704793626, "learning_rate": 4.976584142157337e-05, "loss": 2.4204, "mean_token_accuracy": 0.44827585220336913, "step": 92995 }, { "epoch": 0.09367058773761358, "grad_norm": 12.86482715463306, "learning_rate": 4.976578749904e-05, "loss": 2.5068, "mean_token_accuracy": 0.4068965494632721, "step": 93000 }, { "epoch": 0.09367562379071775, "grad_norm": 7.876332952257544, "learning_rate": 4.9765733570331136e-05, "loss": 1.9628, "mean_token_accuracy": 0.45862067937850953, "step": 93005 }, { "epoch": 0.09368065984382191, "grad_norm": 11.389875037933686, "learning_rate": 4.9765679635446786e-05, "loss": 2.2948, "mean_token_accuracy": 0.4034482717514038, "step": 93010 }, { "epoch": 0.09368569589692609, "grad_norm": 9.808344169641051, "learning_rate": 4.976562569438697e-05, "loss": 2.4078, "mean_token_accuracy": 0.38275861740112305, "step": 93015 }, { "epoch": 0.09369073195003026, "grad_norm": 9.674567995378498, "learning_rate": 4.9765571747151694e-05, "loss": 2.1584, "mean_token_accuracy": 0.4620689570903778, "step": 93020 }, { "epoch": 0.09369576800313444, "grad_norm": 14.020655524611346, "learning_rate": 4.9765517793740985e-05, "loss": 2.4751, "mean_token_accuracy": 0.3931034475564957, "step": 93025 }, { "epoch": 0.09370080405623861, "grad_norm": 9.253712249790578, "learning_rate": 4.976546383415484e-05, "loss": 2.5186, "mean_token_accuracy": 0.38965516686439516, "step": 93030 }, { "epoch": 0.09370584010934278, "grad_norm": 13.97677376993939, "learning_rate": 4.9765409868393296e-05, "loss": 2.7374, "mean_token_accuracy": 0.3896551728248596, "step": 93035 }, { "epoch": 0.09371087616244696, "grad_norm": 14.475171837191011, "learning_rate": 4.976535589645636e-05, "loss": 2.5171, "mean_token_accuracy": 0.47586206793785096, "step": 93040 }, { "epoch": 0.09371591221555113, "grad_norm": 12.496448517715145, "learning_rate": 4.976530191834404e-05, "loss": 2.5097, "mean_token_accuracy": 0.4206896543502808, "step": 93045 }, { "epoch": 0.0937209482686553, "grad_norm": 11.850822630948127, "learning_rate": 4.976524793405635e-05, "loss": 2.349, "mean_token_accuracy": 0.4413793087005615, "step": 93050 }, { "epoch": 0.09372598432175948, "grad_norm": 12.162001219858503, "learning_rate": 4.976519394359332e-05, "loss": 2.2964, "mean_token_accuracy": 0.43793103098869324, "step": 93055 }, { "epoch": 0.09373102037486365, "grad_norm": 10.836631901637716, "learning_rate": 4.9765139946954946e-05, "loss": 2.4334, "mean_token_accuracy": 0.42758620977401735, "step": 93060 }, { "epoch": 0.09373605642796783, "grad_norm": 11.846747313726004, "learning_rate": 4.976508594414126e-05, "loss": 2.286, "mean_token_accuracy": 0.44137930274009707, "step": 93065 }, { "epoch": 0.093741092481072, "grad_norm": 13.244826298425306, "learning_rate": 4.9765031935152265e-05, "loss": 2.4157, "mean_token_accuracy": 0.44137929677963256, "step": 93070 }, { "epoch": 0.09374612853417617, "grad_norm": 11.043719649685853, "learning_rate": 4.976497791998798e-05, "loss": 2.1403, "mean_token_accuracy": 0.48094373345375063, "step": 93075 }, { "epoch": 0.09375116458728033, "grad_norm": 11.252232788321866, "learning_rate": 4.9764923898648416e-05, "loss": 2.5802, "mean_token_accuracy": 0.4310344815254211, "step": 93080 }, { "epoch": 0.09375620064038451, "grad_norm": 11.324223444030354, "learning_rate": 4.9764869871133607e-05, "loss": 2.7732, "mean_token_accuracy": 0.41379311084747317, "step": 93085 }, { "epoch": 0.09376123669348868, "grad_norm": 9.558078622276323, "learning_rate": 4.976481583744355e-05, "loss": 2.6255, "mean_token_accuracy": 0.44827587008476255, "step": 93090 }, { "epoch": 0.09376627274659285, "grad_norm": 12.666129130192001, "learning_rate": 4.976476179757825e-05, "loss": 2.6499, "mean_token_accuracy": 0.3551724076271057, "step": 93095 }, { "epoch": 0.09377130879969703, "grad_norm": 11.282855602729889, "learning_rate": 4.976470775153775e-05, "loss": 2.7059, "mean_token_accuracy": 0.44827585816383364, "step": 93100 }, { "epoch": 0.0937763448528012, "grad_norm": 9.568974110381832, "learning_rate": 4.976465369932204e-05, "loss": 2.2253, "mean_token_accuracy": 0.44482758045196535, "step": 93105 }, { "epoch": 0.09378138090590538, "grad_norm": 9.97680478969816, "learning_rate": 4.976459964093114e-05, "loss": 2.4123, "mean_token_accuracy": 0.46551724076271056, "step": 93110 }, { "epoch": 0.09378641695900955, "grad_norm": 10.967253389825235, "learning_rate": 4.976454557636509e-05, "loss": 2.1697, "mean_token_accuracy": 0.44482759237289426, "step": 93115 }, { "epoch": 0.09379145301211372, "grad_norm": 10.546145964553508, "learning_rate": 4.976449150562387e-05, "loss": 2.2218, "mean_token_accuracy": 0.4620689630508423, "step": 93120 }, { "epoch": 0.0937964890652179, "grad_norm": 13.65094195399109, "learning_rate": 4.9764437428707514e-05, "loss": 2.8169, "mean_token_accuracy": 0.37241379022598264, "step": 93125 }, { "epoch": 0.09380152511832207, "grad_norm": 9.45009948101431, "learning_rate": 4.9764383345616033e-05, "loss": 2.5152, "mean_token_accuracy": 0.47302955389022827, "step": 93130 }, { "epoch": 0.09380656117142624, "grad_norm": 10.961186482380842, "learning_rate": 4.9764329256349446e-05, "loss": 2.6161, "mean_token_accuracy": 0.42758620381355283, "step": 93135 }, { "epoch": 0.09381159722453042, "grad_norm": 10.990956903050208, "learning_rate": 4.976427516090776e-05, "loss": 2.097, "mean_token_accuracy": 0.44568966031074525, "step": 93140 }, { "epoch": 0.09381663327763459, "grad_norm": 11.016899560724887, "learning_rate": 4.9764221059291e-05, "loss": 2.8037, "mean_token_accuracy": 0.36551724672317504, "step": 93145 }, { "epoch": 0.09382166933073875, "grad_norm": 12.59275148945663, "learning_rate": 4.976416695149916e-05, "loss": 2.727, "mean_token_accuracy": 0.3655172407627106, "step": 93150 }, { "epoch": 0.09382670538384293, "grad_norm": 13.276339818626704, "learning_rate": 4.9764112837532287e-05, "loss": 2.6629, "mean_token_accuracy": 0.39310344457626345, "step": 93155 }, { "epoch": 0.0938317414369471, "grad_norm": 10.127358391367475, "learning_rate": 4.976405871739038e-05, "loss": 2.3117, "mean_token_accuracy": 0.49655172824859617, "step": 93160 }, { "epoch": 0.09383677749005127, "grad_norm": 11.50562517117822, "learning_rate": 4.976400459107344e-05, "loss": 2.4372, "mean_token_accuracy": 0.3862069010734558, "step": 93165 }, { "epoch": 0.09384181354315545, "grad_norm": 6.682313721336983, "learning_rate": 4.97639504585815e-05, "loss": 1.6961, "mean_token_accuracy": 0.5332728326320648, "step": 93170 }, { "epoch": 0.09384684959625962, "grad_norm": 11.24456851501498, "learning_rate": 4.976389631991457e-05, "loss": 2.5845, "mean_token_accuracy": 0.39310344457626345, "step": 93175 }, { "epoch": 0.0938518856493638, "grad_norm": 10.826705861434709, "learning_rate": 4.9763842175072665e-05, "loss": 2.4385, "mean_token_accuracy": 0.3931034505367279, "step": 93180 }, { "epoch": 0.09385692170246797, "grad_norm": 9.91587807834643, "learning_rate": 4.976378802405581e-05, "loss": 2.8099, "mean_token_accuracy": 0.4310344815254211, "step": 93185 }, { "epoch": 0.09386195775557214, "grad_norm": 10.289772472303778, "learning_rate": 4.9763733866864e-05, "loss": 2.3091, "mean_token_accuracy": 0.4482758641242981, "step": 93190 }, { "epoch": 0.09386699380867632, "grad_norm": 10.441717954730507, "learning_rate": 4.976367970349726e-05, "loss": 2.511, "mean_token_accuracy": 0.4332123398780823, "step": 93195 }, { "epoch": 0.09387202986178049, "grad_norm": 10.185858471927068, "learning_rate": 4.976362553395561e-05, "loss": 2.631, "mean_token_accuracy": 0.4172413766384125, "step": 93200 }, { "epoch": 0.09387706591488466, "grad_norm": 10.638513408547466, "learning_rate": 4.976357135823906e-05, "loss": 2.4362, "mean_token_accuracy": 0.4586206912994385, "step": 93205 }, { "epoch": 0.09388210196798884, "grad_norm": 12.062711364709205, "learning_rate": 4.976351717634762e-05, "loss": 2.3548, "mean_token_accuracy": 0.441379314661026, "step": 93210 }, { "epoch": 0.09388713802109301, "grad_norm": 11.209233862842803, "learning_rate": 4.9763462988281324e-05, "loss": 2.6796, "mean_token_accuracy": 0.4103448212146759, "step": 93215 }, { "epoch": 0.09389217407419717, "grad_norm": 13.033225749328151, "learning_rate": 4.9763408794040154e-05, "loss": 2.8447, "mean_token_accuracy": 0.4068965494632721, "step": 93220 }, { "epoch": 0.09389721012730134, "grad_norm": 12.242285313587148, "learning_rate": 4.976335459362416e-05, "loss": 2.6098, "mean_token_accuracy": 0.41379310488700866, "step": 93225 }, { "epoch": 0.09390224618040552, "grad_norm": 11.672257732824011, "learning_rate": 4.976330038703333e-05, "loss": 2.477, "mean_token_accuracy": 0.4034482777118683, "step": 93230 }, { "epoch": 0.09390728223350969, "grad_norm": 12.383770062230347, "learning_rate": 4.97632461742677e-05, "loss": 2.7849, "mean_token_accuracy": 0.3241379290819168, "step": 93235 }, { "epoch": 0.09391231828661387, "grad_norm": 10.469512445202916, "learning_rate": 4.976319195532727e-05, "loss": 2.4067, "mean_token_accuracy": 0.41724138259887694, "step": 93240 }, { "epoch": 0.09391735433971804, "grad_norm": 11.620291003463986, "learning_rate": 4.9763137730212066e-05, "loss": 2.6917, "mean_token_accuracy": 0.4172413766384125, "step": 93245 }, { "epoch": 0.09392239039282221, "grad_norm": 9.764571488040007, "learning_rate": 4.9763083498922096e-05, "loss": 2.6827, "mean_token_accuracy": 0.39310344457626345, "step": 93250 }, { "epoch": 0.09392742644592639, "grad_norm": 10.76449687127288, "learning_rate": 4.976302926145738e-05, "loss": 2.3687, "mean_token_accuracy": 0.4379310250282288, "step": 93255 }, { "epoch": 0.09393246249903056, "grad_norm": 12.339009233002926, "learning_rate": 4.9762975017817924e-05, "loss": 2.3613, "mean_token_accuracy": 0.36896551251411436, "step": 93260 }, { "epoch": 0.09393749855213473, "grad_norm": 10.62470451957075, "learning_rate": 4.976292076800375e-05, "loss": 2.2172, "mean_token_accuracy": 0.46551724672317507, "step": 93265 }, { "epoch": 0.09394253460523891, "grad_norm": 11.063958874213542, "learning_rate": 4.9762866512014874e-05, "loss": 2.172, "mean_token_accuracy": 0.4275861978530884, "step": 93270 }, { "epoch": 0.09394757065834308, "grad_norm": 9.8511043703983, "learning_rate": 4.97628122498513e-05, "loss": 2.5852, "mean_token_accuracy": 0.4413793087005615, "step": 93275 }, { "epoch": 0.09395260671144726, "grad_norm": 9.944804429942453, "learning_rate": 4.976275798151306e-05, "loss": 2.5648, "mean_token_accuracy": 0.3724137872457504, "step": 93280 }, { "epoch": 0.09395764276455143, "grad_norm": 13.698295167887576, "learning_rate": 4.9762703707000165e-05, "loss": 2.6808, "mean_token_accuracy": 0.42413792610168455, "step": 93285 }, { "epoch": 0.09396267881765559, "grad_norm": 8.869193279337125, "learning_rate": 4.976264942631262e-05, "loss": 2.1228, "mean_token_accuracy": 0.4965517222881317, "step": 93290 }, { "epoch": 0.09396771487075976, "grad_norm": 10.606050482256475, "learning_rate": 4.976259513945044e-05, "loss": 2.3644, "mean_token_accuracy": 0.41034482419490814, "step": 93295 }, { "epoch": 0.09397275092386394, "grad_norm": 11.63149980598661, "learning_rate": 4.976254084641366e-05, "loss": 2.7444, "mean_token_accuracy": 0.38275861740112305, "step": 93300 }, { "epoch": 0.09397778697696811, "grad_norm": 9.549253153383603, "learning_rate": 4.9762486547202276e-05, "loss": 2.4961, "mean_token_accuracy": 0.3896551728248596, "step": 93305 }, { "epoch": 0.09398282303007228, "grad_norm": 11.174655951933664, "learning_rate": 4.97624322418163e-05, "loss": 2.2278, "mean_token_accuracy": 0.4448275864124298, "step": 93310 }, { "epoch": 0.09398785908317646, "grad_norm": 11.546262547242273, "learning_rate": 4.9762377930255764e-05, "loss": 2.174, "mean_token_accuracy": 0.4344827592372894, "step": 93315 }, { "epoch": 0.09399289513628063, "grad_norm": 10.377504964895614, "learning_rate": 4.976232361252068e-05, "loss": 2.2115, "mean_token_accuracy": 0.44482758045196535, "step": 93320 }, { "epoch": 0.0939979311893848, "grad_norm": 9.484271421616363, "learning_rate": 4.9762269288611045e-05, "loss": 2.4136, "mean_token_accuracy": 0.3758620619773865, "step": 93325 }, { "epoch": 0.09400296724248898, "grad_norm": 12.783151694832894, "learning_rate": 4.976221495852689e-05, "loss": 2.2815, "mean_token_accuracy": 0.4137930989265442, "step": 93330 }, { "epoch": 0.09400800329559315, "grad_norm": 8.123686383888591, "learning_rate": 4.9762160622268235e-05, "loss": 2.3432, "mean_token_accuracy": 0.42758620381355283, "step": 93335 }, { "epoch": 0.09401303934869733, "grad_norm": 9.006053908335527, "learning_rate": 4.976210627983507e-05, "loss": 2.8218, "mean_token_accuracy": 0.36551723778247835, "step": 93340 }, { "epoch": 0.0940180754018015, "grad_norm": 10.803364212176326, "learning_rate": 4.976205193122744e-05, "loss": 2.1425, "mean_token_accuracy": 0.4497277677059174, "step": 93345 }, { "epoch": 0.09402311145490568, "grad_norm": 12.833449529251215, "learning_rate": 4.9761997576445346e-05, "loss": 2.6397, "mean_token_accuracy": 0.3344827562570572, "step": 93350 }, { "epoch": 0.09402814750800985, "grad_norm": 10.759054762538439, "learning_rate": 4.976194321548881e-05, "loss": 2.68, "mean_token_accuracy": 0.3896551728248596, "step": 93355 }, { "epoch": 0.09403318356111401, "grad_norm": 11.392916736789296, "learning_rate": 4.9761888848357825e-05, "loss": 2.6934, "mean_token_accuracy": 0.3862069010734558, "step": 93360 }, { "epoch": 0.09403821961421818, "grad_norm": 13.355663311068163, "learning_rate": 4.976183447505243e-05, "loss": 2.5513, "mean_token_accuracy": 0.41379310488700866, "step": 93365 }, { "epoch": 0.09404325566732236, "grad_norm": 16.263050210260122, "learning_rate": 4.976178009557264e-05, "loss": 2.6799, "mean_token_accuracy": 0.39655172228813174, "step": 93370 }, { "epoch": 0.09404829172042653, "grad_norm": 10.389317954795642, "learning_rate": 4.976172570991844e-05, "loss": 2.4516, "mean_token_accuracy": 0.4413793087005615, "step": 93375 }, { "epoch": 0.0940533277735307, "grad_norm": 13.303103308223422, "learning_rate": 4.976167131808988e-05, "loss": 2.5749, "mean_token_accuracy": 0.4068965554237366, "step": 93380 }, { "epoch": 0.09405836382663488, "grad_norm": 11.193183975221022, "learning_rate": 4.9761616920086976e-05, "loss": 3.3286, "mean_token_accuracy": 0.37090138494968417, "step": 93385 }, { "epoch": 0.09406339987973905, "grad_norm": 13.051894801016754, "learning_rate": 4.9761562515909714e-05, "loss": 2.9833, "mean_token_accuracy": 0.3530550479888916, "step": 93390 }, { "epoch": 0.09406843593284323, "grad_norm": 10.442787016084559, "learning_rate": 4.976150810555813e-05, "loss": 2.3693, "mean_token_accuracy": 0.41724138259887694, "step": 93395 }, { "epoch": 0.0940734719859474, "grad_norm": 13.019957700363257, "learning_rate": 4.9761453689032225e-05, "loss": 2.3714, "mean_token_accuracy": 0.41724138855934145, "step": 93400 }, { "epoch": 0.09407850803905157, "grad_norm": 11.994705902390926, "learning_rate": 4.976139926633203e-05, "loss": 2.3616, "mean_token_accuracy": 0.4137930989265442, "step": 93405 }, { "epoch": 0.09408354409215575, "grad_norm": 12.900729814301334, "learning_rate": 4.976134483745755e-05, "loss": 2.1395, "mean_token_accuracy": 0.4620689630508423, "step": 93410 }, { "epoch": 0.09408858014525992, "grad_norm": 12.086310244999563, "learning_rate": 4.976129040240882e-05, "loss": 2.3367, "mean_token_accuracy": 0.4620689690113068, "step": 93415 }, { "epoch": 0.0940936161983641, "grad_norm": 10.76988756460759, "learning_rate": 4.976123596118581e-05, "loss": 2.559, "mean_token_accuracy": 0.39655172228813174, "step": 93420 }, { "epoch": 0.09409865225146827, "grad_norm": 11.401492980232348, "learning_rate": 4.9761181513788585e-05, "loss": 2.3397, "mean_token_accuracy": 0.42413793206214906, "step": 93425 }, { "epoch": 0.09410368830457243, "grad_norm": 24.042180130981492, "learning_rate": 4.9761127060217125e-05, "loss": 2.8713, "mean_token_accuracy": 0.4, "step": 93430 }, { "epoch": 0.0941087243576766, "grad_norm": 11.372429340350397, "learning_rate": 4.9761072600471465e-05, "loss": 2.7881, "mean_token_accuracy": 0.39655172228813174, "step": 93435 }, { "epoch": 0.09411376041078078, "grad_norm": 10.769655212987994, "learning_rate": 4.976101813455161e-05, "loss": 2.8612, "mean_token_accuracy": 0.4034482777118683, "step": 93440 }, { "epoch": 0.09411879646388495, "grad_norm": 11.378830648691853, "learning_rate": 4.976096366245758e-05, "loss": 2.2618, "mean_token_accuracy": 0.41034482717514037, "step": 93445 }, { "epoch": 0.09412383251698912, "grad_norm": 15.033486243402097, "learning_rate": 4.976090918418939e-05, "loss": 2.5002, "mean_token_accuracy": 0.4413793087005615, "step": 93450 }, { "epoch": 0.0941288685700933, "grad_norm": 9.407489659310537, "learning_rate": 4.976085469974706e-05, "loss": 2.6696, "mean_token_accuracy": 0.4413793087005615, "step": 93455 }, { "epoch": 0.09413390462319747, "grad_norm": 11.265177386642094, "learning_rate": 4.9760800209130584e-05, "loss": 2.7471, "mean_token_accuracy": 0.37241379022598264, "step": 93460 }, { "epoch": 0.09413894067630164, "grad_norm": 9.514155027222621, "learning_rate": 4.976074571234e-05, "loss": 2.1029, "mean_token_accuracy": 0.4517241418361664, "step": 93465 }, { "epoch": 0.09414397672940582, "grad_norm": 9.331723860831088, "learning_rate": 4.976069120937532e-05, "loss": 2.471, "mean_token_accuracy": 0.38620689511299133, "step": 93470 }, { "epoch": 0.09414901278250999, "grad_norm": 14.235241846256773, "learning_rate": 4.976063670023654e-05, "loss": 2.4226, "mean_token_accuracy": 0.4679975748062134, "step": 93475 }, { "epoch": 0.09415404883561417, "grad_norm": 10.398092174856519, "learning_rate": 4.97605821849237e-05, "loss": 2.3256, "mean_token_accuracy": 0.4448275864124298, "step": 93480 }, { "epoch": 0.09415908488871834, "grad_norm": 11.666527372335825, "learning_rate": 4.97605276634368e-05, "loss": 2.4412, "mean_token_accuracy": 0.45517241954803467, "step": 93485 }, { "epoch": 0.09416412094182251, "grad_norm": 9.434644363207424, "learning_rate": 4.976047313577586e-05, "loss": 2.3851, "mean_token_accuracy": 0.5137930989265442, "step": 93490 }, { "epoch": 0.09416915699492669, "grad_norm": 11.023644429272872, "learning_rate": 4.976041860194091e-05, "loss": 2.5825, "mean_token_accuracy": 0.4465819835662842, "step": 93495 }, { "epoch": 0.09417419304803085, "grad_norm": 13.583222457920636, "learning_rate": 4.976036406193193e-05, "loss": 2.5883, "mean_token_accuracy": 0.36551723480224607, "step": 93500 }, { "epoch": 0.09417922910113502, "grad_norm": 10.728433614798897, "learning_rate": 4.9760309515748966e-05, "loss": 2.6421, "mean_token_accuracy": 0.3813067078590393, "step": 93505 }, { "epoch": 0.0941842651542392, "grad_norm": 10.78453840567406, "learning_rate": 4.976025496339202e-05, "loss": 2.4792, "mean_token_accuracy": 0.37586206793785093, "step": 93510 }, { "epoch": 0.09418930120734337, "grad_norm": 12.729352341218307, "learning_rate": 4.976020040486111e-05, "loss": 2.6238, "mean_token_accuracy": 0.404839688539505, "step": 93515 }, { "epoch": 0.09419433726044754, "grad_norm": 11.797521575361689, "learning_rate": 4.976014584015624e-05, "loss": 2.3313, "mean_token_accuracy": 0.4344827592372894, "step": 93520 }, { "epoch": 0.09419937331355172, "grad_norm": 10.363134768154238, "learning_rate": 4.976009126927744e-05, "loss": 2.329, "mean_token_accuracy": 0.4971566915512085, "step": 93525 }, { "epoch": 0.09420440936665589, "grad_norm": 13.138645060910342, "learning_rate": 4.976003669222472e-05, "loss": 2.4366, "mean_token_accuracy": 0.3896551728248596, "step": 93530 }, { "epoch": 0.09420944541976006, "grad_norm": 9.762158332985264, "learning_rate": 4.975998210899811e-05, "loss": 2.5049, "mean_token_accuracy": 0.4517241299152374, "step": 93535 }, { "epoch": 0.09421448147286424, "grad_norm": 11.762140572787576, "learning_rate": 4.97599275195976e-05, "loss": 2.1855, "mean_token_accuracy": 0.44827585816383364, "step": 93540 }, { "epoch": 0.09421951752596841, "grad_norm": 15.429912509949562, "learning_rate": 4.9759872924023215e-05, "loss": 2.6731, "mean_token_accuracy": 0.4068965494632721, "step": 93545 }, { "epoch": 0.09422455357907258, "grad_norm": 12.283650127048253, "learning_rate": 4.9759818322274976e-05, "loss": 2.5697, "mean_token_accuracy": 0.4034482717514038, "step": 93550 }, { "epoch": 0.09422958963217676, "grad_norm": 9.892054657273672, "learning_rate": 4.975976371435289e-05, "loss": 2.269, "mean_token_accuracy": 0.43448275327682495, "step": 93555 }, { "epoch": 0.09423462568528093, "grad_norm": 10.682805381327249, "learning_rate": 4.9759709100256976e-05, "loss": 2.5749, "mean_token_accuracy": 0.44482758045196535, "step": 93560 }, { "epoch": 0.0942396617383851, "grad_norm": 8.79682834740559, "learning_rate": 4.975965447998725e-05, "loss": 2.6551, "mean_token_accuracy": 0.4310344815254211, "step": 93565 }, { "epoch": 0.09424469779148927, "grad_norm": 11.46873677855604, "learning_rate": 4.975959985354372e-05, "loss": 2.3811, "mean_token_accuracy": 0.43793103098869324, "step": 93570 }, { "epoch": 0.09424973384459344, "grad_norm": 9.897303091773074, "learning_rate": 4.9759545220926406e-05, "loss": 2.5478, "mean_token_accuracy": 0.4344827592372894, "step": 93575 }, { "epoch": 0.09425476989769761, "grad_norm": 9.925567240265732, "learning_rate": 4.9759490582135336e-05, "loss": 2.4771, "mean_token_accuracy": 0.45862069725990295, "step": 93580 }, { "epoch": 0.09425980595080179, "grad_norm": 11.009107424762867, "learning_rate": 4.9759435937170506e-05, "loss": 2.0894, "mean_token_accuracy": 0.4655172348022461, "step": 93585 }, { "epoch": 0.09426484200390596, "grad_norm": 11.4778292505902, "learning_rate": 4.9759381286031945e-05, "loss": 2.393, "mean_token_accuracy": 0.4517241299152374, "step": 93590 }, { "epoch": 0.09426987805701013, "grad_norm": 10.445558245673917, "learning_rate": 4.975932662871965e-05, "loss": 2.7289, "mean_token_accuracy": 0.398124623298645, "step": 93595 }, { "epoch": 0.09427491411011431, "grad_norm": 10.797943325203851, "learning_rate": 4.975927196523365e-05, "loss": 2.4743, "mean_token_accuracy": 0.4607380449771881, "step": 93600 }, { "epoch": 0.09427995016321848, "grad_norm": 10.242685818374698, "learning_rate": 4.975921729557396e-05, "loss": 2.3322, "mean_token_accuracy": 0.43793103098869324, "step": 93605 }, { "epoch": 0.09428498621632266, "grad_norm": 10.940434899695527, "learning_rate": 4.975916261974059e-05, "loss": 2.3607, "mean_token_accuracy": 0.4034482717514038, "step": 93610 }, { "epoch": 0.09429002226942683, "grad_norm": 12.077135158807842, "learning_rate": 4.975910793773357e-05, "loss": 2.6846, "mean_token_accuracy": 0.41379310488700866, "step": 93615 }, { "epoch": 0.094295058322531, "grad_norm": 12.460848345033678, "learning_rate": 4.9759053249552895e-05, "loss": 2.1797, "mean_token_accuracy": 0.43793103098869324, "step": 93620 }, { "epoch": 0.09430009437563518, "grad_norm": 9.585981960975458, "learning_rate": 4.975899855519859e-05, "loss": 2.6752, "mean_token_accuracy": 0.3586206793785095, "step": 93625 }, { "epoch": 0.09430513042873935, "grad_norm": 8.840340478336367, "learning_rate": 4.9758943854670664e-05, "loss": 2.2101, "mean_token_accuracy": 0.47931033968925474, "step": 93630 }, { "epoch": 0.09431016648184352, "grad_norm": 11.295277769578684, "learning_rate": 4.975888914796914e-05, "loss": 2.1588, "mean_token_accuracy": 0.5009852230548859, "step": 93635 }, { "epoch": 0.09431520253494768, "grad_norm": 7.934810751667857, "learning_rate": 4.975883443509404e-05, "loss": 2.254, "mean_token_accuracy": 0.4172413766384125, "step": 93640 }, { "epoch": 0.09432023858805186, "grad_norm": 10.47871861006057, "learning_rate": 4.975877971604536e-05, "loss": 2.8562, "mean_token_accuracy": 0.3896551787853241, "step": 93645 }, { "epoch": 0.09432527464115603, "grad_norm": 13.767857022316344, "learning_rate": 4.975872499082312e-05, "loss": 2.4261, "mean_token_accuracy": 0.4181034445762634, "step": 93650 }, { "epoch": 0.0943303106942602, "grad_norm": 13.075076981592666, "learning_rate": 4.975867025942735e-05, "loss": 2.2473, "mean_token_accuracy": 0.4344827592372894, "step": 93655 }, { "epoch": 0.09433534674736438, "grad_norm": 9.364591035153868, "learning_rate": 4.975861552185804e-05, "loss": 2.4141, "mean_token_accuracy": 0.4344827592372894, "step": 93660 }, { "epoch": 0.09434038280046855, "grad_norm": 10.25661809750755, "learning_rate": 4.9758560778115235e-05, "loss": 2.1552, "mean_token_accuracy": 0.4517241299152374, "step": 93665 }, { "epoch": 0.09434541885357273, "grad_norm": 9.884431864468795, "learning_rate": 4.9758506028198924e-05, "loss": 3.4007, "mean_token_accuracy": 0.34827586114406583, "step": 93670 }, { "epoch": 0.0943504549066769, "grad_norm": 9.615450543475237, "learning_rate": 4.975845127210914e-05, "loss": 2.6631, "mean_token_accuracy": 0.39655172228813174, "step": 93675 }, { "epoch": 0.09435549095978107, "grad_norm": 11.350525849564994, "learning_rate": 4.975839650984589e-05, "loss": 2.6136, "mean_token_accuracy": 0.3482758641242981, "step": 93680 }, { "epoch": 0.09436052701288525, "grad_norm": 13.450916891584987, "learning_rate": 4.9758341741409185e-05, "loss": 2.6038, "mean_token_accuracy": 0.4172413766384125, "step": 93685 }, { "epoch": 0.09436556306598942, "grad_norm": 10.736945087669675, "learning_rate": 4.975828696679906e-05, "loss": 2.401, "mean_token_accuracy": 0.4068965494632721, "step": 93690 }, { "epoch": 0.0943705991190936, "grad_norm": 11.081771108980188, "learning_rate": 4.9758232186015504e-05, "loss": 2.489, "mean_token_accuracy": 0.41034482717514037, "step": 93695 }, { "epoch": 0.09437563517219777, "grad_norm": 13.35974699866472, "learning_rate": 4.975817739905855e-05, "loss": 2.5538, "mean_token_accuracy": 0.3793103456497192, "step": 93700 }, { "epoch": 0.09438067122530194, "grad_norm": 13.859442474504705, "learning_rate": 4.97581226059282e-05, "loss": 2.4258, "mean_token_accuracy": 0.41724138855934145, "step": 93705 }, { "epoch": 0.0943857072784061, "grad_norm": 10.873035095123415, "learning_rate": 4.975806780662448e-05, "loss": 2.4042, "mean_token_accuracy": 0.44137930274009707, "step": 93710 }, { "epoch": 0.09439074333151028, "grad_norm": 10.349623797127418, "learning_rate": 4.97580130011474e-05, "loss": 2.6153, "mean_token_accuracy": 0.4034482717514038, "step": 93715 }, { "epoch": 0.09439577938461445, "grad_norm": 11.135879822000295, "learning_rate": 4.975795818949698e-05, "loss": 2.6427, "mean_token_accuracy": 0.3689655244350433, "step": 93720 }, { "epoch": 0.09440081543771862, "grad_norm": 17.953275008329015, "learning_rate": 4.975790337167324e-05, "loss": 2.9001, "mean_token_accuracy": 0.37586206793785093, "step": 93725 }, { "epoch": 0.0944058514908228, "grad_norm": 9.210224754235076, "learning_rate": 4.975784854767618e-05, "loss": 2.2413, "mean_token_accuracy": 0.42413793206214906, "step": 93730 }, { "epoch": 0.09441088754392697, "grad_norm": 9.338199779764377, "learning_rate": 4.975779371750582e-05, "loss": 2.1955, "mean_token_accuracy": 0.4620689570903778, "step": 93735 }, { "epoch": 0.09441592359703115, "grad_norm": 8.986614141636872, "learning_rate": 4.9757738881162185e-05, "loss": 2.3057, "mean_token_accuracy": 0.47241379618644713, "step": 93740 }, { "epoch": 0.09442095965013532, "grad_norm": 15.678215099298507, "learning_rate": 4.975768403864528e-05, "loss": 2.4146, "mean_token_accuracy": 0.43793103098869324, "step": 93745 }, { "epoch": 0.0944259957032395, "grad_norm": 9.96931783290619, "learning_rate": 4.975762918995512e-05, "loss": 2.4002, "mean_token_accuracy": 0.4344827651977539, "step": 93750 }, { "epoch": 0.09443103175634367, "grad_norm": 13.117964465837305, "learning_rate": 4.9757574335091724e-05, "loss": 2.5288, "mean_token_accuracy": 0.3896551728248596, "step": 93755 }, { "epoch": 0.09443606780944784, "grad_norm": 8.261185835731977, "learning_rate": 4.9757519474055106e-05, "loss": 2.1644, "mean_token_accuracy": 0.474954628944397, "step": 93760 }, { "epoch": 0.09444110386255201, "grad_norm": 10.854261072355659, "learning_rate": 4.975746460684529e-05, "loss": 2.5593, "mean_token_accuracy": 0.35862069129943847, "step": 93765 }, { "epoch": 0.09444613991565619, "grad_norm": 10.62232783710421, "learning_rate": 4.975740973346228e-05, "loss": 2.2855, "mean_token_accuracy": 0.3965517282485962, "step": 93770 }, { "epoch": 0.09445117596876036, "grad_norm": 9.071071475778446, "learning_rate": 4.975735485390609e-05, "loss": 2.5849, "mean_token_accuracy": 0.41034482717514037, "step": 93775 }, { "epoch": 0.09445621202186452, "grad_norm": 11.250621575352612, "learning_rate": 4.975729996817674e-05, "loss": 2.5456, "mean_token_accuracy": 0.4275861978530884, "step": 93780 }, { "epoch": 0.0944612480749687, "grad_norm": 10.635798755881865, "learning_rate": 4.975724507627425e-05, "loss": 2.431, "mean_token_accuracy": 0.4413793087005615, "step": 93785 }, { "epoch": 0.09446628412807287, "grad_norm": 11.068434215526537, "learning_rate": 4.975719017819863e-05, "loss": 2.6297, "mean_token_accuracy": 0.3620689630508423, "step": 93790 }, { "epoch": 0.09447132018117704, "grad_norm": 11.350134195918969, "learning_rate": 4.9757135273949883e-05, "loss": 2.5652, "mean_token_accuracy": 0.3931034505367279, "step": 93795 }, { "epoch": 0.09447635623428122, "grad_norm": 9.743414085774845, "learning_rate": 4.975708036352805e-05, "loss": 2.3428, "mean_token_accuracy": 0.4068965554237366, "step": 93800 }, { "epoch": 0.09448139228738539, "grad_norm": 11.090275212118284, "learning_rate": 4.975702544693313e-05, "loss": 2.6732, "mean_token_accuracy": 0.3655172407627106, "step": 93805 }, { "epoch": 0.09448642834048956, "grad_norm": 10.533639835381354, "learning_rate": 4.975697052416514e-05, "loss": 2.4948, "mean_token_accuracy": 0.4103448331356049, "step": 93810 }, { "epoch": 0.09449146439359374, "grad_norm": 12.407908713513581, "learning_rate": 4.975691559522409e-05, "loss": 2.7467, "mean_token_accuracy": 0.38965516686439516, "step": 93815 }, { "epoch": 0.09449650044669791, "grad_norm": 11.290234417151717, "learning_rate": 4.975686066011e-05, "loss": 2.4718, "mean_token_accuracy": 0.3965517282485962, "step": 93820 }, { "epoch": 0.09450153649980209, "grad_norm": 11.927253177402545, "learning_rate": 4.97568057188229e-05, "loss": 2.5858, "mean_token_accuracy": 0.4137930989265442, "step": 93825 }, { "epoch": 0.09450657255290626, "grad_norm": 10.068815763515476, "learning_rate": 4.9756750771362786e-05, "loss": 2.5294, "mean_token_accuracy": 0.4068965494632721, "step": 93830 }, { "epoch": 0.09451160860601043, "grad_norm": 10.385499160633943, "learning_rate": 4.975669581772968e-05, "loss": 2.4088, "mean_token_accuracy": 0.4310344815254211, "step": 93835 }, { "epoch": 0.09451664465911461, "grad_norm": 16.898756621614776, "learning_rate": 4.975664085792359e-05, "loss": 2.8586, "mean_token_accuracy": 0.3379310369491577, "step": 93840 }, { "epoch": 0.09452168071221878, "grad_norm": 9.677758414103813, "learning_rate": 4.975658589194454e-05, "loss": 2.3842, "mean_token_accuracy": 0.45517241954803467, "step": 93845 }, { "epoch": 0.09452671676532294, "grad_norm": 16.605940577719693, "learning_rate": 4.9756530919792555e-05, "loss": 2.63, "mean_token_accuracy": 0.42413793206214906, "step": 93850 }, { "epoch": 0.09453175281842711, "grad_norm": 11.359746430765545, "learning_rate": 4.975647594146762e-05, "loss": 2.9495, "mean_token_accuracy": 0.3793103456497192, "step": 93855 }, { "epoch": 0.09453678887153129, "grad_norm": 11.232638441593778, "learning_rate": 4.9756420956969785e-05, "loss": 2.2464, "mean_token_accuracy": 0.44827587008476255, "step": 93860 }, { "epoch": 0.09454182492463546, "grad_norm": 10.781007337363807, "learning_rate": 4.975636596629904e-05, "loss": 2.2326, "mean_token_accuracy": 0.4072660028934479, "step": 93865 }, { "epoch": 0.09454686097773964, "grad_norm": 10.771587444999625, "learning_rate": 4.975631096945541e-05, "loss": 2.2469, "mean_token_accuracy": 0.38275861740112305, "step": 93870 }, { "epoch": 0.09455189703084381, "grad_norm": 9.97903466176054, "learning_rate": 4.975625596643891e-05, "loss": 2.4957, "mean_token_accuracy": 0.4034482777118683, "step": 93875 }, { "epoch": 0.09455693308394798, "grad_norm": 9.667966985155022, "learning_rate": 4.975620095724955e-05, "loss": 2.5884, "mean_token_accuracy": 0.39310345351696013, "step": 93880 }, { "epoch": 0.09456196913705216, "grad_norm": 10.422221882259715, "learning_rate": 4.975614594188735e-05, "loss": 2.6904, "mean_token_accuracy": 0.42068966031074523, "step": 93885 }, { "epoch": 0.09456700519015633, "grad_norm": 10.301213538322283, "learning_rate": 4.9756090920352325e-05, "loss": 2.2933, "mean_token_accuracy": 0.40344826579093934, "step": 93890 }, { "epoch": 0.0945720412432605, "grad_norm": 15.333407073218519, "learning_rate": 4.975603589264449e-05, "loss": 2.4217, "mean_token_accuracy": 0.38771929144859313, "step": 93895 }, { "epoch": 0.09457707729636468, "grad_norm": 9.863553787669066, "learning_rate": 4.975598085876387e-05, "loss": 2.0677, "mean_token_accuracy": 0.4862069010734558, "step": 93900 }, { "epoch": 0.09458211334946885, "grad_norm": 9.029298240516116, "learning_rate": 4.975592581871047e-05, "loss": 2.5861, "mean_token_accuracy": 0.4206896543502808, "step": 93905 }, { "epoch": 0.09458714940257303, "grad_norm": 11.363515938139154, "learning_rate": 4.975587077248429e-05, "loss": 2.8235, "mean_token_accuracy": 0.38620689511299133, "step": 93910 }, { "epoch": 0.0945921854556772, "grad_norm": 9.89855368105006, "learning_rate": 4.975581572008537e-05, "loss": 2.3144, "mean_token_accuracy": 0.4551724135875702, "step": 93915 }, { "epoch": 0.09459722150878136, "grad_norm": 9.229849367909186, "learning_rate": 4.975576066151372e-05, "loss": 2.4823, "mean_token_accuracy": 0.4137930989265442, "step": 93920 }, { "epoch": 0.09460225756188553, "grad_norm": 11.301061163007358, "learning_rate": 4.975570559676935e-05, "loss": 2.7197, "mean_token_accuracy": 0.4, "step": 93925 }, { "epoch": 0.09460729361498971, "grad_norm": 9.573180422833483, "learning_rate": 4.975565052585227e-05, "loss": 2.3194, "mean_token_accuracy": 0.4620689511299133, "step": 93930 }, { "epoch": 0.09461232966809388, "grad_norm": 10.904534787616782, "learning_rate": 4.9755595448762516e-05, "loss": 2.3012, "mean_token_accuracy": 0.4379310429096222, "step": 93935 }, { "epoch": 0.09461736572119805, "grad_norm": 9.692243851405578, "learning_rate": 4.9755540365500074e-05, "loss": 2.3029, "mean_token_accuracy": 0.43103448748588563, "step": 93940 }, { "epoch": 0.09462240177430223, "grad_norm": 8.989314435235977, "learning_rate": 4.975548527606499e-05, "loss": 2.8007, "mean_token_accuracy": 0.38275861740112305, "step": 93945 }, { "epoch": 0.0946274378274064, "grad_norm": 10.058742094646837, "learning_rate": 4.975543018045725e-05, "loss": 2.8439, "mean_token_accuracy": 0.38620689511299133, "step": 93950 }, { "epoch": 0.09463247388051058, "grad_norm": 11.084764594746185, "learning_rate": 4.975537507867689e-05, "loss": 2.6334, "mean_token_accuracy": 0.3896551728248596, "step": 93955 }, { "epoch": 0.09463750993361475, "grad_norm": 13.561411008815563, "learning_rate": 4.975531997072392e-05, "loss": 2.3086, "mean_token_accuracy": 0.43103448748588563, "step": 93960 }, { "epoch": 0.09464254598671892, "grad_norm": 10.247279109235317, "learning_rate": 4.975526485659836e-05, "loss": 2.4773, "mean_token_accuracy": 0.4172413766384125, "step": 93965 }, { "epoch": 0.0946475820398231, "grad_norm": 10.552376383481556, "learning_rate": 4.975520973630021e-05, "loss": 2.3349, "mean_token_accuracy": 0.4620689630508423, "step": 93970 }, { "epoch": 0.09465261809292727, "grad_norm": 10.255334484756231, "learning_rate": 4.97551546098295e-05, "loss": 2.4085, "mean_token_accuracy": 0.41120689511299136, "step": 93975 }, { "epoch": 0.09465765414603144, "grad_norm": 9.439251456045085, "learning_rate": 4.9755099477186234e-05, "loss": 2.5489, "mean_token_accuracy": 0.38275861740112305, "step": 93980 }, { "epoch": 0.09466269019913562, "grad_norm": 14.047924728690122, "learning_rate": 4.975504433837043e-05, "loss": 2.9445, "mean_token_accuracy": 0.42758620381355283, "step": 93985 }, { "epoch": 0.09466772625223978, "grad_norm": 12.020407930655042, "learning_rate": 4.975498919338212e-05, "loss": 2.852, "mean_token_accuracy": 0.3862068891525269, "step": 93990 }, { "epoch": 0.09467276230534395, "grad_norm": 8.361322479137154, "learning_rate": 4.975493404222129e-05, "loss": 2.4528, "mean_token_accuracy": 0.403448274731636, "step": 93995 }, { "epoch": 0.09467779835844813, "grad_norm": 10.593476554822363, "learning_rate": 4.975487888488798e-05, "loss": 2.6315, "mean_token_accuracy": 0.3655172407627106, "step": 94000 }, { "epoch": 0.0946828344115523, "grad_norm": 12.378403596429438, "learning_rate": 4.975482372138219e-05, "loss": 2.3028, "mean_token_accuracy": 0.4329098641872406, "step": 94005 }, { "epoch": 0.09468787046465647, "grad_norm": 10.548593689097432, "learning_rate": 4.975476855170395e-05, "loss": 2.5973, "mean_token_accuracy": 0.36896551847457887, "step": 94010 }, { "epoch": 0.09469290651776065, "grad_norm": 14.381290414232296, "learning_rate": 4.975471337585327e-05, "loss": 2.6314, "mean_token_accuracy": 0.495220810174942, "step": 94015 }, { "epoch": 0.09469794257086482, "grad_norm": 10.499366230234484, "learning_rate": 4.975465819383015e-05, "loss": 2.8451, "mean_token_accuracy": 0.35517241060733795, "step": 94020 }, { "epoch": 0.094702978623969, "grad_norm": 8.710095164692776, "learning_rate": 4.975460300563463e-05, "loss": 1.9622, "mean_token_accuracy": 0.5048029601573945, "step": 94025 }, { "epoch": 0.09470801467707317, "grad_norm": 8.633174442755298, "learning_rate": 4.9754547811266705e-05, "loss": 2.2888, "mean_token_accuracy": 0.4000000059604645, "step": 94030 }, { "epoch": 0.09471305073017734, "grad_norm": 10.056991291146847, "learning_rate": 4.97544926107264e-05, "loss": 2.3549, "mean_token_accuracy": 0.4517241358757019, "step": 94035 }, { "epoch": 0.09471808678328152, "grad_norm": 11.101551432308566, "learning_rate": 4.9754437404013723e-05, "loss": 2.5931, "mean_token_accuracy": 0.4034482777118683, "step": 94040 }, { "epoch": 0.09472312283638569, "grad_norm": 11.9635585038568, "learning_rate": 4.9754382191128705e-05, "loss": 2.4539, "mean_token_accuracy": 0.4068965494632721, "step": 94045 }, { "epoch": 0.09472815888948986, "grad_norm": 10.428366813764141, "learning_rate": 4.9754326972071345e-05, "loss": 2.4026, "mean_token_accuracy": 0.422202056646347, "step": 94050 }, { "epoch": 0.09473319494259404, "grad_norm": 10.885062873179944, "learning_rate": 4.975427174684166e-05, "loss": 2.0915, "mean_token_accuracy": 0.4965517342090607, "step": 94055 }, { "epoch": 0.0947382309956982, "grad_norm": 11.401241691136448, "learning_rate": 4.9754216515439676e-05, "loss": 2.5789, "mean_token_accuracy": 0.40689654350280763, "step": 94060 }, { "epoch": 0.09474326704880237, "grad_norm": 11.04480346381988, "learning_rate": 4.97541612778654e-05, "loss": 2.2026, "mean_token_accuracy": 0.43938294649124143, "step": 94065 }, { "epoch": 0.09474830310190654, "grad_norm": 15.084593694797695, "learning_rate": 4.9754106034118855e-05, "loss": 2.5102, "mean_token_accuracy": 0.42758620381355283, "step": 94070 }, { "epoch": 0.09475333915501072, "grad_norm": 9.107846441030375, "learning_rate": 4.9754050784200045e-05, "loss": 2.6216, "mean_token_accuracy": 0.3551724135875702, "step": 94075 }, { "epoch": 0.09475837520811489, "grad_norm": 13.17832664332979, "learning_rate": 4.9753995528108994e-05, "loss": 2.5337, "mean_token_accuracy": 0.3758620619773865, "step": 94080 }, { "epoch": 0.09476341126121907, "grad_norm": 13.080730355864766, "learning_rate": 4.97539402658457e-05, "loss": 2.5378, "mean_token_accuracy": 0.3862069010734558, "step": 94085 }, { "epoch": 0.09476844731432324, "grad_norm": 10.075251771425732, "learning_rate": 4.975388499741022e-05, "loss": 2.2274, "mean_token_accuracy": 0.46896551847457885, "step": 94090 }, { "epoch": 0.09477348336742741, "grad_norm": 9.989801220531925, "learning_rate": 4.975382972280252e-05, "loss": 2.5694, "mean_token_accuracy": 0.4172413766384125, "step": 94095 }, { "epoch": 0.09477851942053159, "grad_norm": 10.717814255447083, "learning_rate": 4.9753774442022646e-05, "loss": 2.8049, "mean_token_accuracy": 0.4034482777118683, "step": 94100 }, { "epoch": 0.09478355547363576, "grad_norm": 8.8756890634186, "learning_rate": 4.97537191550706e-05, "loss": 2.3629, "mean_token_accuracy": 0.4517241418361664, "step": 94105 }, { "epoch": 0.09478859152673993, "grad_norm": 9.482943544631453, "learning_rate": 4.975366386194641e-05, "loss": 2.1467, "mean_token_accuracy": 0.46400484442710876, "step": 94110 }, { "epoch": 0.09479362757984411, "grad_norm": 12.526991037223246, "learning_rate": 4.975360856265008e-05, "loss": 2.1067, "mean_token_accuracy": 0.4918330252170563, "step": 94115 }, { "epoch": 0.09479866363294828, "grad_norm": 9.876211166521708, "learning_rate": 4.975355325718162e-05, "loss": 2.209, "mean_token_accuracy": 0.47465215921401976, "step": 94120 }, { "epoch": 0.09480369968605246, "grad_norm": 11.19505914460475, "learning_rate": 4.975349794554106e-05, "loss": 2.6571, "mean_token_accuracy": 0.3586206942796707, "step": 94125 }, { "epoch": 0.09480873573915662, "grad_norm": 10.898563708567476, "learning_rate": 4.975344262772841e-05, "loss": 2.919, "mean_token_accuracy": 0.39655172228813174, "step": 94130 }, { "epoch": 0.09481377179226079, "grad_norm": 11.244838518836662, "learning_rate": 4.9753387303743685e-05, "loss": 2.3881, "mean_token_accuracy": 0.3965517282485962, "step": 94135 }, { "epoch": 0.09481880784536496, "grad_norm": 11.716285137324823, "learning_rate": 4.97533319735869e-05, "loss": 2.3839, "mean_token_accuracy": 0.39310344457626345, "step": 94140 }, { "epoch": 0.09482384389846914, "grad_norm": 14.632955222398527, "learning_rate": 4.975327663725807e-05, "loss": 2.4098, "mean_token_accuracy": 0.4379310369491577, "step": 94145 }, { "epoch": 0.09482887995157331, "grad_norm": 10.83865468385621, "learning_rate": 4.9753221294757214e-05, "loss": 2.0551, "mean_token_accuracy": 0.4882032573223114, "step": 94150 }, { "epoch": 0.09483391600467748, "grad_norm": 10.374182648422599, "learning_rate": 4.975316594608434e-05, "loss": 2.6452, "mean_token_accuracy": 0.3931034505367279, "step": 94155 }, { "epoch": 0.09483895205778166, "grad_norm": 10.775848461761601, "learning_rate": 4.975311059123947e-05, "loss": 1.9772, "mean_token_accuracy": 0.5, "step": 94160 }, { "epoch": 0.09484398811088583, "grad_norm": 9.96367579724092, "learning_rate": 4.975305523022262e-05, "loss": 2.5035, "mean_token_accuracy": 0.39655172228813174, "step": 94165 }, { "epoch": 0.09484902416399, "grad_norm": 14.70227219675699, "learning_rate": 4.975299986303379e-05, "loss": 2.5891, "mean_token_accuracy": 0.38275861740112305, "step": 94170 }, { "epoch": 0.09485406021709418, "grad_norm": 12.71775505855947, "learning_rate": 4.975294448967302e-05, "loss": 2.7976, "mean_token_accuracy": 0.37931033968925476, "step": 94175 }, { "epoch": 0.09485909627019835, "grad_norm": 11.794687455909681, "learning_rate": 4.97528891101403e-05, "loss": 2.916, "mean_token_accuracy": 0.3620689630508423, "step": 94180 }, { "epoch": 0.09486413232330253, "grad_norm": 12.633535851677799, "learning_rate": 4.9752833724435675e-05, "loss": 2.5863, "mean_token_accuracy": 0.43103447556495667, "step": 94185 }, { "epoch": 0.0948691683764067, "grad_norm": 12.930173928268955, "learning_rate": 4.975277833255913e-05, "loss": 2.1001, "mean_token_accuracy": 0.47241378426551817, "step": 94190 }, { "epoch": 0.09487420442951087, "grad_norm": 11.595969660077177, "learning_rate": 4.97527229345107e-05, "loss": 2.4139, "mean_token_accuracy": 0.4172413766384125, "step": 94195 }, { "epoch": 0.09487924048261503, "grad_norm": 12.720048473859977, "learning_rate": 4.975266753029039e-05, "loss": 2.2742, "mean_token_accuracy": 0.44482758045196535, "step": 94200 }, { "epoch": 0.09488427653571921, "grad_norm": 10.32637864530848, "learning_rate": 4.975261211989822e-05, "loss": 2.1682, "mean_token_accuracy": 0.4620689630508423, "step": 94205 }, { "epoch": 0.09488931258882338, "grad_norm": 11.023335206984484, "learning_rate": 4.975255670333421e-05, "loss": 2.7367, "mean_token_accuracy": 0.37586206793785093, "step": 94210 }, { "epoch": 0.09489434864192756, "grad_norm": 11.919258846605922, "learning_rate": 4.975250128059837e-05, "loss": 2.6812, "mean_token_accuracy": 0.41034482717514037, "step": 94215 }, { "epoch": 0.09489938469503173, "grad_norm": 11.849966399148158, "learning_rate": 4.975244585169072e-05, "loss": 2.3836, "mean_token_accuracy": 0.4379310369491577, "step": 94220 }, { "epoch": 0.0949044207481359, "grad_norm": 12.702649132053262, "learning_rate": 4.975239041661126e-05, "loss": 2.8226, "mean_token_accuracy": 0.3793103456497192, "step": 94225 }, { "epoch": 0.09490945680124008, "grad_norm": 12.077951498912952, "learning_rate": 4.975233497536002e-05, "loss": 2.6892, "mean_token_accuracy": 0.4103448331356049, "step": 94230 }, { "epoch": 0.09491449285434425, "grad_norm": 11.005103946081398, "learning_rate": 4.9752279527937005e-05, "loss": 2.5906, "mean_token_accuracy": 0.36206896901130675, "step": 94235 }, { "epoch": 0.09491952890744842, "grad_norm": 10.209504940691707, "learning_rate": 4.9752224074342246e-05, "loss": 2.2826, "mean_token_accuracy": 0.42758620977401735, "step": 94240 }, { "epoch": 0.0949245649605526, "grad_norm": 12.376001238853574, "learning_rate": 4.975216861457575e-05, "loss": 2.5954, "mean_token_accuracy": 0.4344827562570572, "step": 94245 }, { "epoch": 0.09492960101365677, "grad_norm": 11.935646126591037, "learning_rate": 4.975211314863753e-05, "loss": 2.6454, "mean_token_accuracy": 0.3655172407627106, "step": 94250 }, { "epoch": 0.09493463706676095, "grad_norm": 11.61411597924925, "learning_rate": 4.9752057676527604e-05, "loss": 2.5126, "mean_token_accuracy": 0.4387931048870087, "step": 94255 }, { "epoch": 0.09493967311986512, "grad_norm": 14.295694513225758, "learning_rate": 4.975200219824599e-05, "loss": 2.8176, "mean_token_accuracy": 0.3931034505367279, "step": 94260 }, { "epoch": 0.0949447091729693, "grad_norm": 10.035437451532458, "learning_rate": 4.97519467137927e-05, "loss": 2.3864, "mean_token_accuracy": 0.38620689511299133, "step": 94265 }, { "epoch": 0.09494974522607345, "grad_norm": 11.45228448017396, "learning_rate": 4.9751891223167746e-05, "loss": 2.5046, "mean_token_accuracy": 0.42577131986618044, "step": 94270 }, { "epoch": 0.09495478127917763, "grad_norm": 12.881094630419891, "learning_rate": 4.975183572637115e-05, "loss": 2.6303, "mean_token_accuracy": 0.4517241358757019, "step": 94275 }, { "epoch": 0.0949598173322818, "grad_norm": 15.687804508943161, "learning_rate": 4.975178022340292e-05, "loss": 2.4506, "mean_token_accuracy": 0.4172413766384125, "step": 94280 }, { "epoch": 0.09496485338538597, "grad_norm": 8.539945164305356, "learning_rate": 4.975172471426307e-05, "loss": 2.5762, "mean_token_accuracy": 0.41034482717514037, "step": 94285 }, { "epoch": 0.09496988943849015, "grad_norm": 17.11511330502168, "learning_rate": 4.9751669198951636e-05, "loss": 2.0523, "mean_token_accuracy": 0.47241380214691164, "step": 94290 }, { "epoch": 0.09497492549159432, "grad_norm": 12.335162234094282, "learning_rate": 4.975161367746861e-05, "loss": 2.5183, "mean_token_accuracy": 0.4034482717514038, "step": 94295 }, { "epoch": 0.0949799615446985, "grad_norm": 11.693466600893839, "learning_rate": 4.975155814981402e-05, "loss": 2.7601, "mean_token_accuracy": 0.38802178502082824, "step": 94300 }, { "epoch": 0.09498499759780267, "grad_norm": 10.13867408139205, "learning_rate": 4.975150261598787e-05, "loss": 2.4015, "mean_token_accuracy": 0.43992740511894224, "step": 94305 }, { "epoch": 0.09499003365090684, "grad_norm": 10.663304629606444, "learning_rate": 4.9751447075990187e-05, "loss": 2.5268, "mean_token_accuracy": 0.40689654350280763, "step": 94310 }, { "epoch": 0.09499506970401102, "grad_norm": 10.048043207604094, "learning_rate": 4.975139152982099e-05, "loss": 2.6298, "mean_token_accuracy": 0.41379311084747317, "step": 94315 }, { "epoch": 0.09500010575711519, "grad_norm": 8.955446389755506, "learning_rate": 4.975133597748027e-05, "loss": 2.235, "mean_token_accuracy": 0.4413793087005615, "step": 94320 }, { "epoch": 0.09500514181021937, "grad_norm": 11.76075599888514, "learning_rate": 4.975128041896807e-05, "loss": 2.4855, "mean_token_accuracy": 0.42413793206214906, "step": 94325 }, { "epoch": 0.09501017786332354, "grad_norm": 12.276962253209081, "learning_rate": 4.975122485428439e-05, "loss": 2.3946, "mean_token_accuracy": 0.47931033968925474, "step": 94330 }, { "epoch": 0.09501521391642771, "grad_norm": 9.808878404526567, "learning_rate": 4.975116928342925e-05, "loss": 2.4667, "mean_token_accuracy": 0.43986691236495973, "step": 94335 }, { "epoch": 0.09502024996953187, "grad_norm": 10.948924324172822, "learning_rate": 4.975111370640268e-05, "loss": 2.4636, "mean_token_accuracy": 0.3931034505367279, "step": 94340 }, { "epoch": 0.09502528602263605, "grad_norm": 10.972611290763291, "learning_rate": 4.975105812320466e-05, "loss": 2.2427, "mean_token_accuracy": 0.37241379022598264, "step": 94345 }, { "epoch": 0.09503032207574022, "grad_norm": 13.597641603841403, "learning_rate": 4.9751002533835235e-05, "loss": 2.9661, "mean_token_accuracy": 0.3551724165678024, "step": 94350 }, { "epoch": 0.0950353581288444, "grad_norm": 13.740024119148027, "learning_rate": 4.9750946938294416e-05, "loss": 2.6825, "mean_token_accuracy": 0.4034482777118683, "step": 94355 }, { "epoch": 0.09504039418194857, "grad_norm": 9.617909462409292, "learning_rate": 4.97508913365822e-05, "loss": 2.298, "mean_token_accuracy": 0.4034482717514038, "step": 94360 }, { "epoch": 0.09504543023505274, "grad_norm": 11.78687467126387, "learning_rate": 4.975083572869863e-05, "loss": 2.6124, "mean_token_accuracy": 0.37586207389831544, "step": 94365 }, { "epoch": 0.09505046628815692, "grad_norm": 11.669013559961504, "learning_rate": 4.97507801146437e-05, "loss": 2.0374, "mean_token_accuracy": 0.47779794335365294, "step": 94370 }, { "epoch": 0.09505550234126109, "grad_norm": 9.853320361316026, "learning_rate": 4.975072449441743e-05, "loss": 2.2049, "mean_token_accuracy": 0.4691470146179199, "step": 94375 }, { "epoch": 0.09506053839436526, "grad_norm": 9.578687421638223, "learning_rate": 4.9750668868019846e-05, "loss": 2.4483, "mean_token_accuracy": 0.38965518176555636, "step": 94380 }, { "epoch": 0.09506557444746944, "grad_norm": 10.25236658852576, "learning_rate": 4.9750613235450954e-05, "loss": 2.149, "mean_token_accuracy": 0.4294010937213898, "step": 94385 }, { "epoch": 0.09507061050057361, "grad_norm": 13.93944134060669, "learning_rate": 4.975055759671077e-05, "loss": 3.4523, "mean_token_accuracy": 0.34137930572032926, "step": 94390 }, { "epoch": 0.09507564655367778, "grad_norm": 16.91302420894603, "learning_rate": 4.9750501951799315e-05, "loss": 2.928, "mean_token_accuracy": 0.379310342669487, "step": 94395 }, { "epoch": 0.09508068260678196, "grad_norm": 10.64670914900411, "learning_rate": 4.9750446300716595e-05, "loss": 2.6618, "mean_token_accuracy": 0.3620689630508423, "step": 94400 }, { "epoch": 0.09508571865988613, "grad_norm": 11.155151718042662, "learning_rate": 4.9750390643462636e-05, "loss": 2.6848, "mean_token_accuracy": 0.4620689690113068, "step": 94405 }, { "epoch": 0.09509075471299029, "grad_norm": 10.811400923585001, "learning_rate": 4.9750334980037445e-05, "loss": 2.5087, "mean_token_accuracy": 0.38275861740112305, "step": 94410 }, { "epoch": 0.09509579076609447, "grad_norm": 10.38125911337505, "learning_rate": 4.975027931044104e-05, "loss": 2.3799, "mean_token_accuracy": 0.4310344815254211, "step": 94415 }, { "epoch": 0.09510082681919864, "grad_norm": 10.159771520319072, "learning_rate": 4.975022363467344e-05, "loss": 2.6813, "mean_token_accuracy": 0.3517241358757019, "step": 94420 }, { "epoch": 0.09510586287230281, "grad_norm": 10.530161255303089, "learning_rate": 4.9750167952734656e-05, "loss": 2.0551, "mean_token_accuracy": 0.44827585816383364, "step": 94425 }, { "epoch": 0.09511089892540699, "grad_norm": 10.381008716554916, "learning_rate": 4.9750112264624696e-05, "loss": 2.2635, "mean_token_accuracy": 0.47931033968925474, "step": 94430 }, { "epoch": 0.09511593497851116, "grad_norm": 11.189350621478148, "learning_rate": 4.97500565703436e-05, "loss": 2.5448, "mean_token_accuracy": 0.42965604066848756, "step": 94435 }, { "epoch": 0.09512097103161533, "grad_norm": 11.349959911623296, "learning_rate": 4.975000086989136e-05, "loss": 2.3029, "mean_token_accuracy": 0.4620689690113068, "step": 94440 }, { "epoch": 0.09512600708471951, "grad_norm": 11.226907896870637, "learning_rate": 4.9749945163268e-05, "loss": 2.4935, "mean_token_accuracy": 0.42413793206214906, "step": 94445 }, { "epoch": 0.09513104313782368, "grad_norm": 13.009262458724498, "learning_rate": 4.9749889450473534e-05, "loss": 2.5195, "mean_token_accuracy": 0.4206896543502808, "step": 94450 }, { "epoch": 0.09513607919092786, "grad_norm": 10.08457291797649, "learning_rate": 4.974983373150798e-05, "loss": 2.2717, "mean_token_accuracy": 0.43448275327682495, "step": 94455 }, { "epoch": 0.09514111524403203, "grad_norm": 13.651733527016296, "learning_rate": 4.974977800637135e-05, "loss": 2.5546, "mean_token_accuracy": 0.3793103456497192, "step": 94460 }, { "epoch": 0.0951461512971362, "grad_norm": 11.14307438847547, "learning_rate": 4.974972227506366e-05, "loss": 2.5358, "mean_token_accuracy": 0.3862069010734558, "step": 94465 }, { "epoch": 0.09515118735024038, "grad_norm": 9.575953675411071, "learning_rate": 4.9749666537584926e-05, "loss": 2.2173, "mean_token_accuracy": 0.49830610752105714, "step": 94470 }, { "epoch": 0.09515622340334455, "grad_norm": 10.814108298782577, "learning_rate": 4.9749610793935165e-05, "loss": 2.3948, "mean_token_accuracy": 0.4413793087005615, "step": 94475 }, { "epoch": 0.09516125945644871, "grad_norm": 9.878307143859834, "learning_rate": 4.9749555044114397e-05, "loss": 2.8667, "mean_token_accuracy": 0.4310344696044922, "step": 94480 }, { "epoch": 0.09516629550955288, "grad_norm": 11.70583758934135, "learning_rate": 4.974949928812263e-05, "loss": 2.2695, "mean_token_accuracy": 0.43793103098869324, "step": 94485 }, { "epoch": 0.09517133156265706, "grad_norm": 14.088369576505439, "learning_rate": 4.974944352595988e-05, "loss": 2.2757, "mean_token_accuracy": 0.41724138259887694, "step": 94490 }, { "epoch": 0.09517636761576123, "grad_norm": 12.062304066988213, "learning_rate": 4.974938775762615e-05, "loss": 2.2055, "mean_token_accuracy": 0.4551724135875702, "step": 94495 }, { "epoch": 0.0951814036688654, "grad_norm": 9.722449162366482, "learning_rate": 4.974933198312149e-05, "loss": 2.9638, "mean_token_accuracy": 0.38275861740112305, "step": 94500 }, { "epoch": 0.09518643972196958, "grad_norm": 14.021284043765027, "learning_rate": 4.9749276202445885e-05, "loss": 2.3248, "mean_token_accuracy": 0.46551724076271056, "step": 94505 }, { "epoch": 0.09519147577507375, "grad_norm": 10.409375346892778, "learning_rate": 4.9749220415599354e-05, "loss": 2.3532, "mean_token_accuracy": 0.4482758641242981, "step": 94510 }, { "epoch": 0.09519651182817793, "grad_norm": 12.078261072916789, "learning_rate": 4.974916462258193e-05, "loss": 2.3442, "mean_token_accuracy": 0.4344827592372894, "step": 94515 }, { "epoch": 0.0952015478812821, "grad_norm": 10.218559721014014, "learning_rate": 4.974910882339362e-05, "loss": 2.5441, "mean_token_accuracy": 0.3896551728248596, "step": 94520 }, { "epoch": 0.09520658393438627, "grad_norm": 10.33042513440372, "learning_rate": 4.974905301803443e-05, "loss": 2.4093, "mean_token_accuracy": 0.42413793206214906, "step": 94525 }, { "epoch": 0.09521161998749045, "grad_norm": 12.233469919119408, "learning_rate": 4.974899720650438e-05, "loss": 2.3777, "mean_token_accuracy": 0.4344827592372894, "step": 94530 }, { "epoch": 0.09521665604059462, "grad_norm": 10.393902809728301, "learning_rate": 4.974894138880349e-05, "loss": 2.747, "mean_token_accuracy": 0.4034482717514038, "step": 94535 }, { "epoch": 0.0952216920936988, "grad_norm": 11.776635055997412, "learning_rate": 4.9748885564931775e-05, "loss": 2.3422, "mean_token_accuracy": 0.47586206197738645, "step": 94540 }, { "epoch": 0.09522672814680297, "grad_norm": 12.935806101970833, "learning_rate": 4.974882973488925e-05, "loss": 2.6704, "mean_token_accuracy": 0.400967937707901, "step": 94545 }, { "epoch": 0.09523176419990713, "grad_norm": 13.063717830711987, "learning_rate": 4.974877389867592e-05, "loss": 2.5925, "mean_token_accuracy": 0.3965517282485962, "step": 94550 }, { "epoch": 0.0952368002530113, "grad_norm": 11.44425006093507, "learning_rate": 4.9748718056291814e-05, "loss": 2.1588, "mean_token_accuracy": 0.4744101643562317, "step": 94555 }, { "epoch": 0.09524183630611548, "grad_norm": 10.950576231870489, "learning_rate": 4.974866220773695e-05, "loss": 2.3105, "mean_token_accuracy": 0.441379314661026, "step": 94560 }, { "epoch": 0.09524687235921965, "grad_norm": 10.952308859340937, "learning_rate": 4.974860635301133e-05, "loss": 2.5981, "mean_token_accuracy": 0.4310344815254211, "step": 94565 }, { "epoch": 0.09525190841232382, "grad_norm": 13.542165867321689, "learning_rate": 4.974855049211497e-05, "loss": 2.4144, "mean_token_accuracy": 0.3931034475564957, "step": 94570 }, { "epoch": 0.095256944465428, "grad_norm": 11.361301090665656, "learning_rate": 4.9748494625047896e-05, "loss": 2.2613, "mean_token_accuracy": 0.4655172348022461, "step": 94575 }, { "epoch": 0.09526198051853217, "grad_norm": 11.30719623355096, "learning_rate": 4.974843875181013e-05, "loss": 2.6614, "mean_token_accuracy": 0.3206896513700485, "step": 94580 }, { "epoch": 0.09526701657163635, "grad_norm": 10.762342854483771, "learning_rate": 4.974838287240167e-05, "loss": 2.301, "mean_token_accuracy": 0.44827585816383364, "step": 94585 }, { "epoch": 0.09527205262474052, "grad_norm": 12.106367180240598, "learning_rate": 4.974832698682253e-05, "loss": 2.4093, "mean_token_accuracy": 0.43448275327682495, "step": 94590 }, { "epoch": 0.09527708867784469, "grad_norm": 14.755460106901154, "learning_rate": 4.974827109507274e-05, "loss": 2.663, "mean_token_accuracy": 0.3620689630508423, "step": 94595 }, { "epoch": 0.09528212473094887, "grad_norm": 11.225503012028776, "learning_rate": 4.9748215197152306e-05, "loss": 2.3223, "mean_token_accuracy": 0.42413793206214906, "step": 94600 }, { "epoch": 0.09528716078405304, "grad_norm": 9.718317812402411, "learning_rate": 4.974815929306125e-05, "loss": 2.5357, "mean_token_accuracy": 0.42068964838981626, "step": 94605 }, { "epoch": 0.09529219683715721, "grad_norm": 9.940904359790782, "learning_rate": 4.9748103382799594e-05, "loss": 2.5037, "mean_token_accuracy": 0.4137930989265442, "step": 94610 }, { "epoch": 0.09529723289026139, "grad_norm": 12.58440386971749, "learning_rate": 4.974804746636733e-05, "loss": 2.1013, "mean_token_accuracy": 0.47755595445632937, "step": 94615 }, { "epoch": 0.09530226894336555, "grad_norm": 8.968576334231596, "learning_rate": 4.974799154376449e-05, "loss": 2.5682, "mean_token_accuracy": 0.3965517163276672, "step": 94620 }, { "epoch": 0.09530730499646972, "grad_norm": 10.656318762549384, "learning_rate": 4.9747935614991084e-05, "loss": 2.0898, "mean_token_accuracy": 0.4689655125141144, "step": 94625 }, { "epoch": 0.0953123410495739, "grad_norm": 10.163285500441466, "learning_rate": 4.9747879680047125e-05, "loss": 2.2884, "mean_token_accuracy": 0.4482758641242981, "step": 94630 }, { "epoch": 0.09531737710267807, "grad_norm": 10.008543569360336, "learning_rate": 4.9747823738932645e-05, "loss": 2.2527, "mean_token_accuracy": 0.44827585220336913, "step": 94635 }, { "epoch": 0.09532241315578224, "grad_norm": 12.16610033195966, "learning_rate": 4.974776779164764e-05, "loss": 2.6516, "mean_token_accuracy": 0.42068964838981626, "step": 94640 }, { "epoch": 0.09532744920888642, "grad_norm": 9.861103575187354, "learning_rate": 4.974771183819215e-05, "loss": 2.0716, "mean_token_accuracy": 0.46551724076271056, "step": 94645 }, { "epoch": 0.09533248526199059, "grad_norm": 10.858438253292226, "learning_rate": 4.974765587856615e-05, "loss": 2.5596, "mean_token_accuracy": 0.43103448748588563, "step": 94650 }, { "epoch": 0.09533752131509476, "grad_norm": 17.1107835076412, "learning_rate": 4.974759991276969e-05, "loss": 2.5084, "mean_token_accuracy": 0.4126436859369278, "step": 94655 }, { "epoch": 0.09534255736819894, "grad_norm": 12.10068936124448, "learning_rate": 4.974754394080278e-05, "loss": 2.5597, "mean_token_accuracy": 0.4068965494632721, "step": 94660 }, { "epoch": 0.09534759342130311, "grad_norm": 11.058706165880562, "learning_rate": 4.974748796266543e-05, "loss": 2.6365, "mean_token_accuracy": 0.35172412991523744, "step": 94665 }, { "epoch": 0.09535262947440729, "grad_norm": 14.457252502894958, "learning_rate": 4.974743197835765e-05, "loss": 2.802, "mean_token_accuracy": 0.3655172407627106, "step": 94670 }, { "epoch": 0.09535766552751146, "grad_norm": 11.224242092943397, "learning_rate": 4.974737598787947e-05, "loss": 2.2444, "mean_token_accuracy": 0.46551724076271056, "step": 94675 }, { "epoch": 0.09536270158061563, "grad_norm": 13.668233992853766, "learning_rate": 4.974731999123089e-05, "loss": 2.6124, "mean_token_accuracy": 0.37931033968925476, "step": 94680 }, { "epoch": 0.0953677376337198, "grad_norm": 11.970030337080674, "learning_rate": 4.974726398841194e-05, "loss": 2.4412, "mean_token_accuracy": 0.3931034505367279, "step": 94685 }, { "epoch": 0.09537277368682397, "grad_norm": 13.688263451171593, "learning_rate": 4.9747207979422614e-05, "loss": 2.7388, "mean_token_accuracy": 0.358620685338974, "step": 94690 }, { "epoch": 0.09537780973992814, "grad_norm": 17.151685549202128, "learning_rate": 4.9747151964262954e-05, "loss": 2.4786, "mean_token_accuracy": 0.4068965494632721, "step": 94695 }, { "epoch": 0.09538284579303231, "grad_norm": 12.165867633536022, "learning_rate": 4.9747095942932964e-05, "loss": 2.1961, "mean_token_accuracy": 0.44827585220336913, "step": 94700 }, { "epoch": 0.09538788184613649, "grad_norm": 11.380384055982072, "learning_rate": 4.974703991543265e-05, "loss": 2.6343, "mean_token_accuracy": 0.4117362380027771, "step": 94705 }, { "epoch": 0.09539291789924066, "grad_norm": 11.494369031151589, "learning_rate": 4.974698388176205e-05, "loss": 2.6934, "mean_token_accuracy": 0.4000000059604645, "step": 94710 }, { "epoch": 0.09539795395234484, "grad_norm": 9.68863462297922, "learning_rate": 4.9746927841921154e-05, "loss": 2.1479, "mean_token_accuracy": 0.4551724135875702, "step": 94715 }, { "epoch": 0.09540299000544901, "grad_norm": 14.898849018032697, "learning_rate": 4.974687179591e-05, "loss": 2.5465, "mean_token_accuracy": 0.41893526911735535, "step": 94720 }, { "epoch": 0.09540802605855318, "grad_norm": 11.673891039193846, "learning_rate": 4.974681574372858e-05, "loss": 2.4013, "mean_token_accuracy": 0.37931033968925476, "step": 94725 }, { "epoch": 0.09541306211165736, "grad_norm": 10.89689195801521, "learning_rate": 4.974675968537693e-05, "loss": 2.5527, "mean_token_accuracy": 0.4257713258266449, "step": 94730 }, { "epoch": 0.09541809816476153, "grad_norm": 9.914859209642662, "learning_rate": 4.974670362085506e-05, "loss": 2.3718, "mean_token_accuracy": 0.41379310488700866, "step": 94735 }, { "epoch": 0.0954231342178657, "grad_norm": 11.860349924209084, "learning_rate": 4.974664755016298e-05, "loss": 2.407, "mean_token_accuracy": 0.42413792610168455, "step": 94740 }, { "epoch": 0.09542817027096988, "grad_norm": 9.393524062544435, "learning_rate": 4.974659147330071e-05, "loss": 2.198, "mean_token_accuracy": 0.47586206793785096, "step": 94745 }, { "epoch": 0.09543320632407405, "grad_norm": 13.743792810744072, "learning_rate": 4.974653539026827e-05, "loss": 2.4292, "mean_token_accuracy": 0.41034482717514037, "step": 94750 }, { "epoch": 0.09543824237717823, "grad_norm": 10.654313256120993, "learning_rate": 4.974647930106566e-05, "loss": 2.196, "mean_token_accuracy": 0.441379314661026, "step": 94755 }, { "epoch": 0.09544327843028239, "grad_norm": 9.936535530794197, "learning_rate": 4.974642320569292e-05, "loss": 2.1142, "mean_token_accuracy": 0.47586206793785096, "step": 94760 }, { "epoch": 0.09544831448338656, "grad_norm": 15.085458545417078, "learning_rate": 4.9746367104150045e-05, "loss": 2.8066, "mean_token_accuracy": 0.4034482777118683, "step": 94765 }, { "epoch": 0.09545335053649073, "grad_norm": 13.074004864517716, "learning_rate": 4.974631099643705e-05, "loss": 3.0965, "mean_token_accuracy": 0.3310344785451889, "step": 94770 }, { "epoch": 0.0954583865895949, "grad_norm": 10.188909623217869, "learning_rate": 4.9746254882553974e-05, "loss": 2.4005, "mean_token_accuracy": 0.4206896543502808, "step": 94775 }, { "epoch": 0.09546342264269908, "grad_norm": 13.386580181369357, "learning_rate": 4.9746198762500797e-05, "loss": 2.5279, "mean_token_accuracy": 0.44343618154525755, "step": 94780 }, { "epoch": 0.09546845869580325, "grad_norm": 9.288974971891502, "learning_rate": 4.9746142636277566e-05, "loss": 2.2543, "mean_token_accuracy": 0.4896551728248596, "step": 94785 }, { "epoch": 0.09547349474890743, "grad_norm": 11.635673771943251, "learning_rate": 4.974608650388428e-05, "loss": 2.9918, "mean_token_accuracy": 0.38620689511299133, "step": 94790 }, { "epoch": 0.0954785308020116, "grad_norm": 9.986568505131213, "learning_rate": 4.9746030365320956e-05, "loss": 2.3621, "mean_token_accuracy": 0.3896551728248596, "step": 94795 }, { "epoch": 0.09548356685511578, "grad_norm": 12.57825710711765, "learning_rate": 4.9745974220587624e-05, "loss": 2.3384, "mean_token_accuracy": 0.4448275864124298, "step": 94800 }, { "epoch": 0.09548860290821995, "grad_norm": 8.531957527337706, "learning_rate": 4.974591806968428e-05, "loss": 2.5854, "mean_token_accuracy": 0.43103448748588563, "step": 94805 }, { "epoch": 0.09549363896132412, "grad_norm": 9.690930384485728, "learning_rate": 4.9745861912610944e-05, "loss": 2.2907, "mean_token_accuracy": 0.4931034445762634, "step": 94810 }, { "epoch": 0.0954986750144283, "grad_norm": 8.749484874079112, "learning_rate": 4.974580574936764e-05, "loss": 2.3819, "mean_token_accuracy": 0.4, "step": 94815 }, { "epoch": 0.09550371106753247, "grad_norm": 11.990962765794626, "learning_rate": 4.974574957995437e-05, "loss": 2.4787, "mean_token_accuracy": 0.4068965494632721, "step": 94820 }, { "epoch": 0.09550874712063664, "grad_norm": 11.722516132033057, "learning_rate": 4.974569340437117e-05, "loss": 2.5211, "mean_token_accuracy": 0.3965517282485962, "step": 94825 }, { "epoch": 0.0955137831737408, "grad_norm": 11.305392278878944, "learning_rate": 4.974563722261804e-05, "loss": 2.5161, "mean_token_accuracy": 0.44827585816383364, "step": 94830 }, { "epoch": 0.09551881922684498, "grad_norm": 10.522259997967701, "learning_rate": 4.9745581034695006e-05, "loss": 2.4336, "mean_token_accuracy": 0.41724138259887694, "step": 94835 }, { "epoch": 0.09552385527994915, "grad_norm": 10.525775788157246, "learning_rate": 4.974552484060207e-05, "loss": 2.5041, "mean_token_accuracy": 0.42413793206214906, "step": 94840 }, { "epoch": 0.09552889133305333, "grad_norm": 10.263121549009181, "learning_rate": 4.9745468640339255e-05, "loss": 2.0596, "mean_token_accuracy": 0.4413793087005615, "step": 94845 }, { "epoch": 0.0955339273861575, "grad_norm": 11.013966811225886, "learning_rate": 4.974541243390658e-05, "loss": 2.3204, "mean_token_accuracy": 0.43103448748588563, "step": 94850 }, { "epoch": 0.09553896343926167, "grad_norm": 12.835218383506, "learning_rate": 4.9745356221304054e-05, "loss": 2.2652, "mean_token_accuracy": 0.46896551847457885, "step": 94855 }, { "epoch": 0.09554399949236585, "grad_norm": 13.64492457682945, "learning_rate": 4.9745300002531686e-05, "loss": 2.6832, "mean_token_accuracy": 0.42413792610168455, "step": 94860 }, { "epoch": 0.09554903554547002, "grad_norm": 12.76582704381585, "learning_rate": 4.974524377758952e-05, "loss": 2.832, "mean_token_accuracy": 0.4122807025909424, "step": 94865 }, { "epoch": 0.0955540715985742, "grad_norm": 13.85082413261805, "learning_rate": 4.974518754647754e-05, "loss": 2.3891, "mean_token_accuracy": 0.44137930274009707, "step": 94870 }, { "epoch": 0.09555910765167837, "grad_norm": 14.552383073151365, "learning_rate": 4.9745131309195775e-05, "loss": 2.6957, "mean_token_accuracy": 0.38965516686439516, "step": 94875 }, { "epoch": 0.09556414370478254, "grad_norm": 10.496693327162854, "learning_rate": 4.974507506574424e-05, "loss": 3.1934, "mean_token_accuracy": 0.3068965464830399, "step": 94880 }, { "epoch": 0.09556917975788672, "grad_norm": 11.75693894470164, "learning_rate": 4.9745018816122955e-05, "loss": 2.4581, "mean_token_accuracy": 0.4186932861804962, "step": 94885 }, { "epoch": 0.09557421581099089, "grad_norm": 11.341472619917889, "learning_rate": 4.974496256033193e-05, "loss": 2.1995, "mean_token_accuracy": 0.4551724076271057, "step": 94890 }, { "epoch": 0.09557925186409506, "grad_norm": 10.700923521699153, "learning_rate": 4.974490629837117e-05, "loss": 2.5308, "mean_token_accuracy": 0.38275861740112305, "step": 94895 }, { "epoch": 0.09558428791719922, "grad_norm": 14.67259598041589, "learning_rate": 4.974485003024072e-05, "loss": 2.2058, "mean_token_accuracy": 0.4586206912994385, "step": 94900 }, { "epoch": 0.0955893239703034, "grad_norm": 12.320419502220172, "learning_rate": 4.974479375594057e-05, "loss": 2.5006, "mean_token_accuracy": 0.4034482717514038, "step": 94905 }, { "epoch": 0.09559436002340757, "grad_norm": 11.817813604898392, "learning_rate": 4.974473747547074e-05, "loss": 2.5091, "mean_token_accuracy": 0.4379310369491577, "step": 94910 }, { "epoch": 0.09559939607651174, "grad_norm": 10.025453690586517, "learning_rate": 4.974468118883126e-05, "loss": 2.8662, "mean_token_accuracy": 0.38965516686439516, "step": 94915 }, { "epoch": 0.09560443212961592, "grad_norm": 8.418689104953579, "learning_rate": 4.9744624896022125e-05, "loss": 2.4478, "mean_token_accuracy": 0.43793103098869324, "step": 94920 }, { "epoch": 0.09560946818272009, "grad_norm": 9.730269597273631, "learning_rate": 4.974456859704336e-05, "loss": 2.5161, "mean_token_accuracy": 0.4189352631568909, "step": 94925 }, { "epoch": 0.09561450423582427, "grad_norm": 11.540124881513567, "learning_rate": 4.974451229189499e-05, "loss": 2.0733, "mean_token_accuracy": 0.4551724135875702, "step": 94930 }, { "epoch": 0.09561954028892844, "grad_norm": 10.780998516075206, "learning_rate": 4.974445598057701e-05, "loss": 2.6449, "mean_token_accuracy": 0.4, "step": 94935 }, { "epoch": 0.09562457634203261, "grad_norm": 11.52226467486293, "learning_rate": 4.974439966308946e-05, "loss": 2.721, "mean_token_accuracy": 0.4068965554237366, "step": 94940 }, { "epoch": 0.09562961239513679, "grad_norm": 12.189615036989622, "learning_rate": 4.974434333943233e-05, "loss": 2.6215, "mean_token_accuracy": 0.36206896901130675, "step": 94945 }, { "epoch": 0.09563464844824096, "grad_norm": 10.24924620940812, "learning_rate": 4.9744287009605654e-05, "loss": 2.6726, "mean_token_accuracy": 0.4103448212146759, "step": 94950 }, { "epoch": 0.09563968450134513, "grad_norm": 10.647043189335427, "learning_rate": 4.9744230673609435e-05, "loss": 2.4305, "mean_token_accuracy": 0.42068966031074523, "step": 94955 }, { "epoch": 0.09564472055444931, "grad_norm": 11.17706644424301, "learning_rate": 4.974417433144371e-05, "loss": 2.4764, "mean_token_accuracy": 0.4482758641242981, "step": 94960 }, { "epoch": 0.09564975660755348, "grad_norm": 7.519048895750799, "learning_rate": 4.974411798310847e-05, "loss": 2.1688, "mean_token_accuracy": 0.44627949595451355, "step": 94965 }, { "epoch": 0.09565479266065764, "grad_norm": 10.07972849325475, "learning_rate": 4.974406162860375e-05, "loss": 2.5268, "mean_token_accuracy": 0.41034482717514037, "step": 94970 }, { "epoch": 0.09565982871376182, "grad_norm": 13.248575042152243, "learning_rate": 4.974400526792954e-05, "loss": 2.2337, "mean_token_accuracy": 0.4446460962295532, "step": 94975 }, { "epoch": 0.09566486476686599, "grad_norm": 11.356533330831423, "learning_rate": 4.974394890108588e-05, "loss": 3.4618, "mean_token_accuracy": 0.334482753276825, "step": 94980 }, { "epoch": 0.09566990081997016, "grad_norm": 10.274391509196915, "learning_rate": 4.9743892528072785e-05, "loss": 2.2477, "mean_token_accuracy": 0.4448275864124298, "step": 94985 }, { "epoch": 0.09567493687307434, "grad_norm": 9.833458321377458, "learning_rate": 4.974383614889026e-05, "loss": 2.1255, "mean_token_accuracy": 0.4517241358757019, "step": 94990 }, { "epoch": 0.09567997292617851, "grad_norm": 8.917809890262825, "learning_rate": 4.974377976353832e-05, "loss": 2.3166, "mean_token_accuracy": 0.4551724135875702, "step": 94995 }, { "epoch": 0.09568500897928268, "grad_norm": 9.232902469592608, "learning_rate": 4.9743723372016985e-05, "loss": 2.2817, "mean_token_accuracy": 0.45069569945335386, "step": 95000 }, { "epoch": 0.09569004503238686, "grad_norm": 12.135207050337424, "learning_rate": 4.9743666974326275e-05, "loss": 2.7972, "mean_token_accuracy": 0.32758620381355286, "step": 95005 }, { "epoch": 0.09569508108549103, "grad_norm": 12.261816690951363, "learning_rate": 4.97436105704662e-05, "loss": 2.1962, "mean_token_accuracy": 0.49171202778816225, "step": 95010 }, { "epoch": 0.0957001171385952, "grad_norm": 9.611368017874604, "learning_rate": 4.974355416043678e-05, "loss": 2.6051, "mean_token_accuracy": 0.4103448152542114, "step": 95015 }, { "epoch": 0.09570515319169938, "grad_norm": 10.038113931219529, "learning_rate": 4.974349774423802e-05, "loss": 2.4367, "mean_token_accuracy": 0.4103448331356049, "step": 95020 }, { "epoch": 0.09571018924480355, "grad_norm": 13.454845802283526, "learning_rate": 4.9743441321869946e-05, "loss": 2.875, "mean_token_accuracy": 0.3517241418361664, "step": 95025 }, { "epoch": 0.09571522529790773, "grad_norm": 10.093895354658807, "learning_rate": 4.974338489333256e-05, "loss": 2.9521, "mean_token_accuracy": 0.37241379022598264, "step": 95030 }, { "epoch": 0.0957202613510119, "grad_norm": 15.148435349623952, "learning_rate": 4.9743328458625906e-05, "loss": 2.5434, "mean_token_accuracy": 0.3896551728248596, "step": 95035 }, { "epoch": 0.09572529740411606, "grad_norm": 12.889347345312244, "learning_rate": 4.974327201774997e-05, "loss": 2.6411, "mean_token_accuracy": 0.3775559484958649, "step": 95040 }, { "epoch": 0.09573033345722023, "grad_norm": 9.611750448691255, "learning_rate": 4.974321557070478e-05, "loss": 2.4627, "mean_token_accuracy": 0.41941922903060913, "step": 95045 }, { "epoch": 0.09573536951032441, "grad_norm": 12.618658527608344, "learning_rate": 4.9743159117490364e-05, "loss": 2.4415, "mean_token_accuracy": 0.417241370677948, "step": 95050 }, { "epoch": 0.09574040556342858, "grad_norm": 8.913507302493644, "learning_rate": 4.974310265810671e-05, "loss": 2.3102, "mean_token_accuracy": 0.42928009629249575, "step": 95055 }, { "epoch": 0.09574544161653276, "grad_norm": 8.807254106186633, "learning_rate": 4.974304619255385e-05, "loss": 2.1104, "mean_token_accuracy": 0.4862069010734558, "step": 95060 }, { "epoch": 0.09575047766963693, "grad_norm": 10.489873744662276, "learning_rate": 4.974298972083181e-05, "loss": 2.3116, "mean_token_accuracy": 0.42068966031074523, "step": 95065 }, { "epoch": 0.0957555137227411, "grad_norm": 10.788675303453362, "learning_rate": 4.974293324294059e-05, "loss": 2.6141, "mean_token_accuracy": 0.3931034505367279, "step": 95070 }, { "epoch": 0.09576054977584528, "grad_norm": 11.692099525996737, "learning_rate": 4.97428767588802e-05, "loss": 2.5831, "mean_token_accuracy": 0.422202056646347, "step": 95075 }, { "epoch": 0.09576558582894945, "grad_norm": 10.309317769893221, "learning_rate": 4.974282026865067e-05, "loss": 2.999, "mean_token_accuracy": 0.37241379618644715, "step": 95080 }, { "epoch": 0.09577062188205362, "grad_norm": 10.026510251368254, "learning_rate": 4.974276377225201e-05, "loss": 2.4884, "mean_token_accuracy": 0.4275862157344818, "step": 95085 }, { "epoch": 0.0957756579351578, "grad_norm": 12.316708318434136, "learning_rate": 4.974270726968424e-05, "loss": 2.8857, "mean_token_accuracy": 0.3620689630508423, "step": 95090 }, { "epoch": 0.09578069398826197, "grad_norm": 10.719079459536728, "learning_rate": 4.9742650760947366e-05, "loss": 2.5759, "mean_token_accuracy": 0.41379310488700866, "step": 95095 }, { "epoch": 0.09578573004136615, "grad_norm": 10.91063171802848, "learning_rate": 4.974259424604141e-05, "loss": 2.683, "mean_token_accuracy": 0.39655172228813174, "step": 95100 }, { "epoch": 0.09579076609447032, "grad_norm": 10.728893168116445, "learning_rate": 4.9742537724966394e-05, "loss": 2.3566, "mean_token_accuracy": 0.41724138855934145, "step": 95105 }, { "epoch": 0.09579580214757448, "grad_norm": 10.377178786357568, "learning_rate": 4.9742481197722326e-05, "loss": 2.5352, "mean_token_accuracy": 0.4034482777118683, "step": 95110 }, { "epoch": 0.09580083820067865, "grad_norm": 9.739944444428108, "learning_rate": 4.9742424664309223e-05, "loss": 2.3447, "mean_token_accuracy": 0.42758620381355283, "step": 95115 }, { "epoch": 0.09580587425378283, "grad_norm": 8.591985846228438, "learning_rate": 4.97423681247271e-05, "loss": 2.1148, "mean_token_accuracy": 0.4172413766384125, "step": 95120 }, { "epoch": 0.095810910306887, "grad_norm": 15.004131485630086, "learning_rate": 4.974231157897597e-05, "loss": 2.2685, "mean_token_accuracy": 0.47586206197738645, "step": 95125 }, { "epoch": 0.09581594635999117, "grad_norm": 15.30312026025668, "learning_rate": 4.9742255027055855e-05, "loss": 3.0556, "mean_token_accuracy": 0.3620689630508423, "step": 95130 }, { "epoch": 0.09582098241309535, "grad_norm": 10.349299386271236, "learning_rate": 4.974219846896677e-05, "loss": 2.4487, "mean_token_accuracy": 0.4344827651977539, "step": 95135 }, { "epoch": 0.09582601846619952, "grad_norm": 12.246292717670443, "learning_rate": 4.974214190470872e-05, "loss": 2.5275, "mean_token_accuracy": 0.4, "step": 95140 }, { "epoch": 0.0958310545193037, "grad_norm": 12.807837955210342, "learning_rate": 4.9742085334281734e-05, "loss": 2.4405, "mean_token_accuracy": 0.4103448331356049, "step": 95145 }, { "epoch": 0.09583609057240787, "grad_norm": 10.300060522568472, "learning_rate": 4.974202875768582e-05, "loss": 2.5302, "mean_token_accuracy": 0.41034482717514037, "step": 95150 }, { "epoch": 0.09584112662551204, "grad_norm": 8.781037580734248, "learning_rate": 4.9741972174921e-05, "loss": 2.2635, "mean_token_accuracy": 0.45517241954803467, "step": 95155 }, { "epoch": 0.09584616267861622, "grad_norm": 11.171426014680025, "learning_rate": 4.974191558598729e-05, "loss": 2.3941, "mean_token_accuracy": 0.4413793087005615, "step": 95160 }, { "epoch": 0.09585119873172039, "grad_norm": 12.49779640966048, "learning_rate": 4.97418589908847e-05, "loss": 2.5187, "mean_token_accuracy": 0.42758620977401735, "step": 95165 }, { "epoch": 0.09585623478482456, "grad_norm": 10.422837510643445, "learning_rate": 4.974180238961324e-05, "loss": 2.8408, "mean_token_accuracy": 0.38451300859451293, "step": 95170 }, { "epoch": 0.09586127083792874, "grad_norm": 11.69109951096343, "learning_rate": 4.9741745782172946e-05, "loss": 2.6397, "mean_token_accuracy": 0.3827586233615875, "step": 95175 }, { "epoch": 0.0958663068910329, "grad_norm": 10.797278782926025, "learning_rate": 4.97416891685638e-05, "loss": 2.3604, "mean_token_accuracy": 0.4137930989265442, "step": 95180 }, { "epoch": 0.09587134294413707, "grad_norm": 11.172458391187575, "learning_rate": 4.9741632548785856e-05, "loss": 2.0963, "mean_token_accuracy": 0.46896551847457885, "step": 95185 }, { "epoch": 0.09587637899724125, "grad_norm": 11.851418916900489, "learning_rate": 4.97415759228391e-05, "loss": 2.2374, "mean_token_accuracy": 0.4206896543502808, "step": 95190 }, { "epoch": 0.09588141505034542, "grad_norm": 10.372008028498017, "learning_rate": 4.974151929072358e-05, "loss": 2.4251, "mean_token_accuracy": 0.4448275864124298, "step": 95195 }, { "epoch": 0.0958864511034496, "grad_norm": 11.24761765345112, "learning_rate": 4.974146265243927e-05, "loss": 2.1414, "mean_token_accuracy": 0.4310344934463501, "step": 95200 }, { "epoch": 0.09589148715655377, "grad_norm": 10.434198333460122, "learning_rate": 4.9741406007986214e-05, "loss": 2.5401, "mean_token_accuracy": 0.44827585220336913, "step": 95205 }, { "epoch": 0.09589652320965794, "grad_norm": 9.156831173850202, "learning_rate": 4.9741349357364425e-05, "loss": 2.2876, "mean_token_accuracy": 0.4482758641242981, "step": 95210 }, { "epoch": 0.09590155926276211, "grad_norm": 14.967983911508021, "learning_rate": 4.974129270057391e-05, "loss": 2.6626, "mean_token_accuracy": 0.42758620977401735, "step": 95215 }, { "epoch": 0.09590659531586629, "grad_norm": 9.19180242349692, "learning_rate": 4.9741236037614696e-05, "loss": 2.5643, "mean_token_accuracy": 0.4620689570903778, "step": 95220 }, { "epoch": 0.09591163136897046, "grad_norm": 9.82127039237697, "learning_rate": 4.974117936848679e-05, "loss": 2.3989, "mean_token_accuracy": 0.4675136089324951, "step": 95225 }, { "epoch": 0.09591666742207464, "grad_norm": 11.429674489842016, "learning_rate": 4.9741122693190206e-05, "loss": 2.289, "mean_token_accuracy": 0.4344827651977539, "step": 95230 }, { "epoch": 0.09592170347517881, "grad_norm": 12.757624100458566, "learning_rate": 4.9741066011724966e-05, "loss": 2.4595, "mean_token_accuracy": 0.3896551728248596, "step": 95235 }, { "epoch": 0.09592673952828298, "grad_norm": 9.974750157081148, "learning_rate": 4.9741009324091084e-05, "loss": 2.9408, "mean_token_accuracy": 0.37931033968925476, "step": 95240 }, { "epoch": 0.09593177558138716, "grad_norm": 12.369568369633587, "learning_rate": 4.974095263028857e-05, "loss": 2.1176, "mean_token_accuracy": 0.48148819208145144, "step": 95245 }, { "epoch": 0.09593681163449132, "grad_norm": 10.921786368388444, "learning_rate": 4.974089593031746e-05, "loss": 2.4931, "mean_token_accuracy": 0.41034482717514037, "step": 95250 }, { "epoch": 0.09594184768759549, "grad_norm": 8.733011968045858, "learning_rate": 4.9740839224177735e-05, "loss": 2.9242, "mean_token_accuracy": 0.39655172228813174, "step": 95255 }, { "epoch": 0.09594688374069966, "grad_norm": 11.757400447120189, "learning_rate": 4.974078251186944e-05, "loss": 2.3107, "mean_token_accuracy": 0.47241379618644713, "step": 95260 }, { "epoch": 0.09595191979380384, "grad_norm": 10.710397849462495, "learning_rate": 4.9740725793392585e-05, "loss": 2.5068, "mean_token_accuracy": 0.4034482777118683, "step": 95265 }, { "epoch": 0.09595695584690801, "grad_norm": 10.007541540913595, "learning_rate": 4.974066906874718e-05, "loss": 2.5806, "mean_token_accuracy": 0.4068965494632721, "step": 95270 }, { "epoch": 0.09596199190001219, "grad_norm": 10.735108749819753, "learning_rate": 4.9740612337933236e-05, "loss": 2.2604, "mean_token_accuracy": 0.42413793206214906, "step": 95275 }, { "epoch": 0.09596702795311636, "grad_norm": 11.52115412808419, "learning_rate": 4.9740555600950773e-05, "loss": 2.13, "mean_token_accuracy": 0.4620689570903778, "step": 95280 }, { "epoch": 0.09597206400622053, "grad_norm": 11.671862935639766, "learning_rate": 4.974049885779982e-05, "loss": 2.2819, "mean_token_accuracy": 0.4206896543502808, "step": 95285 }, { "epoch": 0.09597710005932471, "grad_norm": 9.441640752481726, "learning_rate": 4.9740442108480374e-05, "loss": 2.2665, "mean_token_accuracy": 0.4344827592372894, "step": 95290 }, { "epoch": 0.09598213611242888, "grad_norm": 11.249877090710932, "learning_rate": 4.9740385352992464e-05, "loss": 2.6736, "mean_token_accuracy": 0.43448275327682495, "step": 95295 }, { "epoch": 0.09598717216553306, "grad_norm": 22.27137243971899, "learning_rate": 4.9740328591336095e-05, "loss": 2.9338, "mean_token_accuracy": 0.388021782040596, "step": 95300 }, { "epoch": 0.09599220821863723, "grad_norm": 11.408239725400664, "learning_rate": 4.9740271823511286e-05, "loss": 2.541, "mean_token_accuracy": 0.4, "step": 95305 }, { "epoch": 0.0959972442717414, "grad_norm": 12.492169711289046, "learning_rate": 4.9740215049518064e-05, "loss": 2.5706, "mean_token_accuracy": 0.3862069010734558, "step": 95310 }, { "epoch": 0.09600228032484556, "grad_norm": 14.138862123100731, "learning_rate": 4.974015826935643e-05, "loss": 2.8088, "mean_token_accuracy": 0.43297035098075864, "step": 95315 }, { "epoch": 0.09600731637794974, "grad_norm": 13.239929582280176, "learning_rate": 4.9740101483026404e-05, "loss": 2.5245, "mean_token_accuracy": 0.4186932921409607, "step": 95320 }, { "epoch": 0.09601235243105391, "grad_norm": 12.575069965290933, "learning_rate": 4.9740044690528e-05, "loss": 2.9031, "mean_token_accuracy": 0.3827586233615875, "step": 95325 }, { "epoch": 0.09601738848415808, "grad_norm": 14.018476110013284, "learning_rate": 4.973998789186124e-05, "loss": 2.9294, "mean_token_accuracy": 0.34482758939266206, "step": 95330 }, { "epoch": 0.09602242453726226, "grad_norm": 9.18123459734771, "learning_rate": 4.973993108702614e-05, "loss": 2.3597, "mean_token_accuracy": 0.4554144024848938, "step": 95335 }, { "epoch": 0.09602746059036643, "grad_norm": 12.76576450220605, "learning_rate": 4.9739874276022705e-05, "loss": 3.0349, "mean_token_accuracy": 0.4034482717514038, "step": 95340 }, { "epoch": 0.0960324966434706, "grad_norm": 10.05518841573313, "learning_rate": 4.973981745885096e-05, "loss": 2.7251, "mean_token_accuracy": 0.36896551251411436, "step": 95345 }, { "epoch": 0.09603753269657478, "grad_norm": 8.828798661659626, "learning_rate": 4.973976063551092e-05, "loss": 2.3836, "mean_token_accuracy": 0.4482758641242981, "step": 95350 }, { "epoch": 0.09604256874967895, "grad_norm": 12.4454148324595, "learning_rate": 4.97397038060026e-05, "loss": 2.3836, "mean_token_accuracy": 0.4103448331356049, "step": 95355 }, { "epoch": 0.09604760480278313, "grad_norm": 11.204530418282397, "learning_rate": 4.973964697032602e-05, "loss": 2.5204, "mean_token_accuracy": 0.39310343861579894, "step": 95360 }, { "epoch": 0.0960526408558873, "grad_norm": 12.60579714319122, "learning_rate": 4.973959012848118e-05, "loss": 2.575, "mean_token_accuracy": 0.37586206793785093, "step": 95365 }, { "epoch": 0.09605767690899147, "grad_norm": 12.338052911550129, "learning_rate": 4.973953328046812e-05, "loss": 2.6658, "mean_token_accuracy": 0.45716878175735476, "step": 95370 }, { "epoch": 0.09606271296209565, "grad_norm": 10.431963632177885, "learning_rate": 4.9739476426286835e-05, "loss": 2.4253, "mean_token_accuracy": 0.41034482717514037, "step": 95375 }, { "epoch": 0.09606774901519982, "grad_norm": 11.214174061238747, "learning_rate": 4.9739419565937344e-05, "loss": 2.6459, "mean_token_accuracy": 0.39655172228813174, "step": 95380 }, { "epoch": 0.09607278506830398, "grad_norm": 11.798086063985945, "learning_rate": 4.973936269941967e-05, "loss": 2.4664, "mean_token_accuracy": 0.4344827592372894, "step": 95385 }, { "epoch": 0.09607782112140816, "grad_norm": 10.639804585164638, "learning_rate": 4.973930582673383e-05, "loss": 2.5283, "mean_token_accuracy": 0.41724138259887694, "step": 95390 }, { "epoch": 0.09608285717451233, "grad_norm": 14.569916951859103, "learning_rate": 4.973924894787983e-05, "loss": 2.5843, "mean_token_accuracy": 0.3724137842655182, "step": 95395 }, { "epoch": 0.0960878932276165, "grad_norm": 9.548282917476612, "learning_rate": 4.9739192062857696e-05, "loss": 2.8062, "mean_token_accuracy": 0.41379310488700866, "step": 95400 }, { "epoch": 0.09609292928072068, "grad_norm": 10.008246028920373, "learning_rate": 4.973913517166744e-05, "loss": 3.067, "mean_token_accuracy": 0.3275862097740173, "step": 95405 }, { "epoch": 0.09609796533382485, "grad_norm": 11.757367358864384, "learning_rate": 4.973907827430908e-05, "loss": 2.3479, "mean_token_accuracy": 0.43103448748588563, "step": 95410 }, { "epoch": 0.09610300138692902, "grad_norm": 10.19495218585726, "learning_rate": 4.973902137078261e-05, "loss": 2.4408, "mean_token_accuracy": 0.45722927451133727, "step": 95415 }, { "epoch": 0.0961080374400332, "grad_norm": 11.777703318046838, "learning_rate": 4.9738964461088085e-05, "loss": 2.6761, "mean_token_accuracy": 0.4103448212146759, "step": 95420 }, { "epoch": 0.09611307349313737, "grad_norm": 10.078515922786325, "learning_rate": 4.973890754522549e-05, "loss": 2.437, "mean_token_accuracy": 0.42413793206214906, "step": 95425 }, { "epoch": 0.09611810954624155, "grad_norm": 10.97734125006126, "learning_rate": 4.973885062319486e-05, "loss": 2.5886, "mean_token_accuracy": 0.42758620381355283, "step": 95430 }, { "epoch": 0.09612314559934572, "grad_norm": 11.485868225425484, "learning_rate": 4.973879369499619e-05, "loss": 2.1637, "mean_token_accuracy": 0.43448275327682495, "step": 95435 }, { "epoch": 0.09612818165244989, "grad_norm": 9.698961065775471, "learning_rate": 4.973873676062951e-05, "loss": 2.5072, "mean_token_accuracy": 0.39655172228813174, "step": 95440 }, { "epoch": 0.09613321770555407, "grad_norm": 13.108996826312467, "learning_rate": 4.973867982009484e-05, "loss": 2.6176, "mean_token_accuracy": 0.45172414779663084, "step": 95445 }, { "epoch": 0.09613825375865824, "grad_norm": 10.183431395851537, "learning_rate": 4.9738622873392184e-05, "loss": 2.5043, "mean_token_accuracy": 0.4103448212146759, "step": 95450 }, { "epoch": 0.0961432898117624, "grad_norm": 10.58621637377267, "learning_rate": 4.973856592052156e-05, "loss": 2.3518, "mean_token_accuracy": 0.4310344815254211, "step": 95455 }, { "epoch": 0.09614832586486657, "grad_norm": 12.104231081275431, "learning_rate": 4.9738508961482996e-05, "loss": 2.7263, "mean_token_accuracy": 0.3655172407627106, "step": 95460 }, { "epoch": 0.09615336191797075, "grad_norm": 10.152698554513846, "learning_rate": 4.973845199627649e-05, "loss": 2.6048, "mean_token_accuracy": 0.39655172228813174, "step": 95465 }, { "epoch": 0.09615839797107492, "grad_norm": 10.672249214311165, "learning_rate": 4.9738395024902075e-05, "loss": 2.6455, "mean_token_accuracy": 0.40344826579093934, "step": 95470 }, { "epoch": 0.0961634340241791, "grad_norm": 12.720320339710737, "learning_rate": 4.973833804735975e-05, "loss": 2.4842, "mean_token_accuracy": 0.4620689570903778, "step": 95475 }, { "epoch": 0.09616847007728327, "grad_norm": 10.368635716914332, "learning_rate": 4.973828106364955e-05, "loss": 2.4516, "mean_token_accuracy": 0.44482759237289426, "step": 95480 }, { "epoch": 0.09617350613038744, "grad_norm": 11.884396607829155, "learning_rate": 4.9738224073771464e-05, "loss": 2.5678, "mean_token_accuracy": 0.4206896543502808, "step": 95485 }, { "epoch": 0.09617854218349162, "grad_norm": 9.162142600567304, "learning_rate": 4.973816707772553e-05, "loss": 2.3943, "mean_token_accuracy": 0.4068965494632721, "step": 95490 }, { "epoch": 0.09618357823659579, "grad_norm": 14.383877509317129, "learning_rate": 4.9738110075511766e-05, "loss": 2.9026, "mean_token_accuracy": 0.37586206793785093, "step": 95495 }, { "epoch": 0.09618861428969996, "grad_norm": 11.902541659823498, "learning_rate": 4.973805306713017e-05, "loss": 2.1208, "mean_token_accuracy": 0.4620689511299133, "step": 95500 }, { "epoch": 0.09619365034280414, "grad_norm": 11.93334597153728, "learning_rate": 4.973799605258077e-05, "loss": 2.8371, "mean_token_accuracy": 0.3551724135875702, "step": 95505 }, { "epoch": 0.09619868639590831, "grad_norm": 10.739982971163023, "learning_rate": 4.9737939031863574e-05, "loss": 2.2036, "mean_token_accuracy": 0.46896552443504336, "step": 95510 }, { "epoch": 0.09620372244901249, "grad_norm": 11.949905095042597, "learning_rate": 4.973788200497861e-05, "loss": 2.1036, "mean_token_accuracy": 0.47586206793785096, "step": 95515 }, { "epoch": 0.09620875850211666, "grad_norm": 11.64971327379536, "learning_rate": 4.9737824971925884e-05, "loss": 2.4053, "mean_token_accuracy": 0.4137930989265442, "step": 95520 }, { "epoch": 0.09621379455522082, "grad_norm": 13.77358130243912, "learning_rate": 4.973776793270542e-05, "loss": 3.0702, "mean_token_accuracy": 0.3655172437429428, "step": 95525 }, { "epoch": 0.09621883060832499, "grad_norm": 11.467515980572148, "learning_rate": 4.9737710887317215e-05, "loss": 2.6149, "mean_token_accuracy": 0.42413792610168455, "step": 95530 }, { "epoch": 0.09622386666142917, "grad_norm": 9.394389486470782, "learning_rate": 4.9737653835761305e-05, "loss": 2.2787, "mean_token_accuracy": 0.4413793087005615, "step": 95535 }, { "epoch": 0.09622890271453334, "grad_norm": 10.547651787129592, "learning_rate": 4.97375967780377e-05, "loss": 2.5225, "mean_token_accuracy": 0.3843315154314041, "step": 95540 }, { "epoch": 0.09623393876763751, "grad_norm": 10.731070862890196, "learning_rate": 4.9737539714146417e-05, "loss": 2.1733, "mean_token_accuracy": 0.4448275864124298, "step": 95545 }, { "epoch": 0.09623897482074169, "grad_norm": 14.664328380143658, "learning_rate": 4.9737482644087465e-05, "loss": 2.5798, "mean_token_accuracy": 0.4310344815254211, "step": 95550 }, { "epoch": 0.09624401087384586, "grad_norm": 8.598998287700047, "learning_rate": 4.973742556786086e-05, "loss": 2.276, "mean_token_accuracy": 0.4413793087005615, "step": 95555 }, { "epoch": 0.09624904692695004, "grad_norm": 11.565497628327153, "learning_rate": 4.973736848546663e-05, "loss": 2.5683, "mean_token_accuracy": 0.37241379022598264, "step": 95560 }, { "epoch": 0.09625408298005421, "grad_norm": 11.571824559239893, "learning_rate": 4.973731139690478e-05, "loss": 2.4504, "mean_token_accuracy": 0.4413793087005615, "step": 95565 }, { "epoch": 0.09625911903315838, "grad_norm": 10.717194962127268, "learning_rate": 4.973725430217532e-05, "loss": 2.5162, "mean_token_accuracy": 0.44827585816383364, "step": 95570 }, { "epoch": 0.09626415508626256, "grad_norm": 10.89973909651337, "learning_rate": 4.973719720127829e-05, "loss": 2.6385, "mean_token_accuracy": 0.41379310488700866, "step": 95575 }, { "epoch": 0.09626919113936673, "grad_norm": 11.39647182504023, "learning_rate": 4.973714009421368e-05, "loss": 2.4942, "mean_token_accuracy": 0.35862068831920624, "step": 95580 }, { "epoch": 0.0962742271924709, "grad_norm": 14.428872892700461, "learning_rate": 4.9737082980981524e-05, "loss": 2.7628, "mean_token_accuracy": 0.37931033968925476, "step": 95585 }, { "epoch": 0.09627926324557508, "grad_norm": 11.699979598475725, "learning_rate": 4.9737025861581826e-05, "loss": 2.8966, "mean_token_accuracy": 0.3689655244350433, "step": 95590 }, { "epoch": 0.09628429929867924, "grad_norm": 9.642883272713432, "learning_rate": 4.97369687360146e-05, "loss": 2.4313, "mean_token_accuracy": 0.4379310369491577, "step": 95595 }, { "epoch": 0.09628933535178341, "grad_norm": 14.97896320994989, "learning_rate": 4.9736911604279876e-05, "loss": 2.4291, "mean_token_accuracy": 0.4228675127029419, "step": 95600 }, { "epoch": 0.09629437140488759, "grad_norm": 12.513926607230237, "learning_rate": 4.973685446637766e-05, "loss": 2.6394, "mean_token_accuracy": 0.4, "step": 95605 }, { "epoch": 0.09629940745799176, "grad_norm": 10.965204580777522, "learning_rate": 4.973679732230797e-05, "loss": 2.0242, "mean_token_accuracy": 0.48620688915252686, "step": 95610 }, { "epoch": 0.09630444351109593, "grad_norm": 10.843799467313156, "learning_rate": 4.973674017207082e-05, "loss": 2.2967, "mean_token_accuracy": 0.4344827592372894, "step": 95615 }, { "epoch": 0.0963094795642001, "grad_norm": 15.558143047880607, "learning_rate": 4.973668301566622e-05, "loss": 2.6606, "mean_token_accuracy": 0.4206896543502808, "step": 95620 }, { "epoch": 0.09631451561730428, "grad_norm": 12.068201771821075, "learning_rate": 4.9736625853094196e-05, "loss": 2.4344, "mean_token_accuracy": 0.44827585816383364, "step": 95625 }, { "epoch": 0.09631955167040845, "grad_norm": 12.988432751145448, "learning_rate": 4.9736568684354774e-05, "loss": 2.7037, "mean_token_accuracy": 0.43986691236495973, "step": 95630 }, { "epoch": 0.09632458772351263, "grad_norm": 11.380903415375702, "learning_rate": 4.973651150944795e-05, "loss": 2.424, "mean_token_accuracy": 0.44289171099662783, "step": 95635 }, { "epoch": 0.0963296237766168, "grad_norm": 9.907444102014804, "learning_rate": 4.973645432837374e-05, "loss": 2.2341, "mean_token_accuracy": 0.417241370677948, "step": 95640 }, { "epoch": 0.09633465982972098, "grad_norm": 10.458270135841472, "learning_rate": 4.9736397141132174e-05, "loss": 2.4337, "mean_token_accuracy": 0.41379310488700866, "step": 95645 }, { "epoch": 0.09633969588282515, "grad_norm": 14.72088143261794, "learning_rate": 4.973633994772326e-05, "loss": 2.4482, "mean_token_accuracy": 0.44482759237289426, "step": 95650 }, { "epoch": 0.09634473193592932, "grad_norm": 11.508555078679926, "learning_rate": 4.9736282748147005e-05, "loss": 2.5893, "mean_token_accuracy": 0.3896551728248596, "step": 95655 }, { "epoch": 0.0963497679890335, "grad_norm": 9.201790434392777, "learning_rate": 4.9736225542403445e-05, "loss": 2.1809, "mean_token_accuracy": 0.5103448271751404, "step": 95660 }, { "epoch": 0.09635480404213766, "grad_norm": 9.31917742079431, "learning_rate": 4.973616833049258e-05, "loss": 2.0619, "mean_token_accuracy": 0.5011083722114563, "step": 95665 }, { "epoch": 0.09635984009524183, "grad_norm": 9.183284527513235, "learning_rate": 4.973611111241442e-05, "loss": 2.249, "mean_token_accuracy": 0.4965517222881317, "step": 95670 }, { "epoch": 0.096364876148346, "grad_norm": 10.232376520690101, "learning_rate": 4.973605388816901e-05, "loss": 2.9215, "mean_token_accuracy": 0.3758620709180832, "step": 95675 }, { "epoch": 0.09636991220145018, "grad_norm": 10.986216334999819, "learning_rate": 4.973599665775634e-05, "loss": 2.5622, "mean_token_accuracy": 0.43793103098869324, "step": 95680 }, { "epoch": 0.09637494825455435, "grad_norm": 11.558583340787235, "learning_rate": 4.973593942117644e-05, "loss": 2.724, "mean_token_accuracy": 0.3827586114406586, "step": 95685 }, { "epoch": 0.09637998430765853, "grad_norm": 12.18118432090526, "learning_rate": 4.9735882178429296e-05, "loss": 2.1419, "mean_token_accuracy": 0.48620688915252686, "step": 95690 }, { "epoch": 0.0963850203607627, "grad_norm": 9.48633808403315, "learning_rate": 4.9735824929514974e-05, "loss": 2.3335, "mean_token_accuracy": 0.4419950693845749, "step": 95695 }, { "epoch": 0.09639005641386687, "grad_norm": 10.16351059166901, "learning_rate": 4.9735767674433446e-05, "loss": 2.5048, "mean_token_accuracy": 0.3896551728248596, "step": 95700 }, { "epoch": 0.09639509246697105, "grad_norm": 11.45007847585355, "learning_rate": 4.973571041318476e-05, "loss": 2.9145, "mean_token_accuracy": 0.38275861740112305, "step": 95705 }, { "epoch": 0.09640012852007522, "grad_norm": 9.794638265298522, "learning_rate": 4.9735653145768905e-05, "loss": 2.6946, "mean_token_accuracy": 0.4188747763633728, "step": 95710 }, { "epoch": 0.0964051645731794, "grad_norm": 10.383656504047751, "learning_rate": 4.973559587218591e-05, "loss": 2.1924, "mean_token_accuracy": 0.42758620977401735, "step": 95715 }, { "epoch": 0.09641020062628357, "grad_norm": 9.797923282699946, "learning_rate": 4.9735538592435784e-05, "loss": 2.5675, "mean_token_accuracy": 0.4517241358757019, "step": 95720 }, { "epoch": 0.09641523667938774, "grad_norm": 10.1815607039788, "learning_rate": 4.9735481306518564e-05, "loss": 2.206, "mean_token_accuracy": 0.4931034445762634, "step": 95725 }, { "epoch": 0.09642027273249192, "grad_norm": 9.209017687332727, "learning_rate": 4.973542401443424e-05, "loss": 2.2278, "mean_token_accuracy": 0.45862067937850953, "step": 95730 }, { "epoch": 0.09642530878559608, "grad_norm": 14.296437625385964, "learning_rate": 4.973536671618285e-05, "loss": 2.7697, "mean_token_accuracy": 0.41379310488700866, "step": 95735 }, { "epoch": 0.09643034483870025, "grad_norm": 9.98482931261745, "learning_rate": 4.973530941176438e-05, "loss": 2.1104, "mean_token_accuracy": 0.46400484442710876, "step": 95740 }, { "epoch": 0.09643538089180442, "grad_norm": 10.12526689256265, "learning_rate": 4.973525210117887e-05, "loss": 2.2251, "mean_token_accuracy": 0.45862069725990295, "step": 95745 }, { "epoch": 0.0964404169449086, "grad_norm": 9.129096993210004, "learning_rate": 4.973519478442633e-05, "loss": 2.2087, "mean_token_accuracy": 0.4413793087005615, "step": 95750 }, { "epoch": 0.09644545299801277, "grad_norm": 9.65077524578711, "learning_rate": 4.973513746150678e-05, "loss": 2.2998, "mean_token_accuracy": 0.4517241448163986, "step": 95755 }, { "epoch": 0.09645048905111694, "grad_norm": 8.982296758206601, "learning_rate": 4.9735080132420235e-05, "loss": 2.2156, "mean_token_accuracy": 0.45662431716918944, "step": 95760 }, { "epoch": 0.09645552510422112, "grad_norm": 13.448091338844382, "learning_rate": 4.97350227971667e-05, "loss": 2.3376, "mean_token_accuracy": 0.413793095946312, "step": 95765 }, { "epoch": 0.09646056115732529, "grad_norm": 11.772039005188871, "learning_rate": 4.9734965455746194e-05, "loss": 2.707, "mean_token_accuracy": 0.4137931078672409, "step": 95770 }, { "epoch": 0.09646559721042947, "grad_norm": 11.879850749863365, "learning_rate": 4.9734908108158754e-05, "loss": 2.6102, "mean_token_accuracy": 0.42915910482406616, "step": 95775 }, { "epoch": 0.09647063326353364, "grad_norm": 11.657083250369277, "learning_rate": 4.973485075440436e-05, "loss": 2.5562, "mean_token_accuracy": 0.42413793206214906, "step": 95780 }, { "epoch": 0.09647566931663781, "grad_norm": 14.492282877325094, "learning_rate": 4.973479339448306e-05, "loss": 2.4502, "mean_token_accuracy": 0.3758620619773865, "step": 95785 }, { "epoch": 0.09648070536974199, "grad_norm": 13.781841487340314, "learning_rate": 4.973473602839486e-05, "loss": 2.3901, "mean_token_accuracy": 0.44827585816383364, "step": 95790 }, { "epoch": 0.09648574142284616, "grad_norm": 14.441629090690327, "learning_rate": 4.9734678656139764e-05, "loss": 2.7771, "mean_token_accuracy": 0.39655172228813174, "step": 95795 }, { "epoch": 0.09649077747595033, "grad_norm": 11.487883457846998, "learning_rate": 4.9734621277717804e-05, "loss": 2.5891, "mean_token_accuracy": 0.3741681814193726, "step": 95800 }, { "epoch": 0.0964958135290545, "grad_norm": 11.14689210303026, "learning_rate": 4.973456389312898e-05, "loss": 2.5727, "mean_token_accuracy": 0.43793103098869324, "step": 95805 }, { "epoch": 0.09650084958215867, "grad_norm": 11.58444777450018, "learning_rate": 4.973450650237332e-05, "loss": 2.0923, "mean_token_accuracy": 0.46896552443504336, "step": 95810 }, { "epoch": 0.09650588563526284, "grad_norm": 11.424052190992473, "learning_rate": 4.9734449105450835e-05, "loss": 2.3583, "mean_token_accuracy": 0.4, "step": 95815 }, { "epoch": 0.09651092168836702, "grad_norm": 18.93291269783059, "learning_rate": 4.973439170236155e-05, "loss": 2.5933, "mean_token_accuracy": 0.4448275864124298, "step": 95820 }, { "epoch": 0.09651595774147119, "grad_norm": 10.57508454012662, "learning_rate": 4.973433429310547e-05, "loss": 2.5739, "mean_token_accuracy": 0.4034482717514038, "step": 95825 }, { "epoch": 0.09652099379457536, "grad_norm": 11.219250899516567, "learning_rate": 4.9734276877682614e-05, "loss": 2.713, "mean_token_accuracy": 0.4275861978530884, "step": 95830 }, { "epoch": 0.09652602984767954, "grad_norm": 11.672250145695347, "learning_rate": 4.9734219456092996e-05, "loss": 2.0933, "mean_token_accuracy": 0.5143375635147095, "step": 95835 }, { "epoch": 0.09653106590078371, "grad_norm": 11.980270805883723, "learning_rate": 4.973416202833663e-05, "loss": 2.6869, "mean_token_accuracy": 0.3862069010734558, "step": 95840 }, { "epoch": 0.09653610195388788, "grad_norm": 11.282549418647301, "learning_rate": 4.973410459441355e-05, "loss": 2.9144, "mean_token_accuracy": 0.38275861740112305, "step": 95845 }, { "epoch": 0.09654113800699206, "grad_norm": 11.047125343444737, "learning_rate": 4.973404715432375e-05, "loss": 2.4748, "mean_token_accuracy": 0.3999999940395355, "step": 95850 }, { "epoch": 0.09654617406009623, "grad_norm": 9.04157310466515, "learning_rate": 4.973398970806725e-05, "loss": 2.4612, "mean_token_accuracy": 0.4413793087005615, "step": 95855 }, { "epoch": 0.0965512101132004, "grad_norm": 11.034733325377017, "learning_rate": 4.973393225564408e-05, "loss": 2.508, "mean_token_accuracy": 0.4068965494632721, "step": 95860 }, { "epoch": 0.09655624616630458, "grad_norm": 10.233317929177732, "learning_rate": 4.973387479705423e-05, "loss": 2.5123, "mean_token_accuracy": 0.37241379618644715, "step": 95865 }, { "epoch": 0.09656128221940875, "grad_norm": 9.961053688192642, "learning_rate": 4.9733817332297755e-05, "loss": 2.5144, "mean_token_accuracy": 0.4068965554237366, "step": 95870 }, { "epoch": 0.09656631827251291, "grad_norm": 11.024613728240325, "learning_rate": 4.9733759861374634e-05, "loss": 2.3592, "mean_token_accuracy": 0.39655172228813174, "step": 95875 }, { "epoch": 0.09657135432561709, "grad_norm": 11.254095753287107, "learning_rate": 4.9733702384284896e-05, "loss": 2.3619, "mean_token_accuracy": 0.43448275327682495, "step": 95880 }, { "epoch": 0.09657639037872126, "grad_norm": 11.915891445690459, "learning_rate": 4.973364490102856e-05, "loss": 2.9767, "mean_token_accuracy": 0.3344827637076378, "step": 95885 }, { "epoch": 0.09658142643182543, "grad_norm": 10.806872996625806, "learning_rate": 4.973358741160564e-05, "loss": 2.2556, "mean_token_accuracy": 0.458620685338974, "step": 95890 }, { "epoch": 0.09658646248492961, "grad_norm": 17.377844458778835, "learning_rate": 4.9733529916016144e-05, "loss": 2.7035, "mean_token_accuracy": 0.38275861740112305, "step": 95895 }, { "epoch": 0.09659149853803378, "grad_norm": 10.607088381926005, "learning_rate": 4.973347241426011e-05, "loss": 2.3611, "mean_token_accuracy": 0.41724138259887694, "step": 95900 }, { "epoch": 0.09659653459113796, "grad_norm": 11.18473749044348, "learning_rate": 4.973341490633753e-05, "loss": 2.2676, "mean_token_accuracy": 0.42413793206214906, "step": 95905 }, { "epoch": 0.09660157064424213, "grad_norm": 10.419121767678288, "learning_rate": 4.973335739224843e-05, "loss": 2.223, "mean_token_accuracy": 0.5, "step": 95910 }, { "epoch": 0.0966066066973463, "grad_norm": 11.239529198177836, "learning_rate": 4.973329987199283e-05, "loss": 2.4312, "mean_token_accuracy": 0.4034482777118683, "step": 95915 }, { "epoch": 0.09661164275045048, "grad_norm": 11.441966118051846, "learning_rate": 4.9733242345570735e-05, "loss": 2.6674, "mean_token_accuracy": 0.4294010818004608, "step": 95920 }, { "epoch": 0.09661667880355465, "grad_norm": 10.23847271530617, "learning_rate": 4.973318481298217e-05, "loss": 2.4201, "mean_token_accuracy": 0.4379310369491577, "step": 95925 }, { "epoch": 0.09662171485665882, "grad_norm": 13.944979632815208, "learning_rate": 4.973312727422715e-05, "loss": 2.7637, "mean_token_accuracy": 0.38965516686439516, "step": 95930 }, { "epoch": 0.096626750909763, "grad_norm": 10.196543502345914, "learning_rate": 4.973306972930569e-05, "loss": 2.608, "mean_token_accuracy": 0.42413792610168455, "step": 95935 }, { "epoch": 0.09663178696286717, "grad_norm": 11.796788012221578, "learning_rate": 4.9733012178217805e-05, "loss": 2.2963, "mean_token_accuracy": 0.4068965494632721, "step": 95940 }, { "epoch": 0.09663682301597133, "grad_norm": 11.832794770506567, "learning_rate": 4.97329546209635e-05, "loss": 2.577, "mean_token_accuracy": 0.42413792610168455, "step": 95945 }, { "epoch": 0.0966418590690755, "grad_norm": 13.215137265567424, "learning_rate": 4.9732897057542815e-05, "loss": 2.8343, "mean_token_accuracy": 0.36896551847457887, "step": 95950 }, { "epoch": 0.09664689512217968, "grad_norm": 10.893704891276444, "learning_rate": 4.973283948795575e-05, "loss": 2.3234, "mean_token_accuracy": 0.43448275327682495, "step": 95955 }, { "epoch": 0.09665193117528385, "grad_norm": 9.129939451688788, "learning_rate": 4.973278191220233e-05, "loss": 2.5442, "mean_token_accuracy": 0.43103448748588563, "step": 95960 }, { "epoch": 0.09665696722838803, "grad_norm": 10.722429686171576, "learning_rate": 4.973272433028255e-05, "loss": 2.3845, "mean_token_accuracy": 0.4640048384666443, "step": 95965 }, { "epoch": 0.0966620032814922, "grad_norm": 11.493399771299801, "learning_rate": 4.973266674219645e-05, "loss": 2.5402, "mean_token_accuracy": 0.38965516686439516, "step": 95970 }, { "epoch": 0.09666703933459637, "grad_norm": 13.013174249036021, "learning_rate": 4.9732609147944045e-05, "loss": 2.3215, "mean_token_accuracy": 0.42068966031074523, "step": 95975 }, { "epoch": 0.09667207538770055, "grad_norm": 9.343154297195118, "learning_rate": 4.973255154752533e-05, "loss": 2.5795, "mean_token_accuracy": 0.3862069010734558, "step": 95980 }, { "epoch": 0.09667711144080472, "grad_norm": 12.62790854592106, "learning_rate": 4.973249394094033e-05, "loss": 2.9361, "mean_token_accuracy": 0.4019358724355698, "step": 95985 }, { "epoch": 0.0966821474939089, "grad_norm": 11.411243323414508, "learning_rate": 4.973243632818908e-05, "loss": 2.641, "mean_token_accuracy": 0.4379310369491577, "step": 95990 }, { "epoch": 0.09668718354701307, "grad_norm": 11.685013754155161, "learning_rate": 4.973237870927157e-05, "loss": 2.0594, "mean_token_accuracy": 0.458620685338974, "step": 95995 }, { "epoch": 0.09669221960011724, "grad_norm": 13.533572355695014, "learning_rate": 4.9732321084187833e-05, "loss": 2.2359, "mean_token_accuracy": 0.4724137902259827, "step": 96000 }, { "epoch": 0.09669725565322142, "grad_norm": 11.062699204201715, "learning_rate": 4.973226345293788e-05, "loss": 2.6765, "mean_token_accuracy": 0.45359950661659243, "step": 96005 }, { "epoch": 0.09670229170632559, "grad_norm": 11.373954333487799, "learning_rate": 4.973220581552172e-05, "loss": 2.0574, "mean_token_accuracy": 0.5068965435028077, "step": 96010 }, { "epoch": 0.09670732775942975, "grad_norm": 11.626099731117069, "learning_rate": 4.9732148171939376e-05, "loss": 2.2313, "mean_token_accuracy": 0.4896551609039307, "step": 96015 }, { "epoch": 0.09671236381253392, "grad_norm": 10.646406791711593, "learning_rate": 4.973209052219086e-05, "loss": 2.5111, "mean_token_accuracy": 0.3896551787853241, "step": 96020 }, { "epoch": 0.0967173998656381, "grad_norm": 8.975575421539746, "learning_rate": 4.973203286627619e-05, "loss": 1.9766, "mean_token_accuracy": 0.5000000059604645, "step": 96025 }, { "epoch": 0.09672243591874227, "grad_norm": 9.905961868142816, "learning_rate": 4.9731975204195394e-05, "loss": 2.8307, "mean_token_accuracy": 0.3724137842655182, "step": 96030 }, { "epoch": 0.09672747197184645, "grad_norm": 11.311171945919167, "learning_rate": 4.973191753594847e-05, "loss": 2.6204, "mean_token_accuracy": 0.4, "step": 96035 }, { "epoch": 0.09673250802495062, "grad_norm": 12.751102596909488, "learning_rate": 4.973185986153544e-05, "loss": 2.836, "mean_token_accuracy": 0.3862069010734558, "step": 96040 }, { "epoch": 0.0967375440780548, "grad_norm": 12.346532435988118, "learning_rate": 4.973180218095632e-05, "loss": 2.1248, "mean_token_accuracy": 0.482758617401123, "step": 96045 }, { "epoch": 0.09674258013115897, "grad_norm": 11.055937058947467, "learning_rate": 4.973174449421113e-05, "loss": 2.4476, "mean_token_accuracy": 0.4219600737094879, "step": 96050 }, { "epoch": 0.09674761618426314, "grad_norm": 8.153207989374925, "learning_rate": 4.973168680129988e-05, "loss": 2.4609, "mean_token_accuracy": 0.4084089457988739, "step": 96055 }, { "epoch": 0.09675265223736731, "grad_norm": 11.459184625177668, "learning_rate": 4.9731629102222595e-05, "loss": 2.3485, "mean_token_accuracy": 0.47931033968925474, "step": 96060 }, { "epoch": 0.09675768829047149, "grad_norm": 13.04516176813894, "learning_rate": 4.973157139697928e-05, "loss": 2.516, "mean_token_accuracy": 0.39999999701976774, "step": 96065 }, { "epoch": 0.09676272434357566, "grad_norm": 9.924561542112423, "learning_rate": 4.973151368556995e-05, "loss": 2.4021, "mean_token_accuracy": 0.41724138259887694, "step": 96070 }, { "epoch": 0.09676776039667984, "grad_norm": 9.565373161095515, "learning_rate": 4.973145596799463e-05, "loss": 1.9065, "mean_token_accuracy": 0.5, "step": 96075 }, { "epoch": 0.09677279644978401, "grad_norm": 12.17088369811793, "learning_rate": 4.973139824425334e-05, "loss": 2.6999, "mean_token_accuracy": 0.3827586144208908, "step": 96080 }, { "epoch": 0.09677783250288817, "grad_norm": 10.965535509259638, "learning_rate": 4.973134051434608e-05, "loss": 2.3523, "mean_token_accuracy": 0.4620689630508423, "step": 96085 }, { "epoch": 0.09678286855599234, "grad_norm": 14.448583333824653, "learning_rate": 4.973128277827288e-05, "loss": 2.6201, "mean_token_accuracy": 0.34482758641242983, "step": 96090 }, { "epoch": 0.09678790460909652, "grad_norm": 10.404628082769083, "learning_rate": 4.973122503603375e-05, "loss": 2.277, "mean_token_accuracy": 0.4329098641872406, "step": 96095 }, { "epoch": 0.09679294066220069, "grad_norm": 11.013236642376073, "learning_rate": 4.973116728762871e-05, "loss": 2.1249, "mean_token_accuracy": 0.46400484442710876, "step": 96100 }, { "epoch": 0.09679797671530486, "grad_norm": 14.88639302637622, "learning_rate": 4.973110953305776e-05, "loss": 2.6015, "mean_token_accuracy": 0.3931034505367279, "step": 96105 }, { "epoch": 0.09680301276840904, "grad_norm": 10.536896474497441, "learning_rate": 4.973105177232094e-05, "loss": 2.5943, "mean_token_accuracy": 0.42068964838981626, "step": 96110 }, { "epoch": 0.09680804882151321, "grad_norm": 10.111898946629537, "learning_rate": 4.973099400541825e-05, "loss": 2.2529, "mean_token_accuracy": 0.4793103516101837, "step": 96115 }, { "epoch": 0.09681308487461739, "grad_norm": 9.374293697206927, "learning_rate": 4.9730936232349717e-05, "loss": 2.2736, "mean_token_accuracy": 0.48620688915252686, "step": 96120 }, { "epoch": 0.09681812092772156, "grad_norm": 11.173764937985394, "learning_rate": 4.973087845311535e-05, "loss": 2.4755, "mean_token_accuracy": 0.3655172407627106, "step": 96125 }, { "epoch": 0.09682315698082573, "grad_norm": 11.239552971529916, "learning_rate": 4.9730820667715154e-05, "loss": 2.0815, "mean_token_accuracy": 0.5039409041404724, "step": 96130 }, { "epoch": 0.09682819303392991, "grad_norm": 16.270708762434673, "learning_rate": 4.9730762876149174e-05, "loss": 2.4635, "mean_token_accuracy": 0.39655172228813174, "step": 96135 }, { "epoch": 0.09683322908703408, "grad_norm": 11.269419700698975, "learning_rate": 4.97307050784174e-05, "loss": 2.3694, "mean_token_accuracy": 0.46025408506393434, "step": 96140 }, { "epoch": 0.09683826514013825, "grad_norm": 8.612185721874907, "learning_rate": 4.973064727451985e-05, "loss": 2.2357, "mean_token_accuracy": 0.37241379618644715, "step": 96145 }, { "epoch": 0.09684330119324243, "grad_norm": 12.009912764954073, "learning_rate": 4.9730589464456555e-05, "loss": 2.1394, "mean_token_accuracy": 0.48275862336158754, "step": 96150 }, { "epoch": 0.09684833724634659, "grad_norm": 12.295191839639305, "learning_rate": 4.973053164822753e-05, "loss": 2.4012, "mean_token_accuracy": 0.45045371651649474, "step": 96155 }, { "epoch": 0.09685337329945076, "grad_norm": 14.089965048754415, "learning_rate": 4.973047382583278e-05, "loss": 2.8956, "mean_token_accuracy": 0.3482758581638336, "step": 96160 }, { "epoch": 0.09685840935255494, "grad_norm": 11.40317014489123, "learning_rate": 4.973041599727232e-05, "loss": 2.3978, "mean_token_accuracy": 0.44482758045196535, "step": 96165 }, { "epoch": 0.09686344540565911, "grad_norm": 13.420203127753577, "learning_rate": 4.973035816254618e-05, "loss": 2.5303, "mean_token_accuracy": 0.4, "step": 96170 }, { "epoch": 0.09686848145876328, "grad_norm": 12.319196145455095, "learning_rate": 4.973030032165435e-05, "loss": 2.5451, "mean_token_accuracy": 0.3862068891525269, "step": 96175 }, { "epoch": 0.09687351751186746, "grad_norm": 12.732813582792284, "learning_rate": 4.973024247459688e-05, "loss": 2.278, "mean_token_accuracy": 0.42413793206214906, "step": 96180 }, { "epoch": 0.09687855356497163, "grad_norm": 13.621883033214095, "learning_rate": 4.973018462137376e-05, "loss": 3.0151, "mean_token_accuracy": 0.3780399262905121, "step": 96185 }, { "epoch": 0.0968835896180758, "grad_norm": 11.150237884305117, "learning_rate": 4.9730126761985016e-05, "loss": 2.4589, "mean_token_accuracy": 0.4172413766384125, "step": 96190 }, { "epoch": 0.09688862567117998, "grad_norm": 12.855475477307328, "learning_rate": 4.973006889643067e-05, "loss": 2.228, "mean_token_accuracy": 0.4742891788482666, "step": 96195 }, { "epoch": 0.09689366172428415, "grad_norm": 11.25511450098093, "learning_rate": 4.9730011024710726e-05, "loss": 2.5857, "mean_token_accuracy": 0.3862069010734558, "step": 96200 }, { "epoch": 0.09689869777738833, "grad_norm": 13.083329506488003, "learning_rate": 4.972995314682522e-05, "loss": 2.5352, "mean_token_accuracy": 0.4103448331356049, "step": 96205 }, { "epoch": 0.0969037338304925, "grad_norm": 11.601643945186618, "learning_rate": 4.972989526277413e-05, "loss": 2.5609, "mean_token_accuracy": 0.46551724076271056, "step": 96210 }, { "epoch": 0.09690876988359667, "grad_norm": 10.942019147451784, "learning_rate": 4.972983737255751e-05, "loss": 2.4678, "mean_token_accuracy": 0.3827586233615875, "step": 96215 }, { "epoch": 0.09691380593670085, "grad_norm": 11.612622010701468, "learning_rate": 4.9729779476175355e-05, "loss": 2.2853, "mean_token_accuracy": 0.4620689570903778, "step": 96220 }, { "epoch": 0.09691884198980501, "grad_norm": 11.05772072843664, "learning_rate": 4.972972157362769e-05, "loss": 2.3635, "mean_token_accuracy": 0.39655172228813174, "step": 96225 }, { "epoch": 0.09692387804290918, "grad_norm": 11.399108174838723, "learning_rate": 4.9729663664914526e-05, "loss": 2.5442, "mean_token_accuracy": 0.4034482717514038, "step": 96230 }, { "epoch": 0.09692891409601335, "grad_norm": 9.76014568351947, "learning_rate": 4.9729605750035883e-05, "loss": 2.531, "mean_token_accuracy": 0.42413792610168455, "step": 96235 }, { "epoch": 0.09693395014911753, "grad_norm": 11.222964816437702, "learning_rate": 4.972954782899178e-05, "loss": 2.9663, "mean_token_accuracy": 0.36551724672317504, "step": 96240 }, { "epoch": 0.0969389862022217, "grad_norm": 14.06239042786985, "learning_rate": 4.972948990178223e-05, "loss": 2.6517, "mean_token_accuracy": 0.3862069010734558, "step": 96245 }, { "epoch": 0.09694402225532588, "grad_norm": 9.206205384956768, "learning_rate": 4.972943196840725e-05, "loss": 2.345, "mean_token_accuracy": 0.38620689511299133, "step": 96250 }, { "epoch": 0.09694905830843005, "grad_norm": 10.518785193823504, "learning_rate": 4.972937402886684e-05, "loss": 2.7558, "mean_token_accuracy": 0.3827586233615875, "step": 96255 }, { "epoch": 0.09695409436153422, "grad_norm": 10.70244045104713, "learning_rate": 4.972931608316104e-05, "loss": 2.6989, "mean_token_accuracy": 0.36896551549434664, "step": 96260 }, { "epoch": 0.0969591304146384, "grad_norm": 12.173620876534342, "learning_rate": 4.972925813128986e-05, "loss": 2.2743, "mean_token_accuracy": 0.47586206197738645, "step": 96265 }, { "epoch": 0.09696416646774257, "grad_norm": 12.750506759064681, "learning_rate": 4.972920017325331e-05, "loss": 2.3576, "mean_token_accuracy": 0.42068966031074523, "step": 96270 }, { "epoch": 0.09696920252084675, "grad_norm": 14.50099229685208, "learning_rate": 4.9729142209051404e-05, "loss": 2.0598, "mean_token_accuracy": 0.4448275864124298, "step": 96275 }, { "epoch": 0.09697423857395092, "grad_norm": 12.28813207536173, "learning_rate": 4.972908423868417e-05, "loss": 2.7888, "mean_token_accuracy": 0.3517241358757019, "step": 96280 }, { "epoch": 0.09697927462705509, "grad_norm": 10.2264249771214, "learning_rate": 4.9729026262151604e-05, "loss": 2.8685, "mean_token_accuracy": 0.34137930274009703, "step": 96285 }, { "epoch": 0.09698431068015927, "grad_norm": 9.66089230367393, "learning_rate": 4.9728968279453744e-05, "loss": 2.2912, "mean_token_accuracy": 0.4517241358757019, "step": 96290 }, { "epoch": 0.09698934673326343, "grad_norm": 10.161013127273788, "learning_rate": 4.9728910290590595e-05, "loss": 2.3912, "mean_token_accuracy": 0.43103448748588563, "step": 96295 }, { "epoch": 0.0969943827863676, "grad_norm": 10.836896898606948, "learning_rate": 4.972885229556217e-05, "loss": 2.4086, "mean_token_accuracy": 0.42068964838981626, "step": 96300 }, { "epoch": 0.09699941883947177, "grad_norm": 9.121540904836728, "learning_rate": 4.97287942943685e-05, "loss": 2.454, "mean_token_accuracy": 0.4241379380226135, "step": 96305 }, { "epoch": 0.09700445489257595, "grad_norm": 10.68270404516779, "learning_rate": 4.9728736287009595e-05, "loss": 2.4654, "mean_token_accuracy": 0.43103448748588563, "step": 96310 }, { "epoch": 0.09700949094568012, "grad_norm": 12.206640846089213, "learning_rate": 4.9728678273485455e-05, "loss": 2.4648, "mean_token_accuracy": 0.4517241418361664, "step": 96315 }, { "epoch": 0.0970145269987843, "grad_norm": 11.266393821578305, "learning_rate": 4.972862025379611e-05, "loss": 1.9887, "mean_token_accuracy": 0.482758617401123, "step": 96320 }, { "epoch": 0.09701956305188847, "grad_norm": 11.670018432279093, "learning_rate": 4.972856222794157e-05, "loss": 2.6934, "mean_token_accuracy": 0.38620689511299133, "step": 96325 }, { "epoch": 0.09702459910499264, "grad_norm": 15.708158982980189, "learning_rate": 4.972850419592186e-05, "loss": 2.723, "mean_token_accuracy": 0.38965516686439516, "step": 96330 }, { "epoch": 0.09702963515809682, "grad_norm": 11.90522545723829, "learning_rate": 4.9728446157737e-05, "loss": 2.4806, "mean_token_accuracy": 0.4379310369491577, "step": 96335 }, { "epoch": 0.09703467121120099, "grad_norm": 10.722682593397936, "learning_rate": 4.9728388113386985e-05, "loss": 2.3176, "mean_token_accuracy": 0.44640048742294314, "step": 96340 }, { "epoch": 0.09703970726430516, "grad_norm": 10.28194351545637, "learning_rate": 4.9728330062871845e-05, "loss": 2.1514, "mean_token_accuracy": 0.4689655125141144, "step": 96345 }, { "epoch": 0.09704474331740934, "grad_norm": 13.302127974048107, "learning_rate": 4.97282720061916e-05, "loss": 2.4013, "mean_token_accuracy": 0.42413793206214906, "step": 96350 }, { "epoch": 0.09704977937051351, "grad_norm": 14.791178161344368, "learning_rate": 4.9728213943346257e-05, "loss": 2.7785, "mean_token_accuracy": 0.3772534757852554, "step": 96355 }, { "epoch": 0.09705481542361769, "grad_norm": 9.59324219540868, "learning_rate": 4.9728155874335834e-05, "loss": 2.443, "mean_token_accuracy": 0.4293406009674072, "step": 96360 }, { "epoch": 0.09705985147672185, "grad_norm": 11.679455275355581, "learning_rate": 4.972809779916036e-05, "loss": 2.6782, "mean_token_accuracy": 0.4034482717514038, "step": 96365 }, { "epoch": 0.09706488752982602, "grad_norm": 13.765571351258297, "learning_rate": 4.972803971781983e-05, "loss": 2.7268, "mean_token_accuracy": 0.35862069129943847, "step": 96370 }, { "epoch": 0.09706992358293019, "grad_norm": 14.508634534154936, "learning_rate": 4.972798163031427e-05, "loss": 2.857, "mean_token_accuracy": 0.3793103456497192, "step": 96375 }, { "epoch": 0.09707495963603437, "grad_norm": 11.500947054818651, "learning_rate": 4.97279235366437e-05, "loss": 2.3497, "mean_token_accuracy": 0.4862069010734558, "step": 96380 }, { "epoch": 0.09707999568913854, "grad_norm": 10.221659800141756, "learning_rate": 4.9727865436808135e-05, "loss": 2.2958, "mean_token_accuracy": 0.4103448331356049, "step": 96385 }, { "epoch": 0.09708503174224271, "grad_norm": 11.803839976263612, "learning_rate": 4.9727807330807594e-05, "loss": 2.4989, "mean_token_accuracy": 0.4275861978530884, "step": 96390 }, { "epoch": 0.09709006779534689, "grad_norm": 13.160267529879707, "learning_rate": 4.972774921864207e-05, "loss": 2.7066, "mean_token_accuracy": 0.3896551728248596, "step": 96395 }, { "epoch": 0.09709510384845106, "grad_norm": 15.134069777010525, "learning_rate": 4.9727691100311615e-05, "loss": 2.8563, "mean_token_accuracy": 0.42413793206214906, "step": 96400 }, { "epoch": 0.09710013990155524, "grad_norm": 9.776494339148266, "learning_rate": 4.972763297581621e-05, "loss": 2.3772, "mean_token_accuracy": 0.44482758045196535, "step": 96405 }, { "epoch": 0.09710517595465941, "grad_norm": 12.878171520094753, "learning_rate": 4.97275748451559e-05, "loss": 2.6478, "mean_token_accuracy": 0.41034482717514037, "step": 96410 }, { "epoch": 0.09711021200776358, "grad_norm": 10.362518908496101, "learning_rate": 4.9727516708330684e-05, "loss": 2.102, "mean_token_accuracy": 0.45172414779663084, "step": 96415 }, { "epoch": 0.09711524806086776, "grad_norm": 8.79744196470157, "learning_rate": 4.972745856534059e-05, "loss": 2.4121, "mean_token_accuracy": 0.42758620381355283, "step": 96420 }, { "epoch": 0.09712028411397193, "grad_norm": 11.285929014660118, "learning_rate": 4.9727400416185615e-05, "loss": 2.8295, "mean_token_accuracy": 0.37586206793785093, "step": 96425 }, { "epoch": 0.0971253201670761, "grad_norm": 14.245513012876458, "learning_rate": 4.972734226086579e-05, "loss": 2.6871, "mean_token_accuracy": 0.42758620381355283, "step": 96430 }, { "epoch": 0.09713035622018026, "grad_norm": 11.89543996490038, "learning_rate": 4.972728409938114e-05, "loss": 2.247, "mean_token_accuracy": 0.47931033968925474, "step": 96435 }, { "epoch": 0.09713539227328444, "grad_norm": 12.290008674368332, "learning_rate": 4.972722593173166e-05, "loss": 2.3052, "mean_token_accuracy": 0.44137930274009707, "step": 96440 }, { "epoch": 0.09714042832638861, "grad_norm": 9.45734294689451, "learning_rate": 4.972716775791738e-05, "loss": 2.2678, "mean_token_accuracy": 0.43793103098869324, "step": 96445 }, { "epoch": 0.09714546437949279, "grad_norm": 13.050424096547166, "learning_rate": 4.972710957793832e-05, "loss": 2.0249, "mean_token_accuracy": 0.4689655065536499, "step": 96450 }, { "epoch": 0.09715050043259696, "grad_norm": 11.33976963118481, "learning_rate": 4.972705139179447e-05, "loss": 2.1287, "mean_token_accuracy": 0.44827585816383364, "step": 96455 }, { "epoch": 0.09715553648570113, "grad_norm": 9.66695533952064, "learning_rate": 4.9726993199485876e-05, "loss": 2.6678, "mean_token_accuracy": 0.37241379618644715, "step": 96460 }, { "epoch": 0.0971605725388053, "grad_norm": 10.76286878115784, "learning_rate": 4.9726935001012546e-05, "loss": 2.4642, "mean_token_accuracy": 0.42758620381355283, "step": 96465 }, { "epoch": 0.09716560859190948, "grad_norm": 11.671631007756726, "learning_rate": 4.972687679637448e-05, "loss": 2.311, "mean_token_accuracy": 0.43103447556495667, "step": 96470 }, { "epoch": 0.09717064464501365, "grad_norm": 10.934151094439425, "learning_rate": 4.9726818585571714e-05, "loss": 2.37, "mean_token_accuracy": 0.4530550479888916, "step": 96475 }, { "epoch": 0.09717568069811783, "grad_norm": 11.345214959531479, "learning_rate": 4.972676036860426e-05, "loss": 2.7723, "mean_token_accuracy": 0.36896551251411436, "step": 96480 }, { "epoch": 0.097180716751222, "grad_norm": 22.268393400185147, "learning_rate": 4.972670214547213e-05, "loss": 2.4091, "mean_token_accuracy": 0.41034482717514037, "step": 96485 }, { "epoch": 0.09718575280432618, "grad_norm": 11.723755830109697, "learning_rate": 4.9726643916175336e-05, "loss": 2.4575, "mean_token_accuracy": 0.43278886675834655, "step": 96490 }, { "epoch": 0.09719078885743035, "grad_norm": 11.063633820950281, "learning_rate": 4.9726585680713906e-05, "loss": 2.6602, "mean_token_accuracy": 0.3896551787853241, "step": 96495 }, { "epoch": 0.09719582491053452, "grad_norm": 10.265460409837928, "learning_rate": 4.972652743908784e-05, "loss": 2.4788, "mean_token_accuracy": 0.4034482717514038, "step": 96500 }, { "epoch": 0.09720086096363868, "grad_norm": 10.596362514804271, "learning_rate": 4.972646919129717e-05, "loss": 2.3332, "mean_token_accuracy": 0.4413793206214905, "step": 96505 }, { "epoch": 0.09720589701674286, "grad_norm": 10.68046442908292, "learning_rate": 4.9726410937341906e-05, "loss": 2.3237, "mean_token_accuracy": 0.36896551251411436, "step": 96510 }, { "epoch": 0.09721093306984703, "grad_norm": 11.37897603361246, "learning_rate": 4.972635267722206e-05, "loss": 2.2783, "mean_token_accuracy": 0.41379310488700866, "step": 96515 }, { "epoch": 0.0972159691229512, "grad_norm": 13.388013996660984, "learning_rate": 4.972629441093766e-05, "loss": 2.7003, "mean_token_accuracy": 0.3947973370552063, "step": 96520 }, { "epoch": 0.09722100517605538, "grad_norm": 9.976030042400652, "learning_rate": 4.9726236138488705e-05, "loss": 2.5135, "mean_token_accuracy": 0.42758620977401735, "step": 96525 }, { "epoch": 0.09722604122915955, "grad_norm": 10.32330164844996, "learning_rate": 4.9726177859875236e-05, "loss": 2.4576, "mean_token_accuracy": 0.4103448331356049, "step": 96530 }, { "epoch": 0.09723107728226373, "grad_norm": 10.579976905442924, "learning_rate": 4.972611957509724e-05, "loss": 2.4994, "mean_token_accuracy": 0.4206896543502808, "step": 96535 }, { "epoch": 0.0972361133353679, "grad_norm": 12.948064003966051, "learning_rate": 4.9726061284154744e-05, "loss": 2.4175, "mean_token_accuracy": 0.4190562665462494, "step": 96540 }, { "epoch": 0.09724114938847207, "grad_norm": 9.934205874096246, "learning_rate": 4.9726002987047775e-05, "loss": 2.524, "mean_token_accuracy": 0.4034482777118683, "step": 96545 }, { "epoch": 0.09724618544157625, "grad_norm": 15.147394639964748, "learning_rate": 4.9725944683776344e-05, "loss": 2.9924, "mean_token_accuracy": 0.341379314661026, "step": 96550 }, { "epoch": 0.09725122149468042, "grad_norm": 10.6263312476722, "learning_rate": 4.972588637434045e-05, "loss": 2.4374, "mean_token_accuracy": 0.4206896543502808, "step": 96555 }, { "epoch": 0.0972562575477846, "grad_norm": 12.236231670175371, "learning_rate": 4.972582805874014e-05, "loss": 2.6418, "mean_token_accuracy": 0.4310344815254211, "step": 96560 }, { "epoch": 0.09726129360088877, "grad_norm": 9.383906429932566, "learning_rate": 4.972576973697541e-05, "loss": 2.1375, "mean_token_accuracy": 0.4413793087005615, "step": 96565 }, { "epoch": 0.09726632965399294, "grad_norm": 8.455281460553742, "learning_rate": 4.972571140904627e-05, "loss": 2.3818, "mean_token_accuracy": 0.3896551698446274, "step": 96570 }, { "epoch": 0.0972713657070971, "grad_norm": 11.134645701505692, "learning_rate": 4.9725653074952755e-05, "loss": 2.6941, "mean_token_accuracy": 0.4448275864124298, "step": 96575 }, { "epoch": 0.09727640176020128, "grad_norm": 11.831679969379401, "learning_rate": 4.972559473469487e-05, "loss": 2.3259, "mean_token_accuracy": 0.46551724076271056, "step": 96580 }, { "epoch": 0.09728143781330545, "grad_norm": 11.97817942967243, "learning_rate": 4.9725536388272634e-05, "loss": 2.6208, "mean_token_accuracy": 0.36896551549434664, "step": 96585 }, { "epoch": 0.09728647386640962, "grad_norm": 10.981562391008152, "learning_rate": 4.9725478035686064e-05, "loss": 2.0739, "mean_token_accuracy": 0.4979064166545868, "step": 96590 }, { "epoch": 0.0972915099195138, "grad_norm": 9.143473257145102, "learning_rate": 4.9725419676935174e-05, "loss": 2.538, "mean_token_accuracy": 0.42413793206214906, "step": 96595 }, { "epoch": 0.09729654597261797, "grad_norm": 9.73518849612509, "learning_rate": 4.972536131201997e-05, "loss": 2.406, "mean_token_accuracy": 0.38275861740112305, "step": 96600 }, { "epoch": 0.09730158202572214, "grad_norm": 9.378363356724913, "learning_rate": 4.97253029409405e-05, "loss": 2.4046, "mean_token_accuracy": 0.3758620619773865, "step": 96605 }, { "epoch": 0.09730661807882632, "grad_norm": 15.153794753572866, "learning_rate": 4.9725244563696743e-05, "loss": 3.1952, "mean_token_accuracy": 0.36896551251411436, "step": 96610 }, { "epoch": 0.09731165413193049, "grad_norm": 12.534745261124794, "learning_rate": 4.972518618028874e-05, "loss": 2.9086, "mean_token_accuracy": 0.37586206793785093, "step": 96615 }, { "epoch": 0.09731669018503467, "grad_norm": 10.439136848106703, "learning_rate": 4.9725127790716495e-05, "loss": 2.4867, "mean_token_accuracy": 0.42928009629249575, "step": 96620 }, { "epoch": 0.09732172623813884, "grad_norm": 11.952303776150343, "learning_rate": 4.972506939498002e-05, "loss": 2.4577, "mean_token_accuracy": 0.43793103098869324, "step": 96625 }, { "epoch": 0.09732676229124301, "grad_norm": 8.528502611793806, "learning_rate": 4.9725010993079354e-05, "loss": 2.09, "mean_token_accuracy": 0.4931034564971924, "step": 96630 }, { "epoch": 0.09733179834434719, "grad_norm": 11.0862047676407, "learning_rate": 4.972495258501449e-05, "loss": 2.3867, "mean_token_accuracy": 0.39310344457626345, "step": 96635 }, { "epoch": 0.09733683439745136, "grad_norm": 12.67013460086597, "learning_rate": 4.972489417078545e-05, "loss": 2.9974, "mean_token_accuracy": 0.4034482717514038, "step": 96640 }, { "epoch": 0.09734187045055552, "grad_norm": 11.490876939259309, "learning_rate": 4.972483575039226e-05, "loss": 2.244, "mean_token_accuracy": 0.4344827592372894, "step": 96645 }, { "epoch": 0.0973469065036597, "grad_norm": 22.726410779267297, "learning_rate": 4.972477732383492e-05, "loss": 2.4759, "mean_token_accuracy": 0.3999999940395355, "step": 96650 }, { "epoch": 0.09735194255676387, "grad_norm": 10.32493373933577, "learning_rate": 4.972471889111346e-05, "loss": 2.606, "mean_token_accuracy": 0.4172413766384125, "step": 96655 }, { "epoch": 0.09735697860986804, "grad_norm": 14.034009480366368, "learning_rate": 4.97246604522279e-05, "loss": 2.5075, "mean_token_accuracy": 0.3896551787853241, "step": 96660 }, { "epoch": 0.09736201466297222, "grad_norm": 9.69913131713579, "learning_rate": 4.972460200717824e-05, "loss": 2.3392, "mean_token_accuracy": 0.44827585220336913, "step": 96665 }, { "epoch": 0.09736705071607639, "grad_norm": 10.68681207746539, "learning_rate": 4.97245435559645e-05, "loss": 2.2344, "mean_token_accuracy": 0.39310344457626345, "step": 96670 }, { "epoch": 0.09737208676918056, "grad_norm": 9.419592364122364, "learning_rate": 4.972448509858671e-05, "loss": 2.3428, "mean_token_accuracy": 0.4034482777118683, "step": 96675 }, { "epoch": 0.09737712282228474, "grad_norm": 10.311463980498708, "learning_rate": 4.9724426635044865e-05, "loss": 2.3887, "mean_token_accuracy": 0.4551724076271057, "step": 96680 }, { "epoch": 0.09738215887538891, "grad_norm": 11.602580995050053, "learning_rate": 4.9724368165338995e-05, "loss": 2.2862, "mean_token_accuracy": 0.47241379618644713, "step": 96685 }, { "epoch": 0.09738719492849308, "grad_norm": 10.192490845618508, "learning_rate": 4.972430968946911e-05, "loss": 2.0668, "mean_token_accuracy": 0.47931034564971925, "step": 96690 }, { "epoch": 0.09739223098159726, "grad_norm": 8.355083860858045, "learning_rate": 4.972425120743524e-05, "loss": 2.0145, "mean_token_accuracy": 0.47931034564971925, "step": 96695 }, { "epoch": 0.09739726703470143, "grad_norm": 10.979154835600486, "learning_rate": 4.9724192719237385e-05, "loss": 2.2679, "mean_token_accuracy": 0.45716878175735476, "step": 96700 }, { "epoch": 0.0974023030878056, "grad_norm": 10.966468621282301, "learning_rate": 4.9724134224875565e-05, "loss": 2.3768, "mean_token_accuracy": 0.44482758045196535, "step": 96705 }, { "epoch": 0.09740733914090978, "grad_norm": 10.51274154958129, "learning_rate": 4.9724075724349805e-05, "loss": 2.7362, "mean_token_accuracy": 0.379310342669487, "step": 96710 }, { "epoch": 0.09741237519401394, "grad_norm": 11.749460811620443, "learning_rate": 4.972401721766011e-05, "loss": 2.1498, "mean_token_accuracy": 0.42413792610168455, "step": 96715 }, { "epoch": 0.09741741124711811, "grad_norm": 11.455565578309374, "learning_rate": 4.9723958704806505e-05, "loss": 2.5497, "mean_token_accuracy": 0.4586206912994385, "step": 96720 }, { "epoch": 0.09742244730022229, "grad_norm": 14.03513215479129, "learning_rate": 4.9723900185789e-05, "loss": 2.4985, "mean_token_accuracy": 0.4379310369491577, "step": 96725 }, { "epoch": 0.09742748335332646, "grad_norm": 12.871879266978096, "learning_rate": 4.9723841660607615e-05, "loss": 2.3151, "mean_token_accuracy": 0.42758620977401735, "step": 96730 }, { "epoch": 0.09743251940643063, "grad_norm": 9.878963807440284, "learning_rate": 4.972378312926236e-05, "loss": 2.2238, "mean_token_accuracy": 0.4413793087005615, "step": 96735 }, { "epoch": 0.09743755545953481, "grad_norm": 10.909500353747827, "learning_rate": 4.9723724591753255e-05, "loss": 2.6082, "mean_token_accuracy": 0.36896551847457887, "step": 96740 }, { "epoch": 0.09744259151263898, "grad_norm": 12.746491718297278, "learning_rate": 4.972366604808033e-05, "loss": 2.7272, "mean_token_accuracy": 0.4310344815254211, "step": 96745 }, { "epoch": 0.09744762756574316, "grad_norm": 10.632690916188793, "learning_rate": 4.9723607498243575e-05, "loss": 2.5296, "mean_token_accuracy": 0.4310344815254211, "step": 96750 }, { "epoch": 0.09745266361884733, "grad_norm": 10.125773622232813, "learning_rate": 4.972354894224302e-05, "loss": 2.3954, "mean_token_accuracy": 0.38965516090393065, "step": 96755 }, { "epoch": 0.0974576996719515, "grad_norm": 9.02353078138823, "learning_rate": 4.9723490380078685e-05, "loss": 2.4236, "mean_token_accuracy": 0.41724138259887694, "step": 96760 }, { "epoch": 0.09746273572505568, "grad_norm": 10.689419546854017, "learning_rate": 4.972343181175058e-05, "loss": 2.4708, "mean_token_accuracy": 0.36896551549434664, "step": 96765 }, { "epoch": 0.09746777177815985, "grad_norm": 10.41181986035487, "learning_rate": 4.9723373237258725e-05, "loss": 2.4957, "mean_token_accuracy": 0.3931034505367279, "step": 96770 }, { "epoch": 0.09747280783126402, "grad_norm": 11.467168512770447, "learning_rate": 4.972331465660313e-05, "loss": 2.375, "mean_token_accuracy": 0.4517241358757019, "step": 96775 }, { "epoch": 0.0974778438843682, "grad_norm": 9.281237339857533, "learning_rate": 4.9723256069783824e-05, "loss": 2.4694, "mean_token_accuracy": 0.441379314661026, "step": 96780 }, { "epoch": 0.09748287993747236, "grad_norm": 9.638861439223946, "learning_rate": 4.9723197476800806e-05, "loss": 2.1548, "mean_token_accuracy": 0.4344827592372894, "step": 96785 }, { "epoch": 0.09748791599057653, "grad_norm": 10.722541410704167, "learning_rate": 4.972313887765411e-05, "loss": 2.809, "mean_token_accuracy": 0.31379311084747313, "step": 96790 }, { "epoch": 0.0974929520436807, "grad_norm": 10.504543482092245, "learning_rate": 4.9723080272343736e-05, "loss": 2.3093, "mean_token_accuracy": 0.441379314661026, "step": 96795 }, { "epoch": 0.09749798809678488, "grad_norm": 12.40963604534285, "learning_rate": 4.972302166086971e-05, "loss": 2.7733, "mean_token_accuracy": 0.33448276221752166, "step": 96800 }, { "epoch": 0.09750302414988905, "grad_norm": 17.54052414336084, "learning_rate": 4.972296304323205e-05, "loss": 2.7422, "mean_token_accuracy": 0.4257713258266449, "step": 96805 }, { "epoch": 0.09750806020299323, "grad_norm": 8.67166551567929, "learning_rate": 4.972290441943077e-05, "loss": 2.2464, "mean_token_accuracy": 0.43103447556495667, "step": 96810 }, { "epoch": 0.0975130962560974, "grad_norm": 10.283620969702627, "learning_rate": 4.972284578946588e-05, "loss": 2.5574, "mean_token_accuracy": 0.42758620977401735, "step": 96815 }, { "epoch": 0.09751813230920157, "grad_norm": 10.516645771920953, "learning_rate": 4.97227871533374e-05, "loss": 2.414, "mean_token_accuracy": 0.3689655244350433, "step": 96820 }, { "epoch": 0.09752316836230575, "grad_norm": 10.089120465956569, "learning_rate": 4.972272851104535e-05, "loss": 2.2177, "mean_token_accuracy": 0.4379310369491577, "step": 96825 }, { "epoch": 0.09752820441540992, "grad_norm": 10.628548014782055, "learning_rate": 4.972266986258974e-05, "loss": 2.3846, "mean_token_accuracy": 0.441379314661026, "step": 96830 }, { "epoch": 0.0975332404685141, "grad_norm": 11.092546364207022, "learning_rate": 4.972261120797059e-05, "loss": 2.4652, "mean_token_accuracy": 0.3896551728248596, "step": 96835 }, { "epoch": 0.09753827652161827, "grad_norm": 11.414157122837512, "learning_rate": 4.972255254718791e-05, "loss": 2.7347, "mean_token_accuracy": 0.3896551638841629, "step": 96840 }, { "epoch": 0.09754331257472244, "grad_norm": 12.616799986478412, "learning_rate": 4.972249388024173e-05, "loss": 2.4133, "mean_token_accuracy": 0.3620689630508423, "step": 96845 }, { "epoch": 0.09754834862782662, "grad_norm": 18.457025654276315, "learning_rate": 4.972243520713206e-05, "loss": 2.775, "mean_token_accuracy": 0.36896551847457887, "step": 96850 }, { "epoch": 0.09755338468093078, "grad_norm": 10.25863709002003, "learning_rate": 4.9722376527858916e-05, "loss": 2.4029, "mean_token_accuracy": 0.4517241418361664, "step": 96855 }, { "epoch": 0.09755842073403495, "grad_norm": 9.884468111241974, "learning_rate": 4.972231784242231e-05, "loss": 2.1127, "mean_token_accuracy": 0.49655171632766726, "step": 96860 }, { "epoch": 0.09756345678713912, "grad_norm": 14.342379478344315, "learning_rate": 4.972225915082226e-05, "loss": 2.5208, "mean_token_accuracy": 0.4034482717514038, "step": 96865 }, { "epoch": 0.0975684928402433, "grad_norm": 10.39999822511253, "learning_rate": 4.9722200453058784e-05, "loss": 2.4626, "mean_token_accuracy": 0.4586206912994385, "step": 96870 }, { "epoch": 0.09757352889334747, "grad_norm": 10.518821936936972, "learning_rate": 4.97221417491319e-05, "loss": 2.4562, "mean_token_accuracy": 0.41379311084747317, "step": 96875 }, { "epoch": 0.09757856494645165, "grad_norm": 12.604331784907307, "learning_rate": 4.9722083039041615e-05, "loss": 2.6805, "mean_token_accuracy": 0.44319419264793397, "step": 96880 }, { "epoch": 0.09758360099955582, "grad_norm": 11.225025639171049, "learning_rate": 4.972202432278796e-05, "loss": 2.7914, "mean_token_accuracy": 0.38620689511299133, "step": 96885 }, { "epoch": 0.09758863705266, "grad_norm": 10.483903283155431, "learning_rate": 4.972196560037094e-05, "loss": 2.2835, "mean_token_accuracy": 0.4517241358757019, "step": 96890 }, { "epoch": 0.09759367310576417, "grad_norm": 10.141427078451763, "learning_rate": 4.9721906871790574e-05, "loss": 2.3951, "mean_token_accuracy": 0.39310344457626345, "step": 96895 }, { "epoch": 0.09759870915886834, "grad_norm": 13.686144988127884, "learning_rate": 4.972184813704688e-05, "loss": 2.3838, "mean_token_accuracy": 0.41379310488700866, "step": 96900 }, { "epoch": 0.09760374521197251, "grad_norm": 12.859264222888662, "learning_rate": 4.972178939613988e-05, "loss": 2.058, "mean_token_accuracy": 0.512583178281784, "step": 96905 }, { "epoch": 0.09760878126507669, "grad_norm": 9.388715490160703, "learning_rate": 4.9721730649069574e-05, "loss": 2.498, "mean_token_accuracy": 0.44482759237289426, "step": 96910 }, { "epoch": 0.09761381731818086, "grad_norm": 10.146798647166731, "learning_rate": 4.972167189583599e-05, "loss": 2.6846, "mean_token_accuracy": 0.4034482717514038, "step": 96915 }, { "epoch": 0.09761885337128504, "grad_norm": 9.6972940625141, "learning_rate": 4.972161313643915e-05, "loss": 2.5748, "mean_token_accuracy": 0.41724138259887694, "step": 96920 }, { "epoch": 0.0976238894243892, "grad_norm": 13.675951276330927, "learning_rate": 4.972155437087905e-05, "loss": 2.3274, "mean_token_accuracy": 0.42413793206214906, "step": 96925 }, { "epoch": 0.09762892547749337, "grad_norm": 12.747957802078194, "learning_rate": 4.972149559915573e-05, "loss": 2.3109, "mean_token_accuracy": 0.47931034564971925, "step": 96930 }, { "epoch": 0.09763396153059754, "grad_norm": 14.625429362218068, "learning_rate": 4.972143682126919e-05, "loss": 2.513, "mean_token_accuracy": 0.3999999940395355, "step": 96935 }, { "epoch": 0.09763899758370172, "grad_norm": 10.324819747378985, "learning_rate": 4.972137803721946e-05, "loss": 2.2237, "mean_token_accuracy": 0.4689655065536499, "step": 96940 }, { "epoch": 0.09764403363680589, "grad_norm": 10.039963294724497, "learning_rate": 4.9721319247006544e-05, "loss": 2.5182, "mean_token_accuracy": 0.3965517163276672, "step": 96945 }, { "epoch": 0.09764906968991006, "grad_norm": 10.16367309737379, "learning_rate": 4.972126045063046e-05, "loss": 2.5677, "mean_token_accuracy": 0.4172413766384125, "step": 96950 }, { "epoch": 0.09765410574301424, "grad_norm": 10.475601876917041, "learning_rate": 4.972120164809122e-05, "loss": 2.6817, "mean_token_accuracy": 0.41034482717514037, "step": 96955 }, { "epoch": 0.09765914179611841, "grad_norm": 12.355243013818377, "learning_rate": 4.972114283938886e-05, "loss": 2.4014, "mean_token_accuracy": 0.39310344457626345, "step": 96960 }, { "epoch": 0.09766417784922259, "grad_norm": 15.27779979410524, "learning_rate": 4.9721084024523376e-05, "loss": 2.5096, "mean_token_accuracy": 0.3655172407627106, "step": 96965 }, { "epoch": 0.09766921390232676, "grad_norm": 9.78506719719141, "learning_rate": 4.972102520349479e-05, "loss": 2.5003, "mean_token_accuracy": 0.4399878978729248, "step": 96970 }, { "epoch": 0.09767424995543093, "grad_norm": 10.300117061269772, "learning_rate": 4.9720966376303115e-05, "loss": 2.7006, "mean_token_accuracy": 0.4379310369491577, "step": 96975 }, { "epoch": 0.09767928600853511, "grad_norm": 9.5872906212588, "learning_rate": 4.972090754294838e-05, "loss": 2.0816, "mean_token_accuracy": 0.4517241358757019, "step": 96980 }, { "epoch": 0.09768432206163928, "grad_norm": 11.323234929765944, "learning_rate": 4.972084870343059e-05, "loss": 2.53, "mean_token_accuracy": 0.4379310369491577, "step": 96985 }, { "epoch": 0.09768935811474345, "grad_norm": 9.668635615328112, "learning_rate": 4.9720789857749774e-05, "loss": 2.4561, "mean_token_accuracy": 0.3896551728248596, "step": 96990 }, { "epoch": 0.09769439416784761, "grad_norm": 14.27955008293976, "learning_rate": 4.9720731005905925e-05, "loss": 2.1514, "mean_token_accuracy": 0.493103438615799, "step": 96995 }, { "epoch": 0.09769943022095179, "grad_norm": 10.349679125934957, "learning_rate": 4.972067214789908e-05, "loss": 3.3402, "mean_token_accuracy": 0.29999999403953553, "step": 97000 }, { "epoch": 0.09770446627405596, "grad_norm": 15.34680864772722, "learning_rate": 4.9720613283729246e-05, "loss": 2.6464, "mean_token_accuracy": 0.37931033968925476, "step": 97005 }, { "epoch": 0.09770950232716014, "grad_norm": 9.737995876588291, "learning_rate": 4.972055441339645e-05, "loss": 3.0523, "mean_token_accuracy": 0.3724137932062149, "step": 97010 }, { "epoch": 0.09771453838026431, "grad_norm": 12.078511110337947, "learning_rate": 4.972049553690069e-05, "loss": 2.744, "mean_token_accuracy": 0.38620689511299133, "step": 97015 }, { "epoch": 0.09771957443336848, "grad_norm": 10.952544947212989, "learning_rate": 4.9720436654242005e-05, "loss": 2.4528, "mean_token_accuracy": 0.3724137842655182, "step": 97020 }, { "epoch": 0.09772461048647266, "grad_norm": 10.289349562810282, "learning_rate": 4.9720377765420384e-05, "loss": 2.9472, "mean_token_accuracy": 0.3827586233615875, "step": 97025 }, { "epoch": 0.09772964653957683, "grad_norm": 11.303116774976345, "learning_rate": 4.972031887043587e-05, "loss": 2.3916, "mean_token_accuracy": 0.46206897497177124, "step": 97030 }, { "epoch": 0.097734682592681, "grad_norm": 11.914021258129052, "learning_rate": 4.9720259969288465e-05, "loss": 2.7275, "mean_token_accuracy": 0.3827586233615875, "step": 97035 }, { "epoch": 0.09773971864578518, "grad_norm": 10.290119889460751, "learning_rate": 4.972020106197818e-05, "loss": 2.5011, "mean_token_accuracy": 0.4241379201412201, "step": 97040 }, { "epoch": 0.09774475469888935, "grad_norm": 9.94221672020278, "learning_rate": 4.9720142148505055e-05, "loss": 2.5097, "mean_token_accuracy": 0.44482758045196535, "step": 97045 }, { "epoch": 0.09774979075199353, "grad_norm": 10.99525162725582, "learning_rate": 4.972008322886908e-05, "loss": 2.6319, "mean_token_accuracy": 0.4403508722782135, "step": 97050 }, { "epoch": 0.0977548268050977, "grad_norm": 9.632273198130806, "learning_rate": 4.972002430307028e-05, "loss": 2.4982, "mean_token_accuracy": 0.417241370677948, "step": 97055 }, { "epoch": 0.09775986285820187, "grad_norm": 11.81225659262148, "learning_rate": 4.971996537110868e-05, "loss": 2.7566, "mean_token_accuracy": 0.4, "step": 97060 }, { "epoch": 0.09776489891130603, "grad_norm": 10.29227096519714, "learning_rate": 4.9719906432984284e-05, "loss": 2.6421, "mean_token_accuracy": 0.39655172228813174, "step": 97065 }, { "epoch": 0.09776993496441021, "grad_norm": 9.038997782769467, "learning_rate": 4.9719847488697116e-05, "loss": 2.4468, "mean_token_accuracy": 0.40689654350280763, "step": 97070 }, { "epoch": 0.09777497101751438, "grad_norm": 9.218105843716975, "learning_rate": 4.971978853824719e-05, "loss": 2.7998, "mean_token_accuracy": 0.4137930989265442, "step": 97075 }, { "epoch": 0.09778000707061855, "grad_norm": 10.579686076686146, "learning_rate": 4.971972958163453e-05, "loss": 2.9357, "mean_token_accuracy": 0.3551724195480347, "step": 97080 }, { "epoch": 0.09778504312372273, "grad_norm": 12.681114501875788, "learning_rate": 4.971967061885913e-05, "loss": 2.715, "mean_token_accuracy": 0.37586206793785093, "step": 97085 }, { "epoch": 0.0977900791768269, "grad_norm": 12.44891396194757, "learning_rate": 4.9719611649921036e-05, "loss": 2.5625, "mean_token_accuracy": 0.4534785211086273, "step": 97090 }, { "epoch": 0.09779511522993108, "grad_norm": 9.09394664856038, "learning_rate": 4.971955267482024e-05, "loss": 2.5012, "mean_token_accuracy": 0.41379311084747317, "step": 97095 }, { "epoch": 0.09780015128303525, "grad_norm": 10.683238073930516, "learning_rate": 4.9719493693556776e-05, "loss": 2.7314, "mean_token_accuracy": 0.37931033968925476, "step": 97100 }, { "epoch": 0.09780518733613942, "grad_norm": 11.115355685730496, "learning_rate": 4.971943470613065e-05, "loss": 3.0163, "mean_token_accuracy": 0.4034482717514038, "step": 97105 }, { "epoch": 0.0978102233892436, "grad_norm": 12.03270088867243, "learning_rate": 4.971937571254188e-05, "loss": 3.09, "mean_token_accuracy": 0.34482758641242983, "step": 97110 }, { "epoch": 0.09781525944234777, "grad_norm": 12.062632611404117, "learning_rate": 4.9719316712790484e-05, "loss": 2.2458, "mean_token_accuracy": 0.4344827651977539, "step": 97115 }, { "epoch": 0.09782029549545194, "grad_norm": 10.650345045574621, "learning_rate": 4.971925770687648e-05, "loss": 1.9558, "mean_token_accuracy": 0.4896551787853241, "step": 97120 }, { "epoch": 0.09782533154855612, "grad_norm": 18.457186575756406, "learning_rate": 4.9719198694799874e-05, "loss": 2.7712, "mean_token_accuracy": 0.3758620619773865, "step": 97125 }, { "epoch": 0.09783036760166029, "grad_norm": 13.788095787988311, "learning_rate": 4.97191396765607e-05, "loss": 2.62, "mean_token_accuracy": 0.43448275327682495, "step": 97130 }, { "epoch": 0.09783540365476445, "grad_norm": 9.512838211464976, "learning_rate": 4.971908065215896e-05, "loss": 2.5173, "mean_token_accuracy": 0.42413793206214906, "step": 97135 }, { "epoch": 0.09784043970786863, "grad_norm": 8.025492767362715, "learning_rate": 4.971902162159467e-05, "loss": 2.2136, "mean_token_accuracy": 0.42758620977401735, "step": 97140 }, { "epoch": 0.0978454757609728, "grad_norm": 11.875854353894487, "learning_rate": 4.971896258486785e-05, "loss": 2.3736, "mean_token_accuracy": 0.4310344696044922, "step": 97145 }, { "epoch": 0.09785051181407697, "grad_norm": 10.544674018491566, "learning_rate": 4.971890354197853e-05, "loss": 2.0914, "mean_token_accuracy": 0.49999999403953554, "step": 97150 }, { "epoch": 0.09785554786718115, "grad_norm": 11.22064793589582, "learning_rate": 4.971884449292671e-05, "loss": 2.4418, "mean_token_accuracy": 0.4206896543502808, "step": 97155 }, { "epoch": 0.09786058392028532, "grad_norm": 10.645219980388783, "learning_rate": 4.971878543771241e-05, "loss": 2.7939, "mean_token_accuracy": 0.39655171930789945, "step": 97160 }, { "epoch": 0.0978656199733895, "grad_norm": 11.228176947380152, "learning_rate": 4.971872637633565e-05, "loss": 2.0494, "mean_token_accuracy": 0.47586206793785096, "step": 97165 }, { "epoch": 0.09787065602649367, "grad_norm": 14.192479031075733, "learning_rate": 4.971866730879644e-05, "loss": 2.4823, "mean_token_accuracy": 0.45517240166664125, "step": 97170 }, { "epoch": 0.09787569207959784, "grad_norm": 11.141254372866738, "learning_rate": 4.971860823509479e-05, "loss": 2.6461, "mean_token_accuracy": 0.4137930989265442, "step": 97175 }, { "epoch": 0.09788072813270202, "grad_norm": 11.076276741959363, "learning_rate": 4.971854915523074e-05, "loss": 2.4316, "mean_token_accuracy": 0.42758620381355283, "step": 97180 }, { "epoch": 0.09788576418580619, "grad_norm": 10.0024741676314, "learning_rate": 4.971849006920429e-05, "loss": 2.2462, "mean_token_accuracy": 0.4225045382976532, "step": 97185 }, { "epoch": 0.09789080023891036, "grad_norm": 12.174778838548878, "learning_rate": 4.971843097701545e-05, "loss": 2.7311, "mean_token_accuracy": 0.4034482777118683, "step": 97190 }, { "epoch": 0.09789583629201454, "grad_norm": 11.093035923026877, "learning_rate": 4.9718371878664257e-05, "loss": 2.5509, "mean_token_accuracy": 0.41379311084747317, "step": 97195 }, { "epoch": 0.09790087234511871, "grad_norm": 11.39140168290409, "learning_rate": 4.97183127741507e-05, "loss": 2.6019, "mean_token_accuracy": 0.3862069010734558, "step": 97200 }, { "epoch": 0.09790590839822287, "grad_norm": 12.984332168372463, "learning_rate": 4.971825366347483e-05, "loss": 2.4617, "mean_token_accuracy": 0.41379310190677643, "step": 97205 }, { "epoch": 0.09791094445132704, "grad_norm": 9.62460337736579, "learning_rate": 4.9718194546636634e-05, "loss": 2.5987, "mean_token_accuracy": 0.40443349480628965, "step": 97210 }, { "epoch": 0.09791598050443122, "grad_norm": 10.49651796765576, "learning_rate": 4.971813542363614e-05, "loss": 2.8555, "mean_token_accuracy": 0.3620689630508423, "step": 97215 }, { "epoch": 0.09792101655753539, "grad_norm": 11.223380933308345, "learning_rate": 4.9718076294473364e-05, "loss": 2.3865, "mean_token_accuracy": 0.4448275864124298, "step": 97220 }, { "epoch": 0.09792605261063957, "grad_norm": 12.550344772911117, "learning_rate": 4.971801715914833e-05, "loss": 2.6305, "mean_token_accuracy": 0.37586206793785093, "step": 97225 }, { "epoch": 0.09793108866374374, "grad_norm": 9.534220346404886, "learning_rate": 4.971795801766103e-05, "loss": 2.137, "mean_token_accuracy": 0.43103447556495667, "step": 97230 }, { "epoch": 0.09793612471684791, "grad_norm": 21.661130683323204, "learning_rate": 4.971789887001151e-05, "loss": 2.6367, "mean_token_accuracy": 0.4275861978530884, "step": 97235 }, { "epoch": 0.09794116076995209, "grad_norm": 12.45245511823149, "learning_rate": 4.9717839716199764e-05, "loss": 2.2642, "mean_token_accuracy": 0.44482759237289426, "step": 97240 }, { "epoch": 0.09794619682305626, "grad_norm": 8.64125924468296, "learning_rate": 4.971778055622582e-05, "loss": 2.4256, "mean_token_accuracy": 0.41379310488700866, "step": 97245 }, { "epoch": 0.09795123287616044, "grad_norm": 8.089768182130541, "learning_rate": 4.971772139008969e-05, "loss": 2.1212, "mean_token_accuracy": 0.4465214729309082, "step": 97250 }, { "epoch": 0.09795626892926461, "grad_norm": 9.954138781133398, "learning_rate": 4.97176622177914e-05, "loss": 2.3069, "mean_token_accuracy": 0.4172413766384125, "step": 97255 }, { "epoch": 0.09796130498236878, "grad_norm": 10.604904212375914, "learning_rate": 4.9717603039330946e-05, "loss": 2.4347, "mean_token_accuracy": 0.4448275864124298, "step": 97260 }, { "epoch": 0.09796634103547296, "grad_norm": 9.939429487131802, "learning_rate": 4.9717543854708364e-05, "loss": 2.5792, "mean_token_accuracy": 0.4034482717514038, "step": 97265 }, { "epoch": 0.09797137708857713, "grad_norm": 12.327897802146444, "learning_rate": 4.971748466392367e-05, "loss": 2.4984, "mean_token_accuracy": 0.37241379022598264, "step": 97270 }, { "epoch": 0.09797641314168129, "grad_norm": 9.70918229342822, "learning_rate": 4.971742546697687e-05, "loss": 2.6968, "mean_token_accuracy": 0.33793102502822875, "step": 97275 }, { "epoch": 0.09798144919478546, "grad_norm": 9.603534685573297, "learning_rate": 4.9717366263867974e-05, "loss": 2.5284, "mean_token_accuracy": 0.4068965494632721, "step": 97280 }, { "epoch": 0.09798648524788964, "grad_norm": 9.98280413246755, "learning_rate": 4.971730705459702e-05, "loss": 2.4625, "mean_token_accuracy": 0.36551723480224607, "step": 97285 }, { "epoch": 0.09799152130099381, "grad_norm": 8.73835032248004, "learning_rate": 4.971724783916401e-05, "loss": 3.0467, "mean_token_accuracy": 0.358620685338974, "step": 97290 }, { "epoch": 0.09799655735409799, "grad_norm": 13.120886250657922, "learning_rate": 4.971718861756896e-05, "loss": 2.3393, "mean_token_accuracy": 0.4413793087005615, "step": 97295 }, { "epoch": 0.09800159340720216, "grad_norm": 11.246169075642408, "learning_rate": 4.971712938981189e-05, "loss": 2.0537, "mean_token_accuracy": 0.4637628495693207, "step": 97300 }, { "epoch": 0.09800662946030633, "grad_norm": 13.524247366323623, "learning_rate": 4.971707015589281e-05, "loss": 2.621, "mean_token_accuracy": 0.40852994918823243, "step": 97305 }, { "epoch": 0.0980116655134105, "grad_norm": 13.292531467620794, "learning_rate": 4.9717010915811764e-05, "loss": 2.2512, "mean_token_accuracy": 0.4501512348651886, "step": 97310 }, { "epoch": 0.09801670156651468, "grad_norm": 10.845938151751648, "learning_rate": 4.971695166956873e-05, "loss": 2.6533, "mean_token_accuracy": 0.4075123190879822, "step": 97315 }, { "epoch": 0.09802173761961885, "grad_norm": 10.804970780779586, "learning_rate": 4.971689241716374e-05, "loss": 2.2309, "mean_token_accuracy": 0.4482758641242981, "step": 97320 }, { "epoch": 0.09802677367272303, "grad_norm": 11.232538178954089, "learning_rate": 4.9716833158596824e-05, "loss": 2.463, "mean_token_accuracy": 0.40865094065666197, "step": 97325 }, { "epoch": 0.0980318097258272, "grad_norm": 10.280132166881835, "learning_rate": 4.971677389386798e-05, "loss": 2.4881, "mean_token_accuracy": 0.47084088921546935, "step": 97330 }, { "epoch": 0.09803684577893138, "grad_norm": 11.784304700738083, "learning_rate": 4.971671462297723e-05, "loss": 2.139, "mean_token_accuracy": 0.5041871964931488, "step": 97335 }, { "epoch": 0.09804188183203555, "grad_norm": 11.541599706620644, "learning_rate": 4.971665534592459e-05, "loss": 2.3401, "mean_token_accuracy": 0.4068965494632721, "step": 97340 }, { "epoch": 0.09804691788513971, "grad_norm": 10.862798458183697, "learning_rate": 4.971659606271008e-05, "loss": 2.3288, "mean_token_accuracy": 0.458620685338974, "step": 97345 }, { "epoch": 0.09805195393824388, "grad_norm": 12.807057917961442, "learning_rate": 4.971653677333371e-05, "loss": 2.6562, "mean_token_accuracy": 0.37931033968925476, "step": 97350 }, { "epoch": 0.09805698999134806, "grad_norm": 10.899880027749033, "learning_rate": 4.971647747779551e-05, "loss": 2.4409, "mean_token_accuracy": 0.4034482717514038, "step": 97355 }, { "epoch": 0.09806202604445223, "grad_norm": 12.727422271199172, "learning_rate": 4.971641817609548e-05, "loss": 2.4622, "mean_token_accuracy": 0.43103448748588563, "step": 97360 }, { "epoch": 0.0980670620975564, "grad_norm": 13.234311009544532, "learning_rate": 4.971635886823365e-05, "loss": 2.6209, "mean_token_accuracy": 0.3862069010734558, "step": 97365 }, { "epoch": 0.09807209815066058, "grad_norm": 10.673717316435907, "learning_rate": 4.971629955421002e-05, "loss": 2.103, "mean_token_accuracy": 0.5068965494632721, "step": 97370 }, { "epoch": 0.09807713420376475, "grad_norm": 13.643854912138634, "learning_rate": 4.9716240234024625e-05, "loss": 2.7575, "mean_token_accuracy": 0.39310344457626345, "step": 97375 }, { "epoch": 0.09808217025686893, "grad_norm": 15.536629801504441, "learning_rate": 4.9716180907677466e-05, "loss": 3.0589, "mean_token_accuracy": 0.3724137872457504, "step": 97380 }, { "epoch": 0.0980872063099731, "grad_norm": 11.069006068684299, "learning_rate": 4.9716121575168576e-05, "loss": 2.5988, "mean_token_accuracy": 0.4463399827480316, "step": 97385 }, { "epoch": 0.09809224236307727, "grad_norm": 11.328459349448453, "learning_rate": 4.9716062236497954e-05, "loss": 2.3668, "mean_token_accuracy": 0.42068964838981626, "step": 97390 }, { "epoch": 0.09809727841618145, "grad_norm": 9.307879567752911, "learning_rate": 4.971600289166562e-05, "loss": 2.234, "mean_token_accuracy": 0.4655172526836395, "step": 97395 }, { "epoch": 0.09810231446928562, "grad_norm": 9.623135758282569, "learning_rate": 4.971594354067161e-05, "loss": 2.3823, "mean_token_accuracy": 0.4551724135875702, "step": 97400 }, { "epoch": 0.0981073505223898, "grad_norm": 11.00608412046635, "learning_rate": 4.971588418351591e-05, "loss": 2.936, "mean_token_accuracy": 0.37586206793785093, "step": 97405 }, { "epoch": 0.09811238657549397, "grad_norm": 14.724998573745744, "learning_rate": 4.971582482019856e-05, "loss": 2.3686, "mean_token_accuracy": 0.4448275864124298, "step": 97410 }, { "epoch": 0.09811742262859813, "grad_norm": 10.423728585907053, "learning_rate": 4.971576545071957e-05, "loss": 2.2985, "mean_token_accuracy": 0.458620685338974, "step": 97415 }, { "epoch": 0.0981224586817023, "grad_norm": 10.165969535174682, "learning_rate": 4.9715706075078946e-05, "loss": 2.4443, "mean_token_accuracy": 0.37586206793785093, "step": 97420 }, { "epoch": 0.09812749473480648, "grad_norm": 11.168144384515777, "learning_rate": 4.9715646693276716e-05, "loss": 2.6236, "mean_token_accuracy": 0.4000000059604645, "step": 97425 }, { "epoch": 0.09813253078791065, "grad_norm": 10.717334549610356, "learning_rate": 4.97155873053129e-05, "loss": 2.9845, "mean_token_accuracy": 0.3983666092157364, "step": 97430 }, { "epoch": 0.09813756684101482, "grad_norm": 10.073634886322745, "learning_rate": 4.9715527911187496e-05, "loss": 2.3298, "mean_token_accuracy": 0.46055657267570493, "step": 97435 }, { "epoch": 0.098142602894119, "grad_norm": 10.609131957458919, "learning_rate": 4.971546851090054e-05, "loss": 2.3797, "mean_token_accuracy": 0.42413793206214906, "step": 97440 }, { "epoch": 0.09814763894722317, "grad_norm": 9.962656117989539, "learning_rate": 4.9715409104452044e-05, "loss": 2.4454, "mean_token_accuracy": 0.4090139091014862, "step": 97445 }, { "epoch": 0.09815267500032734, "grad_norm": 10.262654879251988, "learning_rate": 4.971534969184202e-05, "loss": 2.3542, "mean_token_accuracy": 0.4467029690742493, "step": 97450 }, { "epoch": 0.09815771105343152, "grad_norm": 9.60327381626616, "learning_rate": 4.971529027307049e-05, "loss": 2.6289, "mean_token_accuracy": 0.4034482717514038, "step": 97455 }, { "epoch": 0.09816274710653569, "grad_norm": 11.189314917002674, "learning_rate": 4.971523084813745e-05, "loss": 2.2961, "mean_token_accuracy": 0.4517241418361664, "step": 97460 }, { "epoch": 0.09816778315963987, "grad_norm": 15.70734421804332, "learning_rate": 4.971517141704294e-05, "loss": 2.4001, "mean_token_accuracy": 0.3862068921327591, "step": 97465 }, { "epoch": 0.09817281921274404, "grad_norm": 9.87305913079176, "learning_rate": 4.9715111979786976e-05, "loss": 2.3866, "mean_token_accuracy": 0.4172413766384125, "step": 97470 }, { "epoch": 0.09817785526584821, "grad_norm": 12.197450815255955, "learning_rate": 4.971505253636956e-05, "loss": 2.1722, "mean_token_accuracy": 0.47586206793785096, "step": 97475 }, { "epoch": 0.09818289131895239, "grad_norm": 8.488843296150286, "learning_rate": 4.9714993086790724e-05, "loss": 1.9413, "mean_token_accuracy": 0.5275862038135528, "step": 97480 }, { "epoch": 0.09818792737205655, "grad_norm": 14.485895980656561, "learning_rate": 4.9714933631050467e-05, "loss": 2.8323, "mean_token_accuracy": 0.37241379022598264, "step": 97485 }, { "epoch": 0.09819296342516072, "grad_norm": 12.255518271692631, "learning_rate": 4.971487416914882e-05, "loss": 2.8109, "mean_token_accuracy": 0.3965517282485962, "step": 97490 }, { "epoch": 0.0981979994782649, "grad_norm": 11.982830983494642, "learning_rate": 4.97148147010858e-05, "loss": 2.5581, "mean_token_accuracy": 0.4344827592372894, "step": 97495 }, { "epoch": 0.09820303553136907, "grad_norm": 12.521465508428536, "learning_rate": 4.9714755226861414e-05, "loss": 2.2261, "mean_token_accuracy": 0.4159709572792053, "step": 97500 }, { "epoch": 0.09820807158447324, "grad_norm": 15.04539223274665, "learning_rate": 4.971469574647569e-05, "loss": 2.7264, "mean_token_accuracy": 0.38620689511299133, "step": 97505 }, { "epoch": 0.09821310763757742, "grad_norm": 11.668143506291376, "learning_rate": 4.971463625992863e-05, "loss": 2.7448, "mean_token_accuracy": 0.3724137932062149, "step": 97510 }, { "epoch": 0.09821814369068159, "grad_norm": 9.032773450275338, "learning_rate": 4.971457676722025e-05, "loss": 2.5133, "mean_token_accuracy": 0.42068964838981626, "step": 97515 }, { "epoch": 0.09822317974378576, "grad_norm": 11.333041564792559, "learning_rate": 4.971451726835059e-05, "loss": 2.1615, "mean_token_accuracy": 0.48411330580711365, "step": 97520 }, { "epoch": 0.09822821579688994, "grad_norm": 20.78318693750358, "learning_rate": 4.9714457763319647e-05, "loss": 2.8879, "mean_token_accuracy": 0.3482758581638336, "step": 97525 }, { "epoch": 0.09823325184999411, "grad_norm": 10.053705962477762, "learning_rate": 4.971439825212743e-05, "loss": 2.6089, "mean_token_accuracy": 0.42068964838981626, "step": 97530 }, { "epoch": 0.09823828790309828, "grad_norm": 9.960216416667782, "learning_rate": 4.971433873477398e-05, "loss": 2.3145, "mean_token_accuracy": 0.3827586233615875, "step": 97535 }, { "epoch": 0.09824332395620246, "grad_norm": 10.789542657871245, "learning_rate": 4.971427921125929e-05, "loss": 2.1866, "mean_token_accuracy": 0.44337567687034607, "step": 97540 }, { "epoch": 0.09824836000930663, "grad_norm": 13.201018749330451, "learning_rate": 4.971421968158339e-05, "loss": 2.6757, "mean_token_accuracy": 0.3655172407627106, "step": 97545 }, { "epoch": 0.0982533960624108, "grad_norm": 11.724652705249134, "learning_rate": 4.97141601457463e-05, "loss": 2.943, "mean_token_accuracy": 0.32068966031074525, "step": 97550 }, { "epoch": 0.09825843211551497, "grad_norm": 11.606750931432911, "learning_rate": 4.971410060374802e-05, "loss": 2.4199, "mean_token_accuracy": 0.4206896543502808, "step": 97555 }, { "epoch": 0.09826346816861914, "grad_norm": 11.937365107114122, "learning_rate": 4.971404105558858e-05, "loss": 2.7247, "mean_token_accuracy": 0.3551724135875702, "step": 97560 }, { "epoch": 0.09826850422172331, "grad_norm": 11.715178106022424, "learning_rate": 4.9713981501268e-05, "loss": 2.6865, "mean_token_accuracy": 0.3827586114406586, "step": 97565 }, { "epoch": 0.09827354027482749, "grad_norm": 15.544238260013042, "learning_rate": 4.971392194078628e-05, "loss": 2.5179, "mean_token_accuracy": 0.3896551728248596, "step": 97570 }, { "epoch": 0.09827857632793166, "grad_norm": 11.724707568759483, "learning_rate": 4.9713862374143446e-05, "loss": 2.6605, "mean_token_accuracy": 0.37368420958518983, "step": 97575 }, { "epoch": 0.09828361238103583, "grad_norm": 11.083256266009073, "learning_rate": 4.971380280133952e-05, "loss": 2.6656, "mean_token_accuracy": 0.4, "step": 97580 }, { "epoch": 0.09828864843414001, "grad_norm": 15.486292743747654, "learning_rate": 4.971374322237451e-05, "loss": 2.3749, "mean_token_accuracy": 0.41724138259887694, "step": 97585 }, { "epoch": 0.09829368448724418, "grad_norm": 12.017135034034935, "learning_rate": 4.971368363724844e-05, "loss": 2.7109, "mean_token_accuracy": 0.36896551847457887, "step": 97590 }, { "epoch": 0.09829872054034836, "grad_norm": 9.506074659347666, "learning_rate": 4.971362404596131e-05, "loss": 2.525, "mean_token_accuracy": 0.43448275327682495, "step": 97595 }, { "epoch": 0.09830375659345253, "grad_norm": 9.813523271529426, "learning_rate": 4.971356444851316e-05, "loss": 2.3511, "mean_token_accuracy": 0.4034482717514038, "step": 97600 }, { "epoch": 0.0983087926465567, "grad_norm": 10.183859763990748, "learning_rate": 4.971350484490399e-05, "loss": 2.117, "mean_token_accuracy": 0.4448275864124298, "step": 97605 }, { "epoch": 0.09831382869966088, "grad_norm": 10.741336989471424, "learning_rate": 4.971344523513382e-05, "loss": 2.8036, "mean_token_accuracy": 0.3379310339689255, "step": 97610 }, { "epoch": 0.09831886475276505, "grad_norm": 10.665047545224013, "learning_rate": 4.971338561920268e-05, "loss": 2.4216, "mean_token_accuracy": 0.4344827592372894, "step": 97615 }, { "epoch": 0.09832390080586922, "grad_norm": 10.710897658826648, "learning_rate": 4.9713325997110555e-05, "loss": 1.9869, "mean_token_accuracy": 0.4568663060665131, "step": 97620 }, { "epoch": 0.09832893685897338, "grad_norm": 10.963266200000158, "learning_rate": 4.9713266368857496e-05, "loss": 2.4876, "mean_token_accuracy": 0.38275861740112305, "step": 97625 }, { "epoch": 0.09833397291207756, "grad_norm": 11.854665488687933, "learning_rate": 4.9713206734443505e-05, "loss": 2.3924, "mean_token_accuracy": 0.4137930989265442, "step": 97630 }, { "epoch": 0.09833900896518173, "grad_norm": 11.901892481175224, "learning_rate": 4.971314709386859e-05, "loss": 2.1592, "mean_token_accuracy": 0.4482758641242981, "step": 97635 }, { "epoch": 0.0983440450182859, "grad_norm": 11.7973940481483, "learning_rate": 4.971308744713278e-05, "loss": 2.4243, "mean_token_accuracy": 0.3827586233615875, "step": 97640 }, { "epoch": 0.09834908107139008, "grad_norm": 11.551590428875171, "learning_rate": 4.9713027794236086e-05, "loss": 2.5728, "mean_token_accuracy": 0.382758629322052, "step": 97645 }, { "epoch": 0.09835411712449425, "grad_norm": 10.817075350852532, "learning_rate": 4.9712968135178526e-05, "loss": 2.8453, "mean_token_accuracy": 0.3999999940395355, "step": 97650 }, { "epoch": 0.09835915317759843, "grad_norm": 9.90386646038119, "learning_rate": 4.9712908469960115e-05, "loss": 1.9628, "mean_token_accuracy": 0.5275862157344818, "step": 97655 }, { "epoch": 0.0983641892307026, "grad_norm": 15.244139527243627, "learning_rate": 4.9712848798580875e-05, "loss": 2.3377, "mean_token_accuracy": 0.46727163195610044, "step": 97660 }, { "epoch": 0.09836922528380677, "grad_norm": 10.379719878210476, "learning_rate": 4.971278912104082e-05, "loss": 2.438, "mean_token_accuracy": 0.3896551728248596, "step": 97665 }, { "epoch": 0.09837426133691095, "grad_norm": 12.691440791058897, "learning_rate": 4.971272943733996e-05, "loss": 2.1843, "mean_token_accuracy": 0.4503932178020477, "step": 97670 }, { "epoch": 0.09837929739001512, "grad_norm": 10.156655678547976, "learning_rate": 4.971266974747831e-05, "loss": 2.43, "mean_token_accuracy": 0.4137930989265442, "step": 97675 }, { "epoch": 0.0983843334431193, "grad_norm": 11.188542692657098, "learning_rate": 4.9712610051455906e-05, "loss": 2.2728, "mean_token_accuracy": 0.45716878175735476, "step": 97680 }, { "epoch": 0.09838936949622347, "grad_norm": 10.18426456430586, "learning_rate": 4.971255034927275e-05, "loss": 2.5288, "mean_token_accuracy": 0.44482758045196535, "step": 97685 }, { "epoch": 0.09839440554932764, "grad_norm": 9.682478064778387, "learning_rate": 4.971249064092886e-05, "loss": 2.4859, "mean_token_accuracy": 0.38275861740112305, "step": 97690 }, { "epoch": 0.0983994416024318, "grad_norm": 20.967570590919298, "learning_rate": 4.9712430926424245e-05, "loss": 2.5718, "mean_token_accuracy": 0.4448275864124298, "step": 97695 }, { "epoch": 0.09840447765553598, "grad_norm": 11.277447443484027, "learning_rate": 4.9712371205758935e-05, "loss": 2.61, "mean_token_accuracy": 0.44137929677963256, "step": 97700 }, { "epoch": 0.09840951370864015, "grad_norm": 13.159311013004782, "learning_rate": 4.971231147893294e-05, "loss": 2.5431, "mean_token_accuracy": 0.4034482777118683, "step": 97705 }, { "epoch": 0.09841454976174432, "grad_norm": 10.810680149560186, "learning_rate": 4.971225174594628e-05, "loss": 2.3309, "mean_token_accuracy": 0.4620689690113068, "step": 97710 }, { "epoch": 0.0984195858148485, "grad_norm": 12.993853314255958, "learning_rate": 4.971219200679897e-05, "loss": 2.5806, "mean_token_accuracy": 0.3724137842655182, "step": 97715 }, { "epoch": 0.09842462186795267, "grad_norm": 12.271317359396697, "learning_rate": 4.9712132261491026e-05, "loss": 2.4679, "mean_token_accuracy": 0.4379310429096222, "step": 97720 }, { "epoch": 0.09842965792105685, "grad_norm": 10.786675561259617, "learning_rate": 4.971207251002246e-05, "loss": 2.3974, "mean_token_accuracy": 0.4275862157344818, "step": 97725 }, { "epoch": 0.09843469397416102, "grad_norm": 12.00821831828006, "learning_rate": 4.9712012752393286e-05, "loss": 2.3894, "mean_token_accuracy": 0.42758620381355283, "step": 97730 }, { "epoch": 0.0984397300272652, "grad_norm": 13.918423615708935, "learning_rate": 4.971195298860354e-05, "loss": 2.7863, "mean_token_accuracy": 0.38620689511299133, "step": 97735 }, { "epoch": 0.09844476608036937, "grad_norm": 11.310016190182642, "learning_rate": 4.971189321865322e-05, "loss": 2.3451, "mean_token_accuracy": 0.3965517282485962, "step": 97740 }, { "epoch": 0.09844980213347354, "grad_norm": 15.278417521748223, "learning_rate": 4.971183344254235e-05, "loss": 2.7006, "mean_token_accuracy": 0.3655172407627106, "step": 97745 }, { "epoch": 0.09845483818657771, "grad_norm": 11.850355035572427, "learning_rate": 4.9711773660270944e-05, "loss": 2.5265, "mean_token_accuracy": 0.41379311084747317, "step": 97750 }, { "epoch": 0.09845987423968189, "grad_norm": 10.381352584053383, "learning_rate": 4.971171387183902e-05, "loss": 2.3247, "mean_token_accuracy": 0.4137930989265442, "step": 97755 }, { "epoch": 0.09846491029278606, "grad_norm": 11.94405345979582, "learning_rate": 4.97116540772466e-05, "loss": 2.5648, "mean_token_accuracy": 0.4310344815254211, "step": 97760 }, { "epoch": 0.09846994634589022, "grad_norm": 15.581402902773647, "learning_rate": 4.9711594276493696e-05, "loss": 2.5292, "mean_token_accuracy": 0.4206896543502808, "step": 97765 }, { "epoch": 0.0984749823989944, "grad_norm": 10.066017039142059, "learning_rate": 4.9711534469580314e-05, "loss": 2.191, "mean_token_accuracy": 0.42413792610168455, "step": 97770 }, { "epoch": 0.09848001845209857, "grad_norm": 11.954761015831895, "learning_rate": 4.971147465650648e-05, "loss": 2.4863, "mean_token_accuracy": 0.4034482777118683, "step": 97775 }, { "epoch": 0.09848505450520274, "grad_norm": 9.946762291167051, "learning_rate": 4.971141483727221e-05, "loss": 2.3726, "mean_token_accuracy": 0.45517241954803467, "step": 97780 }, { "epoch": 0.09849009055830692, "grad_norm": 13.102692373701913, "learning_rate": 4.971135501187753e-05, "loss": 2.0279, "mean_token_accuracy": 0.5310344696044922, "step": 97785 }, { "epoch": 0.09849512661141109, "grad_norm": 11.037640847425841, "learning_rate": 4.9711295180322445e-05, "loss": 2.4661, "mean_token_accuracy": 0.4068965554237366, "step": 97790 }, { "epoch": 0.09850016266451526, "grad_norm": 11.567240324795689, "learning_rate": 4.9711235342606973e-05, "loss": 2.5298, "mean_token_accuracy": 0.3999999940395355, "step": 97795 }, { "epoch": 0.09850519871761944, "grad_norm": 10.827189769815602, "learning_rate": 4.971117549873113e-05, "loss": 2.6369, "mean_token_accuracy": 0.3862069010734558, "step": 97800 }, { "epoch": 0.09851023477072361, "grad_norm": 13.12286126856711, "learning_rate": 4.971111564869494e-05, "loss": 2.3658, "mean_token_accuracy": 0.43448275327682495, "step": 97805 }, { "epoch": 0.09851527082382779, "grad_norm": 9.393014220587276, "learning_rate": 4.971105579249841e-05, "loss": 2.5006, "mean_token_accuracy": 0.4185722887516022, "step": 97810 }, { "epoch": 0.09852030687693196, "grad_norm": 10.319542079087064, "learning_rate": 4.9710995930141555e-05, "loss": 2.296, "mean_token_accuracy": 0.41379310488700866, "step": 97815 }, { "epoch": 0.09852534293003613, "grad_norm": 10.375331788137311, "learning_rate": 4.97109360616244e-05, "loss": 1.9967, "mean_token_accuracy": 0.5084089457988739, "step": 97820 }, { "epoch": 0.09853037898314031, "grad_norm": 12.003457196367076, "learning_rate": 4.971087618694697e-05, "loss": 2.629, "mean_token_accuracy": 0.4068965554237366, "step": 97825 }, { "epoch": 0.09853541503624448, "grad_norm": 9.031175134666386, "learning_rate": 4.9710816306109266e-05, "loss": 2.7957, "mean_token_accuracy": 0.33793103098869326, "step": 97830 }, { "epoch": 0.09854045108934864, "grad_norm": 10.478666577288436, "learning_rate": 4.9710756419111304e-05, "loss": 2.4722, "mean_token_accuracy": 0.4172413766384125, "step": 97835 }, { "epoch": 0.09854548714245281, "grad_norm": 8.791131633410815, "learning_rate": 4.97106965259531e-05, "loss": 2.1674, "mean_token_accuracy": 0.4517241418361664, "step": 97840 }, { "epoch": 0.09855052319555699, "grad_norm": 25.177652745530988, "learning_rate": 4.9710636626634686e-05, "loss": 2.9007, "mean_token_accuracy": 0.38620689511299133, "step": 97845 }, { "epoch": 0.09855555924866116, "grad_norm": 15.418782117170883, "learning_rate": 4.971057672115607e-05, "loss": 2.5872, "mean_token_accuracy": 0.4206896543502808, "step": 97850 }, { "epoch": 0.09856059530176534, "grad_norm": 11.393417580420463, "learning_rate": 4.971051680951727e-05, "loss": 2.8021, "mean_token_accuracy": 0.39655172228813174, "step": 97855 }, { "epoch": 0.09856563135486951, "grad_norm": 9.521671743746897, "learning_rate": 4.9710456891718297e-05, "loss": 2.3299, "mean_token_accuracy": 0.4413793087005615, "step": 97860 }, { "epoch": 0.09857066740797368, "grad_norm": 10.66119952614436, "learning_rate": 4.971039696775917e-05, "loss": 2.3629, "mean_token_accuracy": 0.4206896543502808, "step": 97865 }, { "epoch": 0.09857570346107786, "grad_norm": 10.29864309607302, "learning_rate": 4.9710337037639907e-05, "loss": 2.4084, "mean_token_accuracy": 0.39310344457626345, "step": 97870 }, { "epoch": 0.09858073951418203, "grad_norm": 10.867251487601596, "learning_rate": 4.9710277101360524e-05, "loss": 2.5367, "mean_token_accuracy": 0.3999999940395355, "step": 97875 }, { "epoch": 0.0985857755672862, "grad_norm": 9.017821802684573, "learning_rate": 4.971021715892104e-05, "loss": 2.3609, "mean_token_accuracy": 0.4275862127542496, "step": 97880 }, { "epoch": 0.09859081162039038, "grad_norm": 13.5371312437017, "learning_rate": 4.9710157210321465e-05, "loss": 2.7683, "mean_token_accuracy": 0.4000000059604645, "step": 97885 }, { "epoch": 0.09859584767349455, "grad_norm": 12.831430711166034, "learning_rate": 4.971009725556183e-05, "loss": 2.2862, "mean_token_accuracy": 0.4344827592372894, "step": 97890 }, { "epoch": 0.09860088372659873, "grad_norm": 9.899938036503844, "learning_rate": 4.971003729464212e-05, "loss": 2.1744, "mean_token_accuracy": 0.45862067937850953, "step": 97895 }, { "epoch": 0.0986059197797029, "grad_norm": 10.303391171010055, "learning_rate": 4.970997732756239e-05, "loss": 2.8414, "mean_token_accuracy": 0.3620689570903778, "step": 97900 }, { "epoch": 0.09861095583280706, "grad_norm": 10.196640367325134, "learning_rate": 4.970991735432264e-05, "loss": 2.5658, "mean_token_accuracy": 0.46896551847457885, "step": 97905 }, { "epoch": 0.09861599188591123, "grad_norm": 10.427601331647349, "learning_rate": 4.9709857374922884e-05, "loss": 2.598, "mean_token_accuracy": 0.42413793206214906, "step": 97910 }, { "epoch": 0.09862102793901541, "grad_norm": 14.224260412360465, "learning_rate": 4.970979738936314e-05, "loss": 2.602, "mean_token_accuracy": 0.40689654350280763, "step": 97915 }, { "epoch": 0.09862606399211958, "grad_norm": 11.581395200303872, "learning_rate": 4.970973739764342e-05, "loss": 2.3103, "mean_token_accuracy": 0.38965516686439516, "step": 97920 }, { "epoch": 0.09863110004522375, "grad_norm": 8.87365657375527, "learning_rate": 4.970967739976375e-05, "loss": 2.4235, "mean_token_accuracy": 0.41034482717514037, "step": 97925 }, { "epoch": 0.09863613609832793, "grad_norm": 11.007757752120106, "learning_rate": 4.970961739572416e-05, "loss": 2.3578, "mean_token_accuracy": 0.4172413766384125, "step": 97930 }, { "epoch": 0.0986411721514321, "grad_norm": 12.790968680546733, "learning_rate": 4.970955738552463e-05, "loss": 2.6098, "mean_token_accuracy": 0.3999999940395355, "step": 97935 }, { "epoch": 0.09864620820453628, "grad_norm": 13.963999583814843, "learning_rate": 4.97094973691652e-05, "loss": 2.2513, "mean_token_accuracy": 0.4689655125141144, "step": 97940 }, { "epoch": 0.09865124425764045, "grad_norm": 11.49703928048607, "learning_rate": 4.9709437346645886e-05, "loss": 2.4784, "mean_token_accuracy": 0.4310344934463501, "step": 97945 }, { "epoch": 0.09865628031074462, "grad_norm": 13.004024432610764, "learning_rate": 4.97093773179667e-05, "loss": 2.6603, "mean_token_accuracy": 0.37586206793785093, "step": 97950 }, { "epoch": 0.0986613163638488, "grad_norm": 10.453636593894931, "learning_rate": 4.970931728312766e-05, "loss": 2.2709, "mean_token_accuracy": 0.4, "step": 97955 }, { "epoch": 0.09866635241695297, "grad_norm": 11.673354702653139, "learning_rate": 4.970925724212879e-05, "loss": 2.6566, "mean_token_accuracy": 0.3862068891525269, "step": 97960 }, { "epoch": 0.09867138847005714, "grad_norm": 14.311675181997712, "learning_rate": 4.970919719497009e-05, "loss": 2.5013, "mean_token_accuracy": 0.3931034475564957, "step": 97965 }, { "epoch": 0.09867642452316132, "grad_norm": 9.253007233229098, "learning_rate": 4.97091371416516e-05, "loss": 2.076, "mean_token_accuracy": 0.4862068951129913, "step": 97970 }, { "epoch": 0.09868146057626548, "grad_norm": 10.990643773768477, "learning_rate": 4.9709077082173314e-05, "loss": 2.6848, "mean_token_accuracy": 0.44827585816383364, "step": 97975 }, { "epoch": 0.09868649662936965, "grad_norm": 9.911742042713716, "learning_rate": 4.970901701653525e-05, "loss": 2.8209, "mean_token_accuracy": 0.39655171930789945, "step": 97980 }, { "epoch": 0.09869153268247383, "grad_norm": 10.11859020133486, "learning_rate": 4.970895694473744e-05, "loss": 2.3041, "mean_token_accuracy": 0.41724138259887694, "step": 97985 }, { "epoch": 0.098696568735578, "grad_norm": 11.270023606115382, "learning_rate": 4.970889686677989e-05, "loss": 2.159, "mean_token_accuracy": 0.4620689570903778, "step": 97990 }, { "epoch": 0.09870160478868217, "grad_norm": 13.942223548536774, "learning_rate": 4.9708836782662624e-05, "loss": 2.9282, "mean_token_accuracy": 0.38275861740112305, "step": 97995 }, { "epoch": 0.09870664084178635, "grad_norm": 11.859938131770873, "learning_rate": 4.970877669238565e-05, "loss": 2.6706, "mean_token_accuracy": 0.44137930274009707, "step": 98000 }, { "epoch": 0.09871167689489052, "grad_norm": 16.507703314031456, "learning_rate": 4.970871659594899e-05, "loss": 3.0387, "mean_token_accuracy": 0.3793103456497192, "step": 98005 }, { "epoch": 0.0987167129479947, "grad_norm": 11.24955043281426, "learning_rate": 4.970865649335266e-05, "loss": 2.6179, "mean_token_accuracy": 0.42758620977401735, "step": 98010 }, { "epoch": 0.09872174900109887, "grad_norm": 14.666138617772406, "learning_rate": 4.9708596384596675e-05, "loss": 2.8386, "mean_token_accuracy": 0.3393829435110092, "step": 98015 }, { "epoch": 0.09872678505420304, "grad_norm": 11.724975261497976, "learning_rate": 4.970853626968106e-05, "loss": 2.5236, "mean_token_accuracy": 0.39310344457626345, "step": 98020 }, { "epoch": 0.09873182110730722, "grad_norm": 17.32955881392607, "learning_rate": 4.970847614860582e-05, "loss": 2.8313, "mean_token_accuracy": 0.41379310190677643, "step": 98025 }, { "epoch": 0.09873685716041139, "grad_norm": 12.770506308880238, "learning_rate": 4.970841602137097e-05, "loss": 2.7397, "mean_token_accuracy": 0.4034482777118683, "step": 98030 }, { "epoch": 0.09874189321351556, "grad_norm": 10.954857814938002, "learning_rate": 4.9708355887976545e-05, "loss": 2.4899, "mean_token_accuracy": 0.39310344457626345, "step": 98035 }, { "epoch": 0.09874692926661974, "grad_norm": 11.409355004776438, "learning_rate": 4.9708295748422536e-05, "loss": 2.3361, "mean_token_accuracy": 0.41379310488700866, "step": 98040 }, { "epoch": 0.0987519653197239, "grad_norm": 12.596639555162035, "learning_rate": 4.9708235602708986e-05, "loss": 3.2598, "mean_token_accuracy": 0.3551724016666412, "step": 98045 }, { "epoch": 0.09875700137282807, "grad_norm": 8.990840990409456, "learning_rate": 4.970817545083589e-05, "loss": 2.5519, "mean_token_accuracy": 0.41379310488700866, "step": 98050 }, { "epoch": 0.09876203742593224, "grad_norm": 19.859006178409228, "learning_rate": 4.970811529280328e-05, "loss": 2.4828, "mean_token_accuracy": 0.44912281036376955, "step": 98055 }, { "epoch": 0.09876707347903642, "grad_norm": 9.138786415558275, "learning_rate": 4.970805512861116e-05, "loss": 2.3064, "mean_token_accuracy": 0.4241379380226135, "step": 98060 }, { "epoch": 0.09877210953214059, "grad_norm": 12.432783570971484, "learning_rate": 4.9707994958259564e-05, "loss": 2.8446, "mean_token_accuracy": 0.41379310488700866, "step": 98065 }, { "epoch": 0.09877714558524477, "grad_norm": 10.164596343937557, "learning_rate": 4.970793478174849e-05, "loss": 2.3907, "mean_token_accuracy": 0.4379310369491577, "step": 98070 }, { "epoch": 0.09878218163834894, "grad_norm": 11.089378520015082, "learning_rate": 4.9707874599077965e-05, "loss": 2.4744, "mean_token_accuracy": 0.43448275327682495, "step": 98075 }, { "epoch": 0.09878721769145311, "grad_norm": 11.312346405715486, "learning_rate": 4.970781441024799e-05, "loss": 2.597, "mean_token_accuracy": 0.44827585816383364, "step": 98080 }, { "epoch": 0.09879225374455729, "grad_norm": 12.592623145362236, "learning_rate": 4.97077542152586e-05, "loss": 2.7911, "mean_token_accuracy": 0.36896551847457887, "step": 98085 }, { "epoch": 0.09879728979766146, "grad_norm": 11.01910612485212, "learning_rate": 4.970769401410982e-05, "loss": 2.4545, "mean_token_accuracy": 0.4379310429096222, "step": 98090 }, { "epoch": 0.09880232585076563, "grad_norm": 10.050163366299188, "learning_rate": 4.970763380680165e-05, "loss": 2.3706, "mean_token_accuracy": 0.42758620381355283, "step": 98095 }, { "epoch": 0.09880736190386981, "grad_norm": 15.22832408515365, "learning_rate": 4.970757359333411e-05, "loss": 2.5037, "mean_token_accuracy": 0.43793103098869324, "step": 98100 }, { "epoch": 0.09881239795697398, "grad_norm": 9.79357039731364, "learning_rate": 4.97075133737072e-05, "loss": 2.1665, "mean_token_accuracy": 0.48106473684310913, "step": 98105 }, { "epoch": 0.09881743401007816, "grad_norm": 15.360350619818183, "learning_rate": 4.970745314792097e-05, "loss": 2.6582, "mean_token_accuracy": 0.42413792610168455, "step": 98110 }, { "epoch": 0.09882247006318232, "grad_norm": 10.53078678268063, "learning_rate": 4.9707392915975414e-05, "loss": 2.4484, "mean_token_accuracy": 0.42413793206214906, "step": 98115 }, { "epoch": 0.09882750611628649, "grad_norm": 13.59320778984477, "learning_rate": 4.970733267787056e-05, "loss": 2.385, "mean_token_accuracy": 0.41379310488700866, "step": 98120 }, { "epoch": 0.09883254216939066, "grad_norm": 11.517848992756017, "learning_rate": 4.970727243360642e-05, "loss": 2.8107, "mean_token_accuracy": 0.40895341634750365, "step": 98125 }, { "epoch": 0.09883757822249484, "grad_norm": 9.744572804387897, "learning_rate": 4.970721218318301e-05, "loss": 2.6388, "mean_token_accuracy": 0.40689654350280763, "step": 98130 }, { "epoch": 0.09884261427559901, "grad_norm": 11.930975430787674, "learning_rate": 4.970715192660033e-05, "loss": 2.5446, "mean_token_accuracy": 0.4103448212146759, "step": 98135 }, { "epoch": 0.09884765032870318, "grad_norm": 9.386461575119368, "learning_rate": 4.970709166385843e-05, "loss": 2.5837, "mean_token_accuracy": 0.43793103098869324, "step": 98140 }, { "epoch": 0.09885268638180736, "grad_norm": 11.013691281426624, "learning_rate": 4.97070313949573e-05, "loss": 2.3803, "mean_token_accuracy": 0.4172413647174835, "step": 98145 }, { "epoch": 0.09885772243491153, "grad_norm": 12.339083280322672, "learning_rate": 4.970697111989698e-05, "loss": 2.7812, "mean_token_accuracy": 0.3827586203813553, "step": 98150 }, { "epoch": 0.0988627584880157, "grad_norm": 11.330990404560511, "learning_rate": 4.9706910838677466e-05, "loss": 2.6536, "mean_token_accuracy": 0.37931033968925476, "step": 98155 }, { "epoch": 0.09886779454111988, "grad_norm": 9.888103145798377, "learning_rate": 4.9706850551298787e-05, "loss": 2.4927, "mean_token_accuracy": 0.3896551728248596, "step": 98160 }, { "epoch": 0.09887283059422405, "grad_norm": 10.522646339948139, "learning_rate": 4.970679025776096e-05, "loss": 2.4188, "mean_token_accuracy": 0.4363581299781799, "step": 98165 }, { "epoch": 0.09887786664732823, "grad_norm": 8.939621682336476, "learning_rate": 4.970672995806398e-05, "loss": 2.3047, "mean_token_accuracy": 0.4689655125141144, "step": 98170 }, { "epoch": 0.0988829027004324, "grad_norm": 11.403875569504207, "learning_rate": 4.9706669652207887e-05, "loss": 2.3731, "mean_token_accuracy": 0.41379310488700866, "step": 98175 }, { "epoch": 0.09888793875353658, "grad_norm": 10.907963880037839, "learning_rate": 4.97066093401927e-05, "loss": 2.7625, "mean_token_accuracy": 0.4643073260784149, "step": 98180 }, { "epoch": 0.09889297480664073, "grad_norm": 10.387843828286515, "learning_rate": 4.9706549022018415e-05, "loss": 2.5832, "mean_token_accuracy": 0.4310344815254211, "step": 98185 }, { "epoch": 0.09889801085974491, "grad_norm": 11.351449294036794, "learning_rate": 4.970648869768506e-05, "loss": 2.462, "mean_token_accuracy": 0.3896551728248596, "step": 98190 }, { "epoch": 0.09890304691284908, "grad_norm": 11.59278996563145, "learning_rate": 4.9706428367192664e-05, "loss": 2.5998, "mean_token_accuracy": 0.35172414481639863, "step": 98195 }, { "epoch": 0.09890808296595326, "grad_norm": 10.314298294382988, "learning_rate": 4.970636803054123e-05, "loss": 2.5392, "mean_token_accuracy": 0.3827586114406586, "step": 98200 }, { "epoch": 0.09891311901905743, "grad_norm": 9.053597490023474, "learning_rate": 4.970630768773077e-05, "loss": 1.9558, "mean_token_accuracy": 0.5087719321250915, "step": 98205 }, { "epoch": 0.0989181550721616, "grad_norm": 14.524493455425674, "learning_rate": 4.970624733876132e-05, "loss": 2.4377, "mean_token_accuracy": 0.4034482777118683, "step": 98210 }, { "epoch": 0.09892319112526578, "grad_norm": 9.162183929573207, "learning_rate": 4.970618698363288e-05, "loss": 2.4739, "mean_token_accuracy": 0.42068964838981626, "step": 98215 }, { "epoch": 0.09892822717836995, "grad_norm": 9.973446366697797, "learning_rate": 4.9706126622345463e-05, "loss": 2.4182, "mean_token_accuracy": 0.46206897497177124, "step": 98220 }, { "epoch": 0.09893326323147413, "grad_norm": 9.174188685983182, "learning_rate": 4.97060662548991e-05, "loss": 2.3595, "mean_token_accuracy": 0.43793103098869324, "step": 98225 }, { "epoch": 0.0989382992845783, "grad_norm": 10.347456211126978, "learning_rate": 4.97060058812938e-05, "loss": 2.2713, "mean_token_accuracy": 0.420689657330513, "step": 98230 }, { "epoch": 0.09894333533768247, "grad_norm": 15.330849747475884, "learning_rate": 4.9705945501529575e-05, "loss": 2.5948, "mean_token_accuracy": 0.4, "step": 98235 }, { "epoch": 0.09894837139078665, "grad_norm": 11.123840969212207, "learning_rate": 4.970588511560645e-05, "loss": 2.3632, "mean_token_accuracy": 0.48784029483795166, "step": 98240 }, { "epoch": 0.09895340744389082, "grad_norm": 10.05710093112814, "learning_rate": 4.970582472352445e-05, "loss": 2.4511, "mean_token_accuracy": 0.4310344815254211, "step": 98245 }, { "epoch": 0.098958443496995, "grad_norm": 12.27344087661297, "learning_rate": 4.9705764325283585e-05, "loss": 2.3802, "mean_token_accuracy": 0.46551724076271056, "step": 98250 }, { "epoch": 0.09896347955009915, "grad_norm": 12.304931155812197, "learning_rate": 4.970570392088385e-05, "loss": 2.3704, "mean_token_accuracy": 0.4379310369491577, "step": 98255 }, { "epoch": 0.09896851560320333, "grad_norm": 14.218439567165651, "learning_rate": 4.9705643510325294e-05, "loss": 2.7131, "mean_token_accuracy": 0.3999999940395355, "step": 98260 }, { "epoch": 0.0989735516563075, "grad_norm": 13.361735695200595, "learning_rate": 4.9705583093607916e-05, "loss": 2.3466, "mean_token_accuracy": 0.4931034505367279, "step": 98265 }, { "epoch": 0.09897858770941168, "grad_norm": 11.242552604930378, "learning_rate": 4.9705522670731734e-05, "loss": 2.2203, "mean_token_accuracy": 0.458620685338974, "step": 98270 }, { "epoch": 0.09898362376251585, "grad_norm": 10.9032269436806, "learning_rate": 4.970546224169677e-05, "loss": 2.2533, "mean_token_accuracy": 0.4, "step": 98275 }, { "epoch": 0.09898865981562002, "grad_norm": 9.387443345083136, "learning_rate": 4.970540180650304e-05, "loss": 2.3786, "mean_token_accuracy": 0.3896551728248596, "step": 98280 }, { "epoch": 0.0989936958687242, "grad_norm": 9.763117654702796, "learning_rate": 4.9705341365150556e-05, "loss": 2.4596, "mean_token_accuracy": 0.43284936547279357, "step": 98285 }, { "epoch": 0.09899873192182837, "grad_norm": 10.49590051133532, "learning_rate": 4.970528091763934e-05, "loss": 2.1148, "mean_token_accuracy": 0.47931034564971925, "step": 98290 }, { "epoch": 0.09900376797493254, "grad_norm": 10.496333006310019, "learning_rate": 4.970522046396941e-05, "loss": 2.1725, "mean_token_accuracy": 0.44827587008476255, "step": 98295 }, { "epoch": 0.09900880402803672, "grad_norm": 9.364964119123947, "learning_rate": 4.9705160004140775e-05, "loss": 2.4457, "mean_token_accuracy": 0.45517241954803467, "step": 98300 }, { "epoch": 0.09901384008114089, "grad_norm": 10.62952092212202, "learning_rate": 4.9705099538153455e-05, "loss": 2.5531, "mean_token_accuracy": 0.3862069010734558, "step": 98305 }, { "epoch": 0.09901887613424507, "grad_norm": 11.544604815851127, "learning_rate": 4.970503906600747e-05, "loss": 2.5351, "mean_token_accuracy": 0.39310344457626345, "step": 98310 }, { "epoch": 0.09902391218734924, "grad_norm": 7.919368350375854, "learning_rate": 4.970497858770283e-05, "loss": 2.303, "mean_token_accuracy": 0.43448275327682495, "step": 98315 }, { "epoch": 0.09902894824045341, "grad_norm": 12.790521448469477, "learning_rate": 4.970491810323956e-05, "loss": 2.6252, "mean_token_accuracy": 0.3827586233615875, "step": 98320 }, { "epoch": 0.09903398429355757, "grad_norm": 10.20745370132174, "learning_rate": 4.970485761261767e-05, "loss": 2.3155, "mean_token_accuracy": 0.43448275327682495, "step": 98325 }, { "epoch": 0.09903902034666175, "grad_norm": 13.440856080388821, "learning_rate": 4.970479711583719e-05, "loss": 2.4851, "mean_token_accuracy": 0.3896551728248596, "step": 98330 }, { "epoch": 0.09904405639976592, "grad_norm": 11.491372423211995, "learning_rate": 4.970473661289811e-05, "loss": 2.3748, "mean_token_accuracy": 0.4068965554237366, "step": 98335 }, { "epoch": 0.0990490924528701, "grad_norm": 9.348850771782654, "learning_rate": 4.970467610380048e-05, "loss": 2.2375, "mean_token_accuracy": 0.4517241418361664, "step": 98340 }, { "epoch": 0.09905412850597427, "grad_norm": 11.857151642450845, "learning_rate": 4.970461558854429e-05, "loss": 2.6386, "mean_token_accuracy": 0.4034482777118683, "step": 98345 }, { "epoch": 0.09905916455907844, "grad_norm": 8.29150731066686, "learning_rate": 4.970455506712957e-05, "loss": 2.3929, "mean_token_accuracy": 0.41954023241996763, "step": 98350 }, { "epoch": 0.09906420061218262, "grad_norm": 9.823777042874413, "learning_rate": 4.970449453955634e-05, "loss": 2.1681, "mean_token_accuracy": 0.44482759237289426, "step": 98355 }, { "epoch": 0.09906923666528679, "grad_norm": 8.476104390782211, "learning_rate": 4.97044340058246e-05, "loss": 1.9027, "mean_token_accuracy": 0.5241379261016845, "step": 98360 }, { "epoch": 0.09907427271839096, "grad_norm": 13.772664323795611, "learning_rate": 4.970437346593438e-05, "loss": 2.767, "mean_token_accuracy": 0.3241379290819168, "step": 98365 }, { "epoch": 0.09907930877149514, "grad_norm": 10.752918980724479, "learning_rate": 4.970431291988569e-05, "loss": 2.544, "mean_token_accuracy": 0.4068965494632721, "step": 98370 }, { "epoch": 0.09908434482459931, "grad_norm": 10.844680386209507, "learning_rate": 4.9704252367678564e-05, "loss": 2.4945, "mean_token_accuracy": 0.40689654350280763, "step": 98375 }, { "epoch": 0.09908938087770348, "grad_norm": 10.56764299324792, "learning_rate": 4.9704191809313e-05, "loss": 2.5656, "mean_token_accuracy": 0.358620685338974, "step": 98380 }, { "epoch": 0.09909441693080766, "grad_norm": 9.967996859520062, "learning_rate": 4.970413124478902e-05, "loss": 2.5736, "mean_token_accuracy": 0.4, "step": 98385 }, { "epoch": 0.09909945298391183, "grad_norm": 10.648253726024706, "learning_rate": 4.970407067410664e-05, "loss": 2.6266, "mean_token_accuracy": 0.4137930989265442, "step": 98390 }, { "epoch": 0.09910448903701599, "grad_norm": 10.567841801304708, "learning_rate": 4.9704010097265875e-05, "loss": 2.7533, "mean_token_accuracy": 0.38620689809322356, "step": 98395 }, { "epoch": 0.09910952509012017, "grad_norm": 11.095193333315816, "learning_rate": 4.970394951426675e-05, "loss": 2.3154, "mean_token_accuracy": 0.4413793087005615, "step": 98400 }, { "epoch": 0.09911456114322434, "grad_norm": 12.353686485128089, "learning_rate": 4.970388892510928e-05, "loss": 2.5274, "mean_token_accuracy": 0.40840895771980285, "step": 98405 }, { "epoch": 0.09911959719632851, "grad_norm": 9.309233916156364, "learning_rate": 4.970382832979347e-05, "loss": 2.5977, "mean_token_accuracy": 0.4413793087005615, "step": 98410 }, { "epoch": 0.09912463324943269, "grad_norm": 11.282905550993819, "learning_rate": 4.970376772831935e-05, "loss": 2.3098, "mean_token_accuracy": 0.44827587008476255, "step": 98415 }, { "epoch": 0.09912966930253686, "grad_norm": 10.321442875593423, "learning_rate": 4.970370712068693e-05, "loss": 2.09, "mean_token_accuracy": 0.4724137902259827, "step": 98420 }, { "epoch": 0.09913470535564103, "grad_norm": 9.718331729705111, "learning_rate": 4.970364650689624e-05, "loss": 2.4611, "mean_token_accuracy": 0.3862068891525269, "step": 98425 }, { "epoch": 0.09913974140874521, "grad_norm": 14.211198740007548, "learning_rate": 4.9703585886947276e-05, "loss": 2.9202, "mean_token_accuracy": 0.3965517282485962, "step": 98430 }, { "epoch": 0.09914477746184938, "grad_norm": 10.057099281748014, "learning_rate": 4.970352526084005e-05, "loss": 2.5545, "mean_token_accuracy": 0.3896551787853241, "step": 98435 }, { "epoch": 0.09914981351495356, "grad_norm": 10.89137713310155, "learning_rate": 4.9703464628574616e-05, "loss": 2.7316, "mean_token_accuracy": 0.3999999940395355, "step": 98440 }, { "epoch": 0.09915484956805773, "grad_norm": 9.354170961846618, "learning_rate": 4.970340399015096e-05, "loss": 2.6871, "mean_token_accuracy": 0.37241379022598264, "step": 98445 }, { "epoch": 0.0991598856211619, "grad_norm": 9.514754293177226, "learning_rate": 4.970334334556911e-05, "loss": 2.0579, "mean_token_accuracy": 0.4551724135875702, "step": 98450 }, { "epoch": 0.09916492167426608, "grad_norm": 10.115899460275848, "learning_rate": 4.970328269482908e-05, "loss": 2.6012, "mean_token_accuracy": 0.4275862157344818, "step": 98455 }, { "epoch": 0.09916995772737025, "grad_norm": 12.03177884342683, "learning_rate": 4.970322203793088e-05, "loss": 2.4224, "mean_token_accuracy": 0.42758620381355283, "step": 98460 }, { "epoch": 0.09917499378047441, "grad_norm": 9.718838390403258, "learning_rate": 4.970316137487454e-05, "loss": 2.109, "mean_token_accuracy": 0.4724137902259827, "step": 98465 }, { "epoch": 0.09918002983357858, "grad_norm": 7.7876160230351985, "learning_rate": 4.9703100705660074e-05, "loss": 2.815, "mean_token_accuracy": 0.4379310369491577, "step": 98470 }, { "epoch": 0.09918506588668276, "grad_norm": 12.051391339796783, "learning_rate": 4.9703040030287484e-05, "loss": 2.841, "mean_token_accuracy": 0.3275862097740173, "step": 98475 }, { "epoch": 0.09919010193978693, "grad_norm": 9.741272709894394, "learning_rate": 4.970297934875681e-05, "loss": 2.4143, "mean_token_accuracy": 0.39836661219596864, "step": 98480 }, { "epoch": 0.0991951379928911, "grad_norm": 11.84357379264377, "learning_rate": 4.970291866106804e-05, "loss": 2.3261, "mean_token_accuracy": 0.44137930274009707, "step": 98485 }, { "epoch": 0.09920017404599528, "grad_norm": 12.384175045590043, "learning_rate": 4.970285796722123e-05, "loss": 2.3696, "mean_token_accuracy": 0.4068965494632721, "step": 98490 }, { "epoch": 0.09920521009909945, "grad_norm": 11.285704162737877, "learning_rate": 4.970279726721636e-05, "loss": 2.4975, "mean_token_accuracy": 0.4068965554237366, "step": 98495 }, { "epoch": 0.09921024615220363, "grad_norm": 10.984177641200109, "learning_rate": 4.970273656105346e-05, "loss": 3.292, "mean_token_accuracy": 0.3620689660310745, "step": 98500 }, { "epoch": 0.0992152822053078, "grad_norm": 10.499486834300223, "learning_rate": 4.9702675848732555e-05, "loss": 2.7665, "mean_token_accuracy": 0.4159104585647583, "step": 98505 }, { "epoch": 0.09922031825841197, "grad_norm": 11.926402628258648, "learning_rate": 4.970261513025365e-05, "loss": 2.7236, "mean_token_accuracy": 0.39310344457626345, "step": 98510 }, { "epoch": 0.09922535431151615, "grad_norm": 13.497427441574592, "learning_rate": 4.970255440561677e-05, "loss": 2.678, "mean_token_accuracy": 0.4068965494632721, "step": 98515 }, { "epoch": 0.09923039036462032, "grad_norm": 10.192882297896023, "learning_rate": 4.9702493674821935e-05, "loss": 2.3648, "mean_token_accuracy": 0.42758620977401735, "step": 98520 }, { "epoch": 0.0992354264177245, "grad_norm": 14.420117220228178, "learning_rate": 4.970243293786914e-05, "loss": 2.8603, "mean_token_accuracy": 0.3965517282485962, "step": 98525 }, { "epoch": 0.09924046247082867, "grad_norm": 9.781530864769534, "learning_rate": 4.970237219475843e-05, "loss": 2.9612, "mean_token_accuracy": 0.3827586233615875, "step": 98530 }, { "epoch": 0.09924549852393283, "grad_norm": 10.016367391422348, "learning_rate": 4.97023114454898e-05, "loss": 2.2179, "mean_token_accuracy": 0.4103448212146759, "step": 98535 }, { "epoch": 0.099250534577037, "grad_norm": 12.911669335938276, "learning_rate": 4.9702250690063285e-05, "loss": 2.4651, "mean_token_accuracy": 0.3862069010734558, "step": 98540 }, { "epoch": 0.09925557063014118, "grad_norm": 11.094196147909896, "learning_rate": 4.970218992847889e-05, "loss": 2.2788, "mean_token_accuracy": 0.4482758641242981, "step": 98545 }, { "epoch": 0.09926060668324535, "grad_norm": 10.743605574524937, "learning_rate": 4.970212916073663e-05, "loss": 2.724, "mean_token_accuracy": 0.37586205899715425, "step": 98550 }, { "epoch": 0.09926564273634952, "grad_norm": 8.44018118026029, "learning_rate": 4.970206838683653e-05, "loss": 2.2113, "mean_token_accuracy": 0.4689655125141144, "step": 98555 }, { "epoch": 0.0992706787894537, "grad_norm": 12.436463688893426, "learning_rate": 4.97020076067786e-05, "loss": 2.3063, "mean_token_accuracy": 0.42413792610168455, "step": 98560 }, { "epoch": 0.09927571484255787, "grad_norm": 11.250473691738149, "learning_rate": 4.9701946820562875e-05, "loss": 2.2968, "mean_token_accuracy": 0.47931033968925474, "step": 98565 }, { "epoch": 0.09928075089566205, "grad_norm": 13.219721867856176, "learning_rate": 4.970188602818934e-05, "loss": 2.7902, "mean_token_accuracy": 0.34827586710453035, "step": 98570 }, { "epoch": 0.09928578694876622, "grad_norm": 10.866393075138312, "learning_rate": 4.9701825229658036e-05, "loss": 2.0123, "mean_token_accuracy": 0.5137931048870087, "step": 98575 }, { "epoch": 0.09929082300187039, "grad_norm": 11.065729230389186, "learning_rate": 4.970176442496898e-05, "loss": 2.3753, "mean_token_accuracy": 0.4344827592372894, "step": 98580 }, { "epoch": 0.09929585905497457, "grad_norm": 11.691243329008165, "learning_rate": 4.970170361412217e-05, "loss": 2.5292, "mean_token_accuracy": 0.4103448212146759, "step": 98585 }, { "epoch": 0.09930089510807874, "grad_norm": 9.518201721618828, "learning_rate": 4.9701642797117635e-05, "loss": 2.637, "mean_token_accuracy": 0.3620689630508423, "step": 98590 }, { "epoch": 0.09930593116118291, "grad_norm": 11.566564809877233, "learning_rate": 4.97015819739554e-05, "loss": 2.6406, "mean_token_accuracy": 0.37241379022598264, "step": 98595 }, { "epoch": 0.09931096721428709, "grad_norm": 12.10777567611062, "learning_rate": 4.9701521144635466e-05, "loss": 2.3623, "mean_token_accuracy": 0.4206896543502808, "step": 98600 }, { "epoch": 0.09931600326739125, "grad_norm": 10.330323316124542, "learning_rate": 4.970146030915786e-05, "loss": 2.0646, "mean_token_accuracy": 0.47586206197738645, "step": 98605 }, { "epoch": 0.09932103932049542, "grad_norm": 11.049019002518808, "learning_rate": 4.97013994675226e-05, "loss": 2.5011, "mean_token_accuracy": 0.4172413766384125, "step": 98610 }, { "epoch": 0.0993260753735996, "grad_norm": 10.953651094109697, "learning_rate": 4.97013386197297e-05, "loss": 2.5149, "mean_token_accuracy": 0.4551724076271057, "step": 98615 }, { "epoch": 0.09933111142670377, "grad_norm": 10.5106908839289, "learning_rate": 4.970127776577917e-05, "loss": 2.5463, "mean_token_accuracy": 0.43793103098869324, "step": 98620 }, { "epoch": 0.09933614747980794, "grad_norm": 9.949187433789927, "learning_rate": 4.9701216905671034e-05, "loss": 2.5424, "mean_token_accuracy": 0.3931034505367279, "step": 98625 }, { "epoch": 0.09934118353291212, "grad_norm": 10.674849677636555, "learning_rate": 4.970115603940531e-05, "loss": 2.4712, "mean_token_accuracy": 0.41379310488700866, "step": 98630 }, { "epoch": 0.09934621958601629, "grad_norm": 10.822906127121689, "learning_rate": 4.970109516698201e-05, "loss": 2.3629, "mean_token_accuracy": 0.5, "step": 98635 }, { "epoch": 0.09935125563912046, "grad_norm": 9.107292054737528, "learning_rate": 4.970103428840115e-05, "loss": 2.7903, "mean_token_accuracy": 0.34482758343219755, "step": 98640 }, { "epoch": 0.09935629169222464, "grad_norm": 12.27485698258569, "learning_rate": 4.970097340366276e-05, "loss": 2.4764, "mean_token_accuracy": 0.37586207389831544, "step": 98645 }, { "epoch": 0.09936132774532881, "grad_norm": 14.601480566352635, "learning_rate": 4.9700912512766834e-05, "loss": 2.9752, "mean_token_accuracy": 0.40532365441322327, "step": 98650 }, { "epoch": 0.09936636379843299, "grad_norm": 12.298621456984765, "learning_rate": 4.970085161571341e-05, "loss": 2.5744, "mean_token_accuracy": 0.39310344457626345, "step": 98655 }, { "epoch": 0.09937139985153716, "grad_norm": 12.642852450677413, "learning_rate": 4.97007907125025e-05, "loss": 2.7501, "mean_token_accuracy": 0.4156079888343811, "step": 98660 }, { "epoch": 0.09937643590464133, "grad_norm": 10.632012229562127, "learning_rate": 4.970072980313411e-05, "loss": 2.668, "mean_token_accuracy": 0.3931034505367279, "step": 98665 }, { "epoch": 0.0993814719577455, "grad_norm": 10.964231559771315, "learning_rate": 4.970066888760827e-05, "loss": 2.9092, "mean_token_accuracy": 0.3931034505367279, "step": 98670 }, { "epoch": 0.09938650801084967, "grad_norm": 9.934552541215675, "learning_rate": 4.9700607965924995e-05, "loss": 2.7586, "mean_token_accuracy": 0.37586206793785093, "step": 98675 }, { "epoch": 0.09939154406395384, "grad_norm": 10.661615742983955, "learning_rate": 4.9700547038084294e-05, "loss": 2.4344, "mean_token_accuracy": 0.4379310369491577, "step": 98680 }, { "epoch": 0.09939658011705801, "grad_norm": 10.721069147172011, "learning_rate": 4.970048610408619e-05, "loss": 2.3045, "mean_token_accuracy": 0.4551724135875702, "step": 98685 }, { "epoch": 0.09940161617016219, "grad_norm": 10.82883695641102, "learning_rate": 4.9700425163930695e-05, "loss": 2.4394, "mean_token_accuracy": 0.39655172228813174, "step": 98690 }, { "epoch": 0.09940665222326636, "grad_norm": 12.265849840191644, "learning_rate": 4.9700364217617825e-05, "loss": 2.4691, "mean_token_accuracy": 0.42068964838981626, "step": 98695 }, { "epoch": 0.09941168827637054, "grad_norm": 11.248099816361371, "learning_rate": 4.9700303265147615e-05, "loss": 2.3333, "mean_token_accuracy": 0.44827585816383364, "step": 98700 }, { "epoch": 0.09941672432947471, "grad_norm": 12.595841418237173, "learning_rate": 4.970024230652006e-05, "loss": 2.5097, "mean_token_accuracy": 0.4137930989265442, "step": 98705 }, { "epoch": 0.09942176038257888, "grad_norm": 11.077531968957656, "learning_rate": 4.970018134173519e-05, "loss": 2.4496, "mean_token_accuracy": 0.4034482777118683, "step": 98710 }, { "epoch": 0.09942679643568306, "grad_norm": 9.681898887377212, "learning_rate": 4.9700120370793006e-05, "loss": 2.2035, "mean_token_accuracy": 0.4517241358757019, "step": 98715 }, { "epoch": 0.09943183248878723, "grad_norm": 10.118208067177969, "learning_rate": 4.970005939369355e-05, "loss": 2.5644, "mean_token_accuracy": 0.4206896543502808, "step": 98720 }, { "epoch": 0.0994368685418914, "grad_norm": 11.467393157413877, "learning_rate": 4.969999841043682e-05, "loss": 2.5986, "mean_token_accuracy": 0.4103448331356049, "step": 98725 }, { "epoch": 0.09944190459499558, "grad_norm": 10.59435410237419, "learning_rate": 4.9699937421022825e-05, "loss": 2.1589, "mean_token_accuracy": 0.4379310369491577, "step": 98730 }, { "epoch": 0.09944694064809975, "grad_norm": 11.176840101673928, "learning_rate": 4.969987642545161e-05, "loss": 2.5739, "mean_token_accuracy": 0.3999999940395355, "step": 98735 }, { "epoch": 0.09945197670120393, "grad_norm": 12.273939670179729, "learning_rate": 4.9699815423723166e-05, "loss": 2.67, "mean_token_accuracy": 0.36896551251411436, "step": 98740 }, { "epoch": 0.09945701275430809, "grad_norm": 10.198438594014455, "learning_rate": 4.969975441583753e-05, "loss": 2.7478, "mean_token_accuracy": 0.39310344457626345, "step": 98745 }, { "epoch": 0.09946204880741226, "grad_norm": 11.766708095950554, "learning_rate": 4.96996934017947e-05, "loss": 2.3438, "mean_token_accuracy": 0.4430732011795044, "step": 98750 }, { "epoch": 0.09946708486051643, "grad_norm": 15.5827839047036, "learning_rate": 4.969963238159471e-05, "loss": 2.8499, "mean_token_accuracy": 0.39310344457626345, "step": 98755 }, { "epoch": 0.0994721209136206, "grad_norm": 11.220868741327068, "learning_rate": 4.969957135523756e-05, "loss": 2.4575, "mean_token_accuracy": 0.3999999940395355, "step": 98760 }, { "epoch": 0.09947715696672478, "grad_norm": 10.630225877909318, "learning_rate": 4.969951032272329e-05, "loss": 2.2291, "mean_token_accuracy": 0.4103448331356049, "step": 98765 }, { "epoch": 0.09948219301982895, "grad_norm": 10.800337562717154, "learning_rate": 4.969944928405189e-05, "loss": 2.1977, "mean_token_accuracy": 0.4344827651977539, "step": 98770 }, { "epoch": 0.09948722907293313, "grad_norm": 10.357729836771824, "learning_rate": 4.9699388239223396e-05, "loss": 2.2495, "mean_token_accuracy": 0.4535995125770569, "step": 98775 }, { "epoch": 0.0994922651260373, "grad_norm": 11.569148634465867, "learning_rate": 4.969932718823781e-05, "loss": 2.4024, "mean_token_accuracy": 0.4034482777118683, "step": 98780 }, { "epoch": 0.09949730117914148, "grad_norm": 10.0707680592349, "learning_rate": 4.969926613109517e-05, "loss": 2.2049, "mean_token_accuracy": 0.437931028008461, "step": 98785 }, { "epoch": 0.09950233723224565, "grad_norm": 8.64780139545705, "learning_rate": 4.969920506779548e-05, "loss": 2.3762, "mean_token_accuracy": 0.4560344755649567, "step": 98790 }, { "epoch": 0.09950737328534982, "grad_norm": 13.284293459716656, "learning_rate": 4.969914399833875e-05, "loss": 2.6859, "mean_token_accuracy": 0.358620685338974, "step": 98795 }, { "epoch": 0.099512409338454, "grad_norm": 11.627583222495725, "learning_rate": 4.969908292272501e-05, "loss": 2.5095, "mean_token_accuracy": 0.436539626121521, "step": 98800 }, { "epoch": 0.09951744539155817, "grad_norm": 11.520644698152573, "learning_rate": 4.969902184095428e-05, "loss": 2.2787, "mean_token_accuracy": 0.45517241954803467, "step": 98805 }, { "epoch": 0.09952248144466234, "grad_norm": 10.412683323961023, "learning_rate": 4.9698960753026544e-05, "loss": 2.4295, "mean_token_accuracy": 0.3931034505367279, "step": 98810 }, { "epoch": 0.0995275174977665, "grad_norm": 10.654100452719224, "learning_rate": 4.9698899658941865e-05, "loss": 2.1985, "mean_token_accuracy": 0.4379310429096222, "step": 98815 }, { "epoch": 0.09953255355087068, "grad_norm": 10.042910275665275, "learning_rate": 4.9698838558700225e-05, "loss": 2.4013, "mean_token_accuracy": 0.4413793087005615, "step": 98820 }, { "epoch": 0.09953758960397485, "grad_norm": 12.211300078064497, "learning_rate": 4.9698777452301665e-05, "loss": 2.9837, "mean_token_accuracy": 0.3551724076271057, "step": 98825 }, { "epoch": 0.09954262565707903, "grad_norm": 11.414060841156932, "learning_rate": 4.9698716339746185e-05, "loss": 2.8429, "mean_token_accuracy": 0.3551724165678024, "step": 98830 }, { "epoch": 0.0995476617101832, "grad_norm": 9.296741925534977, "learning_rate": 4.969865522103381e-05, "loss": 2.473, "mean_token_accuracy": 0.4413793087005615, "step": 98835 }, { "epoch": 0.09955269776328737, "grad_norm": 10.075177711016169, "learning_rate": 4.969859409616456e-05, "loss": 2.2966, "mean_token_accuracy": 0.4482758641242981, "step": 98840 }, { "epoch": 0.09955773381639155, "grad_norm": 10.44830127364626, "learning_rate": 4.969853296513844e-05, "loss": 2.6144, "mean_token_accuracy": 0.4068965554237366, "step": 98845 }, { "epoch": 0.09956276986949572, "grad_norm": 18.1209026497701, "learning_rate": 4.969847182795548e-05, "loss": 2.4151, "mean_token_accuracy": 0.4482758641242981, "step": 98850 }, { "epoch": 0.0995678059225999, "grad_norm": 13.347163491758097, "learning_rate": 4.9698410684615687e-05, "loss": 2.5989, "mean_token_accuracy": 0.39655172228813174, "step": 98855 }, { "epoch": 0.09957284197570407, "grad_norm": 10.474832666221184, "learning_rate": 4.969834953511908e-05, "loss": 2.4402, "mean_token_accuracy": 0.39655172228813174, "step": 98860 }, { "epoch": 0.09957787802880824, "grad_norm": 11.970620745948679, "learning_rate": 4.969828837946568e-05, "loss": 2.438, "mean_token_accuracy": 0.4068965494632721, "step": 98865 }, { "epoch": 0.09958291408191242, "grad_norm": 11.337109254039637, "learning_rate": 4.96982272176555e-05, "loss": 2.2941, "mean_token_accuracy": 0.4620689690113068, "step": 98870 }, { "epoch": 0.09958795013501659, "grad_norm": 10.791998404263058, "learning_rate": 4.9698166049688564e-05, "loss": 2.4983, "mean_token_accuracy": 0.4206896543502808, "step": 98875 }, { "epoch": 0.09959298618812076, "grad_norm": 8.127306335718513, "learning_rate": 4.9698104875564875e-05, "loss": 2.1692, "mean_token_accuracy": 0.4413793087005615, "step": 98880 }, { "epoch": 0.09959802224122492, "grad_norm": 10.160648262308854, "learning_rate": 4.9698043695284476e-05, "loss": 2.7352, "mean_token_accuracy": 0.41034482717514037, "step": 98885 }, { "epoch": 0.0996030582943291, "grad_norm": 9.432822650485459, "learning_rate": 4.969798250884735e-05, "loss": 2.0272, "mean_token_accuracy": 0.4965517222881317, "step": 98890 }, { "epoch": 0.09960809434743327, "grad_norm": 12.200758439472196, "learning_rate": 4.969792131625353e-05, "loss": 2.8701, "mean_token_accuracy": 0.3362976461648941, "step": 98895 }, { "epoch": 0.09961313040053744, "grad_norm": 8.700086054407246, "learning_rate": 4.969786011750305e-05, "loss": 2.2461, "mean_token_accuracy": 0.4658197224140167, "step": 98900 }, { "epoch": 0.09961816645364162, "grad_norm": 11.710281529437616, "learning_rate": 4.969779891259589e-05, "loss": 2.7498, "mean_token_accuracy": 0.36896551847457887, "step": 98905 }, { "epoch": 0.09962320250674579, "grad_norm": 18.58801103588291, "learning_rate": 4.9697737701532106e-05, "loss": 2.5482, "mean_token_accuracy": 0.3931034505367279, "step": 98910 }, { "epoch": 0.09962823855984997, "grad_norm": 10.457774100455532, "learning_rate": 4.9697676484311685e-05, "loss": 2.2683, "mean_token_accuracy": 0.4206896543502808, "step": 98915 }, { "epoch": 0.09963327461295414, "grad_norm": 10.021663273790406, "learning_rate": 4.9697615260934663e-05, "loss": 2.3746, "mean_token_accuracy": 0.44827587008476255, "step": 98920 }, { "epoch": 0.09963831066605831, "grad_norm": 9.49497681748817, "learning_rate": 4.969755403140105e-05, "loss": 2.0187, "mean_token_accuracy": 0.4965517222881317, "step": 98925 }, { "epoch": 0.09964334671916249, "grad_norm": 12.515823715786759, "learning_rate": 4.9697492795710856e-05, "loss": 2.545, "mean_token_accuracy": 0.37586206793785093, "step": 98930 }, { "epoch": 0.09964838277226666, "grad_norm": 11.375614846554585, "learning_rate": 4.9697431553864105e-05, "loss": 2.6744, "mean_token_accuracy": 0.3944948613643646, "step": 98935 }, { "epoch": 0.09965341882537083, "grad_norm": 10.139094090256808, "learning_rate": 4.969737030586081e-05, "loss": 2.1591, "mean_token_accuracy": 0.4448275864124298, "step": 98940 }, { "epoch": 0.09965845487847501, "grad_norm": 10.883906244462077, "learning_rate": 4.969730905170101e-05, "loss": 2.7957, "mean_token_accuracy": 0.3999999940395355, "step": 98945 }, { "epoch": 0.09966349093157917, "grad_norm": 10.373030960042904, "learning_rate": 4.969724779138469e-05, "loss": 2.1725, "mean_token_accuracy": 0.4172413796186447, "step": 98950 }, { "epoch": 0.09966852698468334, "grad_norm": 10.48641410286295, "learning_rate": 4.969718652491188e-05, "loss": 2.3644, "mean_token_accuracy": 0.458620685338974, "step": 98955 }, { "epoch": 0.09967356303778752, "grad_norm": 12.134083299054232, "learning_rate": 4.969712525228259e-05, "loss": 2.4977, "mean_token_accuracy": 0.44482758045196535, "step": 98960 }, { "epoch": 0.09967859909089169, "grad_norm": 11.871116464171761, "learning_rate": 4.969706397349686e-05, "loss": 2.6052, "mean_token_accuracy": 0.4119177281856537, "step": 98965 }, { "epoch": 0.09968363514399586, "grad_norm": 11.380105868826336, "learning_rate": 4.969700268855468e-05, "loss": 2.3494, "mean_token_accuracy": 0.43793103098869324, "step": 98970 }, { "epoch": 0.09968867119710004, "grad_norm": 9.727769067454384, "learning_rate": 4.969694139745609e-05, "loss": 2.3021, "mean_token_accuracy": 0.4275861978530884, "step": 98975 }, { "epoch": 0.09969370725020421, "grad_norm": 13.115409471336744, "learning_rate": 4.9696880100201084e-05, "loss": 2.7524, "mean_token_accuracy": 0.4172413766384125, "step": 98980 }, { "epoch": 0.09969874330330838, "grad_norm": 9.869143397132234, "learning_rate": 4.96968187967897e-05, "loss": 2.3224, "mean_token_accuracy": 0.41379310488700866, "step": 98985 }, { "epoch": 0.09970377935641256, "grad_norm": 12.666317483410737, "learning_rate": 4.9696757487221934e-05, "loss": 2.5547, "mean_token_accuracy": 0.4310344815254211, "step": 98990 }, { "epoch": 0.09970881540951673, "grad_norm": 14.222433735127392, "learning_rate": 4.969669617149783e-05, "loss": 2.4814, "mean_token_accuracy": 0.42413792610168455, "step": 98995 }, { "epoch": 0.0997138514626209, "grad_norm": 11.10356258419414, "learning_rate": 4.9696634849617374e-05, "loss": 2.4463, "mean_token_accuracy": 0.4, "step": 99000 }, { "epoch": 0.09971888751572508, "grad_norm": 9.593355543892944, "learning_rate": 4.969657352158061e-05, "loss": 2.6309, "mean_token_accuracy": 0.40689654350280763, "step": 99005 }, { "epoch": 0.09972392356882925, "grad_norm": 10.782492613959077, "learning_rate": 4.9696512187387534e-05, "loss": 2.5121, "mean_token_accuracy": 0.37586206793785093, "step": 99010 }, { "epoch": 0.09972895962193343, "grad_norm": 10.535287164253207, "learning_rate": 4.969645084703818e-05, "loss": 2.214, "mean_token_accuracy": 0.4517241418361664, "step": 99015 }, { "epoch": 0.09973399567503759, "grad_norm": 9.258756670458862, "learning_rate": 4.9696389500532555e-05, "loss": 2.2542, "mean_token_accuracy": 0.3931034505367279, "step": 99020 }, { "epoch": 0.09973903172814176, "grad_norm": 12.422026695781394, "learning_rate": 4.969632814787068e-05, "loss": 2.8272, "mean_token_accuracy": 0.3655172407627106, "step": 99025 }, { "epoch": 0.09974406778124593, "grad_norm": 10.561823931093555, "learning_rate": 4.969626678905257e-05, "loss": 2.5561, "mean_token_accuracy": 0.4103448331356049, "step": 99030 }, { "epoch": 0.09974910383435011, "grad_norm": 8.580780370886467, "learning_rate": 4.969620542407824e-05, "loss": 2.6034, "mean_token_accuracy": 0.38965516686439516, "step": 99035 }, { "epoch": 0.09975413988745428, "grad_norm": 10.58446338342712, "learning_rate": 4.9696144052947705e-05, "loss": 2.1629, "mean_token_accuracy": 0.40689654350280763, "step": 99040 }, { "epoch": 0.09975917594055846, "grad_norm": 15.544189544015795, "learning_rate": 4.9696082675661e-05, "loss": 2.5914, "mean_token_accuracy": 0.4401088833808899, "step": 99045 }, { "epoch": 0.09976421199366263, "grad_norm": 10.989792785526587, "learning_rate": 4.969602129221812e-05, "loss": 2.4828, "mean_token_accuracy": 0.39830611646175385, "step": 99050 }, { "epoch": 0.0997692480467668, "grad_norm": 10.672310492075967, "learning_rate": 4.969595990261909e-05, "loss": 2.7451, "mean_token_accuracy": 0.3862069010734558, "step": 99055 }, { "epoch": 0.09977428409987098, "grad_norm": 11.742391124157088, "learning_rate": 4.9695898506863934e-05, "loss": 2.9543, "mean_token_accuracy": 0.3931034505367279, "step": 99060 }, { "epoch": 0.09977932015297515, "grad_norm": 12.094649350930446, "learning_rate": 4.969583710495266e-05, "loss": 2.6948, "mean_token_accuracy": 0.39655172228813174, "step": 99065 }, { "epoch": 0.09978435620607932, "grad_norm": 12.232624536325778, "learning_rate": 4.969577569688529e-05, "loss": 2.452, "mean_token_accuracy": 0.4137930929660797, "step": 99070 }, { "epoch": 0.0997893922591835, "grad_norm": 10.933116284129689, "learning_rate": 4.969571428266183e-05, "loss": 2.6466, "mean_token_accuracy": 0.36551723778247835, "step": 99075 }, { "epoch": 0.09979442831228767, "grad_norm": 7.915417517343722, "learning_rate": 4.969565286228231e-05, "loss": 2.4086, "mean_token_accuracy": 0.4620689570903778, "step": 99080 }, { "epoch": 0.09979946436539185, "grad_norm": 10.46099070616217, "learning_rate": 4.969559143574675e-05, "loss": 2.1768, "mean_token_accuracy": 0.4517241299152374, "step": 99085 }, { "epoch": 0.099804500418496, "grad_norm": 13.41693857158804, "learning_rate": 4.969553000305516e-05, "loss": 2.8919, "mean_token_accuracy": 0.32758620381355286, "step": 99090 }, { "epoch": 0.09980953647160018, "grad_norm": 12.503043460188392, "learning_rate": 4.9695468564207546e-05, "loss": 2.6278, "mean_token_accuracy": 0.3862068891525269, "step": 99095 }, { "epoch": 0.09981457252470435, "grad_norm": 12.459121011245854, "learning_rate": 4.969540711920394e-05, "loss": 2.5036, "mean_token_accuracy": 0.42220205068588257, "step": 99100 }, { "epoch": 0.09981960857780853, "grad_norm": 13.892588678278305, "learning_rate": 4.969534566804436e-05, "loss": 2.0729, "mean_token_accuracy": 0.4655172348022461, "step": 99105 }, { "epoch": 0.0998246446309127, "grad_norm": 12.195619404815158, "learning_rate": 4.969528421072881e-05, "loss": 2.4114, "mean_token_accuracy": 0.3482758581638336, "step": 99110 }, { "epoch": 0.09982968068401687, "grad_norm": 12.23996210337998, "learning_rate": 4.9695222747257316e-05, "loss": 2.5238, "mean_token_accuracy": 0.43448275327682495, "step": 99115 }, { "epoch": 0.09983471673712105, "grad_norm": 10.417373315229511, "learning_rate": 4.969516127762991e-05, "loss": 2.872, "mean_token_accuracy": 0.3655172407627106, "step": 99120 }, { "epoch": 0.09983975279022522, "grad_norm": 11.708786220345328, "learning_rate": 4.969509980184657e-05, "loss": 2.5609, "mean_token_accuracy": 0.42758620381355283, "step": 99125 }, { "epoch": 0.0998447888433294, "grad_norm": 14.886931717542952, "learning_rate": 4.969503831990735e-05, "loss": 2.4945, "mean_token_accuracy": 0.41034482717514037, "step": 99130 }, { "epoch": 0.09984982489643357, "grad_norm": 11.219473864698278, "learning_rate": 4.969497683181225e-05, "loss": 2.6717, "mean_token_accuracy": 0.39655172228813174, "step": 99135 }, { "epoch": 0.09985486094953774, "grad_norm": 14.21772172548677, "learning_rate": 4.969491533756129e-05, "loss": 2.6834, "mean_token_accuracy": 0.3620689630508423, "step": 99140 }, { "epoch": 0.09985989700264192, "grad_norm": 11.36825899213656, "learning_rate": 4.9694853837154485e-05, "loss": 2.564, "mean_token_accuracy": 0.41379310488700866, "step": 99145 }, { "epoch": 0.09986493305574609, "grad_norm": 10.392873837240568, "learning_rate": 4.969479233059186e-05, "loss": 2.433, "mean_token_accuracy": 0.4, "step": 99150 }, { "epoch": 0.09986996910885027, "grad_norm": 12.594430542438904, "learning_rate": 4.9694730817873424e-05, "loss": 2.6588, "mean_token_accuracy": 0.3551724076271057, "step": 99155 }, { "epoch": 0.09987500516195442, "grad_norm": 11.355689336299601, "learning_rate": 4.9694669298999194e-05, "loss": 2.3361, "mean_token_accuracy": 0.3981851160526276, "step": 99160 }, { "epoch": 0.0998800412150586, "grad_norm": 8.462548672991622, "learning_rate": 4.9694607773969196e-05, "loss": 2.2673, "mean_token_accuracy": 0.4310344815254211, "step": 99165 }, { "epoch": 0.09988507726816277, "grad_norm": 11.166572742782451, "learning_rate": 4.9694546242783443e-05, "loss": 2.2497, "mean_token_accuracy": 0.436539626121521, "step": 99170 }, { "epoch": 0.09989011332126695, "grad_norm": 13.443203151636299, "learning_rate": 4.9694484705441944e-05, "loss": 2.4326, "mean_token_accuracy": 0.44482759237289426, "step": 99175 }, { "epoch": 0.09989514937437112, "grad_norm": 11.619874225175279, "learning_rate": 4.9694423161944716e-05, "loss": 2.3325, "mean_token_accuracy": 0.4344827592372894, "step": 99180 }, { "epoch": 0.0999001854274753, "grad_norm": 9.716694246877593, "learning_rate": 4.969436161229178e-05, "loss": 2.0969, "mean_token_accuracy": 0.47931033968925474, "step": 99185 }, { "epoch": 0.09990522148057947, "grad_norm": 9.276625419870708, "learning_rate": 4.9694300056483175e-05, "loss": 2.3838, "mean_token_accuracy": 0.42601330280303956, "step": 99190 }, { "epoch": 0.09991025753368364, "grad_norm": 10.682665418150888, "learning_rate": 4.969423849451888e-05, "loss": 2.4852, "mean_token_accuracy": 0.4448275983333588, "step": 99195 }, { "epoch": 0.09991529358678782, "grad_norm": 11.8784768354687, "learning_rate": 4.969417692639894e-05, "loss": 2.6568, "mean_token_accuracy": 0.4, "step": 99200 }, { "epoch": 0.09992032963989199, "grad_norm": 10.728917227615135, "learning_rate": 4.969411535212336e-05, "loss": 2.4141, "mean_token_accuracy": 0.41379310488700866, "step": 99205 }, { "epoch": 0.09992536569299616, "grad_norm": 10.011199717063104, "learning_rate": 4.969405377169216e-05, "loss": 2.2496, "mean_token_accuracy": 0.4344827592372894, "step": 99210 }, { "epoch": 0.09993040174610034, "grad_norm": 9.797106060114677, "learning_rate": 4.969399218510535e-05, "loss": 2.2509, "mean_token_accuracy": 0.48415002822875974, "step": 99215 }, { "epoch": 0.09993543779920451, "grad_norm": 10.213380519425609, "learning_rate": 4.969393059236296e-05, "loss": 2.2472, "mean_token_accuracy": 0.47241380214691164, "step": 99220 }, { "epoch": 0.09994047385230868, "grad_norm": 10.298990486925344, "learning_rate": 4.9693868993464996e-05, "loss": 2.5746, "mean_token_accuracy": 0.41379310488700866, "step": 99225 }, { "epoch": 0.09994550990541284, "grad_norm": 10.810960491940095, "learning_rate": 4.969380738841148e-05, "loss": 2.4035, "mean_token_accuracy": 0.4103448331356049, "step": 99230 }, { "epoch": 0.09995054595851702, "grad_norm": 10.777615275321356, "learning_rate": 4.969374577720243e-05, "loss": 2.759, "mean_token_accuracy": 0.39310343861579894, "step": 99235 }, { "epoch": 0.09995558201162119, "grad_norm": 10.221440041771125, "learning_rate": 4.969368415983786e-05, "loss": 2.1677, "mean_token_accuracy": 0.4689655125141144, "step": 99240 }, { "epoch": 0.09996061806472537, "grad_norm": 16.346927451575883, "learning_rate": 4.96936225363178e-05, "loss": 2.7581, "mean_token_accuracy": 0.4068965554237366, "step": 99245 }, { "epoch": 0.09996565411782954, "grad_norm": 14.641648106751774, "learning_rate": 4.9693560906642246e-05, "loss": 2.6693, "mean_token_accuracy": 0.3827586233615875, "step": 99250 }, { "epoch": 0.09997069017093371, "grad_norm": 11.980772030032853, "learning_rate": 4.9693499270811224e-05, "loss": 2.5957, "mean_token_accuracy": 0.3793103456497192, "step": 99255 }, { "epoch": 0.09997572622403789, "grad_norm": 10.21698409410971, "learning_rate": 4.969343762882475e-05, "loss": 2.3181, "mean_token_accuracy": 0.4620689630508423, "step": 99260 }, { "epoch": 0.09998076227714206, "grad_norm": 13.716931006099163, "learning_rate": 4.9693375980682846e-05, "loss": 2.6615, "mean_token_accuracy": 0.40139141082763674, "step": 99265 }, { "epoch": 0.09998579833024623, "grad_norm": 12.44162635338195, "learning_rate": 4.969331432638553e-05, "loss": 2.4805, "mean_token_accuracy": 0.4034482717514038, "step": 99270 }, { "epoch": 0.09999083438335041, "grad_norm": 9.823905401134374, "learning_rate": 4.969325266593281e-05, "loss": 2.0817, "mean_token_accuracy": 0.4862069010734558, "step": 99275 }, { "epoch": 0.09999587043645458, "grad_norm": 15.011565730512642, "learning_rate": 4.969319099932472e-05, "loss": 2.6554, "mean_token_accuracy": 0.3655172407627106, "step": 99280 }, { "epoch": 0.10000090648955876, "grad_norm": 12.407264718586683, "learning_rate": 4.9693129326561254e-05, "loss": 2.2656, "mean_token_accuracy": 0.4758620738983154, "step": 99285 }, { "epoch": 0.10000594254266293, "grad_norm": 10.960894213379845, "learning_rate": 4.969306764764245e-05, "loss": 2.7494, "mean_token_accuracy": 0.44482758045196535, "step": 99290 }, { "epoch": 0.1000109785957671, "grad_norm": 10.37501664405137, "learning_rate": 4.969300596256831e-05, "loss": 2.4118, "mean_token_accuracy": 0.41379310488700866, "step": 99295 }, { "epoch": 0.10001601464887126, "grad_norm": 13.114081328634484, "learning_rate": 4.969294427133885e-05, "loss": 2.2604, "mean_token_accuracy": 0.4344827592372894, "step": 99300 }, { "epoch": 0.10002105070197544, "grad_norm": 10.529368929388264, "learning_rate": 4.96928825739541e-05, "loss": 2.2526, "mean_token_accuracy": 0.4310344815254211, "step": 99305 }, { "epoch": 0.10002608675507961, "grad_norm": 10.295379670241816, "learning_rate": 4.969282087041408e-05, "loss": 2.4375, "mean_token_accuracy": 0.41034482717514037, "step": 99310 }, { "epoch": 0.10003112280818378, "grad_norm": 10.464335594527368, "learning_rate": 4.96927591607188e-05, "loss": 2.4012, "mean_token_accuracy": 0.4517241418361664, "step": 99315 }, { "epoch": 0.10003615886128796, "grad_norm": 11.732558757512262, "learning_rate": 4.969269744486826e-05, "loss": 2.4388, "mean_token_accuracy": 0.42758620381355283, "step": 99320 }, { "epoch": 0.10004119491439213, "grad_norm": 8.823232934067352, "learning_rate": 4.969263572286249e-05, "loss": 2.3381, "mean_token_accuracy": 0.4781004309654236, "step": 99325 }, { "epoch": 0.1000462309674963, "grad_norm": 11.20387134333757, "learning_rate": 4.9692573994701526e-05, "loss": 2.4972, "mean_token_accuracy": 0.4172413766384125, "step": 99330 }, { "epoch": 0.10005126702060048, "grad_norm": 12.471100555508237, "learning_rate": 4.969251226038537e-05, "loss": 2.5894, "mean_token_accuracy": 0.4137930989265442, "step": 99335 }, { "epoch": 0.10005630307370465, "grad_norm": 11.70143422867852, "learning_rate": 4.9692450519914016e-05, "loss": 2.4349, "mean_token_accuracy": 0.42758620381355283, "step": 99340 }, { "epoch": 0.10006133912680883, "grad_norm": 11.763850201073302, "learning_rate": 4.969238877328752e-05, "loss": 2.5607, "mean_token_accuracy": 0.3793103456497192, "step": 99345 }, { "epoch": 0.100066375179913, "grad_norm": 9.956199229471707, "learning_rate": 4.969232702050588e-05, "loss": 2.1737, "mean_token_accuracy": 0.43793103098869324, "step": 99350 }, { "epoch": 0.10007141123301717, "grad_norm": 11.909644105979483, "learning_rate": 4.969226526156912e-05, "loss": 2.5138, "mean_token_accuracy": 0.4103448331356049, "step": 99355 }, { "epoch": 0.10007644728612135, "grad_norm": 10.416608854649859, "learning_rate": 4.969220349647724e-05, "loss": 3.2346, "mean_token_accuracy": 0.38669951260089874, "step": 99360 }, { "epoch": 0.10008148333922552, "grad_norm": 11.33488239024833, "learning_rate": 4.969214172523027e-05, "loss": 2.2209, "mean_token_accuracy": 0.482758629322052, "step": 99365 }, { "epoch": 0.10008651939232968, "grad_norm": 24.836189889071715, "learning_rate": 4.969207994782824e-05, "loss": 2.5347, "mean_token_accuracy": 0.41724138259887694, "step": 99370 }, { "epoch": 0.10009155544543386, "grad_norm": 14.293277722426808, "learning_rate": 4.969201816427115e-05, "loss": 2.7279, "mean_token_accuracy": 0.4068965494632721, "step": 99375 }, { "epoch": 0.10009659149853803, "grad_norm": 9.531946835537436, "learning_rate": 4.969195637455901e-05, "loss": 2.488, "mean_token_accuracy": 0.4517241358757019, "step": 99380 }, { "epoch": 0.1001016275516422, "grad_norm": 12.535947009106794, "learning_rate": 4.969189457869186e-05, "loss": 2.6647, "mean_token_accuracy": 0.36896551549434664, "step": 99385 }, { "epoch": 0.10010666360474638, "grad_norm": 10.522443380350985, "learning_rate": 4.96918327766697e-05, "loss": 2.4499, "mean_token_accuracy": 0.40157289505004884, "step": 99390 }, { "epoch": 0.10011169965785055, "grad_norm": 14.99927374364409, "learning_rate": 4.9691770968492554e-05, "loss": 2.6767, "mean_token_accuracy": 0.4, "step": 99395 }, { "epoch": 0.10011673571095472, "grad_norm": 9.714847932169736, "learning_rate": 4.9691709154160446e-05, "loss": 2.5908, "mean_token_accuracy": 0.41724138259887694, "step": 99400 }, { "epoch": 0.1001217717640589, "grad_norm": 10.828901569134635, "learning_rate": 4.969164733367337e-05, "loss": 2.3157, "mean_token_accuracy": 0.4517241418361664, "step": 99405 }, { "epoch": 0.10012680781716307, "grad_norm": 14.692788611917804, "learning_rate": 4.9691585507031364e-05, "loss": 2.644, "mean_token_accuracy": 0.37586207389831544, "step": 99410 }, { "epoch": 0.10013184387026725, "grad_norm": 9.609449728062376, "learning_rate": 4.969152367423444e-05, "loss": 2.3803, "mean_token_accuracy": 0.3793103456497192, "step": 99415 }, { "epoch": 0.10013687992337142, "grad_norm": 11.084672178078666, "learning_rate": 4.969146183528262e-05, "loss": 3.0063, "mean_token_accuracy": 0.31724137663841245, "step": 99420 }, { "epoch": 0.10014191597647559, "grad_norm": 9.833188228212133, "learning_rate": 4.96913999901759e-05, "loss": 2.4141, "mean_token_accuracy": 0.39655172228813174, "step": 99425 }, { "epoch": 0.10014695202957977, "grad_norm": 10.536684350389349, "learning_rate": 4.969133813891432e-05, "loss": 2.3567, "mean_token_accuracy": 0.4568663060665131, "step": 99430 }, { "epoch": 0.10015198808268394, "grad_norm": 10.12832787088772, "learning_rate": 4.9691276281497886e-05, "loss": 2.012, "mean_token_accuracy": 0.4862068951129913, "step": 99435 }, { "epoch": 0.1001570241357881, "grad_norm": 9.492752696885939, "learning_rate": 4.9691214417926627e-05, "loss": 2.5507, "mean_token_accuracy": 0.4241379380226135, "step": 99440 }, { "epoch": 0.10016206018889227, "grad_norm": 9.385173944734856, "learning_rate": 4.969115254820055e-05, "loss": 2.1055, "mean_token_accuracy": 0.4517241358757019, "step": 99445 }, { "epoch": 0.10016709624199645, "grad_norm": 11.413520320317394, "learning_rate": 4.9691090672319665e-05, "loss": 2.5107, "mean_token_accuracy": 0.4241379201412201, "step": 99450 }, { "epoch": 0.10017213229510062, "grad_norm": 11.141921549739099, "learning_rate": 4.969102879028401e-05, "loss": 2.3106, "mean_token_accuracy": 0.42758620381355283, "step": 99455 }, { "epoch": 0.1001771683482048, "grad_norm": 10.42985438905199, "learning_rate": 4.969096690209358e-05, "loss": 2.4024, "mean_token_accuracy": 0.36896551847457887, "step": 99460 }, { "epoch": 0.10018220440130897, "grad_norm": 9.851768781429033, "learning_rate": 4.969090500774841e-05, "loss": 2.1708, "mean_token_accuracy": 0.4724137902259827, "step": 99465 }, { "epoch": 0.10018724045441314, "grad_norm": 12.710849824936062, "learning_rate": 4.9690843107248506e-05, "loss": 2.4126, "mean_token_accuracy": 0.4068965494632721, "step": 99470 }, { "epoch": 0.10019227650751732, "grad_norm": 11.26834253617972, "learning_rate": 4.969078120059389e-05, "loss": 2.5826, "mean_token_accuracy": 0.41893526911735535, "step": 99475 }, { "epoch": 0.10019731256062149, "grad_norm": 10.91557208926437, "learning_rate": 4.969071928778458e-05, "loss": 2.5404, "mean_token_accuracy": 0.38965516686439516, "step": 99480 }, { "epoch": 0.10020234861372566, "grad_norm": 10.298171689786855, "learning_rate": 4.9690657368820586e-05, "loss": 2.2453, "mean_token_accuracy": 0.441379314661026, "step": 99485 }, { "epoch": 0.10020738466682984, "grad_norm": 9.876065869837042, "learning_rate": 4.9690595443701934e-05, "loss": 2.6354, "mean_token_accuracy": 0.3896551728248596, "step": 99490 }, { "epoch": 0.10021242071993401, "grad_norm": 11.965323352654787, "learning_rate": 4.9690533512428646e-05, "loss": 2.7009, "mean_token_accuracy": 0.4034482777118683, "step": 99495 }, { "epoch": 0.10021745677303819, "grad_norm": 10.491704515193655, "learning_rate": 4.969047157500072e-05, "loss": 2.4364, "mean_token_accuracy": 0.4535390198230743, "step": 99500 }, { "epoch": 0.10022249282614236, "grad_norm": 12.164310889053505, "learning_rate": 4.969040963141818e-05, "loss": 2.3295, "mean_token_accuracy": 0.4344827592372894, "step": 99505 }, { "epoch": 0.10022752887924652, "grad_norm": 9.39579182548424, "learning_rate": 4.969034768168105e-05, "loss": 2.138, "mean_token_accuracy": 0.4551724135875702, "step": 99510 }, { "epoch": 0.10023256493235069, "grad_norm": 9.574914082172173, "learning_rate": 4.969028572578936e-05, "loss": 2.4374, "mean_token_accuracy": 0.42413793206214906, "step": 99515 }, { "epoch": 0.10023760098545487, "grad_norm": 9.946400805329093, "learning_rate": 4.96902237637431e-05, "loss": 2.4621, "mean_token_accuracy": 0.43793103098869324, "step": 99520 }, { "epoch": 0.10024263703855904, "grad_norm": 11.502755861119628, "learning_rate": 4.9690161795542296e-05, "loss": 2.6756, "mean_token_accuracy": 0.4103448301553726, "step": 99525 }, { "epoch": 0.10024767309166321, "grad_norm": 10.894927413446235, "learning_rate": 4.969009982118696e-05, "loss": 2.5029, "mean_token_accuracy": 0.4586206912994385, "step": 99530 }, { "epoch": 0.10025270914476739, "grad_norm": 11.71608452219711, "learning_rate": 4.969003784067713e-05, "loss": 3.0159, "mean_token_accuracy": 0.36551724672317504, "step": 99535 }, { "epoch": 0.10025774519787156, "grad_norm": 13.482145955317348, "learning_rate": 4.968997585401281e-05, "loss": 3.3523, "mean_token_accuracy": 0.33793103098869326, "step": 99540 }, { "epoch": 0.10026278125097574, "grad_norm": 11.384578393157817, "learning_rate": 4.968991386119401e-05, "loss": 2.6681, "mean_token_accuracy": 0.4068965494632721, "step": 99545 }, { "epoch": 0.10026781730407991, "grad_norm": 15.57004416562115, "learning_rate": 4.968985186222076e-05, "loss": 2.7367, "mean_token_accuracy": 0.4551724076271057, "step": 99550 }, { "epoch": 0.10027285335718408, "grad_norm": 11.220674992885856, "learning_rate": 4.968978985709307e-05, "loss": 2.6111, "mean_token_accuracy": 0.4034482777118683, "step": 99555 }, { "epoch": 0.10027788941028826, "grad_norm": 9.803040998907047, "learning_rate": 4.9689727845810963e-05, "loss": 2.6092, "mean_token_accuracy": 0.38620689511299133, "step": 99560 }, { "epoch": 0.10028292546339243, "grad_norm": 9.819617832136297, "learning_rate": 4.968966582837445e-05, "loss": 2.1829, "mean_token_accuracy": 0.4482758641242981, "step": 99565 }, { "epoch": 0.1002879615164966, "grad_norm": 9.916826461897802, "learning_rate": 4.9689603804783555e-05, "loss": 2.556, "mean_token_accuracy": 0.4068965494632721, "step": 99570 }, { "epoch": 0.10029299756960078, "grad_norm": 12.731052894230308, "learning_rate": 4.9689541775038275e-05, "loss": 2.7417, "mean_token_accuracy": 0.3569267988204956, "step": 99575 }, { "epoch": 0.10029803362270494, "grad_norm": 8.770093561316267, "learning_rate": 4.968947973913866e-05, "loss": 2.4464, "mean_token_accuracy": 0.3999999940395355, "step": 99580 }, { "epoch": 0.10030306967580911, "grad_norm": 9.81834980751313, "learning_rate": 4.968941769708471e-05, "loss": 2.3344, "mean_token_accuracy": 0.4310344815254211, "step": 99585 }, { "epoch": 0.10030810572891329, "grad_norm": 11.10464612153755, "learning_rate": 4.968935564887642e-05, "loss": 2.5198, "mean_token_accuracy": 0.38620689511299133, "step": 99590 }, { "epoch": 0.10031314178201746, "grad_norm": 13.626264747798231, "learning_rate": 4.9689293594513845e-05, "loss": 2.442, "mean_token_accuracy": 0.43448275327682495, "step": 99595 }, { "epoch": 0.10031817783512163, "grad_norm": 10.122868081030846, "learning_rate": 4.968923153399699e-05, "loss": 2.2156, "mean_token_accuracy": 0.45862069725990295, "step": 99600 }, { "epoch": 0.1003232138882258, "grad_norm": 12.359637982835508, "learning_rate": 4.9689169467325865e-05, "loss": 2.7726, "mean_token_accuracy": 0.4, "step": 99605 }, { "epoch": 0.10032824994132998, "grad_norm": 9.404707539610003, "learning_rate": 4.968910739450049e-05, "loss": 2.7852, "mean_token_accuracy": 0.39655172228813174, "step": 99610 }, { "epoch": 0.10033328599443415, "grad_norm": 12.151819712117094, "learning_rate": 4.9689045315520885e-05, "loss": 2.4407, "mean_token_accuracy": 0.39655172228813174, "step": 99615 }, { "epoch": 0.10033832204753833, "grad_norm": 11.971543134397399, "learning_rate": 4.9688983230387065e-05, "loss": 2.2037, "mean_token_accuracy": 0.4413793087005615, "step": 99620 }, { "epoch": 0.1003433581006425, "grad_norm": 13.329918985317477, "learning_rate": 4.968892113909905e-05, "loss": 2.4947, "mean_token_accuracy": 0.38965516686439516, "step": 99625 }, { "epoch": 0.10034839415374668, "grad_norm": 11.99320884643853, "learning_rate": 4.968885904165685e-05, "loss": 2.959, "mean_token_accuracy": 0.3724137842655182, "step": 99630 }, { "epoch": 0.10035343020685085, "grad_norm": 12.994686342589713, "learning_rate": 4.968879693806049e-05, "loss": 2.1923, "mean_token_accuracy": 0.4620689690113068, "step": 99635 }, { "epoch": 0.10035846625995502, "grad_norm": 10.495589776917498, "learning_rate": 4.9688734828309986e-05, "loss": 2.5349, "mean_token_accuracy": 0.39800362586975097, "step": 99640 }, { "epoch": 0.1003635023130592, "grad_norm": 11.950593110949752, "learning_rate": 4.968867271240535e-05, "loss": 2.3871, "mean_token_accuracy": 0.4034482717514038, "step": 99645 }, { "epoch": 0.10036853836616336, "grad_norm": 11.154201821138878, "learning_rate": 4.968861059034661e-05, "loss": 2.4759, "mean_token_accuracy": 0.4379310369491577, "step": 99650 }, { "epoch": 0.10037357441926753, "grad_norm": 10.790384052673996, "learning_rate": 4.9688548462133764e-05, "loss": 2.283, "mean_token_accuracy": 0.43103448748588563, "step": 99655 }, { "epoch": 0.1003786104723717, "grad_norm": 10.31027914186145, "learning_rate": 4.9688486327766845e-05, "loss": 2.6157, "mean_token_accuracy": 0.4034482777118683, "step": 99660 }, { "epoch": 0.10038364652547588, "grad_norm": 9.813072005090763, "learning_rate": 4.968842418724587e-05, "loss": 2.27, "mean_token_accuracy": 0.4433151841163635, "step": 99665 }, { "epoch": 0.10038868257858005, "grad_norm": 11.604168131239176, "learning_rate": 4.968836204057086e-05, "loss": 2.4624, "mean_token_accuracy": 0.4137930989265442, "step": 99670 }, { "epoch": 0.10039371863168423, "grad_norm": 11.808789992271457, "learning_rate": 4.968829988774182e-05, "loss": 2.3979, "mean_token_accuracy": 0.38620689511299133, "step": 99675 }, { "epoch": 0.1003987546847884, "grad_norm": 14.508113203447868, "learning_rate": 4.9688237728758766e-05, "loss": 2.7195, "mean_token_accuracy": 0.417241370677948, "step": 99680 }, { "epoch": 0.10040379073789257, "grad_norm": 11.766272592324986, "learning_rate": 4.968817556362172e-05, "loss": 2.4508, "mean_token_accuracy": 0.4068965494632721, "step": 99685 }, { "epoch": 0.10040882679099675, "grad_norm": 12.924217135426119, "learning_rate": 4.968811339233071e-05, "loss": 2.7458, "mean_token_accuracy": 0.3896551728248596, "step": 99690 }, { "epoch": 0.10041386284410092, "grad_norm": 10.638004901303898, "learning_rate": 4.968805121488574e-05, "loss": 2.4527, "mean_token_accuracy": 0.38620689511299133, "step": 99695 }, { "epoch": 0.1004188988972051, "grad_norm": 12.042164473902988, "learning_rate": 4.968798903128683e-05, "loss": 2.5183, "mean_token_accuracy": 0.4034482717514038, "step": 99700 }, { "epoch": 0.10042393495030927, "grad_norm": 11.666529497640347, "learning_rate": 4.9687926841533996e-05, "loss": 2.5216, "mean_token_accuracy": 0.42413793206214906, "step": 99705 }, { "epoch": 0.10042897100341344, "grad_norm": 19.53091632343761, "learning_rate": 4.9687864645627266e-05, "loss": 3.138, "mean_token_accuracy": 0.37041741609573364, "step": 99710 }, { "epoch": 0.10043400705651762, "grad_norm": 12.591490133034377, "learning_rate": 4.9687802443566646e-05, "loss": 2.4824, "mean_token_accuracy": 0.4413793087005615, "step": 99715 }, { "epoch": 0.10043904310962178, "grad_norm": 11.03972189063813, "learning_rate": 4.968774023535215e-05, "loss": 2.4438, "mean_token_accuracy": 0.4034482777118683, "step": 99720 }, { "epoch": 0.10044407916272595, "grad_norm": 10.744026486909533, "learning_rate": 4.9687678020983807e-05, "loss": 2.2027, "mean_token_accuracy": 0.4413793087005615, "step": 99725 }, { "epoch": 0.10044911521583012, "grad_norm": 10.450375845265931, "learning_rate": 4.968761580046162e-05, "loss": 2.4219, "mean_token_accuracy": 0.4310344815254211, "step": 99730 }, { "epoch": 0.1004541512689343, "grad_norm": 9.600689216265954, "learning_rate": 4.9687553573785634e-05, "loss": 2.5709, "mean_token_accuracy": 0.3827586233615875, "step": 99735 }, { "epoch": 0.10045918732203847, "grad_norm": 9.493808053644932, "learning_rate": 4.968749134095583e-05, "loss": 2.4296, "mean_token_accuracy": 0.417241370677948, "step": 99740 }, { "epoch": 0.10046422337514264, "grad_norm": 9.40650296306212, "learning_rate": 4.968742910197225e-05, "loss": 2.4762, "mean_token_accuracy": 0.3999999940395355, "step": 99745 }, { "epoch": 0.10046925942824682, "grad_norm": 11.921619767493363, "learning_rate": 4.96873668568349e-05, "loss": 2.5422, "mean_token_accuracy": 0.4000000059604645, "step": 99750 }, { "epoch": 0.10047429548135099, "grad_norm": 11.228487027882364, "learning_rate": 4.9687304605543805e-05, "loss": 2.5264, "mean_token_accuracy": 0.36896551251411436, "step": 99755 }, { "epoch": 0.10047933153445517, "grad_norm": 13.549066193087164, "learning_rate": 4.968724234809898e-05, "loss": 2.4953, "mean_token_accuracy": 0.4172413766384125, "step": 99760 }, { "epoch": 0.10048436758755934, "grad_norm": 14.667298255463983, "learning_rate": 4.968718008450043e-05, "loss": 2.5031, "mean_token_accuracy": 0.40344828367233276, "step": 99765 }, { "epoch": 0.10048940364066351, "grad_norm": 11.223880716589564, "learning_rate": 4.968711781474819e-05, "loss": 2.1226, "mean_token_accuracy": 0.47779794335365294, "step": 99770 }, { "epoch": 0.10049443969376769, "grad_norm": 10.90098848918577, "learning_rate": 4.968705553884228e-05, "loss": 2.7124, "mean_token_accuracy": 0.37931033968925476, "step": 99775 }, { "epoch": 0.10049947574687186, "grad_norm": 15.607037543667179, "learning_rate": 4.968699325678269e-05, "loss": 2.5757, "mean_token_accuracy": 0.3793103456497192, "step": 99780 }, { "epoch": 0.10050451179997603, "grad_norm": 11.877607297371828, "learning_rate": 4.968693096856946e-05, "loss": 2.2703, "mean_token_accuracy": 0.4482758641242981, "step": 99785 }, { "epoch": 0.1005095478530802, "grad_norm": 12.152789741228506, "learning_rate": 4.96868686742026e-05, "loss": 2.1303, "mean_token_accuracy": 0.43641863465309144, "step": 99790 }, { "epoch": 0.10051458390618437, "grad_norm": 11.587328287609994, "learning_rate": 4.968680637368214e-05, "loss": 2.2481, "mean_token_accuracy": 0.458620685338974, "step": 99795 }, { "epoch": 0.10051961995928854, "grad_norm": 10.96572351664888, "learning_rate": 4.968674406700808e-05, "loss": 2.6676, "mean_token_accuracy": 0.3586206793785095, "step": 99800 }, { "epoch": 0.10052465601239272, "grad_norm": 12.12271356309071, "learning_rate": 4.9686681754180445e-05, "loss": 2.4421, "mean_token_accuracy": 0.42758620381355283, "step": 99805 }, { "epoch": 0.10052969206549689, "grad_norm": 10.255389175651684, "learning_rate": 4.968661943519925e-05, "loss": 2.589, "mean_token_accuracy": 0.39655172228813174, "step": 99810 }, { "epoch": 0.10053472811860106, "grad_norm": 11.82423286494721, "learning_rate": 4.96865571100645e-05, "loss": 2.5318, "mean_token_accuracy": 0.42413793206214906, "step": 99815 }, { "epoch": 0.10053976417170524, "grad_norm": 13.730040867990613, "learning_rate": 4.9686494778776244e-05, "loss": 2.8052, "mean_token_accuracy": 0.36206896901130675, "step": 99820 }, { "epoch": 0.10054480022480941, "grad_norm": 11.466585503826266, "learning_rate": 4.9686432441334476e-05, "loss": 2.5078, "mean_token_accuracy": 0.37931033968925476, "step": 99825 }, { "epoch": 0.10054983627791358, "grad_norm": 9.896257939510402, "learning_rate": 4.9686370097739224e-05, "loss": 2.0378, "mean_token_accuracy": 0.4586206912994385, "step": 99830 }, { "epoch": 0.10055487233101776, "grad_norm": 9.672838679742702, "learning_rate": 4.968630774799049e-05, "loss": 2.2577, "mean_token_accuracy": 0.48066502809524536, "step": 99835 }, { "epoch": 0.10055990838412193, "grad_norm": 9.77640990019493, "learning_rate": 4.968624539208831e-05, "loss": 2.1352, "mean_token_accuracy": 0.441379314661026, "step": 99840 }, { "epoch": 0.1005649444372261, "grad_norm": 10.013875542908634, "learning_rate": 4.968618303003268e-05, "loss": 2.4218, "mean_token_accuracy": 0.43448275327682495, "step": 99845 }, { "epoch": 0.10056998049033028, "grad_norm": 14.67226933283389, "learning_rate": 4.968612066182363e-05, "loss": 2.4974, "mean_token_accuracy": 0.41379311084747317, "step": 99850 }, { "epoch": 0.10057501654343445, "grad_norm": 10.004953253497721, "learning_rate": 4.968605828746118e-05, "loss": 2.4775, "mean_token_accuracy": 0.38275861740112305, "step": 99855 }, { "epoch": 0.10058005259653861, "grad_norm": 11.01578635158835, "learning_rate": 4.968599590694535e-05, "loss": 2.4849, "mean_token_accuracy": 0.45414398312568666, "step": 99860 }, { "epoch": 0.10058508864964279, "grad_norm": 12.710777933578147, "learning_rate": 4.9685933520276146e-05, "loss": 2.4547, "mean_token_accuracy": 0.45517241954803467, "step": 99865 }, { "epoch": 0.10059012470274696, "grad_norm": 11.058875638341222, "learning_rate": 4.968587112745359e-05, "loss": 2.3082, "mean_token_accuracy": 0.4379310250282288, "step": 99870 }, { "epoch": 0.10059516075585113, "grad_norm": 10.604547412645783, "learning_rate": 4.96858087284777e-05, "loss": 2.3271, "mean_token_accuracy": 0.44482758045196535, "step": 99875 }, { "epoch": 0.10060019680895531, "grad_norm": 12.084023253056268, "learning_rate": 4.9685746323348495e-05, "loss": 2.501, "mean_token_accuracy": 0.46551724076271056, "step": 99880 }, { "epoch": 0.10060523286205948, "grad_norm": 10.524335022869112, "learning_rate": 4.9685683912065993e-05, "loss": 2.5486, "mean_token_accuracy": 0.3862069010734558, "step": 99885 }, { "epoch": 0.10061026891516366, "grad_norm": 11.87503645700627, "learning_rate": 4.968562149463021e-05, "loss": 2.2213, "mean_token_accuracy": 0.3655172407627106, "step": 99890 }, { "epoch": 0.10061530496826783, "grad_norm": 11.09523063974377, "learning_rate": 4.968555907104115e-05, "loss": 2.564, "mean_token_accuracy": 0.41875378489494325, "step": 99895 }, { "epoch": 0.100620341021372, "grad_norm": 9.610502738219719, "learning_rate": 4.968549664129885e-05, "loss": 2.0579, "mean_token_accuracy": 0.44827585816383364, "step": 99900 }, { "epoch": 0.10062537707447618, "grad_norm": 10.838737457552972, "learning_rate": 4.9685434205403325e-05, "loss": 2.6702, "mean_token_accuracy": 0.37931033968925476, "step": 99905 }, { "epoch": 0.10063041312758035, "grad_norm": 8.925940694187503, "learning_rate": 4.968537176335458e-05, "loss": 2.0763, "mean_token_accuracy": 0.4931034445762634, "step": 99910 }, { "epoch": 0.10063544918068452, "grad_norm": 12.325617892704834, "learning_rate": 4.968530931515264e-05, "loss": 2.2455, "mean_token_accuracy": 0.458620685338974, "step": 99915 }, { "epoch": 0.1006404852337887, "grad_norm": 11.107211811371346, "learning_rate": 4.9685246860797526e-05, "loss": 2.4526, "mean_token_accuracy": 0.39310344457626345, "step": 99920 }, { "epoch": 0.10064552128689287, "grad_norm": 13.494162232248229, "learning_rate": 4.968518440028925e-05, "loss": 2.4281, "mean_token_accuracy": 0.42758620977401735, "step": 99925 }, { "epoch": 0.10065055733999703, "grad_norm": 9.92818904636053, "learning_rate": 4.968512193362783e-05, "loss": 2.2123, "mean_token_accuracy": 0.4551724135875702, "step": 99930 }, { "epoch": 0.1006555933931012, "grad_norm": 12.240299036477683, "learning_rate": 4.968505946081328e-05, "loss": 3.2352, "mean_token_accuracy": 0.3724137872457504, "step": 99935 }, { "epoch": 0.10066062944620538, "grad_norm": 10.961753719087756, "learning_rate": 4.968499698184562e-05, "loss": 2.2681, "mean_token_accuracy": 0.44482758045196535, "step": 99940 }, { "epoch": 0.10066566549930955, "grad_norm": 9.387616884964391, "learning_rate": 4.968493449672487e-05, "loss": 2.3161, "mean_token_accuracy": 0.4620689630508423, "step": 99945 }, { "epoch": 0.10067070155241373, "grad_norm": 11.19079445322811, "learning_rate": 4.968487200545105e-05, "loss": 2.1545, "mean_token_accuracy": 0.44482759237289426, "step": 99950 }, { "epoch": 0.1006757376055179, "grad_norm": 13.649963912407397, "learning_rate": 4.968480950802417e-05, "loss": 2.308, "mean_token_accuracy": 0.4206896543502808, "step": 99955 }, { "epoch": 0.10068077365862207, "grad_norm": 10.701360880147938, "learning_rate": 4.968474700444426e-05, "loss": 2.5466, "mean_token_accuracy": 0.42068964838981626, "step": 99960 }, { "epoch": 0.10068580971172625, "grad_norm": 12.399985579194999, "learning_rate": 4.968468449471131e-05, "loss": 2.5668, "mean_token_accuracy": 0.40344828367233276, "step": 99965 }, { "epoch": 0.10069084576483042, "grad_norm": 7.864421568825692, "learning_rate": 4.9684621978825364e-05, "loss": 2.3094, "mean_token_accuracy": 0.4922564923763275, "step": 99970 }, { "epoch": 0.1006958818179346, "grad_norm": 10.6402865824142, "learning_rate": 4.9684559456786433e-05, "loss": 2.2359, "mean_token_accuracy": 0.42413793206214906, "step": 99975 }, { "epoch": 0.10070091787103877, "grad_norm": 11.851145854922413, "learning_rate": 4.9684496928594526e-05, "loss": 3.0235, "mean_token_accuracy": 0.3758620619773865, "step": 99980 }, { "epoch": 0.10070595392414294, "grad_norm": 10.656117607810696, "learning_rate": 4.968443439424967e-05, "loss": 2.2565, "mean_token_accuracy": 0.44827587008476255, "step": 99985 }, { "epoch": 0.10071098997724712, "grad_norm": 9.484087717898566, "learning_rate": 4.968437185375188e-05, "loss": 2.1594, "mean_token_accuracy": 0.44827587008476255, "step": 99990 }, { "epoch": 0.10071602603035129, "grad_norm": 10.17788425555078, "learning_rate": 4.968430930710116e-05, "loss": 2.151, "mean_token_accuracy": 0.44295220971107485, "step": 99995 }, { "epoch": 0.10072106208345545, "grad_norm": 13.964300808080232, "learning_rate": 4.968424675429755e-05, "loss": 2.5475, "mean_token_accuracy": 0.41724138259887694, "step": 100000 }, { "epoch": 0.10072609813655962, "grad_norm": 10.406477758421381, "learning_rate": 4.9684184195341055e-05, "loss": 2.5639, "mean_token_accuracy": 0.39655172228813174, "step": 100005 }, { "epoch": 0.1007311341896638, "grad_norm": 11.061127377980746, "learning_rate": 4.968412163023169e-05, "loss": 2.9076, "mean_token_accuracy": 0.36206896007061007, "step": 100010 }, { "epoch": 0.10073617024276797, "grad_norm": 10.234807691009491, "learning_rate": 4.968405905896949e-05, "loss": 2.5931, "mean_token_accuracy": 0.3620689630508423, "step": 100015 }, { "epoch": 0.10074120629587215, "grad_norm": 11.774505581506515, "learning_rate": 4.968399648155444e-05, "loss": 2.6812, "mean_token_accuracy": 0.4482758641242981, "step": 100020 }, { "epoch": 0.10074624234897632, "grad_norm": 10.105905203219091, "learning_rate": 4.9683933897986584e-05, "loss": 2.3463, "mean_token_accuracy": 0.4482758641242981, "step": 100025 }, { "epoch": 0.1007512784020805, "grad_norm": 10.250057408208953, "learning_rate": 4.968387130826593e-05, "loss": 2.4349, "mean_token_accuracy": 0.4206896543502808, "step": 100030 }, { "epoch": 0.10075631445518467, "grad_norm": 10.274751795656123, "learning_rate": 4.96838087123925e-05, "loss": 2.5913, "mean_token_accuracy": 0.4034482717514038, "step": 100035 }, { "epoch": 0.10076135050828884, "grad_norm": 9.079882315925548, "learning_rate": 4.9683746110366305e-05, "loss": 2.3455, "mean_token_accuracy": 0.42413792610168455, "step": 100040 }, { "epoch": 0.10076638656139301, "grad_norm": 8.863773684582196, "learning_rate": 4.968368350218736e-05, "loss": 2.2763, "mean_token_accuracy": 0.4310344815254211, "step": 100045 }, { "epoch": 0.10077142261449719, "grad_norm": 15.44091740263888, "learning_rate": 4.9683620887855695e-05, "loss": 2.7504, "mean_token_accuracy": 0.43103447556495667, "step": 100050 }, { "epoch": 0.10077645866760136, "grad_norm": 10.386754074726456, "learning_rate": 4.9683558267371315e-05, "loss": 2.5437, "mean_token_accuracy": 0.3862069010734558, "step": 100055 }, { "epoch": 0.10078149472070554, "grad_norm": 9.582472153935983, "learning_rate": 4.968349564073425e-05, "loss": 2.5318, "mean_token_accuracy": 0.3896551728248596, "step": 100060 }, { "epoch": 0.10078653077380971, "grad_norm": 20.223415310854257, "learning_rate": 4.9683433007944494e-05, "loss": 2.445, "mean_token_accuracy": 0.4551724135875702, "step": 100065 }, { "epoch": 0.10079156682691387, "grad_norm": 10.4790334374899, "learning_rate": 4.9683370369002094e-05, "loss": 2.5041, "mean_token_accuracy": 0.4137930989265442, "step": 100070 }, { "epoch": 0.10079660288001804, "grad_norm": 13.358522219394409, "learning_rate": 4.968330772390705e-05, "loss": 2.6975, "mean_token_accuracy": 0.4379310369491577, "step": 100075 }, { "epoch": 0.10080163893312222, "grad_norm": 8.916087774069464, "learning_rate": 4.968324507265938e-05, "loss": 2.5099, "mean_token_accuracy": 0.3999999940395355, "step": 100080 }, { "epoch": 0.10080667498622639, "grad_norm": 13.451547611396204, "learning_rate": 4.968318241525911e-05, "loss": 2.764, "mean_token_accuracy": 0.374047189950943, "step": 100085 }, { "epoch": 0.10081171103933056, "grad_norm": 11.282079887463281, "learning_rate": 4.968311975170625e-05, "loss": 2.3409, "mean_token_accuracy": 0.4103448212146759, "step": 100090 }, { "epoch": 0.10081674709243474, "grad_norm": 14.940816422487298, "learning_rate": 4.968305708200082e-05, "loss": 2.9524, "mean_token_accuracy": 0.4172413766384125, "step": 100095 }, { "epoch": 0.10082178314553891, "grad_norm": 8.59788008892987, "learning_rate": 4.9682994406142834e-05, "loss": 2.6272, "mean_token_accuracy": 0.3655172437429428, "step": 100100 }, { "epoch": 0.10082681919864309, "grad_norm": 11.442598672615754, "learning_rate": 4.968293172413231e-05, "loss": 2.2398, "mean_token_accuracy": 0.4361766457557678, "step": 100105 }, { "epoch": 0.10083185525174726, "grad_norm": 9.911722112547768, "learning_rate": 4.9682869035969275e-05, "loss": 2.141, "mean_token_accuracy": 0.43103448748588563, "step": 100110 }, { "epoch": 0.10083689130485143, "grad_norm": 8.123422649995424, "learning_rate": 4.968280634165373e-05, "loss": 2.4386, "mean_token_accuracy": 0.42758620977401735, "step": 100115 }, { "epoch": 0.10084192735795561, "grad_norm": 11.607680492207328, "learning_rate": 4.96827436411857e-05, "loss": 2.4313, "mean_token_accuracy": 0.4344827592372894, "step": 100120 }, { "epoch": 0.10084696341105978, "grad_norm": 9.282297042714827, "learning_rate": 4.968268093456521e-05, "loss": 2.343, "mean_token_accuracy": 0.41724138259887694, "step": 100125 }, { "epoch": 0.10085199946416396, "grad_norm": 12.30599082470372, "learning_rate": 4.9682618221792264e-05, "loss": 2.8554, "mean_token_accuracy": 0.4034482777118683, "step": 100130 }, { "epoch": 0.10085703551726813, "grad_norm": 9.39146818546685, "learning_rate": 4.9682555502866894e-05, "loss": 2.1506, "mean_token_accuracy": 0.4571687877178192, "step": 100135 }, { "epoch": 0.10086207157037229, "grad_norm": 11.621297727704723, "learning_rate": 4.9682492777789104e-05, "loss": 2.5482, "mean_token_accuracy": 0.4000000059604645, "step": 100140 }, { "epoch": 0.10086710762347646, "grad_norm": 12.358960974943166, "learning_rate": 4.968243004655892e-05, "loss": 2.1358, "mean_token_accuracy": 0.4662561595439911, "step": 100145 }, { "epoch": 0.10087214367658064, "grad_norm": 8.638664694802257, "learning_rate": 4.968236730917635e-05, "loss": 2.5744, "mean_token_accuracy": 0.41034482717514037, "step": 100150 }, { "epoch": 0.10087717972968481, "grad_norm": 11.940657813740733, "learning_rate": 4.968230456564143e-05, "loss": 2.2187, "mean_token_accuracy": 0.4517241358757019, "step": 100155 }, { "epoch": 0.10088221578278898, "grad_norm": 10.222565744893558, "learning_rate": 4.968224181595415e-05, "loss": 2.2774, "mean_token_accuracy": 0.4103448331356049, "step": 100160 }, { "epoch": 0.10088725183589316, "grad_norm": 7.995514602194898, "learning_rate": 4.968217906011455e-05, "loss": 2.4379, "mean_token_accuracy": 0.413793095946312, "step": 100165 }, { "epoch": 0.10089228788899733, "grad_norm": 8.805412710913629, "learning_rate": 4.968211629812264e-05, "loss": 2.4089, "mean_token_accuracy": 0.4310344815254211, "step": 100170 }, { "epoch": 0.1008973239421015, "grad_norm": 12.420044509657634, "learning_rate": 4.968205352997844e-05, "loss": 2.8002, "mean_token_accuracy": 0.3896551728248596, "step": 100175 }, { "epoch": 0.10090235999520568, "grad_norm": 10.742302311862534, "learning_rate": 4.968199075568196e-05, "loss": 2.4007, "mean_token_accuracy": 0.4034482717514038, "step": 100180 }, { "epoch": 0.10090739604830985, "grad_norm": 11.166003380484904, "learning_rate": 4.9681927975233224e-05, "loss": 2.3804, "mean_token_accuracy": 0.4379310369491577, "step": 100185 }, { "epoch": 0.10091243210141403, "grad_norm": 13.253857309968755, "learning_rate": 4.9681865188632246e-05, "loss": 2.4114, "mean_token_accuracy": 0.44827585816383364, "step": 100190 }, { "epoch": 0.1009174681545182, "grad_norm": 10.745851594174658, "learning_rate": 4.968180239587904e-05, "loss": 2.2756, "mean_token_accuracy": 0.44137930274009707, "step": 100195 }, { "epoch": 0.10092250420762237, "grad_norm": 11.31536930998323, "learning_rate": 4.968173959697364e-05, "loss": 2.6507, "mean_token_accuracy": 0.34482758641242983, "step": 100200 }, { "epoch": 0.10092754026072655, "grad_norm": 10.364119101792767, "learning_rate": 4.968167679191604e-05, "loss": 2.0542, "mean_token_accuracy": 0.48275862336158754, "step": 100205 }, { "epoch": 0.10093257631383071, "grad_norm": 8.571516056707093, "learning_rate": 4.968161398070628e-05, "loss": 2.4534, "mean_token_accuracy": 0.42758620381355283, "step": 100210 }, { "epoch": 0.10093761236693488, "grad_norm": 10.967180509034472, "learning_rate": 4.9681551163344356e-05, "loss": 2.3709, "mean_token_accuracy": 0.4379310250282288, "step": 100215 }, { "epoch": 0.10094264842003906, "grad_norm": 13.01961120023097, "learning_rate": 4.9681488339830305e-05, "loss": 2.4307, "mean_token_accuracy": 0.37241379022598264, "step": 100220 }, { "epoch": 0.10094768447314323, "grad_norm": 9.787951741702825, "learning_rate": 4.968142551016414e-05, "loss": 2.4716, "mean_token_accuracy": 0.44137930274009707, "step": 100225 }, { "epoch": 0.1009527205262474, "grad_norm": 9.100453432121714, "learning_rate": 4.968136267434586e-05, "loss": 2.2706, "mean_token_accuracy": 0.3965517282485962, "step": 100230 }, { "epoch": 0.10095775657935158, "grad_norm": 10.301386374859703, "learning_rate": 4.968129983237551e-05, "loss": 2.955, "mean_token_accuracy": 0.38741681575775144, "step": 100235 }, { "epoch": 0.10096279263245575, "grad_norm": 12.0619974465707, "learning_rate": 4.9681236984253084e-05, "loss": 2.6295, "mean_token_accuracy": 0.38620689511299133, "step": 100240 }, { "epoch": 0.10096782868555992, "grad_norm": 13.241075970773057, "learning_rate": 4.9681174129978605e-05, "loss": 2.3542, "mean_token_accuracy": 0.4482758641242981, "step": 100245 }, { "epoch": 0.1009728647386641, "grad_norm": 16.476216355509845, "learning_rate": 4.9681111269552106e-05, "loss": 2.5856, "mean_token_accuracy": 0.3965517282485962, "step": 100250 }, { "epoch": 0.10097790079176827, "grad_norm": 13.063424961048968, "learning_rate": 4.9681048402973586e-05, "loss": 2.3574, "mean_token_accuracy": 0.4206896543502808, "step": 100255 }, { "epoch": 0.10098293684487245, "grad_norm": 12.336889264703254, "learning_rate": 4.9680985530243065e-05, "loss": 2.4201, "mean_token_accuracy": 0.4344827592372894, "step": 100260 }, { "epoch": 0.10098797289797662, "grad_norm": 10.759507840486759, "learning_rate": 4.968092265136058e-05, "loss": 2.8432, "mean_token_accuracy": 0.42068964838981626, "step": 100265 }, { "epoch": 0.10099300895108079, "grad_norm": 10.167995723622077, "learning_rate": 4.968085976632612e-05, "loss": 2.603, "mean_token_accuracy": 0.37241379618644715, "step": 100270 }, { "epoch": 0.10099804500418497, "grad_norm": 9.738047831612196, "learning_rate": 4.9680796875139726e-05, "loss": 2.5162, "mean_token_accuracy": 0.42413792610168455, "step": 100275 }, { "epoch": 0.10100308105728913, "grad_norm": 9.87581590934551, "learning_rate": 4.9680733977801394e-05, "loss": 2.0519, "mean_token_accuracy": 0.49491833448410033, "step": 100280 }, { "epoch": 0.1010081171103933, "grad_norm": 10.410149019998846, "learning_rate": 4.9680671074311164e-05, "loss": 2.5467, "mean_token_accuracy": 0.4344827651977539, "step": 100285 }, { "epoch": 0.10101315316349747, "grad_norm": 10.511848361828227, "learning_rate": 4.9680608164669034e-05, "loss": 2.7376, "mean_token_accuracy": 0.37586206793785093, "step": 100290 }, { "epoch": 0.10101818921660165, "grad_norm": 9.819977353487074, "learning_rate": 4.9680545248875034e-05, "loss": 2.3325, "mean_token_accuracy": 0.4533575356006622, "step": 100295 }, { "epoch": 0.10102322526970582, "grad_norm": 8.741889901348351, "learning_rate": 4.9680482326929175e-05, "loss": 2.4936, "mean_token_accuracy": 0.39655172228813174, "step": 100300 }, { "epoch": 0.10102826132281, "grad_norm": 11.951088650761767, "learning_rate": 4.968041939883148e-05, "loss": 2.4235, "mean_token_accuracy": 0.43103448748588563, "step": 100305 }, { "epoch": 0.10103329737591417, "grad_norm": 14.52052776631206, "learning_rate": 4.968035646458196e-05, "loss": 2.6713, "mean_token_accuracy": 0.4413793087005615, "step": 100310 }, { "epoch": 0.10103833342901834, "grad_norm": 13.753990768981327, "learning_rate": 4.9680293524180634e-05, "loss": 2.7418, "mean_token_accuracy": 0.3896551728248596, "step": 100315 }, { "epoch": 0.10104336948212252, "grad_norm": 10.235011059977122, "learning_rate": 4.968023057762752e-05, "loss": 2.6075, "mean_token_accuracy": 0.4413793057203293, "step": 100320 }, { "epoch": 0.10104840553522669, "grad_norm": 13.613477175207537, "learning_rate": 4.9680167624922635e-05, "loss": 2.5422, "mean_token_accuracy": 0.4344827651977539, "step": 100325 }, { "epoch": 0.10105344158833086, "grad_norm": 12.012970960158299, "learning_rate": 4.9680104666066e-05, "loss": 2.6302, "mean_token_accuracy": 0.41209921836853025, "step": 100330 }, { "epoch": 0.10105847764143504, "grad_norm": 18.74923309969989, "learning_rate": 4.968004170105763e-05, "loss": 2.8035, "mean_token_accuracy": 0.43448275327682495, "step": 100335 }, { "epoch": 0.10106351369453921, "grad_norm": 10.949841146207635, "learning_rate": 4.9679978729897546e-05, "loss": 2.43, "mean_token_accuracy": 0.46896551847457885, "step": 100340 }, { "epoch": 0.10106854974764339, "grad_norm": 13.998837005957975, "learning_rate": 4.967991575258576e-05, "loss": 2.4437, "mean_token_accuracy": 0.42758620381355283, "step": 100345 }, { "epoch": 0.10107358580074755, "grad_norm": 10.720579643827577, "learning_rate": 4.9679852769122295e-05, "loss": 2.4507, "mean_token_accuracy": 0.42413793206214906, "step": 100350 }, { "epoch": 0.10107862185385172, "grad_norm": 12.431466446024018, "learning_rate": 4.9679789779507164e-05, "loss": 2.4548, "mean_token_accuracy": 0.4310344815254211, "step": 100355 }, { "epoch": 0.10108365790695589, "grad_norm": 11.922578286341688, "learning_rate": 4.967972678374038e-05, "loss": 2.3269, "mean_token_accuracy": 0.45027223229408264, "step": 100360 }, { "epoch": 0.10108869396006007, "grad_norm": 10.550090600499436, "learning_rate": 4.967966378182197e-05, "loss": 2.3792, "mean_token_accuracy": 0.4758620738983154, "step": 100365 }, { "epoch": 0.10109373001316424, "grad_norm": 11.321406819026564, "learning_rate": 4.9679600773751944e-05, "loss": 2.5415, "mean_token_accuracy": 0.4000000059604645, "step": 100370 }, { "epoch": 0.10109876606626841, "grad_norm": 10.798501424265243, "learning_rate": 4.967953775953032e-05, "loss": 2.141, "mean_token_accuracy": 0.475862056016922, "step": 100375 }, { "epoch": 0.10110380211937259, "grad_norm": 16.0658566618976, "learning_rate": 4.9679474739157135e-05, "loss": 3.0247, "mean_token_accuracy": 0.37356321811676024, "step": 100380 }, { "epoch": 0.10110883817247676, "grad_norm": 9.856195793224789, "learning_rate": 4.9679411712632374e-05, "loss": 2.6095, "mean_token_accuracy": 0.4, "step": 100385 }, { "epoch": 0.10111387422558094, "grad_norm": 10.437836199492825, "learning_rate": 4.967934867995608e-05, "loss": 1.9687, "mean_token_accuracy": 0.48275861144065857, "step": 100390 }, { "epoch": 0.10111891027868511, "grad_norm": 12.685438296265172, "learning_rate": 4.967928564112826e-05, "loss": 2.3768, "mean_token_accuracy": 0.4379310369491577, "step": 100395 }, { "epoch": 0.10112394633178928, "grad_norm": 8.778129048227438, "learning_rate": 4.967922259614893e-05, "loss": 2.2661, "mean_token_accuracy": 0.417241370677948, "step": 100400 }, { "epoch": 0.10112898238489346, "grad_norm": 9.846191139180466, "learning_rate": 4.96791595450181e-05, "loss": 2.1253, "mean_token_accuracy": 0.4862069010734558, "step": 100405 }, { "epoch": 0.10113401843799763, "grad_norm": 9.883444789041006, "learning_rate": 4.967909648773581e-05, "loss": 2.3876, "mean_token_accuracy": 0.41379310488700866, "step": 100410 }, { "epoch": 0.1011390544911018, "grad_norm": 10.051223812162036, "learning_rate": 4.967903342430206e-05, "loss": 2.3866, "mean_token_accuracy": 0.4551724076271057, "step": 100415 }, { "epoch": 0.10114409054420596, "grad_norm": 9.677470178966669, "learning_rate": 4.9678970354716874e-05, "loss": 2.1052, "mean_token_accuracy": 0.4482758641242981, "step": 100420 }, { "epoch": 0.10114912659731014, "grad_norm": 10.700655072553364, "learning_rate": 4.9678907278980264e-05, "loss": 2.5944, "mean_token_accuracy": 0.3517241358757019, "step": 100425 }, { "epoch": 0.10115416265041431, "grad_norm": 9.705787756175926, "learning_rate": 4.9678844197092264e-05, "loss": 2.27, "mean_token_accuracy": 0.44827585816383364, "step": 100430 }, { "epoch": 0.10115919870351849, "grad_norm": 10.999017398217122, "learning_rate": 4.967878110905286e-05, "loss": 1.9737, "mean_token_accuracy": 0.45517241954803467, "step": 100435 }, { "epoch": 0.10116423475662266, "grad_norm": 9.504696337997215, "learning_rate": 4.96787180148621e-05, "loss": 2.347, "mean_token_accuracy": 0.44827587008476255, "step": 100440 }, { "epoch": 0.10116927080972683, "grad_norm": 10.79471256984635, "learning_rate": 4.967865491451998e-05, "loss": 2.5696, "mean_token_accuracy": 0.41379310488700866, "step": 100445 }, { "epoch": 0.101174306862831, "grad_norm": 10.96726961200012, "learning_rate": 4.967859180802655e-05, "loss": 2.5477, "mean_token_accuracy": 0.4551724076271057, "step": 100450 }, { "epoch": 0.10117934291593518, "grad_norm": 10.266500837428914, "learning_rate": 4.967852869538179e-05, "loss": 2.5733, "mean_token_accuracy": 0.4297035694122314, "step": 100455 }, { "epoch": 0.10118437896903935, "grad_norm": 11.931957093791128, "learning_rate": 4.967846557658573e-05, "loss": 2.4416, "mean_token_accuracy": 0.4517241358757019, "step": 100460 }, { "epoch": 0.10118941502214353, "grad_norm": 9.7194479637551, "learning_rate": 4.967840245163838e-05, "loss": 2.5615, "mean_token_accuracy": 0.4379310369491577, "step": 100465 }, { "epoch": 0.1011944510752477, "grad_norm": 10.65114543868417, "learning_rate": 4.9678339320539787e-05, "loss": 2.201, "mean_token_accuracy": 0.44646098613739016, "step": 100470 }, { "epoch": 0.10119948712835188, "grad_norm": 10.802781174996692, "learning_rate": 4.967827618328994e-05, "loss": 2.6149, "mean_token_accuracy": 0.4034482777118683, "step": 100475 }, { "epoch": 0.10120452318145605, "grad_norm": 9.412327463837844, "learning_rate": 4.967821303988887e-05, "loss": 2.6717, "mean_token_accuracy": 0.4034482777118683, "step": 100480 }, { "epoch": 0.10120955923456022, "grad_norm": 9.839997620110958, "learning_rate": 4.967814989033659e-05, "loss": 2.6339, "mean_token_accuracy": 0.37931033968925476, "step": 100485 }, { "epoch": 0.10121459528766438, "grad_norm": 11.101060261073052, "learning_rate": 4.967808673463311e-05, "loss": 2.9298, "mean_token_accuracy": 0.36896551847457887, "step": 100490 }, { "epoch": 0.10121963134076856, "grad_norm": 14.898977836783757, "learning_rate": 4.967802357277846e-05, "loss": 2.1909, "mean_token_accuracy": 0.4517241418361664, "step": 100495 }, { "epoch": 0.10122466739387273, "grad_norm": 12.645860913668214, "learning_rate": 4.967796040477265e-05, "loss": 2.5378, "mean_token_accuracy": 0.43103448748588563, "step": 100500 }, { "epoch": 0.1012297034469769, "grad_norm": 10.053678393340512, "learning_rate": 4.96778972306157e-05, "loss": 2.5634, "mean_token_accuracy": 0.41724138259887694, "step": 100505 }, { "epoch": 0.10123473950008108, "grad_norm": 11.905846307496736, "learning_rate": 4.9677834050307625e-05, "loss": 2.5123, "mean_token_accuracy": 0.3931034505367279, "step": 100510 }, { "epoch": 0.10123977555318525, "grad_norm": 9.63733451074268, "learning_rate": 4.967777086384845e-05, "loss": 2.2134, "mean_token_accuracy": 0.4896551728248596, "step": 100515 }, { "epoch": 0.10124481160628943, "grad_norm": 7.5063983459607195, "learning_rate": 4.9677707671238184e-05, "loss": 2.3781, "mean_token_accuracy": 0.4551724135875702, "step": 100520 }, { "epoch": 0.1012498476593936, "grad_norm": 13.147814643541263, "learning_rate": 4.9677644472476856e-05, "loss": 2.609, "mean_token_accuracy": 0.3620689660310745, "step": 100525 }, { "epoch": 0.10125488371249777, "grad_norm": 11.932190157041866, "learning_rate": 4.967758126756447e-05, "loss": 2.6359, "mean_token_accuracy": 0.3517241358757019, "step": 100530 }, { "epoch": 0.10125991976560195, "grad_norm": 8.848120743952473, "learning_rate": 4.967751805650105e-05, "loss": 2.1615, "mean_token_accuracy": 0.4206896543502808, "step": 100535 }, { "epoch": 0.10126495581870612, "grad_norm": 9.619795016903415, "learning_rate": 4.967745483928661e-05, "loss": 2.3464, "mean_token_accuracy": 0.41034482717514037, "step": 100540 }, { "epoch": 0.1012699918718103, "grad_norm": 11.583524889919026, "learning_rate": 4.967739161592117e-05, "loss": 2.5088, "mean_token_accuracy": 0.4241379380226135, "step": 100545 }, { "epoch": 0.10127502792491447, "grad_norm": 11.274429970944206, "learning_rate": 4.967732838640475e-05, "loss": 2.5741, "mean_token_accuracy": 0.42413793206214906, "step": 100550 }, { "epoch": 0.10128006397801864, "grad_norm": 11.051592635038917, "learning_rate": 4.9677265150737364e-05, "loss": 2.5721, "mean_token_accuracy": 0.40344828367233276, "step": 100555 }, { "epoch": 0.1012851000311228, "grad_norm": 7.866862193770247, "learning_rate": 4.967720190891903e-05, "loss": 2.2177, "mean_token_accuracy": 0.4137930989265442, "step": 100560 }, { "epoch": 0.10129013608422698, "grad_norm": 10.415649450687617, "learning_rate": 4.9677138660949766e-05, "loss": 2.459, "mean_token_accuracy": 0.38275861740112305, "step": 100565 }, { "epoch": 0.10129517213733115, "grad_norm": 9.706591525887859, "learning_rate": 4.967707540682959e-05, "loss": 2.7448, "mean_token_accuracy": 0.4172413766384125, "step": 100570 }, { "epoch": 0.10130020819043532, "grad_norm": 11.475594714070683, "learning_rate": 4.9677012146558525e-05, "loss": 2.4134, "mean_token_accuracy": 0.4551724135875702, "step": 100575 }, { "epoch": 0.1013052442435395, "grad_norm": 10.117291897754056, "learning_rate": 4.967694888013658e-05, "loss": 2.2558, "mean_token_accuracy": 0.4586206912994385, "step": 100580 }, { "epoch": 0.10131028029664367, "grad_norm": 10.232207089300864, "learning_rate": 4.9676885607563774e-05, "loss": 2.4348, "mean_token_accuracy": 0.39491833448410035, "step": 100585 }, { "epoch": 0.10131531634974784, "grad_norm": 13.304504899899978, "learning_rate": 4.9676822328840124e-05, "loss": 2.6212, "mean_token_accuracy": 0.441379314661026, "step": 100590 }, { "epoch": 0.10132035240285202, "grad_norm": 10.647563763639933, "learning_rate": 4.967675904396565e-05, "loss": 2.5306, "mean_token_accuracy": 0.4068965554237366, "step": 100595 }, { "epoch": 0.10132538845595619, "grad_norm": 8.916272338926895, "learning_rate": 4.967669575294037e-05, "loss": 2.1849, "mean_token_accuracy": 0.43793103098869324, "step": 100600 }, { "epoch": 0.10133042450906037, "grad_norm": 13.34168583103646, "learning_rate": 4.967663245576431e-05, "loss": 2.4019, "mean_token_accuracy": 0.4137930929660797, "step": 100605 }, { "epoch": 0.10133546056216454, "grad_norm": 10.695399836123494, "learning_rate": 4.967656915243747e-05, "loss": 2.6286, "mean_token_accuracy": 0.4034482777118683, "step": 100610 }, { "epoch": 0.10134049661526871, "grad_norm": 10.361462200560322, "learning_rate": 4.967650584295987e-05, "loss": 3.1051, "mean_token_accuracy": 0.3896551787853241, "step": 100615 }, { "epoch": 0.10134553266837289, "grad_norm": 13.047216953553455, "learning_rate": 4.9676442527331535e-05, "loss": 3.0016, "mean_token_accuracy": 0.34137930572032926, "step": 100620 }, { "epoch": 0.10135056872147706, "grad_norm": 14.073145559868317, "learning_rate": 4.967637920555249e-05, "loss": 3.214, "mean_token_accuracy": 0.37586206793785093, "step": 100625 }, { "epoch": 0.10135560477458122, "grad_norm": 11.647210947356317, "learning_rate": 4.9676315877622734e-05, "loss": 2.7346, "mean_token_accuracy": 0.39655172228813174, "step": 100630 }, { "epoch": 0.1013606408276854, "grad_norm": 11.732739210304482, "learning_rate": 4.967625254354231e-05, "loss": 2.4959, "mean_token_accuracy": 0.4034482717514038, "step": 100635 }, { "epoch": 0.10136567688078957, "grad_norm": 10.287999036247278, "learning_rate": 4.9676189203311205e-05, "loss": 2.5217, "mean_token_accuracy": 0.4034482717514038, "step": 100640 }, { "epoch": 0.10137071293389374, "grad_norm": 9.031237145703985, "learning_rate": 4.967612585692945e-05, "loss": 2.4471, "mean_token_accuracy": 0.3896551728248596, "step": 100645 }, { "epoch": 0.10137574898699792, "grad_norm": 10.405057172254269, "learning_rate": 4.967606250439707e-05, "loss": 2.4304, "mean_token_accuracy": 0.42758620381355283, "step": 100650 }, { "epoch": 0.10138078504010209, "grad_norm": 12.205493953052658, "learning_rate": 4.9675999145714076e-05, "loss": 2.4243, "mean_token_accuracy": 0.44827587008476255, "step": 100655 }, { "epoch": 0.10138582109320626, "grad_norm": 11.341097577105035, "learning_rate": 4.9675935780880486e-05, "loss": 2.6573, "mean_token_accuracy": 0.38620689511299133, "step": 100660 }, { "epoch": 0.10139085714631044, "grad_norm": 12.258841071436759, "learning_rate": 4.967587240989632e-05, "loss": 2.4832, "mean_token_accuracy": 0.4310344815254211, "step": 100665 }, { "epoch": 0.10139589319941461, "grad_norm": 9.706364631053155, "learning_rate": 4.967580903276158e-05, "loss": 2.305, "mean_token_accuracy": 0.4258923172950745, "step": 100670 }, { "epoch": 0.10140092925251878, "grad_norm": 8.691059065651524, "learning_rate": 4.967574564947631e-05, "loss": 2.2603, "mean_token_accuracy": 0.4517241358757019, "step": 100675 }, { "epoch": 0.10140596530562296, "grad_norm": 12.034777173390824, "learning_rate": 4.9675682260040504e-05, "loss": 2.2166, "mean_token_accuracy": 0.4379310369491577, "step": 100680 }, { "epoch": 0.10141100135872713, "grad_norm": 11.295859927668664, "learning_rate": 4.96756188644542e-05, "loss": 2.3683, "mean_token_accuracy": 0.38275861740112305, "step": 100685 }, { "epoch": 0.1014160374118313, "grad_norm": 9.446924969284714, "learning_rate": 4.96755554627174e-05, "loss": 2.335, "mean_token_accuracy": 0.43448275327682495, "step": 100690 }, { "epoch": 0.10142107346493548, "grad_norm": 10.192895583578453, "learning_rate": 4.967549205483013e-05, "loss": 2.3662, "mean_token_accuracy": 0.48965516686439514, "step": 100695 }, { "epoch": 0.10142610951803964, "grad_norm": 8.974214570885653, "learning_rate": 4.9675428640792404e-05, "loss": 2.2922, "mean_token_accuracy": 0.43793103098869324, "step": 100700 }, { "epoch": 0.10143114557114381, "grad_norm": 10.097710525785212, "learning_rate": 4.9675365220604235e-05, "loss": 2.5003, "mean_token_accuracy": 0.3931034505367279, "step": 100705 }, { "epoch": 0.10143618162424799, "grad_norm": 7.561590843733474, "learning_rate": 4.967530179426565e-05, "loss": 2.0413, "mean_token_accuracy": 0.48965516686439514, "step": 100710 }, { "epoch": 0.10144121767735216, "grad_norm": 10.136202970608602, "learning_rate": 4.967523836177666e-05, "loss": 2.0748, "mean_token_accuracy": 0.47828190922737124, "step": 100715 }, { "epoch": 0.10144625373045633, "grad_norm": 9.65393062526334, "learning_rate": 4.967517492313728e-05, "loss": 2.1347, "mean_token_accuracy": 0.47352216839790345, "step": 100720 }, { "epoch": 0.10145128978356051, "grad_norm": 9.738260090124431, "learning_rate": 4.9675111478347544e-05, "loss": 2.3118, "mean_token_accuracy": 0.44137930274009707, "step": 100725 }, { "epoch": 0.10145632583666468, "grad_norm": 11.33915215153483, "learning_rate": 4.967504802740746e-05, "loss": 2.9397, "mean_token_accuracy": 0.36551723480224607, "step": 100730 }, { "epoch": 0.10146136188976886, "grad_norm": 11.395081026282972, "learning_rate": 4.967498457031703e-05, "loss": 2.4194, "mean_token_accuracy": 0.3827586233615875, "step": 100735 }, { "epoch": 0.10146639794287303, "grad_norm": 13.892290641292924, "learning_rate": 4.96749211070763e-05, "loss": 2.5356, "mean_token_accuracy": 0.4056866317987442, "step": 100740 }, { "epoch": 0.1014714339959772, "grad_norm": 9.946780658418962, "learning_rate": 4.967485763768526e-05, "loss": 2.3801, "mean_token_accuracy": 0.41034482717514037, "step": 100745 }, { "epoch": 0.10147647004908138, "grad_norm": 10.646127651361248, "learning_rate": 4.967479416214395e-05, "loss": 2.6984, "mean_token_accuracy": 0.441379314661026, "step": 100750 }, { "epoch": 0.10148150610218555, "grad_norm": 13.995196633381271, "learning_rate": 4.9674730680452374e-05, "loss": 2.2535, "mean_token_accuracy": 0.4655172348022461, "step": 100755 }, { "epoch": 0.10148654215528972, "grad_norm": 9.811352491622047, "learning_rate": 4.967466719261055e-05, "loss": 2.237, "mean_token_accuracy": 0.46551724076271056, "step": 100760 }, { "epoch": 0.1014915782083939, "grad_norm": 9.840966881428052, "learning_rate": 4.9674603698618506e-05, "loss": 2.701, "mean_token_accuracy": 0.38965516686439516, "step": 100765 }, { "epoch": 0.10149661426149806, "grad_norm": 9.967733088992885, "learning_rate": 4.967454019847624e-05, "loss": 2.4588, "mean_token_accuracy": 0.42413793206214906, "step": 100770 }, { "epoch": 0.10150165031460223, "grad_norm": 10.20505073137083, "learning_rate": 4.96744766921838e-05, "loss": 2.0905, "mean_token_accuracy": 0.48965516686439514, "step": 100775 }, { "epoch": 0.1015066863677064, "grad_norm": 24.730972254919475, "learning_rate": 4.967441317974118e-05, "loss": 3.1655, "mean_token_accuracy": 0.3592861443758011, "step": 100780 }, { "epoch": 0.10151172242081058, "grad_norm": 11.614018424718402, "learning_rate": 4.96743496611484e-05, "loss": 2.2111, "mean_token_accuracy": 0.4379310369491577, "step": 100785 }, { "epoch": 0.10151675847391475, "grad_norm": 10.487049522959877, "learning_rate": 4.967428613640548e-05, "loss": 2.4044, "mean_token_accuracy": 0.4034482777118683, "step": 100790 }, { "epoch": 0.10152179452701893, "grad_norm": 14.413487159466131, "learning_rate": 4.967422260551245e-05, "loss": 2.8713, "mean_token_accuracy": 0.3793103516101837, "step": 100795 }, { "epoch": 0.1015268305801231, "grad_norm": 8.448813906671417, "learning_rate": 4.967415906846931e-05, "loss": 2.3608, "mean_token_accuracy": 0.42758620977401735, "step": 100800 }, { "epoch": 0.10153186663322727, "grad_norm": 9.551361366257247, "learning_rate": 4.9674095525276084e-05, "loss": 2.0918, "mean_token_accuracy": 0.4620689690113068, "step": 100805 }, { "epoch": 0.10153690268633145, "grad_norm": 14.849401617582771, "learning_rate": 4.967403197593279e-05, "loss": 2.5876, "mean_token_accuracy": 0.40689656138420105, "step": 100810 }, { "epoch": 0.10154193873943562, "grad_norm": 12.544426863429585, "learning_rate": 4.9673968420439445e-05, "loss": 3.3401, "mean_token_accuracy": 0.28965516984462736, "step": 100815 }, { "epoch": 0.1015469747925398, "grad_norm": 10.325039203033084, "learning_rate": 4.967390485879607e-05, "loss": 2.2345, "mean_token_accuracy": 0.4344827651977539, "step": 100820 }, { "epoch": 0.10155201084564397, "grad_norm": 13.817679325790849, "learning_rate": 4.967384129100267e-05, "loss": 2.5564, "mean_token_accuracy": 0.3965517282485962, "step": 100825 }, { "epoch": 0.10155704689874814, "grad_norm": 12.571800779946168, "learning_rate": 4.967377771705928e-05, "loss": 2.1879, "mean_token_accuracy": 0.4379310369491577, "step": 100830 }, { "epoch": 0.10156208295185232, "grad_norm": 11.72902053037505, "learning_rate": 4.967371413696591e-05, "loss": 2.2935, "mean_token_accuracy": 0.46551724076271056, "step": 100835 }, { "epoch": 0.10156711900495648, "grad_norm": 11.235483601668271, "learning_rate": 4.9673650550722576e-05, "loss": 2.6906, "mean_token_accuracy": 0.3827586203813553, "step": 100840 }, { "epoch": 0.10157215505806065, "grad_norm": 9.032253035829793, "learning_rate": 4.96735869583293e-05, "loss": 2.1702, "mean_token_accuracy": 0.49655172824859617, "step": 100845 }, { "epoch": 0.10157719111116482, "grad_norm": 11.4376212796248, "learning_rate": 4.9673523359786095e-05, "loss": 2.6579, "mean_token_accuracy": 0.4103448212146759, "step": 100850 }, { "epoch": 0.101582227164269, "grad_norm": 9.701295805624754, "learning_rate": 4.967345975509298e-05, "loss": 2.3822, "mean_token_accuracy": 0.41379310488700866, "step": 100855 }, { "epoch": 0.10158726321737317, "grad_norm": 9.855944809224175, "learning_rate": 4.9673396144249975e-05, "loss": 2.5044, "mean_token_accuracy": 0.4068965554237366, "step": 100860 }, { "epoch": 0.10159229927047735, "grad_norm": 8.448833720127668, "learning_rate": 4.9673332527257097e-05, "loss": 2.2792, "mean_token_accuracy": 0.443254691362381, "step": 100865 }, { "epoch": 0.10159733532358152, "grad_norm": 10.491740421603138, "learning_rate": 4.967326890411436e-05, "loss": 2.5856, "mean_token_accuracy": 0.4068965554237366, "step": 100870 }, { "epoch": 0.1016023713766857, "grad_norm": 11.755491951521089, "learning_rate": 4.967320527482179e-05, "loss": 2.703, "mean_token_accuracy": 0.43189655542373656, "step": 100875 }, { "epoch": 0.10160740742978987, "grad_norm": 9.62708744506585, "learning_rate": 4.967314163937939e-05, "loss": 2.2155, "mean_token_accuracy": 0.4482758641242981, "step": 100880 }, { "epoch": 0.10161244348289404, "grad_norm": 9.153020145138159, "learning_rate": 4.967307799778719e-05, "loss": 2.0903, "mean_token_accuracy": 0.458620685338974, "step": 100885 }, { "epoch": 0.10161747953599821, "grad_norm": 12.388236459140934, "learning_rate": 4.96730143500452e-05, "loss": 2.3104, "mean_token_accuracy": 0.4206896543502808, "step": 100890 }, { "epoch": 0.10162251558910239, "grad_norm": 9.773596403077843, "learning_rate": 4.967295069615345e-05, "loss": 2.9281, "mean_token_accuracy": 0.4034482777118683, "step": 100895 }, { "epoch": 0.10162755164220656, "grad_norm": 10.15066229161137, "learning_rate": 4.967288703611194e-05, "loss": 2.1057, "mean_token_accuracy": 0.47241378426551817, "step": 100900 }, { "epoch": 0.10163258769531074, "grad_norm": 10.52561256831747, "learning_rate": 4.967282336992071e-05, "loss": 2.0338, "mean_token_accuracy": 0.4655172348022461, "step": 100905 }, { "epoch": 0.1016376237484149, "grad_norm": 10.369343709503157, "learning_rate": 4.9672759697579755e-05, "loss": 2.2386, "mean_token_accuracy": 0.4137930989265442, "step": 100910 }, { "epoch": 0.10164265980151907, "grad_norm": 12.069768056683298, "learning_rate": 4.9672696019089106e-05, "loss": 2.2408, "mean_token_accuracy": 0.46382335424423216, "step": 100915 }, { "epoch": 0.10164769585462324, "grad_norm": 10.545574118346028, "learning_rate": 4.9672632334448774e-05, "loss": 2.0615, "mean_token_accuracy": 0.4517241358757019, "step": 100920 }, { "epoch": 0.10165273190772742, "grad_norm": 15.455818397902737, "learning_rate": 4.967256864365879e-05, "loss": 2.646, "mean_token_accuracy": 0.3758620619773865, "step": 100925 }, { "epoch": 0.10165776796083159, "grad_norm": 11.98615636227642, "learning_rate": 4.967250494671914e-05, "loss": 2.7912, "mean_token_accuracy": 0.37241379022598264, "step": 100930 }, { "epoch": 0.10166280401393576, "grad_norm": 12.380631434297218, "learning_rate": 4.967244124362987e-05, "loss": 2.5413, "mean_token_accuracy": 0.4103448331356049, "step": 100935 }, { "epoch": 0.10166784006703994, "grad_norm": 7.887126237438663, "learning_rate": 4.9672377534391e-05, "loss": 2.059, "mean_token_accuracy": 0.4816697001457214, "step": 100940 }, { "epoch": 0.10167287612014411, "grad_norm": 12.36479513938091, "learning_rate": 4.967231381900254e-05, "loss": 2.3425, "mean_token_accuracy": 0.42280701398849485, "step": 100945 }, { "epoch": 0.10167791217324829, "grad_norm": 11.606908700053987, "learning_rate": 4.96722500974645e-05, "loss": 2.822, "mean_token_accuracy": 0.3724137932062149, "step": 100950 }, { "epoch": 0.10168294822635246, "grad_norm": 10.50018012618524, "learning_rate": 4.96721863697769e-05, "loss": 2.1854, "mean_token_accuracy": 0.4954023063182831, "step": 100955 }, { "epoch": 0.10168798427945663, "grad_norm": 15.256208905980785, "learning_rate": 4.967212263593977e-05, "loss": 2.9387, "mean_token_accuracy": 0.35172412991523744, "step": 100960 }, { "epoch": 0.10169302033256081, "grad_norm": 10.382403812413267, "learning_rate": 4.967205889595312e-05, "loss": 2.8376, "mean_token_accuracy": 0.3551724165678024, "step": 100965 }, { "epoch": 0.10169805638566498, "grad_norm": 13.240454229917177, "learning_rate": 4.9671995149816955e-05, "loss": 2.4774, "mean_token_accuracy": 0.3965517282485962, "step": 100970 }, { "epoch": 0.10170309243876915, "grad_norm": 11.567877897320187, "learning_rate": 4.967193139753131e-05, "loss": 2.5932, "mean_token_accuracy": 0.38620689511299133, "step": 100975 }, { "epoch": 0.10170812849187331, "grad_norm": 11.109818837255643, "learning_rate": 4.9671867639096195e-05, "loss": 2.6215, "mean_token_accuracy": 0.3344827651977539, "step": 100980 }, { "epoch": 0.10171316454497749, "grad_norm": 10.404065805924267, "learning_rate": 4.9671803874511634e-05, "loss": 2.3149, "mean_token_accuracy": 0.4459770143032074, "step": 100985 }, { "epoch": 0.10171820059808166, "grad_norm": 11.232610393977048, "learning_rate": 4.967174010377764e-05, "loss": 1.9963, "mean_token_accuracy": 0.4931034505367279, "step": 100990 }, { "epoch": 0.10172323665118584, "grad_norm": 10.925460007176074, "learning_rate": 4.9671676326894235e-05, "loss": 2.4733, "mean_token_accuracy": 0.38620689511299133, "step": 100995 }, { "epoch": 0.10172827270429001, "grad_norm": 15.780413641323877, "learning_rate": 4.967161254386143e-05, "loss": 2.4543, "mean_token_accuracy": 0.3999999940395355, "step": 101000 }, { "epoch": 0.10173330875739418, "grad_norm": 13.611269464329586, "learning_rate": 4.967154875467924e-05, "loss": 2.2035, "mean_token_accuracy": 0.4862069010734558, "step": 101005 }, { "epoch": 0.10173834481049836, "grad_norm": 10.404716092969895, "learning_rate": 4.9671484959347697e-05, "loss": 2.2346, "mean_token_accuracy": 0.42413793206214906, "step": 101010 }, { "epoch": 0.10174338086360253, "grad_norm": 10.994114377956851, "learning_rate": 4.96714211578668e-05, "loss": 2.5468, "mean_token_accuracy": 0.43103448748588563, "step": 101015 }, { "epoch": 0.1017484169167067, "grad_norm": 16.276048439422148, "learning_rate": 4.9671357350236586e-05, "loss": 2.8474, "mean_token_accuracy": 0.3551724135875702, "step": 101020 }, { "epoch": 0.10175345296981088, "grad_norm": 10.77156092862102, "learning_rate": 4.9671293536457056e-05, "loss": 2.3863, "mean_token_accuracy": 0.441379314661026, "step": 101025 }, { "epoch": 0.10175848902291505, "grad_norm": 11.03370029899864, "learning_rate": 4.967122971652824e-05, "loss": 2.326, "mean_token_accuracy": 0.4034482717514038, "step": 101030 }, { "epoch": 0.10176352507601923, "grad_norm": 10.156911610184817, "learning_rate": 4.9671165890450146e-05, "loss": 2.3359, "mean_token_accuracy": 0.42758620977401735, "step": 101035 }, { "epoch": 0.1017685611291234, "grad_norm": 9.197896108400101, "learning_rate": 4.967110205822279e-05, "loss": 2.5787, "mean_token_accuracy": 0.3896551728248596, "step": 101040 }, { "epoch": 0.10177359718222757, "grad_norm": 8.528213144170406, "learning_rate": 4.967103821984621e-05, "loss": 2.774, "mean_token_accuracy": 0.38965516686439516, "step": 101045 }, { "epoch": 0.10177863323533173, "grad_norm": 10.679897902721834, "learning_rate": 4.967097437532041e-05, "loss": 2.3483, "mean_token_accuracy": 0.4310344815254211, "step": 101050 }, { "epoch": 0.10178366928843591, "grad_norm": 10.041133067649916, "learning_rate": 4.9670910524645404e-05, "loss": 2.52, "mean_token_accuracy": 0.41034482717514037, "step": 101055 }, { "epoch": 0.10178870534154008, "grad_norm": 9.994290211120356, "learning_rate": 4.967084666782121e-05, "loss": 2.1802, "mean_token_accuracy": 0.4344827592372894, "step": 101060 }, { "epoch": 0.10179374139464425, "grad_norm": 14.431225562379563, "learning_rate": 4.967078280484786e-05, "loss": 2.2004, "mean_token_accuracy": 0.4620689570903778, "step": 101065 }, { "epoch": 0.10179877744774843, "grad_norm": 10.966927455419945, "learning_rate": 4.967071893572535e-05, "loss": 2.1619, "mean_token_accuracy": 0.4448275864124298, "step": 101070 }, { "epoch": 0.1018038135008526, "grad_norm": 10.369541192564483, "learning_rate": 4.9670655060453706e-05, "loss": 2.3112, "mean_token_accuracy": 0.4413793087005615, "step": 101075 }, { "epoch": 0.10180884955395678, "grad_norm": 11.061034657124882, "learning_rate": 4.967059117903296e-05, "loss": 2.0565, "mean_token_accuracy": 0.4569267988204956, "step": 101080 }, { "epoch": 0.10181388560706095, "grad_norm": 10.587146592194774, "learning_rate": 4.96705272914631e-05, "loss": 2.4311, "mean_token_accuracy": 0.3965517163276672, "step": 101085 }, { "epoch": 0.10181892166016512, "grad_norm": 12.934031000544302, "learning_rate": 4.967046339774418e-05, "loss": 2.3297, "mean_token_accuracy": 0.4551724135875702, "step": 101090 }, { "epoch": 0.1018239577132693, "grad_norm": 11.918688664572638, "learning_rate": 4.967039949787619e-05, "loss": 2.3961, "mean_token_accuracy": 0.42068964838981626, "step": 101095 }, { "epoch": 0.10182899376637347, "grad_norm": 11.264872050838905, "learning_rate": 4.967033559185915e-05, "loss": 2.0866, "mean_token_accuracy": 0.5068965435028077, "step": 101100 }, { "epoch": 0.10183402981947764, "grad_norm": 10.289047563676268, "learning_rate": 4.9670271679693096e-05, "loss": 2.4932, "mean_token_accuracy": 0.4137930989265442, "step": 101105 }, { "epoch": 0.10183906587258182, "grad_norm": 9.837015291585804, "learning_rate": 4.967020776137803e-05, "loss": 2.408, "mean_token_accuracy": 0.4206896543502808, "step": 101110 }, { "epoch": 0.10184410192568599, "grad_norm": 9.853966386804649, "learning_rate": 4.967014383691398e-05, "loss": 2.5363, "mean_token_accuracy": 0.4154869973659515, "step": 101115 }, { "epoch": 0.10184913797879015, "grad_norm": 10.295761620117622, "learning_rate": 4.967007990630095e-05, "loss": 2.3673, "mean_token_accuracy": 0.441379314661026, "step": 101120 }, { "epoch": 0.10185417403189433, "grad_norm": 10.058262142379462, "learning_rate": 4.967001596953897e-05, "loss": 2.2536, "mean_token_accuracy": 0.4758620738983154, "step": 101125 }, { "epoch": 0.1018592100849985, "grad_norm": 11.019428454806961, "learning_rate": 4.966995202662805e-05, "loss": 2.7066, "mean_token_accuracy": 0.39655172228813174, "step": 101130 }, { "epoch": 0.10186424613810267, "grad_norm": 10.24806970428062, "learning_rate": 4.966988807756822e-05, "loss": 2.5154, "mean_token_accuracy": 0.4361161530017853, "step": 101135 }, { "epoch": 0.10186928219120685, "grad_norm": 12.156107699066538, "learning_rate": 4.9669824122359475e-05, "loss": 2.6683, "mean_token_accuracy": 0.4172413766384125, "step": 101140 }, { "epoch": 0.10187431824431102, "grad_norm": 7.389442963778349, "learning_rate": 4.9669760161001855e-05, "loss": 2.2613, "mean_token_accuracy": 0.4655172348022461, "step": 101145 }, { "epoch": 0.1018793542974152, "grad_norm": 11.426376677211143, "learning_rate": 4.9669696193495365e-05, "loss": 2.3932, "mean_token_accuracy": 0.4360556542873383, "step": 101150 }, { "epoch": 0.10188439035051937, "grad_norm": 10.553610489473256, "learning_rate": 4.966963221984003e-05, "loss": 2.5471, "mean_token_accuracy": 0.42413793206214906, "step": 101155 }, { "epoch": 0.10188942640362354, "grad_norm": 12.337645816576272, "learning_rate": 4.966956824003587e-05, "loss": 2.5839, "mean_token_accuracy": 0.4482758641242981, "step": 101160 }, { "epoch": 0.10189446245672772, "grad_norm": 46.4937495785292, "learning_rate": 4.9669504254082895e-05, "loss": 2.8224, "mean_token_accuracy": 0.4068965494632721, "step": 101165 }, { "epoch": 0.10189949850983189, "grad_norm": 10.495432188970655, "learning_rate": 4.9669440261981116e-05, "loss": 2.0261, "mean_token_accuracy": 0.46896552443504336, "step": 101170 }, { "epoch": 0.10190453456293606, "grad_norm": 10.607509788634863, "learning_rate": 4.966937626373057e-05, "loss": 2.152, "mean_token_accuracy": 0.47489412426948546, "step": 101175 }, { "epoch": 0.10190957061604024, "grad_norm": 9.70772759813328, "learning_rate": 4.9669312259331255e-05, "loss": 2.4355, "mean_token_accuracy": 0.4413793087005615, "step": 101180 }, { "epoch": 0.10191460666914441, "grad_norm": 14.423752812546205, "learning_rate": 4.966924824878321e-05, "loss": 2.3348, "mean_token_accuracy": 0.432667875289917, "step": 101185 }, { "epoch": 0.10191964272224857, "grad_norm": 7.629462158312488, "learning_rate": 4.966918423208644e-05, "loss": 2.7109, "mean_token_accuracy": 0.4093768924474716, "step": 101190 }, { "epoch": 0.10192467877535275, "grad_norm": 16.061545168444237, "learning_rate": 4.966912020924095e-05, "loss": 2.7195, "mean_token_accuracy": 0.41379310488700866, "step": 101195 }, { "epoch": 0.10192971482845692, "grad_norm": 10.133584347045788, "learning_rate": 4.966905618024679e-05, "loss": 1.9811, "mean_token_accuracy": 0.4931034505367279, "step": 101200 }, { "epoch": 0.10193475088156109, "grad_norm": 10.662707216546274, "learning_rate": 4.966899214510395e-05, "loss": 2.2405, "mean_token_accuracy": 0.42758620977401735, "step": 101205 }, { "epoch": 0.10193978693466527, "grad_norm": 12.168320769300239, "learning_rate": 4.966892810381246e-05, "loss": 2.5598, "mean_token_accuracy": 0.3965517163276672, "step": 101210 }, { "epoch": 0.10194482298776944, "grad_norm": 11.716755528533875, "learning_rate": 4.9668864056372336e-05, "loss": 2.5719, "mean_token_accuracy": 0.42413792610168455, "step": 101215 }, { "epoch": 0.10194985904087361, "grad_norm": 10.817554107088853, "learning_rate": 4.966880000278359e-05, "loss": 2.6313, "mean_token_accuracy": 0.41379311084747317, "step": 101220 }, { "epoch": 0.10195489509397779, "grad_norm": 10.013534855585563, "learning_rate": 4.966873594304625e-05, "loss": 2.6696, "mean_token_accuracy": 0.3896551787853241, "step": 101225 }, { "epoch": 0.10195993114708196, "grad_norm": 13.034660912578813, "learning_rate": 4.9668671877160325e-05, "loss": 2.8, "mean_token_accuracy": 0.4137930989265442, "step": 101230 }, { "epoch": 0.10196496720018614, "grad_norm": 10.858294969111173, "learning_rate": 4.9668607805125836e-05, "loss": 3.1224, "mean_token_accuracy": 0.3068965435028076, "step": 101235 }, { "epoch": 0.10197000325329031, "grad_norm": 10.073687090211894, "learning_rate": 4.966854372694281e-05, "loss": 2.3815, "mean_token_accuracy": 0.4103448331356049, "step": 101240 }, { "epoch": 0.10197503930639448, "grad_norm": 11.19879222105325, "learning_rate": 4.966847964261124e-05, "loss": 2.4538, "mean_token_accuracy": 0.41379310488700866, "step": 101245 }, { "epoch": 0.10198007535949866, "grad_norm": 13.179217835003104, "learning_rate": 4.966841555213117e-05, "loss": 2.9678, "mean_token_accuracy": 0.42068965137004855, "step": 101250 }, { "epoch": 0.10198511141260283, "grad_norm": 10.705079436034646, "learning_rate": 4.96683514555026e-05, "loss": 2.8048, "mean_token_accuracy": 0.3413793116807938, "step": 101255 }, { "epoch": 0.10199014746570699, "grad_norm": 9.618498109629163, "learning_rate": 4.9668287352725564e-05, "loss": 2.4731, "mean_token_accuracy": 0.43980641961097716, "step": 101260 }, { "epoch": 0.10199518351881116, "grad_norm": 10.437419519007902, "learning_rate": 4.966822324380006e-05, "loss": 2.2757, "mean_token_accuracy": 0.44827587008476255, "step": 101265 }, { "epoch": 0.10200021957191534, "grad_norm": 10.677167076755813, "learning_rate": 4.9668159128726126e-05, "loss": 2.528, "mean_token_accuracy": 0.39655172228813174, "step": 101270 }, { "epoch": 0.10200525562501951, "grad_norm": 9.050546650387743, "learning_rate": 4.966809500750377e-05, "loss": 2.2949, "mean_token_accuracy": 0.43793103098869324, "step": 101275 }, { "epoch": 0.10201029167812369, "grad_norm": 11.680354552411675, "learning_rate": 4.9668030880133004e-05, "loss": 2.4028, "mean_token_accuracy": 0.4413793087005615, "step": 101280 }, { "epoch": 0.10201532773122786, "grad_norm": 10.308090073136523, "learning_rate": 4.966796674661385e-05, "loss": 2.6891, "mean_token_accuracy": 0.3931034505367279, "step": 101285 }, { "epoch": 0.10202036378433203, "grad_norm": 11.34737519212835, "learning_rate": 4.966790260694634e-05, "loss": 2.4255, "mean_token_accuracy": 0.41034482717514037, "step": 101290 }, { "epoch": 0.1020253998374362, "grad_norm": 10.515134928771163, "learning_rate": 4.966783846113047e-05, "loss": 2.4041, "mean_token_accuracy": 0.42413792610168455, "step": 101295 }, { "epoch": 0.10203043589054038, "grad_norm": 12.135507006126897, "learning_rate": 4.9667774309166275e-05, "loss": 2.5766, "mean_token_accuracy": 0.4206896543502808, "step": 101300 }, { "epoch": 0.10203547194364455, "grad_norm": 12.743260954649752, "learning_rate": 4.966771015105376e-05, "loss": 2.2759, "mean_token_accuracy": 0.43448275327682495, "step": 101305 }, { "epoch": 0.10204050799674873, "grad_norm": 9.864209041939874, "learning_rate": 4.966764598679295e-05, "loss": 2.9815, "mean_token_accuracy": 0.4, "step": 101310 }, { "epoch": 0.1020455440498529, "grad_norm": 10.034103883878783, "learning_rate": 4.966758181638386e-05, "loss": 2.4889, "mean_token_accuracy": 0.41379310488700866, "step": 101315 }, { "epoch": 0.10205058010295708, "grad_norm": 11.913055040461813, "learning_rate": 4.96675176398265e-05, "loss": 2.4489, "mean_token_accuracy": 0.4418719232082367, "step": 101320 }, { "epoch": 0.10205561615606125, "grad_norm": 11.15737206432806, "learning_rate": 4.9667453457120896e-05, "loss": 2.4279, "mean_token_accuracy": 0.441379314661026, "step": 101325 }, { "epoch": 0.10206065220916541, "grad_norm": 11.326625425440515, "learning_rate": 4.966738926826708e-05, "loss": 2.834, "mean_token_accuracy": 0.341379314661026, "step": 101330 }, { "epoch": 0.10206568826226958, "grad_norm": 15.879808073577067, "learning_rate": 4.9667325073265056e-05, "loss": 2.1314, "mean_token_accuracy": 0.46551724076271056, "step": 101335 }, { "epoch": 0.10207072431537376, "grad_norm": 13.106315756663285, "learning_rate": 4.966726087211483e-05, "loss": 3.0851, "mean_token_accuracy": 0.3448275804519653, "step": 101340 }, { "epoch": 0.10207576036847793, "grad_norm": 12.24190433288864, "learning_rate": 4.966719666481644e-05, "loss": 2.2918, "mean_token_accuracy": 0.44827585816383364, "step": 101345 }, { "epoch": 0.1020807964215821, "grad_norm": 13.199450937300666, "learning_rate": 4.966713245136989e-05, "loss": 2.3746, "mean_token_accuracy": 0.43103448748588563, "step": 101350 }, { "epoch": 0.10208583247468628, "grad_norm": 11.067275679997097, "learning_rate": 4.966706823177521e-05, "loss": 2.3859, "mean_token_accuracy": 0.44827585816383364, "step": 101355 }, { "epoch": 0.10209086852779045, "grad_norm": 9.583815671735127, "learning_rate": 4.9667004006032394e-05, "loss": 2.3475, "mean_token_accuracy": 0.441379314661026, "step": 101360 }, { "epoch": 0.10209590458089463, "grad_norm": 12.581979630930846, "learning_rate": 4.96669397741415e-05, "loss": 2.4919, "mean_token_accuracy": 0.4379310429096222, "step": 101365 }, { "epoch": 0.1021009406339988, "grad_norm": 11.512231942293987, "learning_rate": 4.96668755361025e-05, "loss": 2.1641, "mean_token_accuracy": 0.48620688915252686, "step": 101370 }, { "epoch": 0.10210597668710297, "grad_norm": 10.443468656106361, "learning_rate": 4.966681129191545e-05, "loss": 2.2771, "mean_token_accuracy": 0.4586206912994385, "step": 101375 }, { "epoch": 0.10211101274020715, "grad_norm": 10.357880106524657, "learning_rate": 4.9666747041580344e-05, "loss": 2.5084, "mean_token_accuracy": 0.4172413766384125, "step": 101380 }, { "epoch": 0.10211604879331132, "grad_norm": 9.30325090885326, "learning_rate": 4.966668278509722e-05, "loss": 2.6331, "mean_token_accuracy": 0.4344827592372894, "step": 101385 }, { "epoch": 0.1021210848464155, "grad_norm": 11.013394465380355, "learning_rate": 4.9666618522466075e-05, "loss": 2.4513, "mean_token_accuracy": 0.43103448748588563, "step": 101390 }, { "epoch": 0.10212612089951967, "grad_norm": 8.862121802280159, "learning_rate": 4.9666554253686934e-05, "loss": 2.2252, "mean_token_accuracy": 0.4551724135875702, "step": 101395 }, { "epoch": 0.10213115695262383, "grad_norm": 10.178767160610892, "learning_rate": 4.966648997875982e-05, "loss": 2.0012, "mean_token_accuracy": 0.5145320236682892, "step": 101400 }, { "epoch": 0.102136193005728, "grad_norm": 10.046952452210524, "learning_rate": 4.9666425697684746e-05, "loss": 2.2008, "mean_token_accuracy": 0.4517241358757019, "step": 101405 }, { "epoch": 0.10214122905883218, "grad_norm": 11.009588532572721, "learning_rate": 4.9666361410461734e-05, "loss": 2.4843, "mean_token_accuracy": 0.404718691110611, "step": 101410 }, { "epoch": 0.10214626511193635, "grad_norm": 13.606734697560686, "learning_rate": 4.966629711709079e-05, "loss": 2.4338, "mean_token_accuracy": 0.4034482717514038, "step": 101415 }, { "epoch": 0.10215130116504052, "grad_norm": 11.479368690083962, "learning_rate": 4.966623281757196e-05, "loss": 2.6805, "mean_token_accuracy": 0.38965516686439516, "step": 101420 }, { "epoch": 0.1021563372181447, "grad_norm": 10.971211155835643, "learning_rate": 4.966616851190522e-05, "loss": 2.8029, "mean_token_accuracy": 0.33793103098869326, "step": 101425 }, { "epoch": 0.10216137327124887, "grad_norm": 10.165815502805659, "learning_rate": 4.9666104200090615e-05, "loss": 2.6893, "mean_token_accuracy": 0.37586206793785093, "step": 101430 }, { "epoch": 0.10216640932435304, "grad_norm": 10.671299134544501, "learning_rate": 4.966603988212817e-05, "loss": 3.1933, "mean_token_accuracy": 0.35862069129943847, "step": 101435 }, { "epoch": 0.10217144537745722, "grad_norm": 11.150517995312756, "learning_rate": 4.9665975558017886e-05, "loss": 2.2069, "mean_token_accuracy": 0.4586206912994385, "step": 101440 }, { "epoch": 0.10217648143056139, "grad_norm": 8.719926713472622, "learning_rate": 4.966591122775978e-05, "loss": 2.2478, "mean_token_accuracy": 0.3931034505367279, "step": 101445 }, { "epoch": 0.10218151748366557, "grad_norm": 10.55459519302372, "learning_rate": 4.966584689135388e-05, "loss": 2.3912, "mean_token_accuracy": 0.40689654350280763, "step": 101450 }, { "epoch": 0.10218655353676974, "grad_norm": 12.966120515592092, "learning_rate": 4.966578254880021e-05, "loss": 2.1867, "mean_token_accuracy": 0.4586206912994385, "step": 101455 }, { "epoch": 0.10219158958987391, "grad_norm": 15.400451831114731, "learning_rate": 4.966571820009876e-05, "loss": 2.7093, "mean_token_accuracy": 0.3586206793785095, "step": 101460 }, { "epoch": 0.10219662564297809, "grad_norm": 10.905507874059488, "learning_rate": 4.966565384524957e-05, "loss": 2.3838, "mean_token_accuracy": 0.4931034445762634, "step": 101465 }, { "epoch": 0.10220166169608225, "grad_norm": 8.854349295183948, "learning_rate": 4.966558948425266e-05, "loss": 2.4196, "mean_token_accuracy": 0.4413793087005615, "step": 101470 }, { "epoch": 0.10220669774918642, "grad_norm": 14.25834374638957, "learning_rate": 4.9665525117108033e-05, "loss": 2.5468, "mean_token_accuracy": 0.42262552976608275, "step": 101475 }, { "epoch": 0.1022117338022906, "grad_norm": 10.199296782329531, "learning_rate": 4.9665460743815726e-05, "loss": 2.3312, "mean_token_accuracy": 0.4620689630508423, "step": 101480 }, { "epoch": 0.10221676985539477, "grad_norm": 9.50060042877304, "learning_rate": 4.9665396364375735e-05, "loss": 2.4487, "mean_token_accuracy": 0.3517241358757019, "step": 101485 }, { "epoch": 0.10222180590849894, "grad_norm": 11.434091340228274, "learning_rate": 4.9665331978788094e-05, "loss": 2.653, "mean_token_accuracy": 0.38620689511299133, "step": 101490 }, { "epoch": 0.10222684196160312, "grad_norm": 11.805984698003433, "learning_rate": 4.966526758705281e-05, "loss": 2.8318, "mean_token_accuracy": 0.34827586114406583, "step": 101495 }, { "epoch": 0.10223187801470729, "grad_norm": 10.085346775076433, "learning_rate": 4.9665203189169916e-05, "loss": 2.5287, "mean_token_accuracy": 0.39310344457626345, "step": 101500 }, { "epoch": 0.10223691406781146, "grad_norm": 11.17373884407409, "learning_rate": 4.966513878513941e-05, "loss": 2.3601, "mean_token_accuracy": 0.4068965554237366, "step": 101505 }, { "epoch": 0.10224195012091564, "grad_norm": 12.515360738995351, "learning_rate": 4.966507437496132e-05, "loss": 2.3276, "mean_token_accuracy": 0.41161524653434756, "step": 101510 }, { "epoch": 0.10224698617401981, "grad_norm": 9.474818718156941, "learning_rate": 4.9665009958635675e-05, "loss": 2.5922, "mean_token_accuracy": 0.40877193212509155, "step": 101515 }, { "epoch": 0.10225202222712398, "grad_norm": 12.475417642394666, "learning_rate": 4.966494553616248e-05, "loss": 2.4751, "mean_token_accuracy": 0.46896551847457885, "step": 101520 }, { "epoch": 0.10225705828022816, "grad_norm": 11.589868268843718, "learning_rate": 4.966488110754175e-05, "loss": 2.5717, "mean_token_accuracy": 0.39310344457626345, "step": 101525 }, { "epoch": 0.10226209433333233, "grad_norm": 11.472717688386036, "learning_rate": 4.9664816672773506e-05, "loss": 2.0792, "mean_token_accuracy": 0.4689655125141144, "step": 101530 }, { "epoch": 0.1022671303864365, "grad_norm": 10.237386807476732, "learning_rate": 4.966475223185777e-05, "loss": 2.5498, "mean_token_accuracy": 0.40344828367233276, "step": 101535 }, { "epoch": 0.10227216643954067, "grad_norm": 11.862567961789917, "learning_rate": 4.9664687784794555e-05, "loss": 2.3874, "mean_token_accuracy": 0.41379310488700866, "step": 101540 }, { "epoch": 0.10227720249264484, "grad_norm": 15.204933789485153, "learning_rate": 4.9664623331583876e-05, "loss": 2.0477, "mean_token_accuracy": 0.46394434571266174, "step": 101545 }, { "epoch": 0.10228223854574901, "grad_norm": 8.825379391394835, "learning_rate": 4.966455887222577e-05, "loss": 2.4553, "mean_token_accuracy": 0.40000000298023225, "step": 101550 }, { "epoch": 0.10228727459885319, "grad_norm": 11.297179166233379, "learning_rate": 4.966449440672023e-05, "loss": 2.472, "mean_token_accuracy": 0.41724138259887694, "step": 101555 }, { "epoch": 0.10229231065195736, "grad_norm": 12.19411521625494, "learning_rate": 4.966442993506729e-05, "loss": 2.7066, "mean_token_accuracy": 0.37931033968925476, "step": 101560 }, { "epoch": 0.10229734670506153, "grad_norm": 11.33800025449437, "learning_rate": 4.966436545726696e-05, "loss": 2.7136, "mean_token_accuracy": 0.4103448331356049, "step": 101565 }, { "epoch": 0.10230238275816571, "grad_norm": 10.279873596176971, "learning_rate": 4.966430097331926e-05, "loss": 2.1518, "mean_token_accuracy": 0.47241379618644713, "step": 101570 }, { "epoch": 0.10230741881126988, "grad_norm": 11.673115134047578, "learning_rate": 4.9664236483224215e-05, "loss": 2.5192, "mean_token_accuracy": 0.44827585816383364, "step": 101575 }, { "epoch": 0.10231245486437406, "grad_norm": 10.944195663423022, "learning_rate": 4.966417198698183e-05, "loss": 2.3235, "mean_token_accuracy": 0.42068966031074523, "step": 101580 }, { "epoch": 0.10231749091747823, "grad_norm": 10.889442432948343, "learning_rate": 4.966410748459213e-05, "loss": 2.6649, "mean_token_accuracy": 0.38620689511299133, "step": 101585 }, { "epoch": 0.1023225269705824, "grad_norm": 10.49645800974054, "learning_rate": 4.966404297605514e-05, "loss": 2.9, "mean_token_accuracy": 0.41724138259887694, "step": 101590 }, { "epoch": 0.10232756302368658, "grad_norm": 14.745876133236068, "learning_rate": 4.9663978461370854e-05, "loss": 2.4916, "mean_token_accuracy": 0.4011494219303131, "step": 101595 }, { "epoch": 0.10233259907679075, "grad_norm": 9.206702414430856, "learning_rate": 4.966391394053931e-05, "loss": 2.6706, "mean_token_accuracy": 0.39310344457626345, "step": 101600 }, { "epoch": 0.10233763512989492, "grad_norm": 10.575541480541212, "learning_rate": 4.9663849413560534e-05, "loss": 2.3481, "mean_token_accuracy": 0.44827585816383364, "step": 101605 }, { "epoch": 0.10234267118299908, "grad_norm": 11.455111239494304, "learning_rate": 4.966378488043452e-05, "loss": 2.2668, "mean_token_accuracy": 0.46206896007061005, "step": 101610 }, { "epoch": 0.10234770723610326, "grad_norm": 10.099946547297618, "learning_rate": 4.96637203411613e-05, "loss": 2.3926, "mean_token_accuracy": 0.42758620977401735, "step": 101615 }, { "epoch": 0.10235274328920743, "grad_norm": 10.492093904601548, "learning_rate": 4.9663655795740895e-05, "loss": 2.3901, "mean_token_accuracy": 0.45862067937850953, "step": 101620 }, { "epoch": 0.1023577793423116, "grad_norm": 10.087200804485107, "learning_rate": 4.966359124417331e-05, "loss": 2.1933, "mean_token_accuracy": 0.4379310369491577, "step": 101625 }, { "epoch": 0.10236281539541578, "grad_norm": 11.9923446371806, "learning_rate": 4.9663526686458575e-05, "loss": 2.1371, "mean_token_accuracy": 0.4689655125141144, "step": 101630 }, { "epoch": 0.10236785144851995, "grad_norm": 18.498692301792317, "learning_rate": 4.96634621225967e-05, "loss": 2.557, "mean_token_accuracy": 0.379310342669487, "step": 101635 }, { "epoch": 0.10237288750162413, "grad_norm": 11.751099973737002, "learning_rate": 4.9663397552587715e-05, "loss": 2.3503, "mean_token_accuracy": 0.4275862157344818, "step": 101640 }, { "epoch": 0.1023779235547283, "grad_norm": 9.677773184802712, "learning_rate": 4.966333297643161e-05, "loss": 2.5097, "mean_token_accuracy": 0.41724138259887694, "step": 101645 }, { "epoch": 0.10238295960783247, "grad_norm": 11.912300075149359, "learning_rate": 4.966326839412844e-05, "loss": 2.4573, "mean_token_accuracy": 0.4, "step": 101650 }, { "epoch": 0.10238799566093665, "grad_norm": 10.07874219320994, "learning_rate": 4.96632038056782e-05, "loss": 2.4578, "mean_token_accuracy": 0.4225045323371887, "step": 101655 }, { "epoch": 0.10239303171404082, "grad_norm": 10.641358592510718, "learning_rate": 4.966313921108091e-05, "loss": 2.4991, "mean_token_accuracy": 0.4310344815254211, "step": 101660 }, { "epoch": 0.102398067767145, "grad_norm": 11.003416098229291, "learning_rate": 4.966307461033659e-05, "loss": 2.5566, "mean_token_accuracy": 0.4172413766384125, "step": 101665 }, { "epoch": 0.10240310382024917, "grad_norm": 12.221158778065156, "learning_rate": 4.9663010003445265e-05, "loss": 2.3277, "mean_token_accuracy": 0.4413793087005615, "step": 101670 }, { "epoch": 0.10240813987335334, "grad_norm": 15.358813467993949, "learning_rate": 4.9662945390406936e-05, "loss": 2.9863, "mean_token_accuracy": 0.3529340624809265, "step": 101675 }, { "epoch": 0.1024131759264575, "grad_norm": 10.354403483886225, "learning_rate": 4.966288077122164e-05, "loss": 2.4462, "mean_token_accuracy": 0.4034482717514038, "step": 101680 }, { "epoch": 0.10241821197956168, "grad_norm": 15.36691114722628, "learning_rate": 4.966281614588938e-05, "loss": 2.6192, "mean_token_accuracy": 0.4517241299152374, "step": 101685 }, { "epoch": 0.10242324803266585, "grad_norm": 11.575804384082033, "learning_rate": 4.966275151441019e-05, "loss": 2.4587, "mean_token_accuracy": 0.4206896543502808, "step": 101690 }, { "epoch": 0.10242828408577002, "grad_norm": 8.582409374530396, "learning_rate": 4.966268687678407e-05, "loss": 2.1808, "mean_token_accuracy": 0.5034482777118683, "step": 101695 }, { "epoch": 0.1024333201388742, "grad_norm": 11.188238898730267, "learning_rate": 4.9662622233011054e-05, "loss": 2.7465, "mean_token_accuracy": 0.39310344457626345, "step": 101700 }, { "epoch": 0.10243835619197837, "grad_norm": 12.498810735958163, "learning_rate": 4.966255758309115e-05, "loss": 2.4635, "mean_token_accuracy": 0.4034482717514038, "step": 101705 }, { "epoch": 0.10244339224508255, "grad_norm": 11.42433198736133, "learning_rate": 4.966249292702436e-05, "loss": 2.3962, "mean_token_accuracy": 0.41379311084747317, "step": 101710 }, { "epoch": 0.10244842829818672, "grad_norm": 11.08192715069187, "learning_rate": 4.966242826481074e-05, "loss": 2.6651, "mean_token_accuracy": 0.3911675751209259, "step": 101715 }, { "epoch": 0.1024534643512909, "grad_norm": 11.88239101660398, "learning_rate": 4.9662363596450284e-05, "loss": 2.4092, "mean_token_accuracy": 0.4517241358757019, "step": 101720 }, { "epoch": 0.10245850040439507, "grad_norm": 11.594731320782449, "learning_rate": 4.966229892194301e-05, "loss": 2.1298, "mean_token_accuracy": 0.4379310369491577, "step": 101725 }, { "epoch": 0.10246353645749924, "grad_norm": 9.138191817084461, "learning_rate": 4.966223424128894e-05, "loss": 2.6246, "mean_token_accuracy": 0.401935875415802, "step": 101730 }, { "epoch": 0.10246857251060341, "grad_norm": 11.059292015228092, "learning_rate": 4.96621695544881e-05, "loss": 2.6863, "mean_token_accuracy": 0.37241379022598264, "step": 101735 }, { "epoch": 0.10247360856370759, "grad_norm": 8.519398414335633, "learning_rate": 4.966210486154049e-05, "loss": 2.2333, "mean_token_accuracy": 0.48965516686439514, "step": 101740 }, { "epoch": 0.10247864461681176, "grad_norm": 9.061208460313637, "learning_rate": 4.966204016244614e-05, "loss": 2.4709, "mean_token_accuracy": 0.44827585816383364, "step": 101745 }, { "epoch": 0.10248368066991592, "grad_norm": 9.70373673018463, "learning_rate": 4.9661975457205066e-05, "loss": 2.2141, "mean_token_accuracy": 0.4551724135875702, "step": 101750 }, { "epoch": 0.1024887167230201, "grad_norm": 7.944684320234915, "learning_rate": 4.966191074581729e-05, "loss": 2.1249, "mean_token_accuracy": 0.5034482717514038, "step": 101755 }, { "epoch": 0.10249375277612427, "grad_norm": 9.808142779752208, "learning_rate": 4.966184602828282e-05, "loss": 2.1705, "mean_token_accuracy": 0.4758620738983154, "step": 101760 }, { "epoch": 0.10249878882922844, "grad_norm": 12.081405100812802, "learning_rate": 4.9661781304601675e-05, "loss": 2.6282, "mean_token_accuracy": 0.3862069010734558, "step": 101765 }, { "epoch": 0.10250382488233262, "grad_norm": 15.460521455990698, "learning_rate": 4.966171657477389e-05, "loss": 2.5727, "mean_token_accuracy": 0.4034482777118683, "step": 101770 }, { "epoch": 0.10250886093543679, "grad_norm": 11.007469662805402, "learning_rate": 4.9661651838799453e-05, "loss": 2.5487, "mean_token_accuracy": 0.4034482777118683, "step": 101775 }, { "epoch": 0.10251389698854096, "grad_norm": 11.458407422322827, "learning_rate": 4.966158709667841e-05, "loss": 2.3742, "mean_token_accuracy": 0.441379314661026, "step": 101780 }, { "epoch": 0.10251893304164514, "grad_norm": 10.868045266672093, "learning_rate": 4.966152234841078e-05, "loss": 2.8839, "mean_token_accuracy": 0.3896551728248596, "step": 101785 }, { "epoch": 0.10252396909474931, "grad_norm": 10.56471236106937, "learning_rate": 4.9661457593996546e-05, "loss": 2.3693, "mean_token_accuracy": 0.38965516686439516, "step": 101790 }, { "epoch": 0.10252900514785349, "grad_norm": 14.95486484000657, "learning_rate": 4.966139283343576e-05, "loss": 2.7499, "mean_token_accuracy": 0.3931034475564957, "step": 101795 }, { "epoch": 0.10253404120095766, "grad_norm": 10.271916892603645, "learning_rate": 4.966132806672843e-05, "loss": 2.3544, "mean_token_accuracy": 0.48965518474578856, "step": 101800 }, { "epoch": 0.10253907725406183, "grad_norm": 12.500649904038099, "learning_rate": 4.9661263293874575e-05, "loss": 2.3332, "mean_token_accuracy": 0.4413793087005615, "step": 101805 }, { "epoch": 0.10254411330716601, "grad_norm": 10.326865659565252, "learning_rate": 4.9661198514874194e-05, "loss": 2.3606, "mean_token_accuracy": 0.4241379380226135, "step": 101810 }, { "epoch": 0.10254914936027018, "grad_norm": 9.822274304239823, "learning_rate": 4.966113372972734e-05, "loss": 2.1969, "mean_token_accuracy": 0.48517847061157227, "step": 101815 }, { "epoch": 0.10255418541337434, "grad_norm": 14.33095604089978, "learning_rate": 4.966106893843401e-05, "loss": 2.794, "mean_token_accuracy": 0.38275861740112305, "step": 101820 }, { "epoch": 0.10255922146647851, "grad_norm": 10.75770010849827, "learning_rate": 4.9661004140994227e-05, "loss": 2.4954, "mean_token_accuracy": 0.41379310488700866, "step": 101825 }, { "epoch": 0.10256425751958269, "grad_norm": 17.115912407385828, "learning_rate": 4.9660939337408e-05, "loss": 2.5609, "mean_token_accuracy": 0.4103448212146759, "step": 101830 }, { "epoch": 0.10256929357268686, "grad_norm": 13.939205403960424, "learning_rate": 4.966087452767536e-05, "loss": 2.5106, "mean_token_accuracy": 0.4172413766384125, "step": 101835 }, { "epoch": 0.10257432962579104, "grad_norm": 11.497435487720708, "learning_rate": 4.9660809711796306e-05, "loss": 2.4063, "mean_token_accuracy": 0.4310344815254211, "step": 101840 }, { "epoch": 0.10257936567889521, "grad_norm": 9.858974802556869, "learning_rate": 4.966074488977088e-05, "loss": 2.4905, "mean_token_accuracy": 0.4344827592372894, "step": 101845 }, { "epoch": 0.10258440173199938, "grad_norm": 10.65864954646837, "learning_rate": 4.9660680061599084e-05, "loss": 2.3768, "mean_token_accuracy": 0.4206896543502808, "step": 101850 }, { "epoch": 0.10258943778510356, "grad_norm": 11.426020433645894, "learning_rate": 4.9660615227280954e-05, "loss": 2.4336, "mean_token_accuracy": 0.4413793087005615, "step": 101855 }, { "epoch": 0.10259447383820773, "grad_norm": 11.348431755087745, "learning_rate": 4.966055038681648e-05, "loss": 2.6029, "mean_token_accuracy": 0.4379310250282288, "step": 101860 }, { "epoch": 0.1025995098913119, "grad_norm": 9.167094048011469, "learning_rate": 4.96604855402057e-05, "loss": 2.4337, "mean_token_accuracy": 0.44682395458221436, "step": 101865 }, { "epoch": 0.10260454594441608, "grad_norm": 12.77370967740232, "learning_rate": 4.9660420687448624e-05, "loss": 2.7136, "mean_token_accuracy": 0.38275861740112305, "step": 101870 }, { "epoch": 0.10260958199752025, "grad_norm": 11.39784735322758, "learning_rate": 4.9660355828545274e-05, "loss": 2.7993, "mean_token_accuracy": 0.3896551787853241, "step": 101875 }, { "epoch": 0.10261461805062443, "grad_norm": 9.42788743546398, "learning_rate": 4.966029096349567e-05, "loss": 2.5035, "mean_token_accuracy": 0.417241370677948, "step": 101880 }, { "epoch": 0.1026196541037286, "grad_norm": 16.510028063328747, "learning_rate": 4.966022609229982e-05, "loss": 2.8042, "mean_token_accuracy": 0.36206896901130675, "step": 101885 }, { "epoch": 0.10262469015683276, "grad_norm": 10.321451525604472, "learning_rate": 4.9660161214957754e-05, "loss": 2.3351, "mean_token_accuracy": 0.48620688915252686, "step": 101890 }, { "epoch": 0.10262972620993693, "grad_norm": 9.621585518575282, "learning_rate": 4.966009633146948e-05, "loss": 1.9495, "mean_token_accuracy": 0.4965517222881317, "step": 101895 }, { "epoch": 0.10263476226304111, "grad_norm": 10.22010924434354, "learning_rate": 4.9660031441835025e-05, "loss": 2.2038, "mean_token_accuracy": 0.5034482657909394, "step": 101900 }, { "epoch": 0.10263979831614528, "grad_norm": 9.859462023288978, "learning_rate": 4.96599665460544e-05, "loss": 2.4479, "mean_token_accuracy": 0.4413793087005615, "step": 101905 }, { "epoch": 0.10264483436924945, "grad_norm": 10.978269983895451, "learning_rate": 4.9659901644127626e-05, "loss": 2.5835, "mean_token_accuracy": 0.39098607897758486, "step": 101910 }, { "epoch": 0.10264987042235363, "grad_norm": 12.484107422676011, "learning_rate": 4.965983673605472e-05, "loss": 2.8407, "mean_token_accuracy": 0.3827586233615875, "step": 101915 }, { "epoch": 0.1026549064754578, "grad_norm": 9.319840339828954, "learning_rate": 4.9659771821835705e-05, "loss": 2.1539, "mean_token_accuracy": 0.4492610812187195, "step": 101920 }, { "epoch": 0.10265994252856198, "grad_norm": 18.990370251890706, "learning_rate": 4.9659706901470594e-05, "loss": 2.5973, "mean_token_accuracy": 0.4137930989265442, "step": 101925 }, { "epoch": 0.10266497858166615, "grad_norm": 11.767563176004774, "learning_rate": 4.96596419749594e-05, "loss": 2.7638, "mean_token_accuracy": 0.3896551728248596, "step": 101930 }, { "epoch": 0.10267001463477032, "grad_norm": 9.250690645177698, "learning_rate": 4.9659577042302145e-05, "loss": 2.4109, "mean_token_accuracy": 0.4344827651977539, "step": 101935 }, { "epoch": 0.1026750506878745, "grad_norm": 13.465977049780664, "learning_rate": 4.9659512103498855e-05, "loss": 2.9523, "mean_token_accuracy": 0.398064124584198, "step": 101940 }, { "epoch": 0.10268008674097867, "grad_norm": 12.779978983341879, "learning_rate": 4.965944715854954e-05, "loss": 2.5364, "mean_token_accuracy": 0.4186327874660492, "step": 101945 }, { "epoch": 0.10268512279408284, "grad_norm": 8.497088425683993, "learning_rate": 4.9659382207454224e-05, "loss": 2.1507, "mean_token_accuracy": 0.47586206793785096, "step": 101950 }, { "epoch": 0.10269015884718702, "grad_norm": 10.939774519452017, "learning_rate": 4.9659317250212917e-05, "loss": 2.6772, "mean_token_accuracy": 0.41724138259887694, "step": 101955 }, { "epoch": 0.10269519490029118, "grad_norm": 12.280715256808495, "learning_rate": 4.965925228682563e-05, "loss": 2.309, "mean_token_accuracy": 0.4220810651779175, "step": 101960 }, { "epoch": 0.10270023095339535, "grad_norm": 22.130645483761572, "learning_rate": 4.965918731729241e-05, "loss": 2.4027, "mean_token_accuracy": 0.44137930274009707, "step": 101965 }, { "epoch": 0.10270526700649953, "grad_norm": 14.658707686083057, "learning_rate": 4.965912234161325e-05, "loss": 2.8321, "mean_token_accuracy": 0.37931033968925476, "step": 101970 }, { "epoch": 0.1027103030596037, "grad_norm": 10.223582360701371, "learning_rate": 4.965905735978817e-05, "loss": 2.669, "mean_token_accuracy": 0.35172413289546967, "step": 101975 }, { "epoch": 0.10271533911270787, "grad_norm": 9.000549878270615, "learning_rate": 4.96589923718172e-05, "loss": 2.2625, "mean_token_accuracy": 0.41724138259887694, "step": 101980 }, { "epoch": 0.10272037516581205, "grad_norm": 11.357538538101815, "learning_rate": 4.965892737770034e-05, "loss": 2.6219, "mean_token_accuracy": 0.3931034505367279, "step": 101985 }, { "epoch": 0.10272541121891622, "grad_norm": 10.25022959832566, "learning_rate": 4.965886237743763e-05, "loss": 2.5494, "mean_token_accuracy": 0.39110708236694336, "step": 101990 }, { "epoch": 0.1027304472720204, "grad_norm": 10.529763776620856, "learning_rate": 4.965879737102907e-05, "loss": 2.3375, "mean_token_accuracy": 0.4068965494632721, "step": 101995 }, { "epoch": 0.10273548332512457, "grad_norm": 10.923046048960414, "learning_rate": 4.9658732358474684e-05, "loss": 2.3635, "mean_token_accuracy": 0.41379311084747317, "step": 102000 }, { "epoch": 0.10274051937822874, "grad_norm": 9.27500908781363, "learning_rate": 4.96586673397745e-05, "loss": 2.3628, "mean_token_accuracy": 0.44482759237289426, "step": 102005 }, { "epoch": 0.10274555543133292, "grad_norm": 9.633407643152006, "learning_rate": 4.965860231492852e-05, "loss": 2.8986, "mean_token_accuracy": 0.44482759237289426, "step": 102010 }, { "epoch": 0.10275059148443709, "grad_norm": 7.5630789289690465, "learning_rate": 4.9658537283936774e-05, "loss": 2.3514, "mean_token_accuracy": 0.45759226083755494, "step": 102015 }, { "epoch": 0.10275562753754126, "grad_norm": 11.504885787997875, "learning_rate": 4.965847224679927e-05, "loss": 2.2804, "mean_token_accuracy": 0.4310344815254211, "step": 102020 }, { "epoch": 0.10276066359064544, "grad_norm": 22.000718110991656, "learning_rate": 4.965840720351603e-05, "loss": 2.887, "mean_token_accuracy": 0.43448275327682495, "step": 102025 }, { "epoch": 0.1027656996437496, "grad_norm": 9.415149749126341, "learning_rate": 4.965834215408707e-05, "loss": 2.367, "mean_token_accuracy": 0.42413792610168455, "step": 102030 }, { "epoch": 0.10277073569685377, "grad_norm": 10.134115044373447, "learning_rate": 4.965827709851242e-05, "loss": 2.276, "mean_token_accuracy": 0.45172414779663084, "step": 102035 }, { "epoch": 0.10277577174995794, "grad_norm": 10.408724625688393, "learning_rate": 4.965821203679209e-05, "loss": 2.1607, "mean_token_accuracy": 0.43103448748588563, "step": 102040 }, { "epoch": 0.10278080780306212, "grad_norm": 10.781107238036869, "learning_rate": 4.965814696892609e-05, "loss": 2.5701, "mean_token_accuracy": 0.41379310488700866, "step": 102045 }, { "epoch": 0.10278584385616629, "grad_norm": 10.918861678490645, "learning_rate": 4.965808189491445e-05, "loss": 2.2644, "mean_token_accuracy": 0.42413792610168455, "step": 102050 }, { "epoch": 0.10279087990927047, "grad_norm": 8.356097719679958, "learning_rate": 4.965801681475717e-05, "loss": 2.1046, "mean_token_accuracy": 0.4793103516101837, "step": 102055 }, { "epoch": 0.10279591596237464, "grad_norm": 11.286571209524185, "learning_rate": 4.96579517284543e-05, "loss": 2.2013, "mean_token_accuracy": 0.49999999403953554, "step": 102060 }, { "epoch": 0.10280095201547881, "grad_norm": 9.781892291497881, "learning_rate": 4.9657886636005835e-05, "loss": 2.1821, "mean_token_accuracy": 0.43448275327682495, "step": 102065 }, { "epoch": 0.10280598806858299, "grad_norm": 10.454231894049318, "learning_rate": 4.965782153741179e-05, "loss": 2.6903, "mean_token_accuracy": 0.4206896543502808, "step": 102070 }, { "epoch": 0.10281102412168716, "grad_norm": 13.977495116606674, "learning_rate": 4.96577564326722e-05, "loss": 2.1795, "mean_token_accuracy": 0.4517241358757019, "step": 102075 }, { "epoch": 0.10281606017479133, "grad_norm": 10.916049243674708, "learning_rate": 4.965769132178707e-05, "loss": 2.5148, "mean_token_accuracy": 0.39310345649719236, "step": 102080 }, { "epoch": 0.10282109622789551, "grad_norm": 11.019145188539882, "learning_rate": 4.965762620475642e-05, "loss": 2.4799, "mean_token_accuracy": 0.4034482717514038, "step": 102085 }, { "epoch": 0.10282613228099968, "grad_norm": 10.471593732993124, "learning_rate": 4.9657561081580274e-05, "loss": 2.1258, "mean_token_accuracy": 0.458620685338974, "step": 102090 }, { "epoch": 0.10283116833410386, "grad_norm": 9.632936805314667, "learning_rate": 4.9657495952258634e-05, "loss": 2.2233, "mean_token_accuracy": 0.4275862067937851, "step": 102095 }, { "epoch": 0.10283620438720802, "grad_norm": 12.075287331955318, "learning_rate": 4.965743081679154e-05, "loss": 2.3132, "mean_token_accuracy": 0.47241378426551817, "step": 102100 }, { "epoch": 0.10284124044031219, "grad_norm": 10.053007290233133, "learning_rate": 4.9657365675179e-05, "loss": 3.0249, "mean_token_accuracy": 0.4124621868133545, "step": 102105 }, { "epoch": 0.10284627649341636, "grad_norm": 10.108612896490703, "learning_rate": 4.9657300527421026e-05, "loss": 2.4482, "mean_token_accuracy": 0.3896551728248596, "step": 102110 }, { "epoch": 0.10285131254652054, "grad_norm": 11.122819943444814, "learning_rate": 4.965723537351765e-05, "loss": 2.543, "mean_token_accuracy": 0.4137930929660797, "step": 102115 }, { "epoch": 0.10285634859962471, "grad_norm": 10.945181653672073, "learning_rate": 4.965717021346887e-05, "loss": 2.4663, "mean_token_accuracy": 0.45862069725990295, "step": 102120 }, { "epoch": 0.10286138465272888, "grad_norm": 9.588363496597273, "learning_rate": 4.9657105047274735e-05, "loss": 2.1413, "mean_token_accuracy": 0.4776164531707764, "step": 102125 }, { "epoch": 0.10286642070583306, "grad_norm": 13.137631775570863, "learning_rate": 4.965703987493523e-05, "loss": 2.4318, "mean_token_accuracy": 0.37586206793785093, "step": 102130 }, { "epoch": 0.10287145675893723, "grad_norm": 11.256421228621463, "learning_rate": 4.965697469645039e-05, "loss": 2.744, "mean_token_accuracy": 0.4103448331356049, "step": 102135 }, { "epoch": 0.1028764928120414, "grad_norm": 10.607895119433163, "learning_rate": 4.965690951182023e-05, "loss": 2.841, "mean_token_accuracy": 0.4, "step": 102140 }, { "epoch": 0.10288152886514558, "grad_norm": 11.257702317682202, "learning_rate": 4.965684432104477e-05, "loss": 2.7679, "mean_token_accuracy": 0.3965517282485962, "step": 102145 }, { "epoch": 0.10288656491824975, "grad_norm": 10.201335101923332, "learning_rate": 4.965677912412402e-05, "loss": 3.1029, "mean_token_accuracy": 0.3724137932062149, "step": 102150 }, { "epoch": 0.10289160097135393, "grad_norm": 11.691532097259172, "learning_rate": 4.965671392105802e-05, "loss": 2.7562, "mean_token_accuracy": 0.4467029690742493, "step": 102155 }, { "epoch": 0.1028966370244581, "grad_norm": 10.92796435176211, "learning_rate": 4.965664871184676e-05, "loss": 2.2167, "mean_token_accuracy": 0.4777374446392059, "step": 102160 }, { "epoch": 0.10290167307756228, "grad_norm": 10.51862890936034, "learning_rate": 4.9656583496490275e-05, "loss": 2.2038, "mean_token_accuracy": 0.42413793206214906, "step": 102165 }, { "epoch": 0.10290670913066643, "grad_norm": 14.117083089812956, "learning_rate": 4.965651827498858e-05, "loss": 2.9368, "mean_token_accuracy": 0.42413793206214906, "step": 102170 }, { "epoch": 0.10291174518377061, "grad_norm": 9.525585719769627, "learning_rate": 4.965645304734169e-05, "loss": 2.0411, "mean_token_accuracy": 0.48275861144065857, "step": 102175 }, { "epoch": 0.10291678123687478, "grad_norm": 11.178360780100101, "learning_rate": 4.965638781354962e-05, "loss": 2.1904, "mean_token_accuracy": 0.4413793087005615, "step": 102180 }, { "epoch": 0.10292181728997896, "grad_norm": 10.293867303868764, "learning_rate": 4.96563225736124e-05, "loss": 2.2929, "mean_token_accuracy": 0.47931034564971925, "step": 102185 }, { "epoch": 0.10292685334308313, "grad_norm": 9.916686573688567, "learning_rate": 4.965625732753004e-05, "loss": 2.1097, "mean_token_accuracy": 0.4862068951129913, "step": 102190 }, { "epoch": 0.1029318893961873, "grad_norm": 12.2876593804011, "learning_rate": 4.965619207530256e-05, "loss": 2.0313, "mean_token_accuracy": 0.512583190202713, "step": 102195 }, { "epoch": 0.10293692544929148, "grad_norm": 10.686750626569669, "learning_rate": 4.965612681692998e-05, "loss": 2.5175, "mean_token_accuracy": 0.42413793206214906, "step": 102200 }, { "epoch": 0.10294196150239565, "grad_norm": 11.06822083002831, "learning_rate": 4.965606155241231e-05, "loss": 2.3551, "mean_token_accuracy": 0.38965516686439516, "step": 102205 }, { "epoch": 0.10294699755549983, "grad_norm": 12.736464350132138, "learning_rate": 4.965599628174958e-05, "loss": 3.0601, "mean_token_accuracy": 0.34137930870056155, "step": 102210 }, { "epoch": 0.102952033608604, "grad_norm": 8.848482387845467, "learning_rate": 4.965593100494179e-05, "loss": 2.2923, "mean_token_accuracy": 0.4551724135875702, "step": 102215 }, { "epoch": 0.10295706966170817, "grad_norm": 11.158299392439307, "learning_rate": 4.965586572198898e-05, "loss": 2.6138, "mean_token_accuracy": 0.4, "step": 102220 }, { "epoch": 0.10296210571481235, "grad_norm": 10.227388314430089, "learning_rate": 4.9655800432891156e-05, "loss": 2.4652, "mean_token_accuracy": 0.37586207389831544, "step": 102225 }, { "epoch": 0.10296714176791652, "grad_norm": 11.121329429618935, "learning_rate": 4.965573513764834e-05, "loss": 2.103, "mean_token_accuracy": 0.4758620738983154, "step": 102230 }, { "epoch": 0.1029721778210207, "grad_norm": 13.31074725819182, "learning_rate": 4.9655669836260546e-05, "loss": 2.9709, "mean_token_accuracy": 0.37931033968925476, "step": 102235 }, { "epoch": 0.10297721387412485, "grad_norm": 10.57385465195964, "learning_rate": 4.965560452872779e-05, "loss": 2.8341, "mean_token_accuracy": 0.38965516686439516, "step": 102240 }, { "epoch": 0.10298224992722903, "grad_norm": 10.886100239209368, "learning_rate": 4.9655539215050104e-05, "loss": 2.2853, "mean_token_accuracy": 0.39310345649719236, "step": 102245 }, { "epoch": 0.1029872859803332, "grad_norm": 10.70721907608428, "learning_rate": 4.965547389522749e-05, "loss": 2.5388, "mean_token_accuracy": 0.44827587008476255, "step": 102250 }, { "epoch": 0.10299232203343738, "grad_norm": 9.1492192171996, "learning_rate": 4.965540856925998e-05, "loss": 2.3169, "mean_token_accuracy": 0.42068964838981626, "step": 102255 }, { "epoch": 0.10299735808654155, "grad_norm": 11.588311642934736, "learning_rate": 4.965534323714757e-05, "loss": 2.2744, "mean_token_accuracy": 0.4793103516101837, "step": 102260 }, { "epoch": 0.10300239413964572, "grad_norm": 10.638209323478575, "learning_rate": 4.96552778988903e-05, "loss": 2.4668, "mean_token_accuracy": 0.4034482717514038, "step": 102265 }, { "epoch": 0.1030074301927499, "grad_norm": 14.192960015896805, "learning_rate": 4.965521255448818e-05, "loss": 2.7249, "mean_token_accuracy": 0.3620689630508423, "step": 102270 }, { "epoch": 0.10301246624585407, "grad_norm": 11.741020354618106, "learning_rate": 4.965514720394124e-05, "loss": 2.6517, "mean_token_accuracy": 0.3931034505367279, "step": 102275 }, { "epoch": 0.10301750229895824, "grad_norm": 10.103938788632266, "learning_rate": 4.965508184724948e-05, "loss": 2.4217, "mean_token_accuracy": 0.4344827592372894, "step": 102280 }, { "epoch": 0.10302253835206242, "grad_norm": 9.435916129455332, "learning_rate": 4.965501648441292e-05, "loss": 2.482, "mean_token_accuracy": 0.439443439245224, "step": 102285 }, { "epoch": 0.10302757440516659, "grad_norm": 10.856898010993282, "learning_rate": 4.965495111543159e-05, "loss": 2.3643, "mean_token_accuracy": 0.4206896543502808, "step": 102290 }, { "epoch": 0.10303261045827077, "grad_norm": 10.73676591610329, "learning_rate": 4.96548857403055e-05, "loss": 2.667, "mean_token_accuracy": 0.4275861978530884, "step": 102295 }, { "epoch": 0.10303764651137494, "grad_norm": 12.054086517693046, "learning_rate": 4.9654820359034667e-05, "loss": 2.3956, "mean_token_accuracy": 0.46896552443504336, "step": 102300 }, { "epoch": 0.10304268256447911, "grad_norm": 21.745721914804694, "learning_rate": 4.965475497161912e-05, "loss": 2.7046, "mean_token_accuracy": 0.39310344457626345, "step": 102305 }, { "epoch": 0.10304771861758327, "grad_norm": 12.650487035239586, "learning_rate": 4.965468957805886e-05, "loss": 2.3852, "mean_token_accuracy": 0.4034482777118683, "step": 102310 }, { "epoch": 0.10305275467068745, "grad_norm": 13.08184521403198, "learning_rate": 4.965462417835392e-05, "loss": 2.4584, "mean_token_accuracy": 0.44482759237289426, "step": 102315 }, { "epoch": 0.10305779072379162, "grad_norm": 15.234797038153893, "learning_rate": 4.96545587725043e-05, "loss": 2.5349, "mean_token_accuracy": 0.38965517580509185, "step": 102320 }, { "epoch": 0.1030628267768958, "grad_norm": 17.217485075191032, "learning_rate": 4.9654493360510054e-05, "loss": 2.9051, "mean_token_accuracy": 0.3862068891525269, "step": 102325 }, { "epoch": 0.10306786282999997, "grad_norm": 9.105017808039984, "learning_rate": 4.965442794237115e-05, "loss": 2.5363, "mean_token_accuracy": 0.4344827592372894, "step": 102330 }, { "epoch": 0.10307289888310414, "grad_norm": 11.451312095636137, "learning_rate": 4.9654362518087654e-05, "loss": 2.6433, "mean_token_accuracy": 0.41724138259887694, "step": 102335 }, { "epoch": 0.10307793493620832, "grad_norm": 11.475994716990174, "learning_rate": 4.965429708765955e-05, "loss": 2.8815, "mean_token_accuracy": 0.37047791481018066, "step": 102340 }, { "epoch": 0.10308297098931249, "grad_norm": 11.22939912890726, "learning_rate": 4.965423165108687e-05, "loss": 2.7322, "mean_token_accuracy": 0.42068966031074523, "step": 102345 }, { "epoch": 0.10308800704241666, "grad_norm": 11.083428163653416, "learning_rate": 4.965416620836964e-05, "loss": 2.4685, "mean_token_accuracy": 0.4103448212146759, "step": 102350 }, { "epoch": 0.10309304309552084, "grad_norm": 11.22904067861591, "learning_rate": 4.965410075950786e-05, "loss": 2.4457, "mean_token_accuracy": 0.4536600112915039, "step": 102355 }, { "epoch": 0.10309807914862501, "grad_norm": 11.042799264635574, "learning_rate": 4.9654035304501567e-05, "loss": 2.5452, "mean_token_accuracy": 0.39655172228813174, "step": 102360 }, { "epoch": 0.10310311520172918, "grad_norm": 11.604895082807737, "learning_rate": 4.9653969843350764e-05, "loss": 2.5327, "mean_token_accuracy": 0.36896551251411436, "step": 102365 }, { "epoch": 0.10310815125483336, "grad_norm": 8.972818498717327, "learning_rate": 4.965390437605547e-05, "loss": 2.1956, "mean_token_accuracy": 0.4620689690113068, "step": 102370 }, { "epoch": 0.10311318730793753, "grad_norm": 13.044439230023187, "learning_rate": 4.9653838902615716e-05, "loss": 2.4239, "mean_token_accuracy": 0.4275861978530884, "step": 102375 }, { "epoch": 0.10311822336104169, "grad_norm": 11.155043759351242, "learning_rate": 4.965377342303151e-05, "loss": 2.544, "mean_token_accuracy": 0.44482758045196535, "step": 102380 }, { "epoch": 0.10312325941414587, "grad_norm": 11.443821357480022, "learning_rate": 4.9653707937302876e-05, "loss": 2.7777, "mean_token_accuracy": 0.37586206793785093, "step": 102385 }, { "epoch": 0.10312829546725004, "grad_norm": 11.512084109565542, "learning_rate": 4.965364244542982e-05, "loss": 2.5911, "mean_token_accuracy": 0.43103448748588563, "step": 102390 }, { "epoch": 0.10313333152035421, "grad_norm": 8.834290493130307, "learning_rate": 4.965357694741237e-05, "loss": 2.437, "mean_token_accuracy": 0.36551723778247835, "step": 102395 }, { "epoch": 0.10313836757345839, "grad_norm": 11.404413400892183, "learning_rate": 4.965351144325055e-05, "loss": 2.3684, "mean_token_accuracy": 0.4620689690113068, "step": 102400 }, { "epoch": 0.10314340362656256, "grad_norm": 9.690302487377703, "learning_rate": 4.965344593294437e-05, "loss": 2.6497, "mean_token_accuracy": 0.36896551847457887, "step": 102405 }, { "epoch": 0.10314843967966673, "grad_norm": 9.410942211405763, "learning_rate": 4.9653380416493845e-05, "loss": 2.3542, "mean_token_accuracy": 0.42758620977401735, "step": 102410 }, { "epoch": 0.10315347573277091, "grad_norm": 10.334421098922991, "learning_rate": 4.9653314893898995e-05, "loss": 2.6954, "mean_token_accuracy": 0.35172413289546967, "step": 102415 }, { "epoch": 0.10315851178587508, "grad_norm": 12.287584805356952, "learning_rate": 4.965324936515984e-05, "loss": 2.4355, "mean_token_accuracy": 0.417241370677948, "step": 102420 }, { "epoch": 0.10316354783897926, "grad_norm": 11.606875750707568, "learning_rate": 4.96531838302764e-05, "loss": 2.6862, "mean_token_accuracy": 0.3862068891525269, "step": 102425 }, { "epoch": 0.10316858389208343, "grad_norm": 9.6642033044946, "learning_rate": 4.96531182892487e-05, "loss": 2.3706, "mean_token_accuracy": 0.4068965494632721, "step": 102430 }, { "epoch": 0.1031736199451876, "grad_norm": 16.15070223826042, "learning_rate": 4.9653052742076745e-05, "loss": 2.7245, "mean_token_accuracy": 0.4068965494632721, "step": 102435 }, { "epoch": 0.10317865599829178, "grad_norm": 12.292067190496162, "learning_rate": 4.965298718876055e-05, "loss": 2.7839, "mean_token_accuracy": 0.3620689630508423, "step": 102440 }, { "epoch": 0.10318369205139595, "grad_norm": 12.262468834402549, "learning_rate": 4.965292162930015e-05, "loss": 2.6092, "mean_token_accuracy": 0.41724138855934145, "step": 102445 }, { "epoch": 0.10318872810450011, "grad_norm": 10.366988528147912, "learning_rate": 4.965285606369555e-05, "loss": 2.1988, "mean_token_accuracy": 0.41379310488700866, "step": 102450 }, { "epoch": 0.10319376415760428, "grad_norm": 9.550317414248871, "learning_rate": 4.965279049194678e-05, "loss": 2.3536, "mean_token_accuracy": 0.3896551728248596, "step": 102455 }, { "epoch": 0.10319880021070846, "grad_norm": 13.132805084269354, "learning_rate": 4.9652724914053845e-05, "loss": 2.3134, "mean_token_accuracy": 0.493103438615799, "step": 102460 }, { "epoch": 0.10320383626381263, "grad_norm": 13.844490654754868, "learning_rate": 4.965265933001678e-05, "loss": 2.7033, "mean_token_accuracy": 0.4344827473163605, "step": 102465 }, { "epoch": 0.1032088723169168, "grad_norm": 10.165563134331066, "learning_rate": 4.965259373983558e-05, "loss": 2.5848, "mean_token_accuracy": 0.38275861740112305, "step": 102470 }, { "epoch": 0.10321390837002098, "grad_norm": 11.0973737556596, "learning_rate": 4.9652528143510274e-05, "loss": 2.3131, "mean_token_accuracy": 0.4344827592372894, "step": 102475 }, { "epoch": 0.10321894442312515, "grad_norm": 9.691557745696265, "learning_rate": 4.965246254104089e-05, "loss": 2.1327, "mean_token_accuracy": 0.5206896543502808, "step": 102480 }, { "epoch": 0.10322398047622933, "grad_norm": 12.20701557338449, "learning_rate": 4.965239693242742e-05, "loss": 2.4231, "mean_token_accuracy": 0.4206896543502808, "step": 102485 }, { "epoch": 0.1032290165293335, "grad_norm": 12.365695155282843, "learning_rate": 4.9652331317669926e-05, "loss": 2.7968, "mean_token_accuracy": 0.3275862067937851, "step": 102490 }, { "epoch": 0.10323405258243767, "grad_norm": 11.42862035879425, "learning_rate": 4.9652265696768384e-05, "loss": 2.5835, "mean_token_accuracy": 0.41379310488700866, "step": 102495 }, { "epoch": 0.10323908863554185, "grad_norm": 8.460444567284652, "learning_rate": 4.965220006972284e-05, "loss": 1.9352, "mean_token_accuracy": 0.46896551847457885, "step": 102500 }, { "epoch": 0.10324412468864602, "grad_norm": 11.582043784227944, "learning_rate": 4.965213443653329e-05, "loss": 2.3879, "mean_token_accuracy": 0.4517241299152374, "step": 102505 }, { "epoch": 0.1032491607417502, "grad_norm": 10.155256065840705, "learning_rate": 4.965206879719977e-05, "loss": 2.2055, "mean_token_accuracy": 0.4931034505367279, "step": 102510 }, { "epoch": 0.10325419679485437, "grad_norm": 10.938398392110008, "learning_rate": 4.965200315172228e-05, "loss": 2.4685, "mean_token_accuracy": 0.42413793206214906, "step": 102515 }, { "epoch": 0.10325923284795853, "grad_norm": 9.291433428053487, "learning_rate": 4.965193750010086e-05, "loss": 2.3029, "mean_token_accuracy": 0.42758620977401735, "step": 102520 }, { "epoch": 0.1032642689010627, "grad_norm": 10.643813554795633, "learning_rate": 4.965187184233551e-05, "loss": 2.2657, "mean_token_accuracy": 0.42413793206214906, "step": 102525 }, { "epoch": 0.10326930495416688, "grad_norm": 11.060384727781962, "learning_rate": 4.9651806178426265e-05, "loss": 2.3702, "mean_token_accuracy": 0.4297035694122314, "step": 102530 }, { "epoch": 0.10327434100727105, "grad_norm": 16.888993833240633, "learning_rate": 4.965174050837313e-05, "loss": 2.6972, "mean_token_accuracy": 0.41034482717514037, "step": 102535 }, { "epoch": 0.10327937706037522, "grad_norm": 9.448533773582648, "learning_rate": 4.9651674832176125e-05, "loss": 2.2382, "mean_token_accuracy": 0.47586206197738645, "step": 102540 }, { "epoch": 0.1032844131134794, "grad_norm": 9.512584620871289, "learning_rate": 4.9651609149835265e-05, "loss": 2.6679, "mean_token_accuracy": 0.4172413766384125, "step": 102545 }, { "epoch": 0.10328944916658357, "grad_norm": 13.64653361353644, "learning_rate": 4.9651543461350584e-05, "loss": 2.4553, "mean_token_accuracy": 0.4620689630508423, "step": 102550 }, { "epoch": 0.10329448521968775, "grad_norm": 11.438827036420195, "learning_rate": 4.965147776672209e-05, "loss": 2.4752, "mean_token_accuracy": 0.4496672749519348, "step": 102555 }, { "epoch": 0.10329952127279192, "grad_norm": 10.146353314890844, "learning_rate": 4.96514120659498e-05, "loss": 2.8717, "mean_token_accuracy": 0.3517241418361664, "step": 102560 }, { "epoch": 0.1033045573258961, "grad_norm": 11.140627280171406, "learning_rate": 4.965134635903373e-05, "loss": 2.6728, "mean_token_accuracy": 0.4068965554237366, "step": 102565 }, { "epoch": 0.10330959337900027, "grad_norm": 11.410044974639801, "learning_rate": 4.96512806459739e-05, "loss": 2.2708, "mean_token_accuracy": 0.4448275864124298, "step": 102570 }, { "epoch": 0.10331462943210444, "grad_norm": 8.977165008458515, "learning_rate": 4.965121492677032e-05, "loss": 2.352, "mean_token_accuracy": 0.37586206793785093, "step": 102575 }, { "epoch": 0.10331966548520861, "grad_norm": 9.506870663721585, "learning_rate": 4.9651149201423036e-05, "loss": 2.1491, "mean_token_accuracy": 0.4379310369491577, "step": 102580 }, { "epoch": 0.10332470153831277, "grad_norm": 10.27802258758621, "learning_rate": 4.965108346993204e-05, "loss": 2.4839, "mean_token_accuracy": 0.44482758045196535, "step": 102585 }, { "epoch": 0.10332973759141695, "grad_norm": 9.179126107326997, "learning_rate": 4.9651017732297363e-05, "loss": 2.7501, "mean_token_accuracy": 0.40532365441322327, "step": 102590 }, { "epoch": 0.10333477364452112, "grad_norm": 9.963692042329413, "learning_rate": 4.9650951988519015e-05, "loss": 2.1578, "mean_token_accuracy": 0.43793103098869324, "step": 102595 }, { "epoch": 0.1033398096976253, "grad_norm": 8.942288152229755, "learning_rate": 4.9650886238597015e-05, "loss": 2.1868, "mean_token_accuracy": 0.47241380214691164, "step": 102600 }, { "epoch": 0.10334484575072947, "grad_norm": 10.93475842217757, "learning_rate": 4.9650820482531384e-05, "loss": 2.2634, "mean_token_accuracy": 0.43284936547279357, "step": 102605 }, { "epoch": 0.10334988180383364, "grad_norm": 12.283703315383988, "learning_rate": 4.965075472032215e-05, "loss": 2.3561, "mean_token_accuracy": 0.4379310369491577, "step": 102610 }, { "epoch": 0.10335491785693782, "grad_norm": 10.175706091107438, "learning_rate": 4.965068895196931e-05, "loss": 2.2654, "mean_token_accuracy": 0.47931034564971925, "step": 102615 }, { "epoch": 0.10335995391004199, "grad_norm": 13.829327287560446, "learning_rate": 4.96506231774729e-05, "loss": 2.4558, "mean_token_accuracy": 0.3551724195480347, "step": 102620 }, { "epoch": 0.10336498996314616, "grad_norm": 36.56604074730109, "learning_rate": 4.965055739683293e-05, "loss": 2.4292, "mean_token_accuracy": 0.4620689630508423, "step": 102625 }, { "epoch": 0.10337002601625034, "grad_norm": 11.171458291252886, "learning_rate": 4.965049161004942e-05, "loss": 2.439, "mean_token_accuracy": 0.42758620381355283, "step": 102630 }, { "epoch": 0.10337506206935451, "grad_norm": 10.630095197094661, "learning_rate": 4.9650425817122394e-05, "loss": 2.5001, "mean_token_accuracy": 0.441379314661026, "step": 102635 }, { "epoch": 0.10338009812245869, "grad_norm": 11.123369177335173, "learning_rate": 4.9650360018051856e-05, "loss": 2.6794, "mean_token_accuracy": 0.37241379618644715, "step": 102640 }, { "epoch": 0.10338513417556286, "grad_norm": 8.401817875232402, "learning_rate": 4.9650294212837836e-05, "loss": 2.1332, "mean_token_accuracy": 0.42413793206214906, "step": 102645 }, { "epoch": 0.10339017022866703, "grad_norm": 11.943173112985958, "learning_rate": 4.965022840148036e-05, "loss": 2.7018, "mean_token_accuracy": 0.4344827592372894, "step": 102650 }, { "epoch": 0.1033952062817712, "grad_norm": 12.505414167079445, "learning_rate": 4.9650162583979415e-05, "loss": 2.6027, "mean_token_accuracy": 0.42758620381355283, "step": 102655 }, { "epoch": 0.10340024233487537, "grad_norm": 21.47987013681267, "learning_rate": 4.965009676033505e-05, "loss": 2.79, "mean_token_accuracy": 0.37931033968925476, "step": 102660 }, { "epoch": 0.10340527838797954, "grad_norm": 10.206726569838661, "learning_rate": 4.965003093054728e-05, "loss": 2.7186, "mean_token_accuracy": 0.4172413766384125, "step": 102665 }, { "epoch": 0.10341031444108371, "grad_norm": 10.521354699204924, "learning_rate": 4.9649965094616106e-05, "loss": 2.5722, "mean_token_accuracy": 0.4413793087005615, "step": 102670 }, { "epoch": 0.10341535049418789, "grad_norm": 9.998131703431875, "learning_rate": 4.9649899252541556e-05, "loss": 2.5432, "mean_token_accuracy": 0.4379310250282288, "step": 102675 }, { "epoch": 0.10342038654729206, "grad_norm": 10.319603400035327, "learning_rate": 4.9649833404323666e-05, "loss": 2.1164, "mean_token_accuracy": 0.4620689690113068, "step": 102680 }, { "epoch": 0.10342542260039624, "grad_norm": 11.182615632355347, "learning_rate": 4.9649767549962416e-05, "loss": 2.7234, "mean_token_accuracy": 0.4344827592372894, "step": 102685 }, { "epoch": 0.10343045865350041, "grad_norm": 10.896269753901313, "learning_rate": 4.964970168945786e-05, "loss": 2.5915, "mean_token_accuracy": 0.37586206793785093, "step": 102690 }, { "epoch": 0.10343549470660458, "grad_norm": 10.736927029675403, "learning_rate": 4.964963582280999e-05, "loss": 2.0535, "mean_token_accuracy": 0.4949788272380829, "step": 102695 }, { "epoch": 0.10344053075970876, "grad_norm": 9.482490996003563, "learning_rate": 4.964956995001884e-05, "loss": 1.9643, "mean_token_accuracy": 0.4931034445762634, "step": 102700 }, { "epoch": 0.10344556681281293, "grad_norm": 12.004747237594303, "learning_rate": 4.964950407108443e-05, "loss": 2.7128, "mean_token_accuracy": 0.3758620619773865, "step": 102705 }, { "epoch": 0.1034506028659171, "grad_norm": 10.358029056520946, "learning_rate": 4.964943818600677e-05, "loss": 2.217, "mean_token_accuracy": 0.4517241418361664, "step": 102710 }, { "epoch": 0.10345563891902128, "grad_norm": 12.833175990620939, "learning_rate": 4.964937229478587e-05, "loss": 2.7707, "mean_token_accuracy": 0.36896551251411436, "step": 102715 }, { "epoch": 0.10346067497212545, "grad_norm": 12.048965530177313, "learning_rate": 4.9649306397421776e-05, "loss": 2.2048, "mean_token_accuracy": 0.44482758045196535, "step": 102720 }, { "epoch": 0.10346571102522961, "grad_norm": 11.588323127330552, "learning_rate": 4.964924049391448e-05, "loss": 2.4735, "mean_token_accuracy": 0.4034482717514038, "step": 102725 }, { "epoch": 0.10347074707833379, "grad_norm": 13.710773499176616, "learning_rate": 4.964917458426401e-05, "loss": 2.7472, "mean_token_accuracy": 0.3896551728248596, "step": 102730 }, { "epoch": 0.10347578313143796, "grad_norm": 13.268886204756269, "learning_rate": 4.9649108668470386e-05, "loss": 2.7009, "mean_token_accuracy": 0.3655172437429428, "step": 102735 }, { "epoch": 0.10348081918454213, "grad_norm": 15.896686541395285, "learning_rate": 4.964904274653362e-05, "loss": 2.7674, "mean_token_accuracy": 0.4034482717514038, "step": 102740 }, { "epoch": 0.10348585523764631, "grad_norm": 9.749419147201362, "learning_rate": 4.9648976818453743e-05, "loss": 2.1021, "mean_token_accuracy": 0.4551724076271057, "step": 102745 }, { "epoch": 0.10349089129075048, "grad_norm": 10.351683510126103, "learning_rate": 4.964891088423075e-05, "loss": 2.4928, "mean_token_accuracy": 0.44482758045196535, "step": 102750 }, { "epoch": 0.10349592734385465, "grad_norm": 11.140251616950433, "learning_rate": 4.964884494386468e-05, "loss": 2.4965, "mean_token_accuracy": 0.4206896543502808, "step": 102755 }, { "epoch": 0.10350096339695883, "grad_norm": 11.8491191589136, "learning_rate": 4.964877899735555e-05, "loss": 2.2765, "mean_token_accuracy": 0.44482759237289426, "step": 102760 }, { "epoch": 0.103505999450063, "grad_norm": 13.058745086239194, "learning_rate": 4.964871304470337e-05, "loss": 2.6243, "mean_token_accuracy": 0.39310344457626345, "step": 102765 }, { "epoch": 0.10351103550316718, "grad_norm": 10.827598293098102, "learning_rate": 4.964864708590816e-05, "loss": 2.1959, "mean_token_accuracy": 0.43103447556495667, "step": 102770 }, { "epoch": 0.10351607155627135, "grad_norm": 13.409402588385651, "learning_rate": 4.964858112096995e-05, "loss": 2.7859, "mean_token_accuracy": 0.3482758641242981, "step": 102775 }, { "epoch": 0.10352110760937552, "grad_norm": 10.533910111706248, "learning_rate": 4.964851514988874e-05, "loss": 2.3645, "mean_token_accuracy": 0.38620689511299133, "step": 102780 }, { "epoch": 0.1035261436624797, "grad_norm": 12.019038677153114, "learning_rate": 4.964844917266456e-05, "loss": 2.8183, "mean_token_accuracy": 0.37241379022598264, "step": 102785 }, { "epoch": 0.10353117971558387, "grad_norm": 9.451477302906017, "learning_rate": 4.9648383189297414e-05, "loss": 2.6351, "mean_token_accuracy": 0.38620689511299133, "step": 102790 }, { "epoch": 0.10353621576868803, "grad_norm": 11.311353459201282, "learning_rate": 4.964831719978735e-05, "loss": 2.6062, "mean_token_accuracy": 0.42413793206214906, "step": 102795 }, { "epoch": 0.1035412518217922, "grad_norm": 10.269240646645464, "learning_rate": 4.9648251204134354e-05, "loss": 2.1797, "mean_token_accuracy": 0.47241379618644713, "step": 102800 }, { "epoch": 0.10354628787489638, "grad_norm": 11.131598349902763, "learning_rate": 4.964818520233846e-05, "loss": 2.4014, "mean_token_accuracy": 0.4344827592372894, "step": 102805 }, { "epoch": 0.10355132392800055, "grad_norm": 10.834083475202164, "learning_rate": 4.964811919439968e-05, "loss": 2.2055, "mean_token_accuracy": 0.45591133236885073, "step": 102810 }, { "epoch": 0.10355635998110473, "grad_norm": 10.530199647379593, "learning_rate": 4.964805318031805e-05, "loss": 2.5862, "mean_token_accuracy": 0.3999999940395355, "step": 102815 }, { "epoch": 0.1035613960342089, "grad_norm": 10.98968036115313, "learning_rate": 4.9647987160093565e-05, "loss": 2.5896, "mean_token_accuracy": 0.3965517282485962, "step": 102820 }, { "epoch": 0.10356643208731307, "grad_norm": 11.773844974577928, "learning_rate": 4.9647921133726255e-05, "loss": 2.193, "mean_token_accuracy": 0.45972906351089476, "step": 102825 }, { "epoch": 0.10357146814041725, "grad_norm": 10.907432719035597, "learning_rate": 4.964785510121614e-05, "loss": 2.5704, "mean_token_accuracy": 0.4137930989265442, "step": 102830 }, { "epoch": 0.10357650419352142, "grad_norm": 9.594904047676632, "learning_rate": 4.964778906256323e-05, "loss": 2.5079, "mean_token_accuracy": 0.40689654350280763, "step": 102835 }, { "epoch": 0.1035815402466256, "grad_norm": 11.817506377087932, "learning_rate": 4.964772301776755e-05, "loss": 1.9084, "mean_token_accuracy": 0.4689655065536499, "step": 102840 }, { "epoch": 0.10358657629972977, "grad_norm": 10.436809406700329, "learning_rate": 4.964765696682912e-05, "loss": 2.2675, "mean_token_accuracy": 0.4517241358757019, "step": 102845 }, { "epoch": 0.10359161235283394, "grad_norm": 9.744801261941682, "learning_rate": 4.9647590909747946e-05, "loss": 2.3756, "mean_token_accuracy": 0.4310344815254211, "step": 102850 }, { "epoch": 0.10359664840593812, "grad_norm": 10.67594776021183, "learning_rate": 4.964752484652406e-05, "loss": 2.5473, "mean_token_accuracy": 0.38275861740112305, "step": 102855 }, { "epoch": 0.10360168445904229, "grad_norm": 10.95796771953436, "learning_rate": 4.964745877715747e-05, "loss": 2.7631, "mean_token_accuracy": 0.3448275804519653, "step": 102860 }, { "epoch": 0.10360672051214645, "grad_norm": 10.294691886952323, "learning_rate": 4.964739270164821e-05, "loss": 2.616, "mean_token_accuracy": 0.36206896007061007, "step": 102865 }, { "epoch": 0.10361175656525062, "grad_norm": 11.622773085958949, "learning_rate": 4.964732661999629e-05, "loss": 2.5797, "mean_token_accuracy": 0.41034482717514037, "step": 102870 }, { "epoch": 0.1036167926183548, "grad_norm": 21.016569424734772, "learning_rate": 4.964726053220171e-05, "loss": 2.6472, "mean_token_accuracy": 0.4517241358757019, "step": 102875 }, { "epoch": 0.10362182867145897, "grad_norm": 10.159564637084017, "learning_rate": 4.9647194438264515e-05, "loss": 2.6604, "mean_token_accuracy": 0.31724137663841245, "step": 102880 }, { "epoch": 0.10362686472456314, "grad_norm": 10.361291726826597, "learning_rate": 4.964712833818472e-05, "loss": 2.2221, "mean_token_accuracy": 0.41724138259887694, "step": 102885 }, { "epoch": 0.10363190077766732, "grad_norm": 10.137609089141609, "learning_rate": 4.964706223196232e-05, "loss": 2.8024, "mean_token_accuracy": 0.38275861740112305, "step": 102890 }, { "epoch": 0.10363693683077149, "grad_norm": 9.163925465658757, "learning_rate": 4.964699611959736e-05, "loss": 3.0137, "mean_token_accuracy": 0.38620689511299133, "step": 102895 }, { "epoch": 0.10364197288387567, "grad_norm": 9.93383430476696, "learning_rate": 4.964693000108985e-05, "loss": 2.4878, "mean_token_accuracy": 0.43272837400436404, "step": 102900 }, { "epoch": 0.10364700893697984, "grad_norm": 8.456397507965278, "learning_rate": 4.9646863876439796e-05, "loss": 2.2444, "mean_token_accuracy": 0.45862067937850953, "step": 102905 }, { "epoch": 0.10365204499008401, "grad_norm": 12.527954859910455, "learning_rate": 4.9646797745647236e-05, "loss": 2.6061, "mean_token_accuracy": 0.3689655244350433, "step": 102910 }, { "epoch": 0.10365708104318819, "grad_norm": 10.092031234987058, "learning_rate": 4.964673160871217e-05, "loss": 2.7003, "mean_token_accuracy": 0.4000000059604645, "step": 102915 }, { "epoch": 0.10366211709629236, "grad_norm": 10.787233802403007, "learning_rate": 4.964666546563464e-05, "loss": 1.8639, "mean_token_accuracy": 0.5697044312953949, "step": 102920 }, { "epoch": 0.10366715314939653, "grad_norm": 10.915366378775854, "learning_rate": 4.964659931641464e-05, "loss": 2.8096, "mean_token_accuracy": 0.38620689511299133, "step": 102925 }, { "epoch": 0.10367218920250071, "grad_norm": 11.843694460287882, "learning_rate": 4.96465331610522e-05, "loss": 2.3656, "mean_token_accuracy": 0.3914095640182495, "step": 102930 }, { "epoch": 0.10367722525560487, "grad_norm": 12.34386225549301, "learning_rate": 4.9646466999547335e-05, "loss": 2.2592, "mean_token_accuracy": 0.4206896543502808, "step": 102935 }, { "epoch": 0.10368226130870904, "grad_norm": 12.651232048921464, "learning_rate": 4.9646400831900066e-05, "loss": 2.4771, "mean_token_accuracy": 0.4013309121131897, "step": 102940 }, { "epoch": 0.10368729736181322, "grad_norm": 10.265638717915435, "learning_rate": 4.9646334658110406e-05, "loss": 2.4267, "mean_token_accuracy": 0.44827585816383364, "step": 102945 }, { "epoch": 0.10369233341491739, "grad_norm": 12.147893068494495, "learning_rate": 4.964626847817838e-05, "loss": 2.5088, "mean_token_accuracy": 0.44482758045196535, "step": 102950 }, { "epoch": 0.10369736946802156, "grad_norm": 11.516871813047802, "learning_rate": 4.964620229210401e-05, "loss": 2.6737, "mean_token_accuracy": 0.3876588046550751, "step": 102955 }, { "epoch": 0.10370240552112574, "grad_norm": 9.598558584090025, "learning_rate": 4.96461360998873e-05, "loss": 2.8774, "mean_token_accuracy": 0.36896551251411436, "step": 102960 }, { "epoch": 0.10370744157422991, "grad_norm": 12.125545601326763, "learning_rate": 4.9646069901528276e-05, "loss": 2.2847, "mean_token_accuracy": 0.4448275864124298, "step": 102965 }, { "epoch": 0.10371247762733408, "grad_norm": 11.559501571674797, "learning_rate": 4.964600369702696e-05, "loss": 2.4679, "mean_token_accuracy": 0.4551724135875702, "step": 102970 }, { "epoch": 0.10371751368043826, "grad_norm": 10.613762754498005, "learning_rate": 4.964593748638337e-05, "loss": 2.4601, "mean_token_accuracy": 0.4517241299152374, "step": 102975 }, { "epoch": 0.10372254973354243, "grad_norm": 9.179121786804597, "learning_rate": 4.964587126959752e-05, "loss": 2.0388, "mean_token_accuracy": 0.47241380214691164, "step": 102980 }, { "epoch": 0.1037275857866466, "grad_norm": 13.746243739614954, "learning_rate": 4.964580504666943e-05, "loss": 2.6289, "mean_token_accuracy": 0.37586206793785093, "step": 102985 }, { "epoch": 0.10373262183975078, "grad_norm": 11.85659496551069, "learning_rate": 4.964573881759911e-05, "loss": 2.418, "mean_token_accuracy": 0.41379310488700866, "step": 102990 }, { "epoch": 0.10373765789285495, "grad_norm": 10.488008864164938, "learning_rate": 4.96456725823866e-05, "loss": 2.5342, "mean_token_accuracy": 0.41034482717514037, "step": 102995 }, { "epoch": 0.10374269394595913, "grad_norm": 11.29669362449236, "learning_rate": 4.96456063410319e-05, "loss": 2.3771, "mean_token_accuracy": 0.4172413766384125, "step": 103000 }, { "epoch": 0.10374772999906329, "grad_norm": 12.217376306539848, "learning_rate": 4.9645540093535037e-05, "loss": 2.5203, "mean_token_accuracy": 0.4470659375190735, "step": 103005 }, { "epoch": 0.10375276605216746, "grad_norm": 9.961374498156212, "learning_rate": 4.9645473839896014e-05, "loss": 2.5732, "mean_token_accuracy": 0.3965517163276672, "step": 103010 }, { "epoch": 0.10375780210527163, "grad_norm": 11.544591464007157, "learning_rate": 4.964540758011487e-05, "loss": 2.6246, "mean_token_accuracy": 0.38620689511299133, "step": 103015 }, { "epoch": 0.10376283815837581, "grad_norm": 11.66084024621183, "learning_rate": 4.9645341314191614e-05, "loss": 2.3964, "mean_token_accuracy": 0.4482758641242981, "step": 103020 }, { "epoch": 0.10376787421147998, "grad_norm": 11.087490562297699, "learning_rate": 4.964527504212627e-05, "loss": 2.0646, "mean_token_accuracy": 0.4847549915313721, "step": 103025 }, { "epoch": 0.10377291026458416, "grad_norm": 11.17779495633632, "learning_rate": 4.964520876391885e-05, "loss": 2.4559, "mean_token_accuracy": 0.3965517282485962, "step": 103030 }, { "epoch": 0.10377794631768833, "grad_norm": 14.16836855289918, "learning_rate": 4.9645142479569365e-05, "loss": 2.7772, "mean_token_accuracy": 0.3655172407627106, "step": 103035 }, { "epoch": 0.1037829823707925, "grad_norm": 18.170207200643063, "learning_rate": 4.964507618907784e-05, "loss": 2.8981, "mean_token_accuracy": 0.3620689630508423, "step": 103040 }, { "epoch": 0.10378801842389668, "grad_norm": 10.609405445528665, "learning_rate": 4.964500989244431e-05, "loss": 2.1909, "mean_token_accuracy": 0.4620689690113068, "step": 103045 }, { "epoch": 0.10379305447700085, "grad_norm": 10.819538578165798, "learning_rate": 4.964494358966877e-05, "loss": 2.5672, "mean_token_accuracy": 0.38965516686439516, "step": 103050 }, { "epoch": 0.10379809053010502, "grad_norm": 14.278955876243346, "learning_rate": 4.9644877280751246e-05, "loss": 2.5587, "mean_token_accuracy": 0.3896551728248596, "step": 103055 }, { "epoch": 0.1038031265832092, "grad_norm": 12.953895390448283, "learning_rate": 4.964481096569176e-05, "loss": 2.4801, "mean_token_accuracy": 0.41724138259887694, "step": 103060 }, { "epoch": 0.10380816263631337, "grad_norm": 9.862541243696544, "learning_rate": 4.9644744644490323e-05, "loss": 2.4888, "mean_token_accuracy": 0.43103448748588563, "step": 103065 }, { "epoch": 0.10381319868941755, "grad_norm": 10.524393373886394, "learning_rate": 4.964467831714697e-05, "loss": 2.2784, "mean_token_accuracy": 0.4172413766384125, "step": 103070 }, { "epoch": 0.1038182347425217, "grad_norm": 20.148968872601777, "learning_rate": 4.96446119836617e-05, "loss": 2.8623, "mean_token_accuracy": 0.4185117959976196, "step": 103075 }, { "epoch": 0.10382327079562588, "grad_norm": 11.300556852011367, "learning_rate": 4.964454564403455e-05, "loss": 2.9313, "mean_token_accuracy": 0.3206896483898163, "step": 103080 }, { "epoch": 0.10382830684873005, "grad_norm": 9.115530990852688, "learning_rate": 4.964447929826551e-05, "loss": 2.6246, "mean_token_accuracy": 0.3913490653038025, "step": 103085 }, { "epoch": 0.10383334290183423, "grad_norm": 11.346985371573272, "learning_rate": 4.964441294635462e-05, "loss": 2.5378, "mean_token_accuracy": 0.4, "step": 103090 }, { "epoch": 0.1038383789549384, "grad_norm": 12.383008048615729, "learning_rate": 4.96443465883019e-05, "loss": 2.7198, "mean_token_accuracy": 0.33793102502822875, "step": 103095 }, { "epoch": 0.10384341500804257, "grad_norm": 10.278049099499064, "learning_rate": 4.964428022410737e-05, "loss": 2.3052, "mean_token_accuracy": 0.4068965554237366, "step": 103100 }, { "epoch": 0.10384845106114675, "grad_norm": 9.842128740586753, "learning_rate": 4.964421385377103e-05, "loss": 2.4356, "mean_token_accuracy": 0.37586206793785093, "step": 103105 }, { "epoch": 0.10385348711425092, "grad_norm": 12.431769659594135, "learning_rate": 4.964414747729291e-05, "loss": 2.6686, "mean_token_accuracy": 0.38620689511299133, "step": 103110 }, { "epoch": 0.1038585231673551, "grad_norm": 9.527892922749471, "learning_rate": 4.964408109467303e-05, "loss": 2.4081, "mean_token_accuracy": 0.42758620381355283, "step": 103115 }, { "epoch": 0.10386355922045927, "grad_norm": 12.013315758723625, "learning_rate": 4.964401470591141e-05, "loss": 2.6244, "mean_token_accuracy": 0.3743496656417847, "step": 103120 }, { "epoch": 0.10386859527356344, "grad_norm": 12.059398000966743, "learning_rate": 4.964394831100806e-05, "loss": 2.5144, "mean_token_accuracy": 0.39655172526836396, "step": 103125 }, { "epoch": 0.10387363132666762, "grad_norm": 10.236682310414727, "learning_rate": 4.9643881909963005e-05, "loss": 1.9798, "mean_token_accuracy": 0.476477837562561, "step": 103130 }, { "epoch": 0.10387866737977179, "grad_norm": 11.948750204413411, "learning_rate": 4.964381550277626e-05, "loss": 2.333, "mean_token_accuracy": 0.39310344457626345, "step": 103135 }, { "epoch": 0.10388370343287597, "grad_norm": 10.389392272985065, "learning_rate": 4.964374908944785e-05, "loss": 2.4147, "mean_token_accuracy": 0.42413793206214906, "step": 103140 }, { "epoch": 0.10388873948598012, "grad_norm": 10.482548569784681, "learning_rate": 4.9643682669977784e-05, "loss": 2.168, "mean_token_accuracy": 0.47586206793785096, "step": 103145 }, { "epoch": 0.1038937755390843, "grad_norm": 9.366579341243721, "learning_rate": 4.964361624436609e-05, "loss": 2.9311, "mean_token_accuracy": 0.417241370677948, "step": 103150 }, { "epoch": 0.10389881159218847, "grad_norm": 8.877242022777903, "learning_rate": 4.9643549812612775e-05, "loss": 2.3232, "mean_token_accuracy": 0.4517241418361664, "step": 103155 }, { "epoch": 0.10390384764529265, "grad_norm": 11.63038409830123, "learning_rate": 4.9643483374717876e-05, "loss": 2.6384, "mean_token_accuracy": 0.42758620381355283, "step": 103160 }, { "epoch": 0.10390888369839682, "grad_norm": 8.924805549176025, "learning_rate": 4.9643416930681385e-05, "loss": 3.1786, "mean_token_accuracy": 0.37586206793785093, "step": 103165 }, { "epoch": 0.103913919751501, "grad_norm": 9.810659204540482, "learning_rate": 4.964335048050334e-05, "loss": 2.398, "mean_token_accuracy": 0.4137930989265442, "step": 103170 }, { "epoch": 0.10391895580460517, "grad_norm": 13.268822926787431, "learning_rate": 4.9643284024183755e-05, "loss": 2.4014, "mean_token_accuracy": 0.4517241358757019, "step": 103175 }, { "epoch": 0.10392399185770934, "grad_norm": 9.64431378841381, "learning_rate": 4.964321756172265e-05, "loss": 2.1192, "mean_token_accuracy": 0.5103448331356049, "step": 103180 }, { "epoch": 0.10392902791081352, "grad_norm": 10.46694535595462, "learning_rate": 4.9643151093120036e-05, "loss": 2.6512, "mean_token_accuracy": 0.3758620649576187, "step": 103185 }, { "epoch": 0.10393406396391769, "grad_norm": 16.359528268712353, "learning_rate": 4.964308461837594e-05, "loss": 2.5574, "mean_token_accuracy": 0.4206896543502808, "step": 103190 }, { "epoch": 0.10393910001702186, "grad_norm": 10.325443510330471, "learning_rate": 4.964301813749037e-05, "loss": 2.6767, "mean_token_accuracy": 0.39310344457626345, "step": 103195 }, { "epoch": 0.10394413607012604, "grad_norm": 10.306307195822047, "learning_rate": 4.964295165046336e-05, "loss": 2.5239, "mean_token_accuracy": 0.4310344815254211, "step": 103200 }, { "epoch": 0.10394917212323021, "grad_norm": 10.154949405566802, "learning_rate": 4.964288515729492e-05, "loss": 2.8357, "mean_token_accuracy": 0.38275861740112305, "step": 103205 }, { "epoch": 0.10395420817633438, "grad_norm": 10.246948459832236, "learning_rate": 4.964281865798506e-05, "loss": 2.4723, "mean_token_accuracy": 0.41379310488700866, "step": 103210 }, { "epoch": 0.10395924422943854, "grad_norm": 12.44744602847921, "learning_rate": 4.964275215253381e-05, "loss": 2.5516, "mean_token_accuracy": 0.42068964540958403, "step": 103215 }, { "epoch": 0.10396428028254272, "grad_norm": 18.60790535718843, "learning_rate": 4.964268564094119e-05, "loss": 2.6742, "mean_token_accuracy": 0.4413793087005615, "step": 103220 }, { "epoch": 0.10396931633564689, "grad_norm": 10.163224950449452, "learning_rate": 4.964261912320721e-05, "loss": 2.4327, "mean_token_accuracy": 0.4275861978530884, "step": 103225 }, { "epoch": 0.10397435238875107, "grad_norm": 10.62199361822042, "learning_rate": 4.964255259933189e-05, "loss": 2.8677, "mean_token_accuracy": 0.38620689511299133, "step": 103230 }, { "epoch": 0.10397938844185524, "grad_norm": 10.06233262961867, "learning_rate": 4.9642486069315253e-05, "loss": 2.4183, "mean_token_accuracy": 0.4344827592372894, "step": 103235 }, { "epoch": 0.10398442449495941, "grad_norm": 10.668441515428324, "learning_rate": 4.9642419533157316e-05, "loss": 2.2822, "mean_token_accuracy": 0.4310344815254211, "step": 103240 }, { "epoch": 0.10398946054806359, "grad_norm": 10.880775073365328, "learning_rate": 4.9642352990858084e-05, "loss": 3.0879, "mean_token_accuracy": 0.3965517282485962, "step": 103245 }, { "epoch": 0.10399449660116776, "grad_norm": 10.999367407275313, "learning_rate": 4.9642286442417606e-05, "loss": 2.473, "mean_token_accuracy": 0.4482758641242981, "step": 103250 }, { "epoch": 0.10399953265427193, "grad_norm": 14.956567271060655, "learning_rate": 4.9642219887835875e-05, "loss": 2.7544, "mean_token_accuracy": 0.3931034505367279, "step": 103255 }, { "epoch": 0.10400456870737611, "grad_norm": 13.48617843293436, "learning_rate": 4.964215332711292e-05, "loss": 2.5589, "mean_token_accuracy": 0.4310344815254211, "step": 103260 }, { "epoch": 0.10400960476048028, "grad_norm": 8.576824025256261, "learning_rate": 4.964208676024875e-05, "loss": 2.2712, "mean_token_accuracy": 0.4551724135875702, "step": 103265 }, { "epoch": 0.10401464081358446, "grad_norm": 13.463537277148754, "learning_rate": 4.9642020187243394e-05, "loss": 2.3939, "mean_token_accuracy": 0.39655172228813174, "step": 103270 }, { "epoch": 0.10401967686668863, "grad_norm": 8.691586043168915, "learning_rate": 4.964195360809686e-05, "loss": 2.2159, "mean_token_accuracy": 0.47586206197738645, "step": 103275 }, { "epoch": 0.1040247129197928, "grad_norm": 11.218305997294125, "learning_rate": 4.9641887022809175e-05, "loss": 2.2314, "mean_token_accuracy": 0.44482759237289426, "step": 103280 }, { "epoch": 0.10402974897289696, "grad_norm": 9.440437200746915, "learning_rate": 4.964182043138036e-05, "loss": 2.4635, "mean_token_accuracy": 0.3862069010734558, "step": 103285 }, { "epoch": 0.10403478502600114, "grad_norm": 12.155968677040468, "learning_rate": 4.9641753833810425e-05, "loss": 2.3421, "mean_token_accuracy": 0.4, "step": 103290 }, { "epoch": 0.10403982107910531, "grad_norm": 14.98645656481463, "learning_rate": 4.9641687230099394e-05, "loss": 2.5056, "mean_token_accuracy": 0.4413793087005615, "step": 103295 }, { "epoch": 0.10404485713220948, "grad_norm": 17.883162164846706, "learning_rate": 4.964162062024728e-05, "loss": 2.527, "mean_token_accuracy": 0.4329098641872406, "step": 103300 }, { "epoch": 0.10404989318531366, "grad_norm": 9.82871302895975, "learning_rate": 4.96415540042541e-05, "loss": 2.493, "mean_token_accuracy": 0.4034482777118683, "step": 103305 }, { "epoch": 0.10405492923841783, "grad_norm": 10.211613205743413, "learning_rate": 4.9641487382119886e-05, "loss": 2.4909, "mean_token_accuracy": 0.37931033968925476, "step": 103310 }, { "epoch": 0.104059965291522, "grad_norm": 10.002671639656016, "learning_rate": 4.964142075384465e-05, "loss": 2.56, "mean_token_accuracy": 0.3911070764064789, "step": 103315 }, { "epoch": 0.10406500134462618, "grad_norm": 10.486757320972604, "learning_rate": 4.96413541194284e-05, "loss": 2.0799, "mean_token_accuracy": 0.5094373881816864, "step": 103320 }, { "epoch": 0.10407003739773035, "grad_norm": 10.49286509392905, "learning_rate": 4.9641287478871174e-05, "loss": 2.3625, "mean_token_accuracy": 0.4586206912994385, "step": 103325 }, { "epoch": 0.10407507345083453, "grad_norm": 11.817766591551393, "learning_rate": 4.964122083217297e-05, "loss": 2.342, "mean_token_accuracy": 0.43103448748588563, "step": 103330 }, { "epoch": 0.1040801095039387, "grad_norm": 9.992153067497501, "learning_rate": 4.964115417933382e-05, "loss": 2.0207, "mean_token_accuracy": 0.4986085832118988, "step": 103335 }, { "epoch": 0.10408514555704287, "grad_norm": 11.92204613269951, "learning_rate": 4.964108752035374e-05, "loss": 2.6386, "mean_token_accuracy": 0.37241379022598264, "step": 103340 }, { "epoch": 0.10409018161014705, "grad_norm": 8.146902150215247, "learning_rate": 4.964102085523274e-05, "loss": 2.3077, "mean_token_accuracy": 0.4482758641242981, "step": 103345 }, { "epoch": 0.10409521766325122, "grad_norm": 10.133016040599395, "learning_rate": 4.964095418397085e-05, "loss": 2.2462, "mean_token_accuracy": 0.4275861978530884, "step": 103350 }, { "epoch": 0.10410025371635538, "grad_norm": 13.3906812671072, "learning_rate": 4.964088750656808e-05, "loss": 2.5773, "mean_token_accuracy": 0.41379310488700866, "step": 103355 }, { "epoch": 0.10410528976945956, "grad_norm": 10.007831257830533, "learning_rate": 4.964082082302445e-05, "loss": 2.5446, "mean_token_accuracy": 0.41379310488700866, "step": 103360 }, { "epoch": 0.10411032582256373, "grad_norm": 12.2229837756531, "learning_rate": 4.964075413333999e-05, "loss": 2.1582, "mean_token_accuracy": 0.48620688915252686, "step": 103365 }, { "epoch": 0.1041153618756679, "grad_norm": 10.342947958259925, "learning_rate": 4.964068743751471e-05, "loss": 2.2866, "mean_token_accuracy": 0.4586206912994385, "step": 103370 }, { "epoch": 0.10412039792877208, "grad_norm": 12.110625930506135, "learning_rate": 4.9640620735548614e-05, "loss": 3.1656, "mean_token_accuracy": 0.3620689630508423, "step": 103375 }, { "epoch": 0.10412543398187625, "grad_norm": 10.53502498754386, "learning_rate": 4.964055402744174e-05, "loss": 2.8981, "mean_token_accuracy": 0.39310344457626345, "step": 103380 }, { "epoch": 0.10413047003498042, "grad_norm": 14.418509527586824, "learning_rate": 4.964048731319411e-05, "loss": 2.7263, "mean_token_accuracy": 0.38275861740112305, "step": 103385 }, { "epoch": 0.1041355060880846, "grad_norm": 9.623282878133681, "learning_rate": 4.964042059280573e-05, "loss": 2.4296, "mean_token_accuracy": 0.4172413766384125, "step": 103390 }, { "epoch": 0.10414054214118877, "grad_norm": 10.259714571318074, "learning_rate": 4.964035386627661e-05, "loss": 2.5875, "mean_token_accuracy": 0.34137930274009703, "step": 103395 }, { "epoch": 0.10414557819429295, "grad_norm": 11.38552948638494, "learning_rate": 4.9640287133606794e-05, "loss": 2.3321, "mean_token_accuracy": 0.46551724076271056, "step": 103400 }, { "epoch": 0.10415061424739712, "grad_norm": 10.547823130894201, "learning_rate": 4.964022039479628e-05, "loss": 2.4525, "mean_token_accuracy": 0.47931034564971925, "step": 103405 }, { "epoch": 0.10415565030050129, "grad_norm": 11.321733970183939, "learning_rate": 4.964015364984509e-05, "loss": 2.7999, "mean_token_accuracy": 0.4034482777118683, "step": 103410 }, { "epoch": 0.10416068635360547, "grad_norm": 11.600272548937992, "learning_rate": 4.9640086898753254e-05, "loss": 2.4958, "mean_token_accuracy": 0.38620689511299133, "step": 103415 }, { "epoch": 0.10416572240670964, "grad_norm": 10.777140456521964, "learning_rate": 4.964002014152077e-05, "loss": 2.5404, "mean_token_accuracy": 0.3862068891525269, "step": 103420 }, { "epoch": 0.1041707584598138, "grad_norm": 11.119819852100127, "learning_rate": 4.9639953378147685e-05, "loss": 2.4379, "mean_token_accuracy": 0.4223835408687592, "step": 103425 }, { "epoch": 0.10417579451291797, "grad_norm": 7.941035122173889, "learning_rate": 4.9639886608633993e-05, "loss": 1.5436, "mean_token_accuracy": 0.5575922548770904, "step": 103430 }, { "epoch": 0.10418083056602215, "grad_norm": 11.522293897442974, "learning_rate": 4.963981983297972e-05, "loss": 2.4231, "mean_token_accuracy": 0.42068966031074523, "step": 103435 }, { "epoch": 0.10418586661912632, "grad_norm": 13.592013846310603, "learning_rate": 4.963975305118488e-05, "loss": 2.431, "mean_token_accuracy": 0.39655172228813174, "step": 103440 }, { "epoch": 0.1041909026722305, "grad_norm": 9.173513121450622, "learning_rate": 4.963968626324951e-05, "loss": 2.3383, "mean_token_accuracy": 0.4485178530216217, "step": 103445 }, { "epoch": 0.10419593872533467, "grad_norm": 11.145550894835978, "learning_rate": 4.963961946917361e-05, "loss": 2.2309, "mean_token_accuracy": 0.4275861978530884, "step": 103450 }, { "epoch": 0.10420097477843884, "grad_norm": 11.089754074483869, "learning_rate": 4.9639552668957206e-05, "loss": 2.6872, "mean_token_accuracy": 0.4137930989265442, "step": 103455 }, { "epoch": 0.10420601083154302, "grad_norm": 8.44635633157336, "learning_rate": 4.963948586260031e-05, "loss": 2.4519, "mean_token_accuracy": 0.41379310488700866, "step": 103460 }, { "epoch": 0.10421104688464719, "grad_norm": 10.975085593705144, "learning_rate": 4.963941905010295e-05, "loss": 2.2306, "mean_token_accuracy": 0.4517241299152374, "step": 103465 }, { "epoch": 0.10421608293775136, "grad_norm": 9.340644400696902, "learning_rate": 4.9639352231465136e-05, "loss": 2.5608, "mean_token_accuracy": 0.37586206793785093, "step": 103470 }, { "epoch": 0.10422111899085554, "grad_norm": 22.183797640315863, "learning_rate": 4.9639285406686894e-05, "loss": 3.0508, "mean_token_accuracy": 0.3827586114406586, "step": 103475 }, { "epoch": 0.10422615504395971, "grad_norm": 13.619188693516692, "learning_rate": 4.9639218575768233e-05, "loss": 2.2544, "mean_token_accuracy": 0.4463399887084961, "step": 103480 }, { "epoch": 0.10423119109706389, "grad_norm": 11.330051944073848, "learning_rate": 4.963915173870917e-05, "loss": 2.21, "mean_token_accuracy": 0.47586206793785096, "step": 103485 }, { "epoch": 0.10423622715016806, "grad_norm": 11.535812835645796, "learning_rate": 4.963908489550975e-05, "loss": 2.5493, "mean_token_accuracy": 0.4206896543502808, "step": 103490 }, { "epoch": 0.10424126320327222, "grad_norm": 9.591861953394002, "learning_rate": 4.963901804616996e-05, "loss": 2.5748, "mean_token_accuracy": 0.4344827592372894, "step": 103495 }, { "epoch": 0.10424629925637639, "grad_norm": 10.27634254909053, "learning_rate": 4.9638951190689833e-05, "loss": 2.455, "mean_token_accuracy": 0.4137930989265442, "step": 103500 }, { "epoch": 0.10425133530948057, "grad_norm": 13.906251838594448, "learning_rate": 4.963888432906938e-05, "loss": 2.4018, "mean_token_accuracy": 0.4241379380226135, "step": 103505 }, { "epoch": 0.10425637136258474, "grad_norm": 10.205707547763863, "learning_rate": 4.9638817461308634e-05, "loss": 2.7354, "mean_token_accuracy": 0.3724137932062149, "step": 103510 }, { "epoch": 0.10426140741568891, "grad_norm": 11.229334354443871, "learning_rate": 4.96387505874076e-05, "loss": 2.597, "mean_token_accuracy": 0.43448275327682495, "step": 103515 }, { "epoch": 0.10426644346879309, "grad_norm": 10.581523680019457, "learning_rate": 4.9638683707366304e-05, "loss": 2.5355, "mean_token_accuracy": 0.41506351828575133, "step": 103520 }, { "epoch": 0.10427147952189726, "grad_norm": 15.318133913874002, "learning_rate": 4.963861682118476e-05, "loss": 2.7423, "mean_token_accuracy": 0.382758629322052, "step": 103525 }, { "epoch": 0.10427651557500144, "grad_norm": 10.378484211995836, "learning_rate": 4.963854992886298e-05, "loss": 2.2227, "mean_token_accuracy": 0.42758620977401735, "step": 103530 }, { "epoch": 0.10428155162810561, "grad_norm": 10.711756366934662, "learning_rate": 4.9638483030401e-05, "loss": 2.3336, "mean_token_accuracy": 0.43793103098869324, "step": 103535 }, { "epoch": 0.10428658768120978, "grad_norm": 10.033906857823823, "learning_rate": 4.9638416125798825e-05, "loss": 2.0944, "mean_token_accuracy": 0.495099812746048, "step": 103540 }, { "epoch": 0.10429162373431396, "grad_norm": 10.084271402057777, "learning_rate": 4.963834921505648e-05, "loss": 2.4423, "mean_token_accuracy": 0.43793103098869324, "step": 103545 }, { "epoch": 0.10429665978741813, "grad_norm": 12.389113586671364, "learning_rate": 4.963828229817398e-05, "loss": 2.8161, "mean_token_accuracy": 0.3965517282485962, "step": 103550 }, { "epoch": 0.1043016958405223, "grad_norm": 12.23171169453411, "learning_rate": 4.963821537515134e-05, "loss": 2.432, "mean_token_accuracy": 0.4310344815254211, "step": 103555 }, { "epoch": 0.10430673189362648, "grad_norm": 10.378459731312915, "learning_rate": 4.9638148445988596e-05, "loss": 2.5288, "mean_token_accuracy": 0.42413793206214906, "step": 103560 }, { "epoch": 0.10431176794673064, "grad_norm": 13.36084284854327, "learning_rate": 4.963808151068575e-05, "loss": 2.7507, "mean_token_accuracy": 0.39310344457626345, "step": 103565 }, { "epoch": 0.10431680399983481, "grad_norm": 12.177270616466435, "learning_rate": 4.9638014569242813e-05, "loss": 2.6571, "mean_token_accuracy": 0.3620689630508423, "step": 103570 }, { "epoch": 0.10432184005293899, "grad_norm": 12.600076777861132, "learning_rate": 4.9637947621659825e-05, "loss": 2.4698, "mean_token_accuracy": 0.42879613041877745, "step": 103575 }, { "epoch": 0.10432687610604316, "grad_norm": 10.650463763307181, "learning_rate": 4.9637880667936795e-05, "loss": 2.6863, "mean_token_accuracy": 0.4379310369491577, "step": 103580 }, { "epoch": 0.10433191215914733, "grad_norm": 11.371756029700144, "learning_rate": 4.963781370807374e-05, "loss": 2.4485, "mean_token_accuracy": 0.42413793206214906, "step": 103585 }, { "epoch": 0.1043369482122515, "grad_norm": 11.348176211773698, "learning_rate": 4.963774674207067e-05, "loss": 2.3483, "mean_token_accuracy": 0.42068966031074523, "step": 103590 }, { "epoch": 0.10434198426535568, "grad_norm": 10.473499532902943, "learning_rate": 4.963767976992763e-05, "loss": 2.5315, "mean_token_accuracy": 0.42413793206214906, "step": 103595 }, { "epoch": 0.10434702031845985, "grad_norm": 10.181923659945623, "learning_rate": 4.963761279164461e-05, "loss": 2.3881, "mean_token_accuracy": 0.4413793087005615, "step": 103600 }, { "epoch": 0.10435205637156403, "grad_norm": 11.741191079925311, "learning_rate": 4.963754580722165e-05, "loss": 2.4332, "mean_token_accuracy": 0.44827587008476255, "step": 103605 }, { "epoch": 0.1043570924246682, "grad_norm": 12.14507891519842, "learning_rate": 4.963747881665875e-05, "loss": 2.9841, "mean_token_accuracy": 0.34482758641242983, "step": 103610 }, { "epoch": 0.10436212847777238, "grad_norm": 12.220318888751688, "learning_rate": 4.9637411819955944e-05, "loss": 2.5066, "mean_token_accuracy": 0.39310344457626345, "step": 103615 }, { "epoch": 0.10436716453087655, "grad_norm": 12.76438227022332, "learning_rate": 4.9637344817113243e-05, "loss": 2.4688, "mean_token_accuracy": 0.3999999940395355, "step": 103620 }, { "epoch": 0.10437220058398072, "grad_norm": 9.686551430316133, "learning_rate": 4.963727780813066e-05, "loss": 2.6688, "mean_token_accuracy": 0.3965517163276672, "step": 103625 }, { "epoch": 0.1043772366370849, "grad_norm": 12.19324147446133, "learning_rate": 4.963721079300823e-05, "loss": 2.4444, "mean_token_accuracy": 0.44694494009017943, "step": 103630 }, { "epoch": 0.10438227269018906, "grad_norm": 9.404065912769726, "learning_rate": 4.963714377174595e-05, "loss": 2.2919, "mean_token_accuracy": 0.42068964838981626, "step": 103635 }, { "epoch": 0.10438730874329323, "grad_norm": 8.7383613542677, "learning_rate": 4.9637076744343864e-05, "loss": 2.4102, "mean_token_accuracy": 0.4137930989265442, "step": 103640 }, { "epoch": 0.1043923447963974, "grad_norm": 12.843332349912515, "learning_rate": 4.963700971080197e-05, "loss": 2.2671, "mean_token_accuracy": 0.4517241358757019, "step": 103645 }, { "epoch": 0.10439738084950158, "grad_norm": 11.796621544836372, "learning_rate": 4.96369426711203e-05, "loss": 2.4795, "mean_token_accuracy": 0.3862068891525269, "step": 103650 }, { "epoch": 0.10440241690260575, "grad_norm": 11.140150458946463, "learning_rate": 4.9636875625298855e-05, "loss": 2.2771, "mean_token_accuracy": 0.43793103098869324, "step": 103655 }, { "epoch": 0.10440745295570993, "grad_norm": 10.922218200206508, "learning_rate": 4.963680857333767e-05, "loss": 2.0578, "mean_token_accuracy": 0.4620689690113068, "step": 103660 }, { "epoch": 0.1044124890088141, "grad_norm": 36.46360879936684, "learning_rate": 4.963674151523677e-05, "loss": 2.6587, "mean_token_accuracy": 0.441379314661026, "step": 103665 }, { "epoch": 0.10441752506191827, "grad_norm": 14.067742648750556, "learning_rate": 4.963667445099615e-05, "loss": 2.7886, "mean_token_accuracy": 0.38620689511299133, "step": 103670 }, { "epoch": 0.10442256111502245, "grad_norm": 13.928579320046438, "learning_rate": 4.963660738061584e-05, "loss": 2.2611, "mean_token_accuracy": 0.44482759237289426, "step": 103675 }, { "epoch": 0.10442759716812662, "grad_norm": 9.44874343203612, "learning_rate": 4.963654030409586e-05, "loss": 2.5797, "mean_token_accuracy": 0.44137930274009707, "step": 103680 }, { "epoch": 0.1044326332212308, "grad_norm": 10.673237390557992, "learning_rate": 4.963647322143623e-05, "loss": 2.4717, "mean_token_accuracy": 0.3896551728248596, "step": 103685 }, { "epoch": 0.10443766927433497, "grad_norm": 11.82852704694365, "learning_rate": 4.963640613263698e-05, "loss": 2.3012, "mean_token_accuracy": 0.41379310488700866, "step": 103690 }, { "epoch": 0.10444270532743914, "grad_norm": 12.157273296885288, "learning_rate": 4.9636339037698094e-05, "loss": 2.7572, "mean_token_accuracy": 0.43242589831352235, "step": 103695 }, { "epoch": 0.10444774138054332, "grad_norm": 14.101796927600006, "learning_rate": 4.9636271936619616e-05, "loss": 2.6262, "mean_token_accuracy": 0.35172413289546967, "step": 103700 }, { "epoch": 0.10445277743364748, "grad_norm": 14.167107548156475, "learning_rate": 4.963620482940157e-05, "loss": 2.2663, "mean_token_accuracy": 0.4502722263336182, "step": 103705 }, { "epoch": 0.10445781348675165, "grad_norm": 10.17840737953548, "learning_rate": 4.9636137716043956e-05, "loss": 1.9708, "mean_token_accuracy": 0.48275861144065857, "step": 103710 }, { "epoch": 0.10446284953985582, "grad_norm": 13.1848292823169, "learning_rate": 4.963607059654681e-05, "loss": 2.4803, "mean_token_accuracy": 0.4068965494632721, "step": 103715 }, { "epoch": 0.10446788559296, "grad_norm": 10.127751539255769, "learning_rate": 4.9636003470910134e-05, "loss": 2.2646, "mean_token_accuracy": 0.48620688915252686, "step": 103720 }, { "epoch": 0.10447292164606417, "grad_norm": 11.570488402059526, "learning_rate": 4.963593633913396e-05, "loss": 2.525, "mean_token_accuracy": 0.4222625494003296, "step": 103725 }, { "epoch": 0.10447795769916834, "grad_norm": 11.42208981661114, "learning_rate": 4.96358692012183e-05, "loss": 2.2482, "mean_token_accuracy": 0.43793103098869324, "step": 103730 }, { "epoch": 0.10448299375227252, "grad_norm": 8.893750605117278, "learning_rate": 4.963580205716317e-05, "loss": 2.612, "mean_token_accuracy": 0.41034482717514037, "step": 103735 }, { "epoch": 0.10448802980537669, "grad_norm": 9.89667229528441, "learning_rate": 4.96357349069686e-05, "loss": 2.0072, "mean_token_accuracy": 0.4852389574050903, "step": 103740 }, { "epoch": 0.10449306585848087, "grad_norm": 9.82896367214625, "learning_rate": 4.9635667750634594e-05, "loss": 2.4494, "mean_token_accuracy": 0.4034482717514038, "step": 103745 }, { "epoch": 0.10449810191158504, "grad_norm": 9.003076552507387, "learning_rate": 4.963560058816118e-05, "loss": 2.5537, "mean_token_accuracy": 0.3965517163276672, "step": 103750 }, { "epoch": 0.10450313796468921, "grad_norm": 11.461572428129221, "learning_rate": 4.963553341954838e-05, "loss": 2.6165, "mean_token_accuracy": 0.3793103456497192, "step": 103755 }, { "epoch": 0.10450817401779339, "grad_norm": 16.95036199849308, "learning_rate": 4.9635466244796204e-05, "loss": 2.4711, "mean_token_accuracy": 0.425952810049057, "step": 103760 }, { "epoch": 0.10451321007089756, "grad_norm": 7.344111676692995, "learning_rate": 4.963539906390468e-05, "loss": 2.1086, "mean_token_accuracy": 0.47931033968925474, "step": 103765 }, { "epoch": 0.10451824612400173, "grad_norm": 12.03461989892936, "learning_rate": 4.963533187687382e-05, "loss": 2.1615, "mean_token_accuracy": 0.495099812746048, "step": 103770 }, { "epoch": 0.1045232821771059, "grad_norm": 9.535352672246937, "learning_rate": 4.9635264683703627e-05, "loss": 2.1203, "mean_token_accuracy": 0.441379314661026, "step": 103775 }, { "epoch": 0.10452831823021007, "grad_norm": 8.972267724154506, "learning_rate": 4.963519748439415e-05, "loss": 2.2181, "mean_token_accuracy": 0.4482758641242981, "step": 103780 }, { "epoch": 0.10453335428331424, "grad_norm": 9.20649779723848, "learning_rate": 4.963513027894538e-05, "loss": 2.3419, "mean_token_accuracy": 0.46061705946922304, "step": 103785 }, { "epoch": 0.10453839033641842, "grad_norm": 11.397274773826782, "learning_rate": 4.963506306735737e-05, "loss": 2.3216, "mean_token_accuracy": 0.47931033968925474, "step": 103790 }, { "epoch": 0.10454342638952259, "grad_norm": 9.657594127647421, "learning_rate": 4.96349958496301e-05, "loss": 2.3275, "mean_token_accuracy": 0.4620689630508423, "step": 103795 }, { "epoch": 0.10454846244262676, "grad_norm": 10.113205202715854, "learning_rate": 4.963492862576361e-05, "loss": 2.6431, "mean_token_accuracy": 0.38965516686439516, "step": 103800 }, { "epoch": 0.10455349849573094, "grad_norm": 9.476346875198999, "learning_rate": 4.963486139575792e-05, "loss": 2.2249, "mean_token_accuracy": 0.47241379618644713, "step": 103805 }, { "epoch": 0.10455853454883511, "grad_norm": 10.17892326259015, "learning_rate": 4.9634794159613044e-05, "loss": 2.4943, "mean_token_accuracy": 0.4482758641242981, "step": 103810 }, { "epoch": 0.10456357060193928, "grad_norm": 10.77684541891807, "learning_rate": 4.9634726917328996e-05, "loss": 2.7576, "mean_token_accuracy": 0.38965516686439516, "step": 103815 }, { "epoch": 0.10456860665504346, "grad_norm": 12.59925814778215, "learning_rate": 4.96346596689058e-05, "loss": 2.6967, "mean_token_accuracy": 0.3862069010734558, "step": 103820 }, { "epoch": 0.10457364270814763, "grad_norm": 12.83608469894313, "learning_rate": 4.963459241434347e-05, "loss": 2.9241, "mean_token_accuracy": 0.3448275804519653, "step": 103825 }, { "epoch": 0.1045786787612518, "grad_norm": 8.834855075990559, "learning_rate": 4.9634525153642035e-05, "loss": 2.2304, "mean_token_accuracy": 0.45396249294281005, "step": 103830 }, { "epoch": 0.10458371481435598, "grad_norm": 10.627287815257999, "learning_rate": 4.96344578868015e-05, "loss": 2.2472, "mean_token_accuracy": 0.4429521977901459, "step": 103835 }, { "epoch": 0.10458875086746015, "grad_norm": 8.50141838204776, "learning_rate": 4.96343906138219e-05, "loss": 2.3461, "mean_token_accuracy": 0.4813067078590393, "step": 103840 }, { "epoch": 0.10459378692056431, "grad_norm": 11.275918176377852, "learning_rate": 4.9634323334703236e-05, "loss": 2.2656, "mean_token_accuracy": 0.4592256486415863, "step": 103845 }, { "epoch": 0.10459882297366849, "grad_norm": 10.296569154783302, "learning_rate": 4.9634256049445534e-05, "loss": 2.7804, "mean_token_accuracy": 0.4188747704029083, "step": 103850 }, { "epoch": 0.10460385902677266, "grad_norm": 11.388375264799716, "learning_rate": 4.9634188758048816e-05, "loss": 2.4708, "mean_token_accuracy": 0.44482759237289426, "step": 103855 }, { "epoch": 0.10460889507987683, "grad_norm": 12.311459088558646, "learning_rate": 4.96341214605131e-05, "loss": 2.468, "mean_token_accuracy": 0.4448275983333588, "step": 103860 }, { "epoch": 0.10461393113298101, "grad_norm": 10.150749440153362, "learning_rate": 4.96340541568384e-05, "loss": 2.3655, "mean_token_accuracy": 0.4103448301553726, "step": 103865 }, { "epoch": 0.10461896718608518, "grad_norm": 12.186425261492035, "learning_rate": 4.963398684702474e-05, "loss": 2.825, "mean_token_accuracy": 0.41379310488700866, "step": 103870 }, { "epoch": 0.10462400323918936, "grad_norm": 15.864778150880378, "learning_rate": 4.9633919531072146e-05, "loss": 2.6345, "mean_token_accuracy": 0.3758620619773865, "step": 103875 }, { "epoch": 0.10462903929229353, "grad_norm": 12.775242450317522, "learning_rate": 4.963385220898061e-05, "loss": 2.4882, "mean_token_accuracy": 0.4172413766384125, "step": 103880 }, { "epoch": 0.1046340753453977, "grad_norm": 8.836555736129975, "learning_rate": 4.963378488075017e-05, "loss": 2.671, "mean_token_accuracy": 0.44827585220336913, "step": 103885 }, { "epoch": 0.10463911139850188, "grad_norm": 11.426439924731422, "learning_rate": 4.963371754638085e-05, "loss": 2.3924, "mean_token_accuracy": 0.4399878978729248, "step": 103890 }, { "epoch": 0.10464414745160605, "grad_norm": 10.138186121060816, "learning_rate": 4.963365020587266e-05, "loss": 2.7743, "mean_token_accuracy": 0.3965517163276672, "step": 103895 }, { "epoch": 0.10464918350471022, "grad_norm": 10.180488070865366, "learning_rate": 4.9633582859225617e-05, "loss": 2.4515, "mean_token_accuracy": 0.43103447556495667, "step": 103900 }, { "epoch": 0.1046542195578144, "grad_norm": 10.343279252894899, "learning_rate": 4.9633515506439745e-05, "loss": 2.2611, "mean_token_accuracy": 0.4344827473163605, "step": 103905 }, { "epoch": 0.10465925561091857, "grad_norm": 11.82328202583085, "learning_rate": 4.963344814751505e-05, "loss": 2.4676, "mean_token_accuracy": 0.4172413766384125, "step": 103910 }, { "epoch": 0.10466429166402273, "grad_norm": 9.697109450703039, "learning_rate": 4.963338078245157e-05, "loss": 2.3514, "mean_token_accuracy": 0.4379310369491577, "step": 103915 }, { "epoch": 0.1046693277171269, "grad_norm": 12.33348395039808, "learning_rate": 4.963331341124931e-05, "loss": 2.5811, "mean_token_accuracy": 0.4241379380226135, "step": 103920 }, { "epoch": 0.10467436377023108, "grad_norm": 10.727621179600156, "learning_rate": 4.963324603390829e-05, "loss": 2.53, "mean_token_accuracy": 0.3827586233615875, "step": 103925 }, { "epoch": 0.10467939982333525, "grad_norm": 11.050456638740005, "learning_rate": 4.9633178650428534e-05, "loss": 2.1409, "mean_token_accuracy": 0.4586206912994385, "step": 103930 }, { "epoch": 0.10468443587643943, "grad_norm": 10.215214947431406, "learning_rate": 4.9633111260810065e-05, "loss": 2.3731, "mean_token_accuracy": 0.44827585220336913, "step": 103935 }, { "epoch": 0.1046894719295436, "grad_norm": 12.2797076162197, "learning_rate": 4.963304386505289e-05, "loss": 3.0348, "mean_token_accuracy": 0.37586206793785093, "step": 103940 }, { "epoch": 0.10469450798264777, "grad_norm": 12.396413275175545, "learning_rate": 4.963297646315704e-05, "loss": 2.4188, "mean_token_accuracy": 0.4379310429096222, "step": 103945 }, { "epoch": 0.10469954403575195, "grad_norm": 13.493329298902268, "learning_rate": 4.9632909055122514e-05, "loss": 2.5714, "mean_token_accuracy": 0.37586206793785093, "step": 103950 }, { "epoch": 0.10470458008885612, "grad_norm": 20.143616129975943, "learning_rate": 4.963284164094935e-05, "loss": 2.524, "mean_token_accuracy": 0.38275861740112305, "step": 103955 }, { "epoch": 0.1047096161419603, "grad_norm": 12.228609158097202, "learning_rate": 4.963277422063756e-05, "loss": 2.3223, "mean_token_accuracy": 0.4379310369491577, "step": 103960 }, { "epoch": 0.10471465219506447, "grad_norm": 11.537071422058181, "learning_rate": 4.963270679418716e-05, "loss": 2.376, "mean_token_accuracy": 0.4413793087005615, "step": 103965 }, { "epoch": 0.10471968824816864, "grad_norm": 10.531307624490426, "learning_rate": 4.963263936159818e-05, "loss": 2.2471, "mean_token_accuracy": 0.46551724672317507, "step": 103970 }, { "epoch": 0.10472472430127282, "grad_norm": 11.332010079471702, "learning_rate": 4.9632571922870616e-05, "loss": 2.3787, "mean_token_accuracy": 0.4034482717514038, "step": 103975 }, { "epoch": 0.10472976035437699, "grad_norm": 10.238605881439101, "learning_rate": 4.9632504478004504e-05, "loss": 2.3162, "mean_token_accuracy": 0.41724138259887694, "step": 103980 }, { "epoch": 0.10473479640748115, "grad_norm": 22.071766369195398, "learning_rate": 4.9632437026999864e-05, "loss": 2.4119, "mean_token_accuracy": 0.41034482717514037, "step": 103985 }, { "epoch": 0.10473983246058532, "grad_norm": 16.04930805860322, "learning_rate": 4.963236956985671e-05, "loss": 2.4662, "mean_token_accuracy": 0.4137930989265442, "step": 103990 }, { "epoch": 0.1047448685136895, "grad_norm": 11.466228927049901, "learning_rate": 4.9632302106575056e-05, "loss": 2.5193, "mean_token_accuracy": 0.41724138259887694, "step": 103995 }, { "epoch": 0.10474990456679367, "grad_norm": 9.816737589081203, "learning_rate": 4.963223463715492e-05, "loss": 2.2841, "mean_token_accuracy": 0.4517241358757019, "step": 104000 }, { "epoch": 0.10475494061989785, "grad_norm": 14.948246793100575, "learning_rate": 4.963216716159634e-05, "loss": 2.7182, "mean_token_accuracy": 0.4517241358757019, "step": 104005 }, { "epoch": 0.10475997667300202, "grad_norm": 13.051587827248236, "learning_rate": 4.963209967989932e-05, "loss": 2.55, "mean_token_accuracy": 0.41161524057388305, "step": 104010 }, { "epoch": 0.1047650127261062, "grad_norm": 12.039451179042391, "learning_rate": 4.9632032192063876e-05, "loss": 2.2666, "mean_token_accuracy": 0.44355716109275817, "step": 104015 }, { "epoch": 0.10477004877921037, "grad_norm": 12.13252582412816, "learning_rate": 4.963196469809002e-05, "loss": 2.4723, "mean_token_accuracy": 0.4034482717514038, "step": 104020 }, { "epoch": 0.10477508483231454, "grad_norm": 9.56408220808534, "learning_rate": 4.96318971979778e-05, "loss": 2.5118, "mean_token_accuracy": 0.40000000298023225, "step": 104025 }, { "epoch": 0.10478012088541871, "grad_norm": 11.037321589730412, "learning_rate": 4.963182969172721e-05, "loss": 2.5299, "mean_token_accuracy": 0.3862069010734558, "step": 104030 }, { "epoch": 0.10478515693852289, "grad_norm": 12.726437126198556, "learning_rate": 4.9631762179338265e-05, "loss": 2.981, "mean_token_accuracy": 0.36896551251411436, "step": 104035 }, { "epoch": 0.10479019299162706, "grad_norm": 9.330428238882309, "learning_rate": 4.9631694660811e-05, "loss": 2.4234, "mean_token_accuracy": 0.4, "step": 104040 }, { "epoch": 0.10479522904473124, "grad_norm": 12.35139380693233, "learning_rate": 4.963162713614542e-05, "loss": 2.1832, "mean_token_accuracy": 0.4551724076271057, "step": 104045 }, { "epoch": 0.10480026509783541, "grad_norm": 11.979356145690286, "learning_rate": 4.963155960534156e-05, "loss": 2.5356, "mean_token_accuracy": 0.44827585220336913, "step": 104050 }, { "epoch": 0.10480530115093957, "grad_norm": 11.626126946679923, "learning_rate": 4.963149206839943e-05, "loss": 2.4335, "mean_token_accuracy": 0.4310344815254211, "step": 104055 }, { "epoch": 0.10481033720404374, "grad_norm": 10.55813657532861, "learning_rate": 4.9631424525319046e-05, "loss": 2.5451, "mean_token_accuracy": 0.4275862127542496, "step": 104060 }, { "epoch": 0.10481537325714792, "grad_norm": 9.846870308188988, "learning_rate": 4.963135697610042e-05, "loss": 2.5082, "mean_token_accuracy": 0.4482758641242981, "step": 104065 }, { "epoch": 0.10482040931025209, "grad_norm": 11.495942412233441, "learning_rate": 4.963128942074358e-05, "loss": 2.4912, "mean_token_accuracy": 0.38620689511299133, "step": 104070 }, { "epoch": 0.10482544536335626, "grad_norm": 9.321278029663754, "learning_rate": 4.9631221859248556e-05, "loss": 2.0798, "mean_token_accuracy": 0.5034482717514038, "step": 104075 }, { "epoch": 0.10483048141646044, "grad_norm": 12.654234140705524, "learning_rate": 4.9631154291615345e-05, "loss": 2.2259, "mean_token_accuracy": 0.4918330192565918, "step": 104080 }, { "epoch": 0.10483551746956461, "grad_norm": 8.676855807659111, "learning_rate": 4.9631086717843985e-05, "loss": 2.2767, "mean_token_accuracy": 0.42413793206214906, "step": 104085 }, { "epoch": 0.10484055352266879, "grad_norm": 10.062172691741328, "learning_rate": 4.963101913793448e-05, "loss": 2.3631, "mean_token_accuracy": 0.4379310250282288, "step": 104090 }, { "epoch": 0.10484558957577296, "grad_norm": 19.394394574490114, "learning_rate": 4.963095155188685e-05, "loss": 2.7214, "mean_token_accuracy": 0.3482758641242981, "step": 104095 }, { "epoch": 0.10485062562887713, "grad_norm": 11.064985386540059, "learning_rate": 4.963088395970113e-05, "loss": 2.1542, "mean_token_accuracy": 0.47586206197738645, "step": 104100 }, { "epoch": 0.10485566168198131, "grad_norm": 12.81755036922968, "learning_rate": 4.963081636137732e-05, "loss": 2.751, "mean_token_accuracy": 0.3517241358757019, "step": 104105 }, { "epoch": 0.10486069773508548, "grad_norm": 9.06593498323187, "learning_rate": 4.963074875691545e-05, "loss": 2.3601, "mean_token_accuracy": 0.41379310488700866, "step": 104110 }, { "epoch": 0.10486573378818966, "grad_norm": 12.581203768010441, "learning_rate": 4.963068114631553e-05, "loss": 2.8013, "mean_token_accuracy": 0.39655172228813174, "step": 104115 }, { "epoch": 0.10487076984129383, "grad_norm": 12.819218417071824, "learning_rate": 4.963061352957758e-05, "loss": 2.3518, "mean_token_accuracy": 0.4119782209396362, "step": 104120 }, { "epoch": 0.10487580589439799, "grad_norm": 10.9190985692, "learning_rate": 4.963054590670162e-05, "loss": 2.3678, "mean_token_accuracy": 0.441379314661026, "step": 104125 }, { "epoch": 0.10488084194750216, "grad_norm": 11.534833327034596, "learning_rate": 4.9630478277687675e-05, "loss": 2.7484, "mean_token_accuracy": 0.3999999940395355, "step": 104130 }, { "epoch": 0.10488587800060634, "grad_norm": 11.499511754363747, "learning_rate": 4.963041064253576e-05, "loss": 2.9454, "mean_token_accuracy": 0.36896551251411436, "step": 104135 }, { "epoch": 0.10489091405371051, "grad_norm": 11.366043390532214, "learning_rate": 4.963034300124589e-05, "loss": 2.2627, "mean_token_accuracy": 0.4620689630508423, "step": 104140 }, { "epoch": 0.10489595010681468, "grad_norm": 13.412084507353034, "learning_rate": 4.963027535381809e-05, "loss": 2.8307, "mean_token_accuracy": 0.36206896007061007, "step": 104145 }, { "epoch": 0.10490098615991886, "grad_norm": 10.956585587672876, "learning_rate": 4.963020770025238e-05, "loss": 2.1864, "mean_token_accuracy": 0.4068965554237366, "step": 104150 }, { "epoch": 0.10490602221302303, "grad_norm": 8.756257263100425, "learning_rate": 4.963014004054876e-05, "loss": 2.5224, "mean_token_accuracy": 0.482758617401123, "step": 104155 }, { "epoch": 0.1049110582661272, "grad_norm": 9.666583919047543, "learning_rate": 4.963007237470728e-05, "loss": 2.5129, "mean_token_accuracy": 0.42413793206214906, "step": 104160 }, { "epoch": 0.10491609431923138, "grad_norm": 10.450762512692284, "learning_rate": 4.963000470272793e-05, "loss": 2.6904, "mean_token_accuracy": 0.4172413766384125, "step": 104165 }, { "epoch": 0.10492113037233555, "grad_norm": 10.55127333999612, "learning_rate": 4.9629937024610744e-05, "loss": 2.4627, "mean_token_accuracy": 0.3965517282485962, "step": 104170 }, { "epoch": 0.10492616642543973, "grad_norm": 10.397115012695442, "learning_rate": 4.962986934035575e-05, "loss": 2.2181, "mean_token_accuracy": 0.40344828367233276, "step": 104175 }, { "epoch": 0.1049312024785439, "grad_norm": 10.206228920985115, "learning_rate": 4.962980164996293e-05, "loss": 2.2698, "mean_token_accuracy": 0.44482758045196535, "step": 104180 }, { "epoch": 0.10493623853164807, "grad_norm": 11.600334409972222, "learning_rate": 4.962973395343234e-05, "loss": 2.8156, "mean_token_accuracy": 0.41379311084747317, "step": 104185 }, { "epoch": 0.10494127458475225, "grad_norm": 10.21850808763546, "learning_rate": 4.962966625076399e-05, "loss": 2.5689, "mean_token_accuracy": 0.42413793206214906, "step": 104190 }, { "epoch": 0.10494631063785641, "grad_norm": 10.3588736542016, "learning_rate": 4.962959854195789e-05, "loss": 2.3702, "mean_token_accuracy": 0.41724138259887694, "step": 104195 }, { "epoch": 0.10495134669096058, "grad_norm": 10.060905824278123, "learning_rate": 4.962953082701407e-05, "loss": 2.7718, "mean_token_accuracy": 0.3655172407627106, "step": 104200 }, { "epoch": 0.10495638274406476, "grad_norm": 11.411455235343968, "learning_rate": 4.962946310593253e-05, "loss": 2.2699, "mean_token_accuracy": 0.4620689690113068, "step": 104205 }, { "epoch": 0.10496141879716893, "grad_norm": 10.467943632835208, "learning_rate": 4.96293953787133e-05, "loss": 2.4553, "mean_token_accuracy": 0.4000000059604645, "step": 104210 }, { "epoch": 0.1049664548502731, "grad_norm": 9.863013480701674, "learning_rate": 4.9629327645356405e-05, "loss": 2.3218, "mean_token_accuracy": 0.4344827592372894, "step": 104215 }, { "epoch": 0.10497149090337728, "grad_norm": 9.570439695297525, "learning_rate": 4.962925990586186e-05, "loss": 2.4081, "mean_token_accuracy": 0.43448275327682495, "step": 104220 }, { "epoch": 0.10497652695648145, "grad_norm": 10.416331869635563, "learning_rate": 4.962919216022968e-05, "loss": 2.6841, "mean_token_accuracy": 0.3551724195480347, "step": 104225 }, { "epoch": 0.10498156300958562, "grad_norm": 10.639930018155725, "learning_rate": 4.962912440845989e-05, "loss": 2.0856, "mean_token_accuracy": 0.4704433500766754, "step": 104230 }, { "epoch": 0.1049865990626898, "grad_norm": 9.45027781800723, "learning_rate": 4.9629056650552497e-05, "loss": 2.4314, "mean_token_accuracy": 0.4068965494632721, "step": 104235 }, { "epoch": 0.10499163511579397, "grad_norm": 15.264633429674324, "learning_rate": 4.962898888650753e-05, "loss": 2.6629, "mean_token_accuracy": 0.3758620649576187, "step": 104240 }, { "epoch": 0.10499667116889815, "grad_norm": 11.310133921733465, "learning_rate": 4.962892111632502e-05, "loss": 2.6282, "mean_token_accuracy": 0.39842710494995115, "step": 104245 }, { "epoch": 0.10500170722200232, "grad_norm": 9.72143240421472, "learning_rate": 4.962885334000495e-05, "loss": 2.4102, "mean_token_accuracy": 0.4123411953449249, "step": 104250 }, { "epoch": 0.10500674327510649, "grad_norm": 9.02671059113164, "learning_rate": 4.9628785557547376e-05, "loss": 2.5854, "mean_token_accuracy": 0.4551724076271057, "step": 104255 }, { "epoch": 0.10501177932821067, "grad_norm": 10.936300577015556, "learning_rate": 4.9628717768952286e-05, "loss": 2.553, "mean_token_accuracy": 0.39086509943008424, "step": 104260 }, { "epoch": 0.10501681538131483, "grad_norm": 13.13127863390209, "learning_rate": 4.9628649974219723e-05, "loss": 2.5706, "mean_token_accuracy": 0.35862069129943847, "step": 104265 }, { "epoch": 0.105021851434419, "grad_norm": 12.216592917636945, "learning_rate": 4.96285821733497e-05, "loss": 2.2482, "mean_token_accuracy": 0.4344827592372894, "step": 104270 }, { "epoch": 0.10502688748752317, "grad_norm": 10.293780999079074, "learning_rate": 4.962851436634222e-05, "loss": 2.1938, "mean_token_accuracy": 0.441379314661026, "step": 104275 }, { "epoch": 0.10503192354062735, "grad_norm": 14.330754022928506, "learning_rate": 4.962844655319732e-05, "loss": 2.614, "mean_token_accuracy": 0.4034482717514038, "step": 104280 }, { "epoch": 0.10503695959373152, "grad_norm": 9.812876636797895, "learning_rate": 4.962837873391502e-05, "loss": 1.9529, "mean_token_accuracy": 0.520992124080658, "step": 104285 }, { "epoch": 0.1050419956468357, "grad_norm": 17.135634112779773, "learning_rate": 4.9628310908495326e-05, "loss": 2.7436, "mean_token_accuracy": 0.4137930989265442, "step": 104290 }, { "epoch": 0.10504703169993987, "grad_norm": 10.385988512392446, "learning_rate": 4.9628243076938264e-05, "loss": 2.5008, "mean_token_accuracy": 0.4482758641242981, "step": 104295 }, { "epoch": 0.10505206775304404, "grad_norm": 9.807478457153017, "learning_rate": 4.9628175239243845e-05, "loss": 2.4519, "mean_token_accuracy": 0.42068966031074523, "step": 104300 }, { "epoch": 0.10505710380614822, "grad_norm": 9.084020543276518, "learning_rate": 4.9628107395412096e-05, "loss": 2.3274, "mean_token_accuracy": 0.3896551728248596, "step": 104305 }, { "epoch": 0.10506213985925239, "grad_norm": 11.68858665292888, "learning_rate": 4.962803954544304e-05, "loss": 2.3425, "mean_token_accuracy": 0.43103448748588563, "step": 104310 }, { "epoch": 0.10506717591235656, "grad_norm": 10.238277599128011, "learning_rate": 4.9627971689336684e-05, "loss": 2.0116, "mean_token_accuracy": 0.45517241954803467, "step": 104315 }, { "epoch": 0.10507221196546074, "grad_norm": 8.623054930425678, "learning_rate": 4.9627903827093054e-05, "loss": 1.9555, "mean_token_accuracy": 0.4936479091644287, "step": 104320 }, { "epoch": 0.10507724801856491, "grad_norm": 10.719437973646045, "learning_rate": 4.962783595871217e-05, "loss": 2.794, "mean_token_accuracy": 0.3896551728248596, "step": 104325 }, { "epoch": 0.10508228407166909, "grad_norm": 19.229301521538122, "learning_rate": 4.9627768084194044e-05, "loss": 2.4967, "mean_token_accuracy": 0.4084089517593384, "step": 104330 }, { "epoch": 0.10508732012477325, "grad_norm": 9.65723479161499, "learning_rate": 4.9627700203538696e-05, "loss": 2.1691, "mean_token_accuracy": 0.4482758641242981, "step": 104335 }, { "epoch": 0.10509235617787742, "grad_norm": 8.805551982575079, "learning_rate": 4.9627632316746154e-05, "loss": 2.0863, "mean_token_accuracy": 0.5068965494632721, "step": 104340 }, { "epoch": 0.10509739223098159, "grad_norm": 9.666568512834388, "learning_rate": 4.962756442381643e-05, "loss": 2.1471, "mean_token_accuracy": 0.493103438615799, "step": 104345 }, { "epoch": 0.10510242828408577, "grad_norm": 12.872396051499596, "learning_rate": 4.962749652474954e-05, "loss": 2.8309, "mean_token_accuracy": 0.36896551847457887, "step": 104350 }, { "epoch": 0.10510746433718994, "grad_norm": 11.63484894566667, "learning_rate": 4.9627428619545514e-05, "loss": 2.44, "mean_token_accuracy": 0.37586206793785093, "step": 104355 }, { "epoch": 0.10511250039029411, "grad_norm": 11.430611182435214, "learning_rate": 4.962736070820435e-05, "loss": 2.0939, "mean_token_accuracy": 0.47931034564971925, "step": 104360 }, { "epoch": 0.10511753644339829, "grad_norm": 10.623681452284707, "learning_rate": 4.9627292790726096e-05, "loss": 1.9904, "mean_token_accuracy": 0.44482759237289426, "step": 104365 }, { "epoch": 0.10512257249650246, "grad_norm": 10.061719151977647, "learning_rate": 4.962722486711074e-05, "loss": 2.6797, "mean_token_accuracy": 0.37586207389831544, "step": 104370 }, { "epoch": 0.10512760854960664, "grad_norm": 12.94275104905402, "learning_rate": 4.962715693735833e-05, "loss": 2.2372, "mean_token_accuracy": 0.44482758045196535, "step": 104375 }, { "epoch": 0.10513264460271081, "grad_norm": 11.861088513103544, "learning_rate": 4.9627089001468865e-05, "loss": 2.5491, "mean_token_accuracy": 0.3999999940395355, "step": 104380 }, { "epoch": 0.10513768065581498, "grad_norm": 8.238414969436992, "learning_rate": 4.9627021059442355e-05, "loss": 1.7623, "mean_token_accuracy": 0.5424876868724823, "step": 104385 }, { "epoch": 0.10514271670891916, "grad_norm": 10.089772643188878, "learning_rate": 4.9626953111278854e-05, "loss": 2.5224, "mean_token_accuracy": 0.4310344815254211, "step": 104390 }, { "epoch": 0.10514775276202333, "grad_norm": 12.43101837941754, "learning_rate": 4.962688515697834e-05, "loss": 2.098, "mean_token_accuracy": 0.4551724135875702, "step": 104395 }, { "epoch": 0.1051527888151275, "grad_norm": 11.939319118217934, "learning_rate": 4.962681719654087e-05, "loss": 2.3852, "mean_token_accuracy": 0.4551724135875702, "step": 104400 }, { "epoch": 0.10515782486823166, "grad_norm": 12.213084854459414, "learning_rate": 4.962674922996644e-05, "loss": 2.4956, "mean_token_accuracy": 0.4379310429096222, "step": 104405 }, { "epoch": 0.10516286092133584, "grad_norm": 10.405037374973546, "learning_rate": 4.962668125725507e-05, "loss": 2.2493, "mean_token_accuracy": 0.4344827592372894, "step": 104410 }, { "epoch": 0.10516789697444001, "grad_norm": 11.700716407650718, "learning_rate": 4.962661327840678e-05, "loss": 2.3471, "mean_token_accuracy": 0.4206896543502808, "step": 104415 }, { "epoch": 0.10517293302754419, "grad_norm": 11.285773180230967, "learning_rate": 4.96265452934216e-05, "loss": 2.5454, "mean_token_accuracy": 0.38275861740112305, "step": 104420 }, { "epoch": 0.10517796908064836, "grad_norm": 10.643787686356456, "learning_rate": 4.9626477302299534e-05, "loss": 2.1005, "mean_token_accuracy": 0.441379314661026, "step": 104425 }, { "epoch": 0.10518300513375253, "grad_norm": 11.803988382830147, "learning_rate": 4.962640930504061e-05, "loss": 2.3415, "mean_token_accuracy": 0.4533575356006622, "step": 104430 }, { "epoch": 0.1051880411868567, "grad_norm": 9.346115168494713, "learning_rate": 4.962634130164483e-05, "loss": 2.3295, "mean_token_accuracy": 0.4290986001491547, "step": 104435 }, { "epoch": 0.10519307723996088, "grad_norm": 11.257395626140907, "learning_rate": 4.962627329211225e-05, "loss": 2.2176, "mean_token_accuracy": 0.42413792610168455, "step": 104440 }, { "epoch": 0.10519811329306505, "grad_norm": 10.824926258074312, "learning_rate": 4.962620527644285e-05, "loss": 2.5731, "mean_token_accuracy": 0.4068965494632721, "step": 104445 }, { "epoch": 0.10520314934616923, "grad_norm": 11.269678800878701, "learning_rate": 4.9626137254636675e-05, "loss": 2.98, "mean_token_accuracy": 0.3206896483898163, "step": 104450 }, { "epoch": 0.1052081853992734, "grad_norm": 9.341629381584324, "learning_rate": 4.962606922669372e-05, "loss": 2.2644, "mean_token_accuracy": 0.4745311439037323, "step": 104455 }, { "epoch": 0.10521322145237758, "grad_norm": 11.700680338270931, "learning_rate": 4.962600119261403e-05, "loss": 2.3472, "mean_token_accuracy": 0.4310344815254211, "step": 104460 }, { "epoch": 0.10521825750548175, "grad_norm": 11.168420741253183, "learning_rate": 4.9625933152397604e-05, "loss": 2.3918, "mean_token_accuracy": 0.45517241954803467, "step": 104465 }, { "epoch": 0.10522329355858592, "grad_norm": 11.073665060641494, "learning_rate": 4.9625865106044466e-05, "loss": 2.235, "mean_token_accuracy": 0.4310344696044922, "step": 104470 }, { "epoch": 0.10522832961169008, "grad_norm": 11.14349013105833, "learning_rate": 4.962579705355464e-05, "loss": 2.4706, "mean_token_accuracy": 0.3896551728248596, "step": 104475 }, { "epoch": 0.10523336566479426, "grad_norm": 15.492503794684588, "learning_rate": 4.962572899492814e-05, "loss": 2.5848, "mean_token_accuracy": 0.4103448331356049, "step": 104480 }, { "epoch": 0.10523840171789843, "grad_norm": 10.376270434511785, "learning_rate": 4.962566093016499e-05, "loss": 2.5172, "mean_token_accuracy": 0.36551723778247835, "step": 104485 }, { "epoch": 0.1052434377710026, "grad_norm": 10.411506146859866, "learning_rate": 4.96255928592652e-05, "loss": 2.5499, "mean_token_accuracy": 0.41724138259887694, "step": 104490 }, { "epoch": 0.10524847382410678, "grad_norm": 10.64188409034023, "learning_rate": 4.96255247822288e-05, "loss": 2.5712, "mean_token_accuracy": 0.43641863465309144, "step": 104495 }, { "epoch": 0.10525350987721095, "grad_norm": 10.7727777010905, "learning_rate": 4.96254566990558e-05, "loss": 2.5249, "mean_token_accuracy": 0.41379311084747317, "step": 104500 }, { "epoch": 0.10525854593031513, "grad_norm": 9.46284445099597, "learning_rate": 4.962538860974622e-05, "loss": 2.1961, "mean_token_accuracy": 0.4689655125141144, "step": 104505 }, { "epoch": 0.1052635819834193, "grad_norm": 10.823538629885405, "learning_rate": 4.9625320514300085e-05, "loss": 2.2574, "mean_token_accuracy": 0.41379310488700866, "step": 104510 }, { "epoch": 0.10526861803652347, "grad_norm": 13.48198615315915, "learning_rate": 4.962525241271741e-05, "loss": 2.9341, "mean_token_accuracy": 0.3931034505367279, "step": 104515 }, { "epoch": 0.10527365408962765, "grad_norm": 10.922012810188601, "learning_rate": 4.962518430499822e-05, "loss": 2.3436, "mean_token_accuracy": 0.42613430619239806, "step": 104520 }, { "epoch": 0.10527869014273182, "grad_norm": 9.469779872193012, "learning_rate": 4.962511619114251e-05, "loss": 2.5683, "mean_token_accuracy": 0.41724138259887694, "step": 104525 }, { "epoch": 0.105283726195836, "grad_norm": 10.473719154934223, "learning_rate": 4.9625048071150325e-05, "loss": 3.2372, "mean_token_accuracy": 0.3620689660310745, "step": 104530 }, { "epoch": 0.10528876224894017, "grad_norm": 9.587430195189388, "learning_rate": 4.9624979945021685e-05, "loss": 2.6169, "mean_token_accuracy": 0.41379310488700866, "step": 104535 }, { "epoch": 0.10529379830204434, "grad_norm": 11.77243450265687, "learning_rate": 4.962491181275658e-05, "loss": 2.1481, "mean_token_accuracy": 0.44827585816383364, "step": 104540 }, { "epoch": 0.1052988343551485, "grad_norm": 13.20106916984188, "learning_rate": 4.9624843674355066e-05, "loss": 2.5211, "mean_token_accuracy": 0.41379310488700866, "step": 104545 }, { "epoch": 0.10530387040825268, "grad_norm": 10.043084635997937, "learning_rate": 4.9624775529817134e-05, "loss": 2.4595, "mean_token_accuracy": 0.43793103098869324, "step": 104550 }, { "epoch": 0.10530890646135685, "grad_norm": 18.76786820225853, "learning_rate": 4.962470737914282e-05, "loss": 2.7603, "mean_token_accuracy": 0.358620685338974, "step": 104555 }, { "epoch": 0.10531394251446102, "grad_norm": 12.54169573967831, "learning_rate": 4.962463922233213e-05, "loss": 2.7367, "mean_token_accuracy": 0.4517241418361664, "step": 104560 }, { "epoch": 0.1053189785675652, "grad_norm": 10.909113543475502, "learning_rate": 4.9624571059385094e-05, "loss": 2.6188, "mean_token_accuracy": 0.3931034505367279, "step": 104565 }, { "epoch": 0.10532401462066937, "grad_norm": 9.96207734278988, "learning_rate": 4.962450289030172e-05, "loss": 2.3455, "mean_token_accuracy": 0.47241379618644713, "step": 104570 }, { "epoch": 0.10532905067377354, "grad_norm": 10.90432089142969, "learning_rate": 4.962443471508204e-05, "loss": 2.7206, "mean_token_accuracy": 0.3517241358757019, "step": 104575 }, { "epoch": 0.10533408672687772, "grad_norm": 10.54067863793279, "learning_rate": 4.962436653372606e-05, "loss": 2.317, "mean_token_accuracy": 0.4034482777118683, "step": 104580 }, { "epoch": 0.10533912277998189, "grad_norm": 9.9133158633996, "learning_rate": 4.9624298346233804e-05, "loss": 2.5115, "mean_token_accuracy": 0.3827586144208908, "step": 104585 }, { "epoch": 0.10534415883308607, "grad_norm": 12.40972735281351, "learning_rate": 4.962423015260529e-05, "loss": 2.8339, "mean_token_accuracy": 0.42758620977401735, "step": 104590 }, { "epoch": 0.10534919488619024, "grad_norm": 11.987546463643906, "learning_rate": 4.962416195284054e-05, "loss": 2.4714, "mean_token_accuracy": 0.43684210777282717, "step": 104595 }, { "epoch": 0.10535423093929441, "grad_norm": 9.83523822454958, "learning_rate": 4.962409374693957e-05, "loss": 2.5817, "mean_token_accuracy": 0.4172413766384125, "step": 104600 }, { "epoch": 0.10535926699239859, "grad_norm": 10.858031263847476, "learning_rate": 4.9624025534902404e-05, "loss": 2.5035, "mean_token_accuracy": 0.3909255862236023, "step": 104605 }, { "epoch": 0.10536430304550276, "grad_norm": 11.622443928297898, "learning_rate": 4.962395731672905e-05, "loss": 2.3513, "mean_token_accuracy": 0.4068965494632721, "step": 104610 }, { "epoch": 0.10536933909860692, "grad_norm": 12.848475172146022, "learning_rate": 4.962388909241954e-05, "loss": 2.3392, "mean_token_accuracy": 0.4344827651977539, "step": 104615 }, { "epoch": 0.1053743751517111, "grad_norm": 10.36120287179352, "learning_rate": 4.962382086197389e-05, "loss": 2.7308, "mean_token_accuracy": 0.3551724135875702, "step": 104620 }, { "epoch": 0.10537941120481527, "grad_norm": 10.985492743745747, "learning_rate": 4.962375262539211e-05, "loss": 2.4592, "mean_token_accuracy": 0.3999999940395355, "step": 104625 }, { "epoch": 0.10538444725791944, "grad_norm": 13.275102379195742, "learning_rate": 4.962368438267423e-05, "loss": 2.6649, "mean_token_accuracy": 0.4, "step": 104630 }, { "epoch": 0.10538948331102362, "grad_norm": 10.97049861843048, "learning_rate": 4.9623616133820256e-05, "loss": 2.3995, "mean_token_accuracy": 0.41379310488700866, "step": 104635 }, { "epoch": 0.10539451936412779, "grad_norm": 11.661108321111179, "learning_rate": 4.962354787883022e-05, "loss": 2.3673, "mean_token_accuracy": 0.44827585816383364, "step": 104640 }, { "epoch": 0.10539955541723196, "grad_norm": 12.174696233824838, "learning_rate": 4.962347961770413e-05, "loss": 2.665, "mean_token_accuracy": 0.4413793087005615, "step": 104645 }, { "epoch": 0.10540459147033614, "grad_norm": 10.869964958454787, "learning_rate": 4.962341135044202e-05, "loss": 2.2938, "mean_token_accuracy": 0.4482758641242981, "step": 104650 }, { "epoch": 0.10540962752344031, "grad_norm": 8.462168269319575, "learning_rate": 4.962334307704389e-05, "loss": 2.2435, "mean_token_accuracy": 0.47931034564971925, "step": 104655 }, { "epoch": 0.10541466357654448, "grad_norm": 11.181526119449435, "learning_rate": 4.962327479750977e-05, "loss": 2.6877, "mean_token_accuracy": 0.39655172228813174, "step": 104660 }, { "epoch": 0.10541969962964866, "grad_norm": 11.300944835898926, "learning_rate": 4.962320651183968e-05, "loss": 2.2873, "mean_token_accuracy": 0.4359346628189087, "step": 104665 }, { "epoch": 0.10542473568275283, "grad_norm": 10.561806510308658, "learning_rate": 4.9623138220033633e-05, "loss": 3.2234, "mean_token_accuracy": 0.37737447023391724, "step": 104670 }, { "epoch": 0.105429771735857, "grad_norm": 10.238636926645285, "learning_rate": 4.962306992209166e-05, "loss": 2.4781, "mean_token_accuracy": 0.4172413766384125, "step": 104675 }, { "epoch": 0.10543480778896118, "grad_norm": 11.43516663557122, "learning_rate": 4.9623001618013756e-05, "loss": 2.9079, "mean_token_accuracy": 0.3689655065536499, "step": 104680 }, { "epoch": 0.10543984384206534, "grad_norm": 10.749287119223046, "learning_rate": 4.962293330779997e-05, "loss": 2.2969, "mean_token_accuracy": 0.4068965554237366, "step": 104685 }, { "epoch": 0.10544487989516951, "grad_norm": 10.461361133467605, "learning_rate": 4.962286499145029e-05, "loss": 2.2206, "mean_token_accuracy": 0.4230490028858185, "step": 104690 }, { "epoch": 0.10544991594827369, "grad_norm": 10.53091488839773, "learning_rate": 4.962279666896476e-05, "loss": 2.4326, "mean_token_accuracy": 0.43297035694122316, "step": 104695 }, { "epoch": 0.10545495200137786, "grad_norm": 12.379212629863284, "learning_rate": 4.9622728340343386e-05, "loss": 2.4512, "mean_token_accuracy": 0.4344827592372894, "step": 104700 }, { "epoch": 0.10545998805448203, "grad_norm": 8.729504539453158, "learning_rate": 4.9622660005586195e-05, "loss": 2.2001, "mean_token_accuracy": 0.47241378426551817, "step": 104705 }, { "epoch": 0.10546502410758621, "grad_norm": 10.32949120703033, "learning_rate": 4.9622591664693205e-05, "loss": 2.6687, "mean_token_accuracy": 0.3448275804519653, "step": 104710 }, { "epoch": 0.10547006016069038, "grad_norm": 9.460294170557734, "learning_rate": 4.9622523317664424e-05, "loss": 2.1135, "mean_token_accuracy": 0.4241379380226135, "step": 104715 }, { "epoch": 0.10547509621379456, "grad_norm": 13.97331300135685, "learning_rate": 4.9622454964499885e-05, "loss": 2.9182, "mean_token_accuracy": 0.36896551847457887, "step": 104720 }, { "epoch": 0.10548013226689873, "grad_norm": 8.801153722767756, "learning_rate": 4.96223866051996e-05, "loss": 2.092, "mean_token_accuracy": 0.482758617401123, "step": 104725 }, { "epoch": 0.1054851683200029, "grad_norm": 12.10756195488822, "learning_rate": 4.962231823976359e-05, "loss": 2.6237, "mean_token_accuracy": 0.4, "step": 104730 }, { "epoch": 0.10549020437310708, "grad_norm": 11.07534289901473, "learning_rate": 4.9622249868191866e-05, "loss": 2.5145, "mean_token_accuracy": 0.44337567687034607, "step": 104735 }, { "epoch": 0.10549524042621125, "grad_norm": 11.511118031889584, "learning_rate": 4.962218149048445e-05, "loss": 2.7609, "mean_token_accuracy": 0.4, "step": 104740 }, { "epoch": 0.10550027647931542, "grad_norm": 9.695354287989908, "learning_rate": 4.9622113106641376e-05, "loss": 2.4246, "mean_token_accuracy": 0.4052631616592407, "step": 104745 }, { "epoch": 0.1055053125324196, "grad_norm": 12.345215988806293, "learning_rate": 4.9622044716662644e-05, "loss": 2.6129, "mean_token_accuracy": 0.42758620381355283, "step": 104750 }, { "epoch": 0.10551034858552376, "grad_norm": 11.269323579776556, "learning_rate": 4.9621976320548283e-05, "loss": 2.3762, "mean_token_accuracy": 0.4448275864124298, "step": 104755 }, { "epoch": 0.10551538463862793, "grad_norm": 9.817481281374286, "learning_rate": 4.962190791829831e-05, "loss": 2.4085, "mean_token_accuracy": 0.42413792610168455, "step": 104760 }, { "epoch": 0.1055204206917321, "grad_norm": 10.982604451325106, "learning_rate": 4.962183950991274e-05, "loss": 2.4533, "mean_token_accuracy": 0.44137930274009707, "step": 104765 }, { "epoch": 0.10552545674483628, "grad_norm": 13.306689968402468, "learning_rate": 4.9621771095391604e-05, "loss": 2.3624, "mean_token_accuracy": 0.41379310488700866, "step": 104770 }, { "epoch": 0.10553049279794045, "grad_norm": 13.141821337319119, "learning_rate": 4.9621702674734904e-05, "loss": 2.4011, "mean_token_accuracy": 0.4398669183254242, "step": 104775 }, { "epoch": 0.10553552885104463, "grad_norm": 10.786315989336098, "learning_rate": 4.962163424794267e-05, "loss": 2.7761, "mean_token_accuracy": 0.3896551728248596, "step": 104780 }, { "epoch": 0.1055405649041488, "grad_norm": 10.6102419213239, "learning_rate": 4.962156581501492e-05, "loss": 2.183, "mean_token_accuracy": 0.46551724076271056, "step": 104785 }, { "epoch": 0.10554560095725297, "grad_norm": 13.516152712457925, "learning_rate": 4.962149737595166e-05, "loss": 2.54, "mean_token_accuracy": 0.4413793087005615, "step": 104790 }, { "epoch": 0.10555063701035715, "grad_norm": 10.823859038659698, "learning_rate": 4.9621428930752934e-05, "loss": 2.4085, "mean_token_accuracy": 0.4517241418361664, "step": 104795 }, { "epoch": 0.10555567306346132, "grad_norm": 8.051955577275516, "learning_rate": 4.962136047941875e-05, "loss": 2.4135, "mean_token_accuracy": 0.46896551847457885, "step": 104800 }, { "epoch": 0.1055607091165655, "grad_norm": 11.083787475872414, "learning_rate": 4.9621292021949114e-05, "loss": 2.3255, "mean_token_accuracy": 0.44827585816383364, "step": 104805 }, { "epoch": 0.10556574516966967, "grad_norm": 11.089800852299916, "learning_rate": 4.9621223558344056e-05, "loss": 2.1278, "mean_token_accuracy": 0.44827585220336913, "step": 104810 }, { "epoch": 0.10557078122277384, "grad_norm": 16.303213202834048, "learning_rate": 4.962115508860359e-05, "loss": 2.2118, "mean_token_accuracy": 0.43103448748588563, "step": 104815 }, { "epoch": 0.10557581727587802, "grad_norm": 9.635345101044063, "learning_rate": 4.9621086612727744e-05, "loss": 1.879, "mean_token_accuracy": 0.4862068951129913, "step": 104820 }, { "epoch": 0.10558085332898218, "grad_norm": 11.405114978774881, "learning_rate": 4.962101813071654e-05, "loss": 2.2892, "mean_token_accuracy": 0.4482758641242981, "step": 104825 }, { "epoch": 0.10558588938208635, "grad_norm": 39.30089286162542, "learning_rate": 4.962094964256997e-05, "loss": 2.6542, "mean_token_accuracy": 0.45722927451133727, "step": 104830 }, { "epoch": 0.10559092543519052, "grad_norm": 13.975043636295133, "learning_rate": 4.9620881148288085e-05, "loss": 2.5085, "mean_token_accuracy": 0.4655172526836395, "step": 104835 }, { "epoch": 0.1055959614882947, "grad_norm": 10.352586111100576, "learning_rate": 4.9620812647870894e-05, "loss": 2.2482, "mean_token_accuracy": 0.4572292804718018, "step": 104840 }, { "epoch": 0.10560099754139887, "grad_norm": 12.515984210502726, "learning_rate": 4.9620744141318405e-05, "loss": 2.329, "mean_token_accuracy": 0.47241379618644713, "step": 104845 }, { "epoch": 0.10560603359450305, "grad_norm": 14.284755088394794, "learning_rate": 4.9620675628630654e-05, "loss": 2.4442, "mean_token_accuracy": 0.4172413766384125, "step": 104850 }, { "epoch": 0.10561106964760722, "grad_norm": 9.837977078318639, "learning_rate": 4.962060710980765e-05, "loss": 2.0779, "mean_token_accuracy": 0.49655171632766726, "step": 104855 }, { "epoch": 0.1056161057007114, "grad_norm": 9.098912515875515, "learning_rate": 4.9620538584849404e-05, "loss": 2.7032, "mean_token_accuracy": 0.417241370677948, "step": 104860 }, { "epoch": 0.10562114175381557, "grad_norm": 9.863230341870077, "learning_rate": 4.962047005375595e-05, "loss": 2.4562, "mean_token_accuracy": 0.42413793206214906, "step": 104865 }, { "epoch": 0.10562617780691974, "grad_norm": 9.085318534133192, "learning_rate": 4.96204015165273e-05, "loss": 2.4252, "mean_token_accuracy": 0.3999999940395355, "step": 104870 }, { "epoch": 0.10563121386002391, "grad_norm": 9.755402938567999, "learning_rate": 4.9620332973163476e-05, "loss": 2.3463, "mean_token_accuracy": 0.3999999940395355, "step": 104875 }, { "epoch": 0.10563624991312809, "grad_norm": 15.077860369430121, "learning_rate": 4.962026442366449e-05, "loss": 2.3191, "mean_token_accuracy": 0.4206896543502808, "step": 104880 }, { "epoch": 0.10564128596623226, "grad_norm": 10.259733066517377, "learning_rate": 4.9620195868030375e-05, "loss": 2.4412, "mean_token_accuracy": 0.37931033968925476, "step": 104885 }, { "epoch": 0.10564632201933644, "grad_norm": 8.506638040850353, "learning_rate": 4.962012730626113e-05, "loss": 2.3846, "mean_token_accuracy": 0.4724137902259827, "step": 104890 }, { "epoch": 0.1056513580724406, "grad_norm": 12.024224121183295, "learning_rate": 4.962005873835679e-05, "loss": 2.0721, "mean_token_accuracy": 0.5004926145076751, "step": 104895 }, { "epoch": 0.10565639412554477, "grad_norm": 11.149523757050849, "learning_rate": 4.9619990164317374e-05, "loss": 2.253, "mean_token_accuracy": 0.441379314661026, "step": 104900 }, { "epoch": 0.10566143017864894, "grad_norm": 11.416983244837931, "learning_rate": 4.961992158414289e-05, "loss": 2.5954, "mean_token_accuracy": 0.3827586233615875, "step": 104905 }, { "epoch": 0.10566646623175312, "grad_norm": 11.021363502121414, "learning_rate": 4.9619852997833365e-05, "loss": 2.4318, "mean_token_accuracy": 0.37586207389831544, "step": 104910 }, { "epoch": 0.10567150228485729, "grad_norm": 11.370701414301896, "learning_rate": 4.961978440538882e-05, "loss": 2.6969, "mean_token_accuracy": 0.36376285552978516, "step": 104915 }, { "epoch": 0.10567653833796146, "grad_norm": 13.866042022192232, "learning_rate": 4.9619715806809267e-05, "loss": 2.3446, "mean_token_accuracy": 0.44137930274009707, "step": 104920 }, { "epoch": 0.10568157439106564, "grad_norm": 17.036802978387495, "learning_rate": 4.961964720209473e-05, "loss": 2.4377, "mean_token_accuracy": 0.39655173420906065, "step": 104925 }, { "epoch": 0.10568661044416981, "grad_norm": 13.35022357804328, "learning_rate": 4.961957859124522e-05, "loss": 2.1544, "mean_token_accuracy": 0.4551724135875702, "step": 104930 }, { "epoch": 0.10569164649727399, "grad_norm": 10.74752020647704, "learning_rate": 4.961950997426077e-05, "loss": 2.4397, "mean_token_accuracy": 0.4413793087005615, "step": 104935 }, { "epoch": 0.10569668255037816, "grad_norm": 14.679087123588063, "learning_rate": 4.961944135114139e-05, "loss": 2.1548, "mean_token_accuracy": 0.4944581389427185, "step": 104940 }, { "epoch": 0.10570171860348233, "grad_norm": 10.893804127277205, "learning_rate": 4.96193727218871e-05, "loss": 2.6017, "mean_token_accuracy": 0.37586207389831544, "step": 104945 }, { "epoch": 0.10570675465658651, "grad_norm": 9.856905481399101, "learning_rate": 4.961930408649791e-05, "loss": 2.4833, "mean_token_accuracy": 0.3965517282485962, "step": 104950 }, { "epoch": 0.10571179070969068, "grad_norm": 12.909340183488235, "learning_rate": 4.961923544497386e-05, "loss": 2.7324, "mean_token_accuracy": 0.3896551728248596, "step": 104955 }, { "epoch": 0.10571682676279485, "grad_norm": 11.999709155867427, "learning_rate": 4.9619166797314954e-05, "loss": 2.5658, "mean_token_accuracy": 0.3827586114406586, "step": 104960 }, { "epoch": 0.10572186281589901, "grad_norm": 11.45126738377503, "learning_rate": 4.961909814352122e-05, "loss": 2.37, "mean_token_accuracy": 0.42068966031074523, "step": 104965 }, { "epoch": 0.10572689886900319, "grad_norm": 13.605116506280586, "learning_rate": 4.9619029483592665e-05, "loss": 2.3484, "mean_token_accuracy": 0.4172413766384125, "step": 104970 }, { "epoch": 0.10573193492210736, "grad_norm": 10.673263994220422, "learning_rate": 4.961896081752932e-05, "loss": 2.5726, "mean_token_accuracy": 0.4379310250282288, "step": 104975 }, { "epoch": 0.10573697097521154, "grad_norm": 10.224274248464523, "learning_rate": 4.961889214533119e-05, "loss": 2.4232, "mean_token_accuracy": 0.45517241954803467, "step": 104980 }, { "epoch": 0.10574200702831571, "grad_norm": 10.10453858373152, "learning_rate": 4.961882346699831e-05, "loss": 2.2098, "mean_token_accuracy": 0.41379310488700866, "step": 104985 }, { "epoch": 0.10574704308141988, "grad_norm": 10.847356554116907, "learning_rate": 4.961875478253069e-05, "loss": 2.6371, "mean_token_accuracy": 0.37241379618644715, "step": 104990 }, { "epoch": 0.10575207913452406, "grad_norm": 9.706681182326673, "learning_rate": 4.9618686091928345e-05, "loss": 2.2699, "mean_token_accuracy": 0.42758620381355283, "step": 104995 }, { "epoch": 0.10575711518762823, "grad_norm": 10.38859414274225, "learning_rate": 4.9618617395191305e-05, "loss": 2.5912, "mean_token_accuracy": 0.4034482777118683, "step": 105000 }, { "epoch": 0.1057621512407324, "grad_norm": 11.418707185344394, "learning_rate": 4.961854869231959e-05, "loss": 2.5848, "mean_token_accuracy": 0.42758620381355283, "step": 105005 }, { "epoch": 0.10576718729383658, "grad_norm": 10.347604298356547, "learning_rate": 4.961847998331321e-05, "loss": 2.1672, "mean_token_accuracy": 0.510344821214676, "step": 105010 }, { "epoch": 0.10577222334694075, "grad_norm": 11.456624173197307, "learning_rate": 4.9618411268172185e-05, "loss": 2.742, "mean_token_accuracy": 0.3517241418361664, "step": 105015 }, { "epoch": 0.10577725940004493, "grad_norm": 9.322663367072932, "learning_rate": 4.9618342546896527e-05, "loss": 2.2623, "mean_token_accuracy": 0.4329098641872406, "step": 105020 }, { "epoch": 0.1057822954531491, "grad_norm": 11.520760448156823, "learning_rate": 4.9618273819486275e-05, "loss": 2.222, "mean_token_accuracy": 0.43103448748588563, "step": 105025 }, { "epoch": 0.10578733150625327, "grad_norm": 12.394725831205681, "learning_rate": 4.961820508594144e-05, "loss": 2.8287, "mean_token_accuracy": 0.4172413766384125, "step": 105030 }, { "epoch": 0.10579236755935743, "grad_norm": 17.065761537363592, "learning_rate": 4.961813634626203e-05, "loss": 2.4802, "mean_token_accuracy": 0.4551724135875702, "step": 105035 }, { "epoch": 0.10579740361246161, "grad_norm": 10.103761519271226, "learning_rate": 4.961806760044808e-05, "loss": 2.9535, "mean_token_accuracy": 0.3724137932062149, "step": 105040 }, { "epoch": 0.10580243966556578, "grad_norm": 10.049954772048478, "learning_rate": 4.9617998848499594e-05, "loss": 2.2423, "mean_token_accuracy": 0.43793103098869324, "step": 105045 }, { "epoch": 0.10580747571866995, "grad_norm": 11.120205487737183, "learning_rate": 4.96179300904166e-05, "loss": 3.0154, "mean_token_accuracy": 0.4275862067937851, "step": 105050 }, { "epoch": 0.10581251177177413, "grad_norm": 9.19907989531605, "learning_rate": 4.9617861326199115e-05, "loss": 2.3271, "mean_token_accuracy": 0.41379310488700866, "step": 105055 }, { "epoch": 0.1058175478248783, "grad_norm": 9.668116411039161, "learning_rate": 4.9617792555847166e-05, "loss": 2.4488, "mean_token_accuracy": 0.3896551728248596, "step": 105060 }, { "epoch": 0.10582258387798248, "grad_norm": 12.161778669010873, "learning_rate": 4.961772377936076e-05, "loss": 2.3818, "mean_token_accuracy": 0.41893526911735535, "step": 105065 }, { "epoch": 0.10582761993108665, "grad_norm": 9.469732473525996, "learning_rate": 4.9617654996739914e-05, "loss": 2.3649, "mean_token_accuracy": 0.4034482777118683, "step": 105070 }, { "epoch": 0.10583265598419082, "grad_norm": 11.738179206051273, "learning_rate": 4.961758620798467e-05, "loss": 2.0533, "mean_token_accuracy": 0.4517241358757019, "step": 105075 }, { "epoch": 0.105837692037295, "grad_norm": 11.436384264354333, "learning_rate": 4.961751741309502e-05, "loss": 2.6769, "mean_token_accuracy": 0.3862068891525269, "step": 105080 }, { "epoch": 0.10584272809039917, "grad_norm": 8.892214731727295, "learning_rate": 4.9617448612071e-05, "loss": 2.4675, "mean_token_accuracy": 0.4241379380226135, "step": 105085 }, { "epoch": 0.10584776414350335, "grad_norm": 14.601768913496445, "learning_rate": 4.9617379804912614e-05, "loss": 2.4999, "mean_token_accuracy": 0.41379310488700866, "step": 105090 }, { "epoch": 0.10585280019660752, "grad_norm": 14.151238869399718, "learning_rate": 4.961731099161988e-05, "loss": 2.9225, "mean_token_accuracy": 0.37241379022598264, "step": 105095 }, { "epoch": 0.10585783624971169, "grad_norm": 12.828700198136676, "learning_rate": 4.961724217219285e-05, "loss": 2.5272, "mean_token_accuracy": 0.4068965494632721, "step": 105100 }, { "epoch": 0.10586287230281585, "grad_norm": 10.93421308188576, "learning_rate": 4.961717334663151e-05, "loss": 2.3744, "mean_token_accuracy": 0.4068965554237366, "step": 105105 }, { "epoch": 0.10586790835592003, "grad_norm": 11.308857749649773, "learning_rate": 4.96171045149359e-05, "loss": 2.0331, "mean_token_accuracy": 0.48965516686439514, "step": 105110 }, { "epoch": 0.1058729444090242, "grad_norm": 11.17492842316766, "learning_rate": 4.9617035677106014e-05, "loss": 2.33, "mean_token_accuracy": 0.4344827592372894, "step": 105115 }, { "epoch": 0.10587798046212837, "grad_norm": 10.886734741446126, "learning_rate": 4.9616966833141895e-05, "loss": 2.1474, "mean_token_accuracy": 0.4551724076271057, "step": 105120 }, { "epoch": 0.10588301651523255, "grad_norm": 9.34640769941015, "learning_rate": 4.961689798304355e-05, "loss": 2.2887, "mean_token_accuracy": 0.4862068951129913, "step": 105125 }, { "epoch": 0.10588805256833672, "grad_norm": 11.885704967343518, "learning_rate": 4.9616829126811e-05, "loss": 2.673, "mean_token_accuracy": 0.33793102502822875, "step": 105130 }, { "epoch": 0.1058930886214409, "grad_norm": 9.044865313366774, "learning_rate": 4.961676026444426e-05, "loss": 2.4155, "mean_token_accuracy": 0.4310344815254211, "step": 105135 }, { "epoch": 0.10589812467454507, "grad_norm": 12.30889076324346, "learning_rate": 4.961669139594336e-05, "loss": 2.7935, "mean_token_accuracy": 0.3793103456497192, "step": 105140 }, { "epoch": 0.10590316072764924, "grad_norm": 10.925506315490697, "learning_rate": 4.961662252130831e-05, "loss": 2.1736, "mean_token_accuracy": 0.5006049573421478, "step": 105145 }, { "epoch": 0.10590819678075342, "grad_norm": 12.598723717332883, "learning_rate": 4.9616553640539135e-05, "loss": 2.29, "mean_token_accuracy": 0.4851309359073639, "step": 105150 }, { "epoch": 0.10591323283385759, "grad_norm": 11.628122445942168, "learning_rate": 4.961648475363585e-05, "loss": 2.3636, "mean_token_accuracy": 0.39655172228813174, "step": 105155 }, { "epoch": 0.10591826888696176, "grad_norm": 11.083621451349437, "learning_rate": 4.961641586059848e-05, "loss": 2.4006, "mean_token_accuracy": 0.4448275864124298, "step": 105160 }, { "epoch": 0.10592330494006594, "grad_norm": 11.36896968345212, "learning_rate": 4.961634696142704e-05, "loss": 2.5781, "mean_token_accuracy": 0.39310344457626345, "step": 105165 }, { "epoch": 0.10592834099317011, "grad_norm": 9.99275332003944, "learning_rate": 4.961627805612154e-05, "loss": 2.1454, "mean_token_accuracy": 0.45329703092575074, "step": 105170 }, { "epoch": 0.10593337704627427, "grad_norm": 12.295185471839522, "learning_rate": 4.961620914468202e-05, "loss": 2.6159, "mean_token_accuracy": 0.3862068921327591, "step": 105175 }, { "epoch": 0.10593841309937845, "grad_norm": 15.089669187153085, "learning_rate": 4.9616140227108476e-05, "loss": 2.9091, "mean_token_accuracy": 0.3896551787853241, "step": 105180 }, { "epoch": 0.10594344915248262, "grad_norm": 12.425555679652831, "learning_rate": 4.961607130340094e-05, "loss": 2.4273, "mean_token_accuracy": 0.41724138259887694, "step": 105185 }, { "epoch": 0.10594848520558679, "grad_norm": 9.991696131651292, "learning_rate": 4.961600237355943e-05, "loss": 2.2178, "mean_token_accuracy": 0.4379310369491577, "step": 105190 }, { "epoch": 0.10595352125869097, "grad_norm": 13.056928082819637, "learning_rate": 4.961593343758397e-05, "loss": 2.5414, "mean_token_accuracy": 0.3862069010734558, "step": 105195 }, { "epoch": 0.10595855731179514, "grad_norm": 12.656115923246702, "learning_rate": 4.961586449547456e-05, "loss": 2.7699, "mean_token_accuracy": 0.37241379618644715, "step": 105200 }, { "epoch": 0.10596359336489931, "grad_norm": 11.125883502101003, "learning_rate": 4.961579554723124e-05, "loss": 2.5101, "mean_token_accuracy": 0.4103448212146759, "step": 105205 }, { "epoch": 0.10596862941800349, "grad_norm": 10.473859195726073, "learning_rate": 4.961572659285403e-05, "loss": 2.4145, "mean_token_accuracy": 0.432667875289917, "step": 105210 }, { "epoch": 0.10597366547110766, "grad_norm": 12.819109730456917, "learning_rate": 4.9615657632342935e-05, "loss": 2.3883, "mean_token_accuracy": 0.4689655125141144, "step": 105215 }, { "epoch": 0.10597870152421184, "grad_norm": 9.737457267989155, "learning_rate": 4.961558866569798e-05, "loss": 2.1959, "mean_token_accuracy": 0.45517241954803467, "step": 105220 }, { "epoch": 0.10598373757731601, "grad_norm": 11.944480824546707, "learning_rate": 4.961551969291918e-05, "loss": 2.4549, "mean_token_accuracy": 0.39655172228813174, "step": 105225 }, { "epoch": 0.10598877363042018, "grad_norm": 13.39909587970602, "learning_rate": 4.9615450714006554e-05, "loss": 2.515, "mean_token_accuracy": 0.44482758045196535, "step": 105230 }, { "epoch": 0.10599380968352436, "grad_norm": 11.198920903498093, "learning_rate": 4.961538172896013e-05, "loss": 2.5482, "mean_token_accuracy": 0.4172413766384125, "step": 105235 }, { "epoch": 0.10599884573662853, "grad_norm": 11.951487202630013, "learning_rate": 4.9615312737779926e-05, "loss": 2.7598, "mean_token_accuracy": 0.36896551251411436, "step": 105240 }, { "epoch": 0.10600388178973269, "grad_norm": 13.470815415442516, "learning_rate": 4.961524374046595e-05, "loss": 2.6538, "mean_token_accuracy": 0.3655172437429428, "step": 105245 }, { "epoch": 0.10600891784283686, "grad_norm": 11.016017163838463, "learning_rate": 4.9615174737018236e-05, "loss": 2.6624, "mean_token_accuracy": 0.4103448212146759, "step": 105250 }, { "epoch": 0.10601395389594104, "grad_norm": 10.284956892607294, "learning_rate": 4.961510572743679e-05, "loss": 2.1685, "mean_token_accuracy": 0.4551724076271057, "step": 105255 }, { "epoch": 0.10601898994904521, "grad_norm": 13.621210071624485, "learning_rate": 4.961503671172165e-05, "loss": 2.4654, "mean_token_accuracy": 0.38275861740112305, "step": 105260 }, { "epoch": 0.10602402600214939, "grad_norm": 11.308854201371862, "learning_rate": 4.96149676898728e-05, "loss": 2.5186, "mean_token_accuracy": 0.3620689630508423, "step": 105265 }, { "epoch": 0.10602906205525356, "grad_norm": 11.034911051757247, "learning_rate": 4.96148986618903e-05, "loss": 2.1912, "mean_token_accuracy": 0.4862068951129913, "step": 105270 }, { "epoch": 0.10603409810835773, "grad_norm": 12.432000512609143, "learning_rate": 4.961482962777414e-05, "loss": 2.5644, "mean_token_accuracy": 0.42413792610168455, "step": 105275 }, { "epoch": 0.1060391341614619, "grad_norm": 10.35387927682903, "learning_rate": 4.9614760587524355e-05, "loss": 2.4656, "mean_token_accuracy": 0.42413792610168455, "step": 105280 }, { "epoch": 0.10604417021456608, "grad_norm": 11.600779834109012, "learning_rate": 4.9614691541140964e-05, "loss": 2.7816, "mean_token_accuracy": 0.3862069010734558, "step": 105285 }, { "epoch": 0.10604920626767025, "grad_norm": 9.50925266287983, "learning_rate": 4.961462248862397e-05, "loss": 2.5536, "mean_token_accuracy": 0.4241379201412201, "step": 105290 }, { "epoch": 0.10605424232077443, "grad_norm": 10.037863619727432, "learning_rate": 4.961455342997341e-05, "loss": 2.983, "mean_token_accuracy": 0.4206896543502808, "step": 105295 }, { "epoch": 0.1060592783738786, "grad_norm": 10.916374475968524, "learning_rate": 4.961448436518929e-05, "loss": 2.1646, "mean_token_accuracy": 0.4965517222881317, "step": 105300 }, { "epoch": 0.10606431442698278, "grad_norm": 10.63419228515586, "learning_rate": 4.961441529427164e-05, "loss": 2.5277, "mean_token_accuracy": 0.36206896901130675, "step": 105305 }, { "epoch": 0.10606935048008695, "grad_norm": 11.860842572942735, "learning_rate": 4.961434621722048e-05, "loss": 2.2351, "mean_token_accuracy": 0.5034482836723327, "step": 105310 }, { "epoch": 0.10607438653319111, "grad_norm": 10.613797104784211, "learning_rate": 4.9614277134035816e-05, "loss": 2.3155, "mean_token_accuracy": 0.41724138259887694, "step": 105315 }, { "epoch": 0.10607942258629528, "grad_norm": 10.796840745443847, "learning_rate": 4.9614208044717667e-05, "loss": 2.3432, "mean_token_accuracy": 0.4310344815254211, "step": 105320 }, { "epoch": 0.10608445863939946, "grad_norm": 10.8506939966427, "learning_rate": 4.9614138949266064e-05, "loss": 2.6897, "mean_token_accuracy": 0.4034482717514038, "step": 105325 }, { "epoch": 0.10608949469250363, "grad_norm": 11.259507604331944, "learning_rate": 4.9614069847681034e-05, "loss": 2.1789, "mean_token_accuracy": 0.4379310369491577, "step": 105330 }, { "epoch": 0.1060945307456078, "grad_norm": 16.963744397502197, "learning_rate": 4.961400073996257e-05, "loss": 2.6779, "mean_token_accuracy": 0.39310345649719236, "step": 105335 }, { "epoch": 0.10609956679871198, "grad_norm": 10.599918125224239, "learning_rate": 4.961393162611071e-05, "loss": 2.3201, "mean_token_accuracy": 0.41379310488700866, "step": 105340 }, { "epoch": 0.10610460285181615, "grad_norm": 13.463836883453, "learning_rate": 4.9613862506125475e-05, "loss": 2.7117, "mean_token_accuracy": 0.3793103456497192, "step": 105345 }, { "epoch": 0.10610963890492033, "grad_norm": 10.605284951449091, "learning_rate": 4.9613793380006875e-05, "loss": 2.3708, "mean_token_accuracy": 0.4448275864124298, "step": 105350 }, { "epoch": 0.1061146749580245, "grad_norm": 13.286983318225928, "learning_rate": 4.961372424775492e-05, "loss": 2.8408, "mean_token_accuracy": 0.4137930989265442, "step": 105355 }, { "epoch": 0.10611971101112867, "grad_norm": 10.659024874570177, "learning_rate": 4.961365510936965e-05, "loss": 2.2983, "mean_token_accuracy": 0.41379310488700866, "step": 105360 }, { "epoch": 0.10612474706423285, "grad_norm": 10.239351153667839, "learning_rate": 4.9613585964851084e-05, "loss": 2.5403, "mean_token_accuracy": 0.43103448748588563, "step": 105365 }, { "epoch": 0.10612978311733702, "grad_norm": 9.778370845000953, "learning_rate": 4.9613516814199225e-05, "loss": 2.1441, "mean_token_accuracy": 0.4862068951129913, "step": 105370 }, { "epoch": 0.1061348191704412, "grad_norm": 11.085666650011644, "learning_rate": 4.9613447657414095e-05, "loss": 2.5558, "mean_token_accuracy": 0.41923774480819703, "step": 105375 }, { "epoch": 0.10613985522354537, "grad_norm": 15.21483270012711, "learning_rate": 4.961337849449573e-05, "loss": 2.6565, "mean_token_accuracy": 0.4068965494632721, "step": 105380 }, { "epoch": 0.10614489127664953, "grad_norm": 12.536474340951818, "learning_rate": 4.9613309325444125e-05, "loss": 2.7464, "mean_token_accuracy": 0.4137930989265442, "step": 105385 }, { "epoch": 0.1061499273297537, "grad_norm": 14.373195057216673, "learning_rate": 4.961324015025932e-05, "loss": 2.7939, "mean_token_accuracy": 0.32068965435028074, "step": 105390 }, { "epoch": 0.10615496338285788, "grad_norm": 7.794521083882906, "learning_rate": 4.9613170968941316e-05, "loss": 2.2107, "mean_token_accuracy": 0.44137930274009707, "step": 105395 }, { "epoch": 0.10615999943596205, "grad_norm": 13.502608002782837, "learning_rate": 4.9613101781490145e-05, "loss": 2.3439, "mean_token_accuracy": 0.41379310488700866, "step": 105400 }, { "epoch": 0.10616503548906622, "grad_norm": 11.00293083121396, "learning_rate": 4.9613032587905824e-05, "loss": 2.6104, "mean_token_accuracy": 0.39310344457626345, "step": 105405 }, { "epoch": 0.1061700715421704, "grad_norm": 12.561234825180192, "learning_rate": 4.961296338818837e-05, "loss": 2.3741, "mean_token_accuracy": 0.43103447556495667, "step": 105410 }, { "epoch": 0.10617510759527457, "grad_norm": 12.108726898952854, "learning_rate": 4.9612894182337805e-05, "loss": 2.5407, "mean_token_accuracy": 0.4310344815254211, "step": 105415 }, { "epoch": 0.10618014364837874, "grad_norm": 11.3990008502189, "learning_rate": 4.961282497035415e-05, "loss": 2.7171, "mean_token_accuracy": 0.3965517282485962, "step": 105420 }, { "epoch": 0.10618517970148292, "grad_norm": 9.414912725330982, "learning_rate": 4.961275575223741e-05, "loss": 2.3447, "mean_token_accuracy": 0.42068964838981626, "step": 105425 }, { "epoch": 0.10619021575458709, "grad_norm": 10.50253492514507, "learning_rate": 4.961268652798762e-05, "loss": 2.1962, "mean_token_accuracy": 0.4517241358757019, "step": 105430 }, { "epoch": 0.10619525180769127, "grad_norm": 10.619512878157053, "learning_rate": 4.9612617297604795e-05, "loss": 2.292, "mean_token_accuracy": 0.48620688915252686, "step": 105435 }, { "epoch": 0.10620028786079544, "grad_norm": 16.64499333926843, "learning_rate": 4.961254806108895e-05, "loss": 2.5417, "mean_token_accuracy": 0.3965517282485962, "step": 105440 }, { "epoch": 0.10620532391389961, "grad_norm": 11.782473629050147, "learning_rate": 4.9612478818440116e-05, "loss": 2.5476, "mean_token_accuracy": 0.3862068891525269, "step": 105445 }, { "epoch": 0.10621035996700379, "grad_norm": 9.963823262948056, "learning_rate": 4.96124095696583e-05, "loss": 2.2809, "mean_token_accuracy": 0.4413793087005615, "step": 105450 }, { "epoch": 0.10621539602010795, "grad_norm": 10.65502581556437, "learning_rate": 4.961234031474351e-05, "loss": 2.7316, "mean_token_accuracy": 0.38275861740112305, "step": 105455 }, { "epoch": 0.10622043207321212, "grad_norm": 9.713704383903654, "learning_rate": 4.96122710536958e-05, "loss": 2.3658, "mean_token_accuracy": 0.42413792610168455, "step": 105460 }, { "epoch": 0.1062254681263163, "grad_norm": 9.503679212412685, "learning_rate": 4.9612201786515156e-05, "loss": 2.0081, "mean_token_accuracy": 0.5189957678318023, "step": 105465 }, { "epoch": 0.10623050417942047, "grad_norm": 12.759896204469833, "learning_rate": 4.961213251320162e-05, "loss": 2.4544, "mean_token_accuracy": 0.4206896543502808, "step": 105470 }, { "epoch": 0.10623554023252464, "grad_norm": 8.492496292863187, "learning_rate": 4.96120632337552e-05, "loss": 2.222, "mean_token_accuracy": 0.4562807857990265, "step": 105475 }, { "epoch": 0.10624057628562882, "grad_norm": 10.393793841584083, "learning_rate": 4.9611993948175904e-05, "loss": 2.3483, "mean_token_accuracy": 0.45862069725990295, "step": 105480 }, { "epoch": 0.10624561233873299, "grad_norm": 10.0208717486039, "learning_rate": 4.961192465646378e-05, "loss": 2.3656, "mean_token_accuracy": 0.4241379201412201, "step": 105485 }, { "epoch": 0.10625064839183716, "grad_norm": 10.499095883016969, "learning_rate": 4.9611855358618824e-05, "loss": 2.4608, "mean_token_accuracy": 0.37931033968925476, "step": 105490 }, { "epoch": 0.10625568444494134, "grad_norm": 10.260554888578302, "learning_rate": 4.961178605464106e-05, "loss": 2.3661, "mean_token_accuracy": 0.41034482717514037, "step": 105495 }, { "epoch": 0.10626072049804551, "grad_norm": 8.381159521609332, "learning_rate": 4.961171674453051e-05, "loss": 2.4305, "mean_token_accuracy": 0.4103448331356049, "step": 105500 }, { "epoch": 0.10626575655114968, "grad_norm": 10.240063427387415, "learning_rate": 4.961164742828721e-05, "loss": 2.3052, "mean_token_accuracy": 0.44361767172813416, "step": 105505 }, { "epoch": 0.10627079260425386, "grad_norm": 14.493319553815224, "learning_rate": 4.961157810591114e-05, "loss": 2.4815, "mean_token_accuracy": 0.41034482717514037, "step": 105510 }, { "epoch": 0.10627582865735803, "grad_norm": 13.59889139638753, "learning_rate": 4.9611508777402346e-05, "loss": 2.5607, "mean_token_accuracy": 0.44827585816383364, "step": 105515 }, { "epoch": 0.1062808647104622, "grad_norm": 11.754689613278279, "learning_rate": 4.961143944276085e-05, "loss": 2.599, "mean_token_accuracy": 0.441379314661026, "step": 105520 }, { "epoch": 0.10628590076356637, "grad_norm": 10.770112403080116, "learning_rate": 4.961137010198666e-05, "loss": 2.1231, "mean_token_accuracy": 0.4586206912994385, "step": 105525 }, { "epoch": 0.10629093681667054, "grad_norm": 10.718579797369966, "learning_rate": 4.961130075507979e-05, "loss": 2.2288, "mean_token_accuracy": 0.47773745059967043, "step": 105530 }, { "epoch": 0.10629597286977471, "grad_norm": 10.329629135147139, "learning_rate": 4.961123140204028e-05, "loss": 2.3146, "mean_token_accuracy": 0.42068966031074523, "step": 105535 }, { "epoch": 0.10630100892287889, "grad_norm": 9.794321499993579, "learning_rate": 4.9611162042868134e-05, "loss": 2.1824, "mean_token_accuracy": 0.4172413766384125, "step": 105540 }, { "epoch": 0.10630604497598306, "grad_norm": 10.215088042946645, "learning_rate": 4.9611092677563374e-05, "loss": 2.0877, "mean_token_accuracy": 0.45517241954803467, "step": 105545 }, { "epoch": 0.10631108102908723, "grad_norm": 14.690943723624414, "learning_rate": 4.961102330612602e-05, "loss": 2.9215, "mean_token_accuracy": 0.3793103456497192, "step": 105550 }, { "epoch": 0.10631611708219141, "grad_norm": 10.91961207736115, "learning_rate": 4.9610953928556095e-05, "loss": 2.2084, "mean_token_accuracy": 0.441379314661026, "step": 105555 }, { "epoch": 0.10632115313529558, "grad_norm": 11.774315441355565, "learning_rate": 4.961088454485362e-05, "loss": 2.7718, "mean_token_accuracy": 0.4068965494632721, "step": 105560 }, { "epoch": 0.10632618918839976, "grad_norm": 12.845555990087119, "learning_rate": 4.9610815155018594e-05, "loss": 2.5161, "mean_token_accuracy": 0.4068965554237366, "step": 105565 }, { "epoch": 0.10633122524150393, "grad_norm": 10.969263196048981, "learning_rate": 4.961074575905106e-05, "loss": 2.2892, "mean_token_accuracy": 0.441379314661026, "step": 105570 }, { "epoch": 0.1063362612946081, "grad_norm": 12.889627573376501, "learning_rate": 4.961067635695102e-05, "loss": 2.522, "mean_token_accuracy": 0.40465819239616396, "step": 105575 }, { "epoch": 0.10634129734771228, "grad_norm": 10.391521625971489, "learning_rate": 4.9610606948718506e-05, "loss": 2.3197, "mean_token_accuracy": 0.4413793087005615, "step": 105580 }, { "epoch": 0.10634633340081645, "grad_norm": 9.615394528583272, "learning_rate": 4.961053753435354e-05, "loss": 2.1895, "mean_token_accuracy": 0.4517241299152374, "step": 105585 }, { "epoch": 0.10635136945392062, "grad_norm": 8.10619015543136, "learning_rate": 4.9610468113856124e-05, "loss": 2.2081, "mean_token_accuracy": 0.493719220161438, "step": 105590 }, { "epoch": 0.10635640550702478, "grad_norm": 10.511974420793832, "learning_rate": 4.961039868722629e-05, "loss": 2.6032, "mean_token_accuracy": 0.43103448748588563, "step": 105595 }, { "epoch": 0.10636144156012896, "grad_norm": 10.222124705448744, "learning_rate": 4.961032925446406e-05, "loss": 2.5101, "mean_token_accuracy": 0.43103448748588563, "step": 105600 }, { "epoch": 0.10636647761323313, "grad_norm": 23.42722016373519, "learning_rate": 4.961025981556943e-05, "loss": 2.6944, "mean_token_accuracy": 0.42068966031074523, "step": 105605 }, { "epoch": 0.1063715136663373, "grad_norm": 13.195504360518031, "learning_rate": 4.961019037054245e-05, "loss": 2.8729, "mean_token_accuracy": 0.3551724135875702, "step": 105610 }, { "epoch": 0.10637654971944148, "grad_norm": 11.924925565640642, "learning_rate": 4.961012091938313e-05, "loss": 2.296, "mean_token_accuracy": 0.4620689630508423, "step": 105615 }, { "epoch": 0.10638158577254565, "grad_norm": 11.562062057140933, "learning_rate": 4.9610051462091486e-05, "loss": 2.4037, "mean_token_accuracy": 0.43793103098869324, "step": 105620 }, { "epoch": 0.10638662182564983, "grad_norm": 13.330768663913016, "learning_rate": 4.9609981998667534e-05, "loss": 2.9719, "mean_token_accuracy": 0.358620685338974, "step": 105625 }, { "epoch": 0.106391657878754, "grad_norm": 11.510533390902443, "learning_rate": 4.9609912529111294e-05, "loss": 3.0845, "mean_token_accuracy": 0.324137932062149, "step": 105630 }, { "epoch": 0.10639669393185817, "grad_norm": 11.336166139199856, "learning_rate": 4.9609843053422786e-05, "loss": 2.2834, "mean_token_accuracy": 0.4448275864124298, "step": 105635 }, { "epoch": 0.10640172998496235, "grad_norm": 10.63952827321493, "learning_rate": 4.960977357160204e-05, "loss": 2.8423, "mean_token_accuracy": 0.39310343861579894, "step": 105640 }, { "epoch": 0.10640676603806652, "grad_norm": 10.936849103297401, "learning_rate": 4.960970408364904e-05, "loss": 2.3336, "mean_token_accuracy": 0.458620685338974, "step": 105645 }, { "epoch": 0.1064118020911707, "grad_norm": 10.556678027162631, "learning_rate": 4.960963458956386e-05, "loss": 2.4622, "mean_token_accuracy": 0.4103448212146759, "step": 105650 }, { "epoch": 0.10641683814427487, "grad_norm": 9.883236959531065, "learning_rate": 4.9609565089346476e-05, "loss": 2.1086, "mean_token_accuracy": 0.4206896543502808, "step": 105655 }, { "epoch": 0.10642187419737904, "grad_norm": 14.404082389049533, "learning_rate": 4.960949558299693e-05, "loss": 2.9326, "mean_token_accuracy": 0.3758620619773865, "step": 105660 }, { "epoch": 0.1064269102504832, "grad_norm": 10.168499193146074, "learning_rate": 4.9609426070515226e-05, "loss": 2.2846, "mean_token_accuracy": 0.42589232325553894, "step": 105665 }, { "epoch": 0.10643194630358738, "grad_norm": 10.892191226795626, "learning_rate": 4.960935655190139e-05, "loss": 2.5668, "mean_token_accuracy": 0.3965517282485962, "step": 105670 }, { "epoch": 0.10643698235669155, "grad_norm": 10.143857847713587, "learning_rate": 4.960928702715545e-05, "loss": 2.1427, "mean_token_accuracy": 0.4517241358757019, "step": 105675 }, { "epoch": 0.10644201840979572, "grad_norm": 10.899831503693116, "learning_rate": 4.96092174962774e-05, "loss": 2.4924, "mean_token_accuracy": 0.4172413766384125, "step": 105680 }, { "epoch": 0.1064470544628999, "grad_norm": 9.179687584947143, "learning_rate": 4.960914795926729e-05, "loss": 2.0653, "mean_token_accuracy": 0.4986690878868103, "step": 105685 }, { "epoch": 0.10645209051600407, "grad_norm": 11.28341117584674, "learning_rate": 4.960907841612512e-05, "loss": 2.4174, "mean_token_accuracy": 0.40689656138420105, "step": 105690 }, { "epoch": 0.10645712656910825, "grad_norm": 12.171949813679081, "learning_rate": 4.960900886685092e-05, "loss": 2.2726, "mean_token_accuracy": 0.4620689690113068, "step": 105695 }, { "epoch": 0.10646216262221242, "grad_norm": 14.024559736456316, "learning_rate": 4.9608939311444696e-05, "loss": 2.5324, "mean_token_accuracy": 0.3965517282485962, "step": 105700 }, { "epoch": 0.1064671986753166, "grad_norm": 10.475452493906293, "learning_rate": 4.960886974990648e-05, "loss": 2.8657, "mean_token_accuracy": 0.3896551728248596, "step": 105705 }, { "epoch": 0.10647223472842077, "grad_norm": 9.217486913995485, "learning_rate": 4.9608800182236284e-05, "loss": 2.2551, "mean_token_accuracy": 0.4517241418361664, "step": 105710 }, { "epoch": 0.10647727078152494, "grad_norm": 7.74224869736242, "learning_rate": 4.9608730608434134e-05, "loss": 1.949, "mean_token_accuracy": 0.5151477873325347, "step": 105715 }, { "epoch": 0.10648230683462911, "grad_norm": 14.009155484137054, "learning_rate": 4.960866102850004e-05, "loss": 2.7425, "mean_token_accuracy": 0.35862069129943847, "step": 105720 }, { "epoch": 0.10648734288773329, "grad_norm": 9.48476867867452, "learning_rate": 4.960859144243403e-05, "loss": 2.0394, "mean_token_accuracy": 0.49122806787490847, "step": 105725 }, { "epoch": 0.10649237894083746, "grad_norm": 11.010740838657517, "learning_rate": 4.960852185023612e-05, "loss": 2.2702, "mean_token_accuracy": 0.4586206912994385, "step": 105730 }, { "epoch": 0.10649741499394162, "grad_norm": 11.02368755871764, "learning_rate": 4.960845225190633e-05, "loss": 2.5832, "mean_token_accuracy": 0.40689654350280763, "step": 105735 }, { "epoch": 0.1065024510470458, "grad_norm": 10.056229824156834, "learning_rate": 4.960838264744467e-05, "loss": 2.2277, "mean_token_accuracy": 0.4379310369491577, "step": 105740 }, { "epoch": 0.10650748710014997, "grad_norm": 12.45825661268633, "learning_rate": 4.9608313036851184e-05, "loss": 2.9005, "mean_token_accuracy": 0.3793103456497192, "step": 105745 }, { "epoch": 0.10651252315325414, "grad_norm": 10.239135865581074, "learning_rate": 4.960824342012586e-05, "loss": 2.1918, "mean_token_accuracy": 0.47931033968925474, "step": 105750 }, { "epoch": 0.10651755920635832, "grad_norm": 10.642204973958417, "learning_rate": 4.9608173797268735e-05, "loss": 2.243, "mean_token_accuracy": 0.44827585816383364, "step": 105755 }, { "epoch": 0.10652259525946249, "grad_norm": 10.59360479051198, "learning_rate": 4.960810416827983e-05, "loss": 2.5923, "mean_token_accuracy": 0.4034482777118683, "step": 105760 }, { "epoch": 0.10652763131256666, "grad_norm": 8.951112576899517, "learning_rate": 4.960803453315915e-05, "loss": 2.0167, "mean_token_accuracy": 0.5076225101947784, "step": 105765 }, { "epoch": 0.10653266736567084, "grad_norm": 11.327368266147833, "learning_rate": 4.960796489190674e-05, "loss": 2.165, "mean_token_accuracy": 0.4713248610496521, "step": 105770 }, { "epoch": 0.10653770341877501, "grad_norm": 10.751595775869605, "learning_rate": 4.960789524452259e-05, "loss": 3.0152, "mean_token_accuracy": 0.38620689809322356, "step": 105775 }, { "epoch": 0.10654273947187919, "grad_norm": 10.134679720416521, "learning_rate": 4.960782559100674e-05, "loss": 2.5519, "mean_token_accuracy": 0.3896551728248596, "step": 105780 }, { "epoch": 0.10654777552498336, "grad_norm": 14.89436097122948, "learning_rate": 4.9607755931359204e-05, "loss": 2.7041, "mean_token_accuracy": 0.4154264897108078, "step": 105785 }, { "epoch": 0.10655281157808753, "grad_norm": 11.671515814058447, "learning_rate": 4.960768626557999e-05, "loss": 2.3587, "mean_token_accuracy": 0.4398669183254242, "step": 105790 }, { "epoch": 0.10655784763119171, "grad_norm": 9.428897918744239, "learning_rate": 4.960761659366913e-05, "loss": 2.2364, "mean_token_accuracy": 0.40344828367233276, "step": 105795 }, { "epoch": 0.10656288368429588, "grad_norm": 10.661186134068625, "learning_rate": 4.960754691562664e-05, "loss": 2.1223, "mean_token_accuracy": 0.4896551787853241, "step": 105800 }, { "epoch": 0.10656791973740004, "grad_norm": 15.771477211011103, "learning_rate": 4.9607477231452545e-05, "loss": 2.5723, "mean_token_accuracy": 0.4344827592372894, "step": 105805 }, { "epoch": 0.10657295579050421, "grad_norm": 9.904939115355846, "learning_rate": 4.9607407541146854e-05, "loss": 2.2659, "mean_token_accuracy": 0.4103448212146759, "step": 105810 }, { "epoch": 0.10657799184360839, "grad_norm": 12.325862312722297, "learning_rate": 4.960733784470959e-05, "loss": 2.3011, "mean_token_accuracy": 0.4103448212146759, "step": 105815 }, { "epoch": 0.10658302789671256, "grad_norm": 9.290105810914442, "learning_rate": 4.9607268142140776e-05, "loss": 2.3861, "mean_token_accuracy": 0.43103448748588563, "step": 105820 }, { "epoch": 0.10658806394981674, "grad_norm": 13.122035911356026, "learning_rate": 4.960719843344043e-05, "loss": 2.3222, "mean_token_accuracy": 0.42758620977401735, "step": 105825 }, { "epoch": 0.10659310000292091, "grad_norm": 10.803169377196573, "learning_rate": 4.960712871860857e-05, "loss": 2.2713, "mean_token_accuracy": 0.44827587008476255, "step": 105830 }, { "epoch": 0.10659813605602508, "grad_norm": 9.666909029526721, "learning_rate": 4.960705899764522e-05, "loss": 2.2915, "mean_token_accuracy": 0.3965517163276672, "step": 105835 }, { "epoch": 0.10660317210912926, "grad_norm": 10.077490101975984, "learning_rate": 4.960698927055038e-05, "loss": 2.4332, "mean_token_accuracy": 0.41034482717514037, "step": 105840 }, { "epoch": 0.10660820816223343, "grad_norm": 9.293324603500935, "learning_rate": 4.960691953732409e-05, "loss": 2.2629, "mean_token_accuracy": 0.4310344815254211, "step": 105845 }, { "epoch": 0.1066132442153376, "grad_norm": 9.346917581556548, "learning_rate": 4.960684979796637e-05, "loss": 2.3457, "mean_token_accuracy": 0.44827585816383364, "step": 105850 }, { "epoch": 0.10661828026844178, "grad_norm": 10.554066058941908, "learning_rate": 4.960678005247723e-05, "loss": 2.4649, "mean_token_accuracy": 0.3827586233615875, "step": 105855 }, { "epoch": 0.10662331632154595, "grad_norm": 10.08565646644747, "learning_rate": 4.960671030085669e-05, "loss": 2.2852, "mean_token_accuracy": 0.3999999940395355, "step": 105860 }, { "epoch": 0.10662835237465013, "grad_norm": 10.991491638556516, "learning_rate": 4.960664054310477e-05, "loss": 2.3441, "mean_token_accuracy": 0.4448275864124298, "step": 105865 }, { "epoch": 0.1066333884277543, "grad_norm": 12.157366812144062, "learning_rate": 4.9606570779221494e-05, "loss": 2.1713, "mean_token_accuracy": 0.45674530863761903, "step": 105870 }, { "epoch": 0.10663842448085846, "grad_norm": 10.160379790920366, "learning_rate": 4.960650100920688e-05, "loss": 2.0676, "mean_token_accuracy": 0.49999998807907103, "step": 105875 }, { "epoch": 0.10664346053396263, "grad_norm": 10.594469968303201, "learning_rate": 4.960643123306094e-05, "loss": 2.8621, "mean_token_accuracy": 0.37241379618644715, "step": 105880 }, { "epoch": 0.10664849658706681, "grad_norm": 10.492713765073432, "learning_rate": 4.96063614507837e-05, "loss": 2.5596, "mean_token_accuracy": 0.43103448748588563, "step": 105885 }, { "epoch": 0.10665353264017098, "grad_norm": 11.07504603392353, "learning_rate": 4.960629166237518e-05, "loss": 2.1858, "mean_token_accuracy": 0.4642468154430389, "step": 105890 }, { "epoch": 0.10665856869327515, "grad_norm": 10.428499407613012, "learning_rate": 4.9606221867835396e-05, "loss": 2.5399, "mean_token_accuracy": 0.41379310488700866, "step": 105895 }, { "epoch": 0.10666360474637933, "grad_norm": 13.77881257173632, "learning_rate": 4.960615206716437e-05, "loss": 2.5363, "mean_token_accuracy": 0.40852994918823243, "step": 105900 }, { "epoch": 0.1066686407994835, "grad_norm": 10.345392802571451, "learning_rate": 4.9606082260362116e-05, "loss": 2.5579, "mean_token_accuracy": 0.4206896543502808, "step": 105905 }, { "epoch": 0.10667367685258768, "grad_norm": 13.557675106105078, "learning_rate": 4.9606012447428666e-05, "loss": 2.2094, "mean_token_accuracy": 0.4310344815254211, "step": 105910 }, { "epoch": 0.10667871290569185, "grad_norm": 14.314976740800002, "learning_rate": 4.9605942628364027e-05, "loss": 2.5085, "mean_token_accuracy": 0.37586206793785093, "step": 105915 }, { "epoch": 0.10668374895879602, "grad_norm": 11.8181993652702, "learning_rate": 4.9605872803168224e-05, "loss": 2.2983, "mean_token_accuracy": 0.4655172348022461, "step": 105920 }, { "epoch": 0.1066887850119002, "grad_norm": 11.343552154104566, "learning_rate": 4.960580297184127e-05, "loss": 3.028, "mean_token_accuracy": 0.337931028008461, "step": 105925 }, { "epoch": 0.10669382106500437, "grad_norm": 12.292892553855758, "learning_rate": 4.9605733134383195e-05, "loss": 3.0418, "mean_token_accuracy": 0.3774954617023468, "step": 105930 }, { "epoch": 0.10669885711810854, "grad_norm": 12.814627635984294, "learning_rate": 4.960566329079401e-05, "loss": 2.5443, "mean_token_accuracy": 0.4344827592372894, "step": 105935 }, { "epoch": 0.10670389317121272, "grad_norm": 8.75248970029647, "learning_rate": 4.960559344107373e-05, "loss": 2.3261, "mean_token_accuracy": 0.4206896543502808, "step": 105940 }, { "epoch": 0.10670892922431688, "grad_norm": 11.417607596483748, "learning_rate": 4.960552358522239e-05, "loss": 2.0592, "mean_token_accuracy": 0.47931034564971925, "step": 105945 }, { "epoch": 0.10671396527742105, "grad_norm": 10.111870792220772, "learning_rate": 4.960545372324e-05, "loss": 2.5146, "mean_token_accuracy": 0.358620685338974, "step": 105950 }, { "epoch": 0.10671900133052523, "grad_norm": 9.850160258794245, "learning_rate": 4.9605383855126575e-05, "loss": 2.3184, "mean_token_accuracy": 0.412522679567337, "step": 105955 }, { "epoch": 0.1067240373836294, "grad_norm": 17.821630861894306, "learning_rate": 4.9605313980882144e-05, "loss": 3.4998, "mean_token_accuracy": 0.334482753276825, "step": 105960 }, { "epoch": 0.10672907343673357, "grad_norm": 11.355975336235863, "learning_rate": 4.960524410050672e-05, "loss": 2.3843, "mean_token_accuracy": 0.42068964838981626, "step": 105965 }, { "epoch": 0.10673410948983775, "grad_norm": 11.183338936268772, "learning_rate": 4.960517421400032e-05, "loss": 3.2897, "mean_token_accuracy": 0.31379310190677645, "step": 105970 }, { "epoch": 0.10673914554294192, "grad_norm": 10.3078057862167, "learning_rate": 4.960510432136298e-05, "loss": 2.2432, "mean_token_accuracy": 0.4551724135875702, "step": 105975 }, { "epoch": 0.1067441815960461, "grad_norm": 10.843678159346208, "learning_rate": 4.9605034422594695e-05, "loss": 2.3662, "mean_token_accuracy": 0.4034482777118683, "step": 105980 }, { "epoch": 0.10674921764915027, "grad_norm": 9.871976758010373, "learning_rate": 4.96049645176955e-05, "loss": 2.4233, "mean_token_accuracy": 0.4641863226890564, "step": 105985 }, { "epoch": 0.10675425370225444, "grad_norm": 11.668965947357728, "learning_rate": 4.960489460666541e-05, "loss": 2.1161, "mean_token_accuracy": 0.47241379618644713, "step": 105990 }, { "epoch": 0.10675928975535862, "grad_norm": 9.717714476370647, "learning_rate": 4.9604824689504445e-05, "loss": 2.5415, "mean_token_accuracy": 0.4034482777118683, "step": 105995 }, { "epoch": 0.10676432580846279, "grad_norm": 10.017032205451983, "learning_rate": 4.9604754766212625e-05, "loss": 2.6212, "mean_token_accuracy": 0.3931034505367279, "step": 106000 }, { "epoch": 0.10676936186156696, "grad_norm": 11.555550626982852, "learning_rate": 4.960468483678996e-05, "loss": 2.1882, "mean_token_accuracy": 0.4586206912994385, "step": 106005 }, { "epoch": 0.10677439791467114, "grad_norm": 9.750141990981302, "learning_rate": 4.96046149012365e-05, "loss": 2.3654, "mean_token_accuracy": 0.42758620381355283, "step": 106010 }, { "epoch": 0.1067794339677753, "grad_norm": 9.966254099722603, "learning_rate": 4.960454495955222e-05, "loss": 2.2491, "mean_token_accuracy": 0.44137929677963256, "step": 106015 }, { "epoch": 0.10678447002087947, "grad_norm": 9.19069934302603, "learning_rate": 4.9604475011737174e-05, "loss": 2.2026, "mean_token_accuracy": 0.4379310369491577, "step": 106020 }, { "epoch": 0.10678950607398364, "grad_norm": 8.92999780093296, "learning_rate": 4.9604405057791367e-05, "loss": 2.0855, "mean_token_accuracy": 0.49655171632766726, "step": 106025 }, { "epoch": 0.10679454212708782, "grad_norm": 11.594527455518849, "learning_rate": 4.9604335097714816e-05, "loss": 2.2019, "mean_token_accuracy": 0.4344827651977539, "step": 106030 }, { "epoch": 0.10679957818019199, "grad_norm": 11.231909746995255, "learning_rate": 4.960426513150756e-05, "loss": 2.4957, "mean_token_accuracy": 0.42758620977401735, "step": 106035 }, { "epoch": 0.10680461423329617, "grad_norm": 10.60092254365431, "learning_rate": 4.960419515916958e-05, "loss": 2.2344, "mean_token_accuracy": 0.4586206912994385, "step": 106040 }, { "epoch": 0.10680965028640034, "grad_norm": 10.89283291457191, "learning_rate": 4.960412518070093e-05, "loss": 2.2898, "mean_token_accuracy": 0.42068964838981626, "step": 106045 }, { "epoch": 0.10681468633950451, "grad_norm": 11.15762957271298, "learning_rate": 4.9604055196101624e-05, "loss": 2.7114, "mean_token_accuracy": 0.4157894730567932, "step": 106050 }, { "epoch": 0.10681972239260869, "grad_norm": 8.172697546037096, "learning_rate": 4.960398520537167e-05, "loss": 2.366, "mean_token_accuracy": 0.44682395458221436, "step": 106055 }, { "epoch": 0.10682475844571286, "grad_norm": 11.559083532195913, "learning_rate": 4.96039152085111e-05, "loss": 2.3883, "mean_token_accuracy": 0.4620689690113068, "step": 106060 }, { "epoch": 0.10682979449881704, "grad_norm": 10.810911381803772, "learning_rate": 4.9603845205519925e-05, "loss": 2.2879, "mean_token_accuracy": 0.39655172526836396, "step": 106065 }, { "epoch": 0.10683483055192121, "grad_norm": 13.772707847197502, "learning_rate": 4.960377519639816e-05, "loss": 2.7858, "mean_token_accuracy": 0.38620689809322356, "step": 106070 }, { "epoch": 0.10683986660502538, "grad_norm": 16.89154582576504, "learning_rate": 4.960370518114584e-05, "loss": 3.0403, "mean_token_accuracy": 0.4034482777118683, "step": 106075 }, { "epoch": 0.10684490265812956, "grad_norm": 12.273779166174588, "learning_rate": 4.960363515976296e-05, "loss": 2.5523, "mean_token_accuracy": 0.3814881980419159, "step": 106080 }, { "epoch": 0.10684993871123372, "grad_norm": 10.545258833736206, "learning_rate": 4.960356513224957e-05, "loss": 1.9563, "mean_token_accuracy": 0.506896561384201, "step": 106085 }, { "epoch": 0.10685497476433789, "grad_norm": 9.494618917363432, "learning_rate": 4.960349509860566e-05, "loss": 2.4414, "mean_token_accuracy": 0.4241379380226135, "step": 106090 }, { "epoch": 0.10686001081744206, "grad_norm": 9.699515332554189, "learning_rate": 4.960342505883127e-05, "loss": 2.651, "mean_token_accuracy": 0.3689655065536499, "step": 106095 }, { "epoch": 0.10686504687054624, "grad_norm": 9.225856898683219, "learning_rate": 4.960335501292642e-05, "loss": 2.2099, "mean_token_accuracy": 0.41379310488700866, "step": 106100 }, { "epoch": 0.10687008292365041, "grad_norm": 13.655133750090595, "learning_rate": 4.960328496089111e-05, "loss": 2.7623, "mean_token_accuracy": 0.4034482717514038, "step": 106105 }, { "epoch": 0.10687511897675459, "grad_norm": 10.576333421835402, "learning_rate": 4.9603214902725375e-05, "loss": 2.4774, "mean_token_accuracy": 0.4114531993865967, "step": 106110 }, { "epoch": 0.10688015502985876, "grad_norm": 9.96006598630805, "learning_rate": 4.9603144838429234e-05, "loss": 2.349, "mean_token_accuracy": 0.3965517163276672, "step": 106115 }, { "epoch": 0.10688519108296293, "grad_norm": 10.53316904756797, "learning_rate": 4.960307476800271e-05, "loss": 2.3149, "mean_token_accuracy": 0.44694494009017943, "step": 106120 }, { "epoch": 0.1068902271360671, "grad_norm": 10.250091928136863, "learning_rate": 4.9603004691445806e-05, "loss": 2.1367, "mean_token_accuracy": 0.4813672065734863, "step": 106125 }, { "epoch": 0.10689526318917128, "grad_norm": 9.788375258606946, "learning_rate": 4.960293460875855e-05, "loss": 2.1996, "mean_token_accuracy": 0.4896551787853241, "step": 106130 }, { "epoch": 0.10690029924227545, "grad_norm": 11.068595653552201, "learning_rate": 4.960286451994096e-05, "loss": 2.4378, "mean_token_accuracy": 0.4137930989265442, "step": 106135 }, { "epoch": 0.10690533529537963, "grad_norm": 11.564967844533834, "learning_rate": 4.960279442499306e-05, "loss": 2.2954, "mean_token_accuracy": 0.42413792610168455, "step": 106140 }, { "epoch": 0.1069103713484838, "grad_norm": 10.215552343545129, "learning_rate": 4.960272432391487e-05, "loss": 2.0954, "mean_token_accuracy": 0.46551724672317507, "step": 106145 }, { "epoch": 0.10691540740158798, "grad_norm": 18.567800987719952, "learning_rate": 4.960265421670641e-05, "loss": 2.4066, "mean_token_accuracy": 0.4222625494003296, "step": 106150 }, { "epoch": 0.10692044345469214, "grad_norm": 12.28733449130241, "learning_rate": 4.9602584103367694e-05, "loss": 1.969, "mean_token_accuracy": 0.4620689630508423, "step": 106155 }, { "epoch": 0.10692547950779631, "grad_norm": 10.080060803221418, "learning_rate": 4.960251398389875e-05, "loss": 2.1451, "mean_token_accuracy": 0.4758620738983154, "step": 106160 }, { "epoch": 0.10693051556090048, "grad_norm": 12.6990985086516, "learning_rate": 4.960244385829958e-05, "loss": 2.4214, "mean_token_accuracy": 0.43448275327682495, "step": 106165 }, { "epoch": 0.10693555161400466, "grad_norm": 9.355155322848296, "learning_rate": 4.960237372657022e-05, "loss": 2.4064, "mean_token_accuracy": 0.4689655125141144, "step": 106170 }, { "epoch": 0.10694058766710883, "grad_norm": 12.179542378794698, "learning_rate": 4.9602303588710686e-05, "loss": 3.1062, "mean_token_accuracy": 0.3620689630508423, "step": 106175 }, { "epoch": 0.106945623720213, "grad_norm": 11.682613518361826, "learning_rate": 4.960223344472099e-05, "loss": 2.4927, "mean_token_accuracy": 0.3999999940395355, "step": 106180 }, { "epoch": 0.10695065977331718, "grad_norm": 11.92787282939523, "learning_rate": 4.960216329460116e-05, "loss": 2.6219, "mean_token_accuracy": 0.3655172407627106, "step": 106185 }, { "epoch": 0.10695569582642135, "grad_norm": 8.813097699009763, "learning_rate": 4.9602093138351216e-05, "loss": 2.547, "mean_token_accuracy": 0.3827586084604263, "step": 106190 }, { "epoch": 0.10696073187952553, "grad_norm": 9.720737737116847, "learning_rate": 4.9602022975971166e-05, "loss": 2.6435, "mean_token_accuracy": 0.3827586233615875, "step": 106195 }, { "epoch": 0.1069657679326297, "grad_norm": 9.203666714240121, "learning_rate": 4.9601952807461044e-05, "loss": 2.347, "mean_token_accuracy": 0.3896551728248596, "step": 106200 }, { "epoch": 0.10697080398573387, "grad_norm": 10.551161485115642, "learning_rate": 4.960188263282087e-05, "loss": 2.3377, "mean_token_accuracy": 0.4620689570903778, "step": 106205 }, { "epoch": 0.10697584003883805, "grad_norm": 12.18837055252694, "learning_rate": 4.9601812452050645e-05, "loss": 2.2996, "mean_token_accuracy": 0.42068966031074523, "step": 106210 }, { "epoch": 0.10698087609194222, "grad_norm": 10.647184005148105, "learning_rate": 4.96017422651504e-05, "loss": 2.3411, "mean_token_accuracy": 0.4206896543502808, "step": 106215 }, { "epoch": 0.1069859121450464, "grad_norm": 10.591931149445683, "learning_rate": 4.960167207212016e-05, "loss": 2.0449, "mean_token_accuracy": 0.4915305435657501, "step": 106220 }, { "epoch": 0.10699094819815055, "grad_norm": 12.62236676550205, "learning_rate": 4.9601601872959934e-05, "loss": 2.0983, "mean_token_accuracy": 0.4689655125141144, "step": 106225 }, { "epoch": 0.10699598425125473, "grad_norm": 9.151261576963176, "learning_rate": 4.960153166766975e-05, "loss": 2.6565, "mean_token_accuracy": 0.36896551847457887, "step": 106230 }, { "epoch": 0.1070010203043589, "grad_norm": 11.335415882908068, "learning_rate": 4.960146145624962e-05, "loss": 2.5267, "mean_token_accuracy": 0.41034482717514037, "step": 106235 }, { "epoch": 0.10700605635746308, "grad_norm": 12.610846103990081, "learning_rate": 4.9601391238699574e-05, "loss": 2.7983, "mean_token_accuracy": 0.3724137842655182, "step": 106240 }, { "epoch": 0.10701109241056725, "grad_norm": 9.512504353904053, "learning_rate": 4.9601321015019615e-05, "loss": 2.5021, "mean_token_accuracy": 0.42758620381355283, "step": 106245 }, { "epoch": 0.10701612846367142, "grad_norm": 11.262034357659095, "learning_rate": 4.960125078520977e-05, "loss": 2.2957, "mean_token_accuracy": 0.4379310369491577, "step": 106250 }, { "epoch": 0.1070211645167756, "grad_norm": 10.281610013330932, "learning_rate": 4.9601180549270076e-05, "loss": 2.4475, "mean_token_accuracy": 0.47586206793785096, "step": 106255 }, { "epoch": 0.10702620056987977, "grad_norm": 10.632703885607167, "learning_rate": 4.960111030720053e-05, "loss": 2.1376, "mean_token_accuracy": 0.4954023063182831, "step": 106260 }, { "epoch": 0.10703123662298394, "grad_norm": 13.97067586289822, "learning_rate": 4.9601040059001155e-05, "loss": 2.4335, "mean_token_accuracy": 0.4448275864124298, "step": 106265 }, { "epoch": 0.10703627267608812, "grad_norm": 12.21876480455352, "learning_rate": 4.960096980467198e-05, "loss": 2.7622, "mean_token_accuracy": 0.3999999940395355, "step": 106270 }, { "epoch": 0.10704130872919229, "grad_norm": 11.958643687740581, "learning_rate": 4.9600899544213016e-05, "loss": 2.4066, "mean_token_accuracy": 0.4502117395401001, "step": 106275 }, { "epoch": 0.10704634478229647, "grad_norm": 12.505872551999165, "learning_rate": 4.960082927762428e-05, "loss": 2.4707, "mean_token_accuracy": 0.4068965554237366, "step": 106280 }, { "epoch": 0.10705138083540064, "grad_norm": 12.86858903315689, "learning_rate": 4.960075900490581e-05, "loss": 2.5142, "mean_token_accuracy": 0.41034482717514037, "step": 106285 }, { "epoch": 0.1070564168885048, "grad_norm": 9.864573964343156, "learning_rate": 4.9600688726057596e-05, "loss": 2.6396, "mean_token_accuracy": 0.4103448212146759, "step": 106290 }, { "epoch": 0.10706145294160897, "grad_norm": 11.333264249833642, "learning_rate": 4.960061844107968e-05, "loss": 2.484, "mean_token_accuracy": 0.38275861740112305, "step": 106295 }, { "epoch": 0.10706648899471315, "grad_norm": 11.846136773835408, "learning_rate": 4.960054814997207e-05, "loss": 2.6541, "mean_token_accuracy": 0.3827586233615875, "step": 106300 }, { "epoch": 0.10707152504781732, "grad_norm": 10.375181675040198, "learning_rate": 4.96004778527348e-05, "loss": 2.45, "mean_token_accuracy": 0.45710828304290774, "step": 106305 }, { "epoch": 0.1070765611009215, "grad_norm": 12.30809107495536, "learning_rate": 4.9600407549367876e-05, "loss": 2.8306, "mean_token_accuracy": 0.37586207389831544, "step": 106310 }, { "epoch": 0.10708159715402567, "grad_norm": 8.901571025778285, "learning_rate": 4.960033723987132e-05, "loss": 2.6224, "mean_token_accuracy": 0.36206896007061007, "step": 106315 }, { "epoch": 0.10708663320712984, "grad_norm": 11.254247772851109, "learning_rate": 4.960026692424516e-05, "loss": 2.3727, "mean_token_accuracy": 0.4103448331356049, "step": 106320 }, { "epoch": 0.10709166926023402, "grad_norm": 10.602474859357526, "learning_rate": 4.9600196602489395e-05, "loss": 2.4554, "mean_token_accuracy": 0.3827586233615875, "step": 106325 }, { "epoch": 0.10709670531333819, "grad_norm": 13.974560603666632, "learning_rate": 4.9600126274604074e-05, "loss": 2.4257, "mean_token_accuracy": 0.44482758045196535, "step": 106330 }, { "epoch": 0.10710174136644236, "grad_norm": 8.092058920024067, "learning_rate": 4.960005594058919e-05, "loss": 2.0517, "mean_token_accuracy": 0.5137931048870087, "step": 106335 }, { "epoch": 0.10710677741954654, "grad_norm": 9.468858471077091, "learning_rate": 4.9599985600444775e-05, "loss": 2.3183, "mean_token_accuracy": 0.4931034505367279, "step": 106340 }, { "epoch": 0.10711181347265071, "grad_norm": 6.812811816305926, "learning_rate": 4.959991525417085e-05, "loss": 2.2093, "mean_token_accuracy": 0.4701567471027374, "step": 106345 }, { "epoch": 0.10711684952575488, "grad_norm": 9.989062824667348, "learning_rate": 4.959984490176742e-05, "loss": 2.5022, "mean_token_accuracy": 0.41379311084747317, "step": 106350 }, { "epoch": 0.10712188557885906, "grad_norm": 9.576582406198384, "learning_rate": 4.959977454323453e-05, "loss": 2.5167, "mean_token_accuracy": 0.4275861978530884, "step": 106355 }, { "epoch": 0.10712692163196322, "grad_norm": 10.156951963421292, "learning_rate": 4.9599704178572173e-05, "loss": 2.148, "mean_token_accuracy": 0.47241378426551817, "step": 106360 }, { "epoch": 0.10713195768506739, "grad_norm": 15.215475566162258, "learning_rate": 4.959963380778039e-05, "loss": 2.8452, "mean_token_accuracy": 0.38275861740112305, "step": 106365 }, { "epoch": 0.10713699373817157, "grad_norm": 12.155713386710275, "learning_rate": 4.959956343085919e-05, "loss": 2.6524, "mean_token_accuracy": 0.39122806787490844, "step": 106370 }, { "epoch": 0.10714202979127574, "grad_norm": 10.922229977426966, "learning_rate": 4.959949304780859e-05, "loss": 2.5111, "mean_token_accuracy": 0.4034482777118683, "step": 106375 }, { "epoch": 0.10714706584437991, "grad_norm": 12.981070557993153, "learning_rate": 4.959942265862861e-05, "loss": 2.4031, "mean_token_accuracy": 0.4172413766384125, "step": 106380 }, { "epoch": 0.10715210189748409, "grad_norm": 13.378562793767566, "learning_rate": 4.959935226331928e-05, "loss": 2.3787, "mean_token_accuracy": 0.3931034505367279, "step": 106385 }, { "epoch": 0.10715713795058826, "grad_norm": 9.97687030117511, "learning_rate": 4.959928186188061e-05, "loss": 2.5343, "mean_token_accuracy": 0.4, "step": 106390 }, { "epoch": 0.10716217400369243, "grad_norm": 9.040406108841525, "learning_rate": 4.959921145431262e-05, "loss": 2.1169, "mean_token_accuracy": 0.4965517222881317, "step": 106395 }, { "epoch": 0.10716721005679661, "grad_norm": 10.305001881807378, "learning_rate": 4.9599141040615335e-05, "loss": 2.5092, "mean_token_accuracy": 0.4673926174640656, "step": 106400 }, { "epoch": 0.10717224610990078, "grad_norm": 9.437873331732726, "learning_rate": 4.959907062078877e-05, "loss": 2.2606, "mean_token_accuracy": 0.44827587008476255, "step": 106405 }, { "epoch": 0.10717728216300496, "grad_norm": 11.109560962306661, "learning_rate": 4.9599000194832944e-05, "loss": 2.4531, "mean_token_accuracy": 0.39310344457626345, "step": 106410 }, { "epoch": 0.10718231821610913, "grad_norm": 9.300760891524575, "learning_rate": 4.959892976274788e-05, "loss": 2.463, "mean_token_accuracy": 0.43448275327682495, "step": 106415 }, { "epoch": 0.1071873542692133, "grad_norm": 10.85006747763822, "learning_rate": 4.9598859324533584e-05, "loss": 2.4694, "mean_token_accuracy": 0.4430732011795044, "step": 106420 }, { "epoch": 0.10719239032231748, "grad_norm": 12.325406028623624, "learning_rate": 4.9598788880190104e-05, "loss": 2.8229, "mean_token_accuracy": 0.42758620381355283, "step": 106425 }, { "epoch": 0.10719742637542164, "grad_norm": 10.177527305236834, "learning_rate": 4.9598718429717425e-05, "loss": 2.1901, "mean_token_accuracy": 0.4572897732257843, "step": 106430 }, { "epoch": 0.10720246242852581, "grad_norm": 9.957698541120218, "learning_rate": 4.95986479731156e-05, "loss": 2.47, "mean_token_accuracy": 0.4206896543502808, "step": 106435 }, { "epoch": 0.10720749848162998, "grad_norm": 13.722865477115791, "learning_rate": 4.959857751038462e-05, "loss": 2.8377, "mean_token_accuracy": 0.4068965494632721, "step": 106440 }, { "epoch": 0.10721253453473416, "grad_norm": 10.862217325426737, "learning_rate": 4.9598507041524525e-05, "loss": 2.4196, "mean_token_accuracy": 0.4103448331356049, "step": 106445 }, { "epoch": 0.10721757058783833, "grad_norm": 10.783111750898163, "learning_rate": 4.9598436566535325e-05, "loss": 2.0976, "mean_token_accuracy": 0.5156684756278992, "step": 106450 }, { "epoch": 0.1072226066409425, "grad_norm": 9.586846385596319, "learning_rate": 4.9598366085417035e-05, "loss": 2.2582, "mean_token_accuracy": 0.4965517222881317, "step": 106455 }, { "epoch": 0.10722764269404668, "grad_norm": 10.344148331896527, "learning_rate": 4.959829559816969e-05, "loss": 2.387, "mean_token_accuracy": 0.4068965554237366, "step": 106460 }, { "epoch": 0.10723267874715085, "grad_norm": 12.102759267611043, "learning_rate": 4.95982251047933e-05, "loss": 2.648, "mean_token_accuracy": 0.36896551847457887, "step": 106465 }, { "epoch": 0.10723771480025503, "grad_norm": 16.544150644709934, "learning_rate": 4.959815460528788e-05, "loss": 2.5696, "mean_token_accuracy": 0.4551724135875702, "step": 106470 }, { "epoch": 0.1072427508533592, "grad_norm": 11.737588974468213, "learning_rate": 4.959808409965346e-05, "loss": 2.2859, "mean_token_accuracy": 0.41034482717514037, "step": 106475 }, { "epoch": 0.10724778690646337, "grad_norm": 9.432847999561684, "learning_rate": 4.9598013587890045e-05, "loss": 2.8732, "mean_token_accuracy": 0.3862069010734558, "step": 106480 }, { "epoch": 0.10725282295956755, "grad_norm": 10.15694550816408, "learning_rate": 4.959794306999767e-05, "loss": 2.4563, "mean_token_accuracy": 0.42068966031074523, "step": 106485 }, { "epoch": 0.10725785901267172, "grad_norm": 12.170937370608891, "learning_rate": 4.959787254597634e-05, "loss": 2.2018, "mean_token_accuracy": 0.43448275327682495, "step": 106490 }, { "epoch": 0.1072628950657759, "grad_norm": 12.104171816278498, "learning_rate": 4.959780201582609e-05, "loss": 2.9951, "mean_token_accuracy": 0.39310345649719236, "step": 106495 }, { "epoch": 0.10726793111888006, "grad_norm": 8.904005847271538, "learning_rate": 4.959773147954693e-05, "loss": 2.0593, "mean_token_accuracy": 0.5137931108474731, "step": 106500 }, { "epoch": 0.10727296717198423, "grad_norm": 10.590525083425986, "learning_rate": 4.959766093713889e-05, "loss": 2.2341, "mean_token_accuracy": 0.4137930989265442, "step": 106505 }, { "epoch": 0.1072780032250884, "grad_norm": 10.64108891856517, "learning_rate": 4.959759038860197e-05, "loss": 2.5243, "mean_token_accuracy": 0.4172413766384125, "step": 106510 }, { "epoch": 0.10728303927819258, "grad_norm": 13.137321110980233, "learning_rate": 4.9597519833936205e-05, "loss": 2.1885, "mean_token_accuracy": 0.441379314661026, "step": 106515 }, { "epoch": 0.10728807533129675, "grad_norm": 10.62191902858775, "learning_rate": 4.959744927314161e-05, "loss": 2.5678, "mean_token_accuracy": 0.3793103456497192, "step": 106520 }, { "epoch": 0.10729311138440092, "grad_norm": 12.031802667360553, "learning_rate": 4.9597378706218206e-05, "loss": 2.4642, "mean_token_accuracy": 0.3793103456497192, "step": 106525 }, { "epoch": 0.1072981474375051, "grad_norm": 10.344008858056721, "learning_rate": 4.9597308133166016e-05, "loss": 2.6041, "mean_token_accuracy": 0.38965516686439516, "step": 106530 }, { "epoch": 0.10730318349060927, "grad_norm": 23.621930592021844, "learning_rate": 4.959723755398504e-05, "loss": 2.6777, "mean_token_accuracy": 0.34137930274009703, "step": 106535 }, { "epoch": 0.10730821954371345, "grad_norm": 8.125423020146227, "learning_rate": 4.959716696867532e-05, "loss": 2.3515, "mean_token_accuracy": 0.43448275327682495, "step": 106540 }, { "epoch": 0.10731325559681762, "grad_norm": 7.905561744551934, "learning_rate": 4.959709637723688e-05, "loss": 2.6121, "mean_token_accuracy": 0.42413792610168455, "step": 106545 }, { "epoch": 0.1073182916499218, "grad_norm": 10.076402592319168, "learning_rate": 4.959702577966972e-05, "loss": 2.7255, "mean_token_accuracy": 0.33103448152542114, "step": 106550 }, { "epoch": 0.10732332770302597, "grad_norm": 12.837750551964897, "learning_rate": 4.959695517597387e-05, "loss": 2.2592, "mean_token_accuracy": 0.45517241954803467, "step": 106555 }, { "epoch": 0.10732836375613014, "grad_norm": 13.091453100239086, "learning_rate": 4.959688456614934e-05, "loss": 2.4559, "mean_token_accuracy": 0.382758629322052, "step": 106560 }, { "epoch": 0.10733339980923431, "grad_norm": 12.032691621125032, "learning_rate": 4.959681395019616e-05, "loss": 2.7772, "mean_token_accuracy": 0.3137931048870087, "step": 106565 }, { "epoch": 0.10733843586233847, "grad_norm": 13.330493760758372, "learning_rate": 4.959674332811434e-05, "loss": 2.609, "mean_token_accuracy": 0.4533575356006622, "step": 106570 }, { "epoch": 0.10734347191544265, "grad_norm": 13.666181170991402, "learning_rate": 4.959667269990391e-05, "loss": 2.4683, "mean_token_accuracy": 0.4344827651977539, "step": 106575 }, { "epoch": 0.10734850796854682, "grad_norm": 10.696599630127945, "learning_rate": 4.9596602065564895e-05, "loss": 2.3656, "mean_token_accuracy": 0.4551724076271057, "step": 106580 }, { "epoch": 0.107353544021651, "grad_norm": 11.217610448709276, "learning_rate": 4.9596531425097305e-05, "loss": 2.3688, "mean_token_accuracy": 0.4034482777118683, "step": 106585 }, { "epoch": 0.10735858007475517, "grad_norm": 9.85562931862014, "learning_rate": 4.959646077850115e-05, "loss": 2.2313, "mean_token_accuracy": 0.4599753737449646, "step": 106590 }, { "epoch": 0.10736361612785934, "grad_norm": 12.895972277728696, "learning_rate": 4.959639012577646e-05, "loss": 2.4179, "mean_token_accuracy": 0.4413793087005615, "step": 106595 }, { "epoch": 0.10736865218096352, "grad_norm": 10.924053609702735, "learning_rate": 4.959631946692325e-05, "loss": 2.3044, "mean_token_accuracy": 0.4172413766384125, "step": 106600 }, { "epoch": 0.10737368823406769, "grad_norm": 10.733186332823689, "learning_rate": 4.959624880194155e-05, "loss": 2.3178, "mean_token_accuracy": 0.4034482717514038, "step": 106605 }, { "epoch": 0.10737872428717186, "grad_norm": 13.530467681064485, "learning_rate": 4.9596178130831365e-05, "loss": 2.3154, "mean_token_accuracy": 0.45650331377983094, "step": 106610 }, { "epoch": 0.10738376034027604, "grad_norm": 14.403014485381881, "learning_rate": 4.959610745359274e-05, "loss": 2.4041, "mean_token_accuracy": 0.46551724672317507, "step": 106615 }, { "epoch": 0.10738879639338021, "grad_norm": 10.393808097004916, "learning_rate": 4.959603677022566e-05, "loss": 2.4378, "mean_token_accuracy": 0.4379310429096222, "step": 106620 }, { "epoch": 0.10739383244648439, "grad_norm": 12.35202844295194, "learning_rate": 4.959596608073017e-05, "loss": 2.2705, "mean_token_accuracy": 0.4310344815254211, "step": 106625 }, { "epoch": 0.10739886849958856, "grad_norm": 11.293775159140248, "learning_rate": 4.959589538510628e-05, "loss": 2.4262, "mean_token_accuracy": 0.41379310488700866, "step": 106630 }, { "epoch": 0.10740390455269273, "grad_norm": 11.732765882356322, "learning_rate": 4.959582468335401e-05, "loss": 2.7593, "mean_token_accuracy": 0.36896551847457887, "step": 106635 }, { "epoch": 0.1074089406057969, "grad_norm": 9.468588879874773, "learning_rate": 4.959575397547338e-05, "loss": 2.5064, "mean_token_accuracy": 0.403448274731636, "step": 106640 }, { "epoch": 0.10741397665890107, "grad_norm": 11.944422244346457, "learning_rate": 4.959568326146442e-05, "loss": 2.3013, "mean_token_accuracy": 0.44482759237289426, "step": 106645 }, { "epoch": 0.10741901271200524, "grad_norm": 11.017043266183217, "learning_rate": 4.9595612541327125e-05, "loss": 2.7699, "mean_token_accuracy": 0.3862069010734558, "step": 106650 }, { "epoch": 0.10742404876510941, "grad_norm": 10.449653059428352, "learning_rate": 4.9595541815061535e-05, "loss": 2.563, "mean_token_accuracy": 0.37586206793785093, "step": 106655 }, { "epoch": 0.10742908481821359, "grad_norm": 9.994158753533286, "learning_rate": 4.959547108266766e-05, "loss": 2.5532, "mean_token_accuracy": 0.37586206793785093, "step": 106660 }, { "epoch": 0.10743412087131776, "grad_norm": 10.48592145820035, "learning_rate": 4.959540034414553e-05, "loss": 2.2758, "mean_token_accuracy": 0.38275861740112305, "step": 106665 }, { "epoch": 0.10743915692442194, "grad_norm": 10.062621367223123, "learning_rate": 4.959532959949515e-05, "loss": 2.2209, "mean_token_accuracy": 0.4517241299152374, "step": 106670 }, { "epoch": 0.10744419297752611, "grad_norm": 11.223274517915318, "learning_rate": 4.959525884871656e-05, "loss": 2.8167, "mean_token_accuracy": 0.39310344457626345, "step": 106675 }, { "epoch": 0.10744922903063028, "grad_norm": 10.011428549098868, "learning_rate": 4.959518809180976e-05, "loss": 2.2912, "mean_token_accuracy": 0.4551724135875702, "step": 106680 }, { "epoch": 0.10745426508373446, "grad_norm": 10.646608262919381, "learning_rate": 4.9595117328774783e-05, "loss": 2.1735, "mean_token_accuracy": 0.4310344815254211, "step": 106685 }, { "epoch": 0.10745930113683863, "grad_norm": 10.927723365878006, "learning_rate": 4.959504655961164e-05, "loss": 2.6942, "mean_token_accuracy": 0.39655172228813174, "step": 106690 }, { "epoch": 0.1074643371899428, "grad_norm": 9.848513366683308, "learning_rate": 4.959497578432035e-05, "loss": 2.3722, "mean_token_accuracy": 0.4413793087005615, "step": 106695 }, { "epoch": 0.10746937324304698, "grad_norm": 9.745369565534425, "learning_rate": 4.959490500290094e-05, "loss": 2.3078, "mean_token_accuracy": 0.43103448748588563, "step": 106700 }, { "epoch": 0.10747440929615115, "grad_norm": 10.453926616018494, "learning_rate": 4.959483421535342e-05, "loss": 2.2058, "mean_token_accuracy": 0.4517241299152374, "step": 106705 }, { "epoch": 0.10747944534925531, "grad_norm": 12.161459202232635, "learning_rate": 4.959476342167782e-05, "loss": 2.3153, "mean_token_accuracy": 0.43103448748588563, "step": 106710 }, { "epoch": 0.10748448140235949, "grad_norm": 9.649767345890206, "learning_rate": 4.9594692621874154e-05, "loss": 2.4339, "mean_token_accuracy": 0.38620689511299133, "step": 106715 }, { "epoch": 0.10748951745546366, "grad_norm": 12.886503610995915, "learning_rate": 4.959462181594245e-05, "loss": 2.1258, "mean_token_accuracy": 0.49655171632766726, "step": 106720 }, { "epoch": 0.10749455350856783, "grad_norm": 11.555338059611536, "learning_rate": 4.9594551003882705e-05, "loss": 2.6321, "mean_token_accuracy": 0.37931033968925476, "step": 106725 }, { "epoch": 0.10749958956167201, "grad_norm": 9.640672181942858, "learning_rate": 4.959448018569497e-05, "loss": 2.7943, "mean_token_accuracy": 0.42413793206214906, "step": 106730 }, { "epoch": 0.10750462561477618, "grad_norm": 11.32226200521982, "learning_rate": 4.959440936137923e-05, "loss": 2.6046, "mean_token_accuracy": 0.435571676492691, "step": 106735 }, { "epoch": 0.10750966166788035, "grad_norm": 10.195056195676251, "learning_rate": 4.9594338530935536e-05, "loss": 2.2792, "mean_token_accuracy": 0.4137930989265442, "step": 106740 }, { "epoch": 0.10751469772098453, "grad_norm": 9.98543662903377, "learning_rate": 4.9594267694363895e-05, "loss": 2.9484, "mean_token_accuracy": 0.36896551847457887, "step": 106745 }, { "epoch": 0.1075197337740887, "grad_norm": 10.326730138561052, "learning_rate": 4.959419685166432e-05, "loss": 2.4506, "mean_token_accuracy": 0.41379310488700866, "step": 106750 }, { "epoch": 0.10752476982719288, "grad_norm": 11.294552537674972, "learning_rate": 4.959412600283685e-05, "loss": 2.1495, "mean_token_accuracy": 0.47241378426551817, "step": 106755 }, { "epoch": 0.10752980588029705, "grad_norm": 12.551294045295297, "learning_rate": 4.9594055147881473e-05, "loss": 2.8227, "mean_token_accuracy": 0.3724137872457504, "step": 106760 }, { "epoch": 0.10753484193340122, "grad_norm": 10.354612437864445, "learning_rate": 4.959398428679824e-05, "loss": 2.5441, "mean_token_accuracy": 0.3965517163276672, "step": 106765 }, { "epoch": 0.1075398779865054, "grad_norm": 12.029865720983064, "learning_rate": 4.959391341958715e-05, "loss": 2.6199, "mean_token_accuracy": 0.42758620977401735, "step": 106770 }, { "epoch": 0.10754491403960957, "grad_norm": 9.381046228614197, "learning_rate": 4.9593842546248235e-05, "loss": 2.3842, "mean_token_accuracy": 0.4310344815254211, "step": 106775 }, { "epoch": 0.10754995009271373, "grad_norm": 9.958304397898976, "learning_rate": 4.959377166678152e-05, "loss": 2.1766, "mean_token_accuracy": 0.4379310369491577, "step": 106780 }, { "epoch": 0.1075549861458179, "grad_norm": 14.621644808857548, "learning_rate": 4.9593700781187004e-05, "loss": 2.907, "mean_token_accuracy": 0.3655172407627106, "step": 106785 }, { "epoch": 0.10756002219892208, "grad_norm": 39.435746885334055, "learning_rate": 4.9593629889464715e-05, "loss": 3.0036, "mean_token_accuracy": 0.4, "step": 106790 }, { "epoch": 0.10756505825202625, "grad_norm": 12.416538995700153, "learning_rate": 4.959355899161468e-05, "loss": 2.7264, "mean_token_accuracy": 0.4310344815254211, "step": 106795 }, { "epoch": 0.10757009430513043, "grad_norm": 10.089464802630003, "learning_rate": 4.9593488087636915e-05, "loss": 2.1758, "mean_token_accuracy": 0.458620685338974, "step": 106800 }, { "epoch": 0.1075751303582346, "grad_norm": 10.988388477522694, "learning_rate": 4.9593417177531436e-05, "loss": 2.6094, "mean_token_accuracy": 0.41379310488700866, "step": 106805 }, { "epoch": 0.10758016641133877, "grad_norm": 10.302927322332708, "learning_rate": 4.959334626129827e-05, "loss": 2.5436, "mean_token_accuracy": 0.4068965554237366, "step": 106810 }, { "epoch": 0.10758520246444295, "grad_norm": 10.84781891306715, "learning_rate": 4.9593275338937425e-05, "loss": 2.2994, "mean_token_accuracy": 0.4034482777118683, "step": 106815 }, { "epoch": 0.10759023851754712, "grad_norm": 10.649820333278269, "learning_rate": 4.9593204410448926e-05, "loss": 2.2966, "mean_token_accuracy": 0.4689655125141144, "step": 106820 }, { "epoch": 0.1075952745706513, "grad_norm": 10.733918534391444, "learning_rate": 4.959313347583281e-05, "loss": 2.2193, "mean_token_accuracy": 0.4517241358757019, "step": 106825 }, { "epoch": 0.10760031062375547, "grad_norm": 10.886491885111527, "learning_rate": 4.959306253508907e-05, "loss": 2.4809, "mean_token_accuracy": 0.41034482717514037, "step": 106830 }, { "epoch": 0.10760534667685964, "grad_norm": 13.075971436146078, "learning_rate": 4.9592991588217734e-05, "loss": 2.9847, "mean_token_accuracy": 0.4172413766384125, "step": 106835 }, { "epoch": 0.10761038272996382, "grad_norm": 11.286498635031593, "learning_rate": 4.9592920635218825e-05, "loss": 2.1673, "mean_token_accuracy": 0.4896551728248596, "step": 106840 }, { "epoch": 0.10761541878306799, "grad_norm": 9.85853254442693, "learning_rate": 4.959284967609237e-05, "loss": 2.0912, "mean_token_accuracy": 0.42710224390029905, "step": 106845 }, { "epoch": 0.10762045483617215, "grad_norm": 10.54336939385977, "learning_rate": 4.959277871083838e-05, "loss": 2.452, "mean_token_accuracy": 0.4310344815254211, "step": 106850 }, { "epoch": 0.10762549088927632, "grad_norm": 13.397413412069337, "learning_rate": 4.9592707739456866e-05, "loss": 2.6721, "mean_token_accuracy": 0.324137932062149, "step": 106855 }, { "epoch": 0.1076305269423805, "grad_norm": 11.608632425523028, "learning_rate": 4.9592636761947866e-05, "loss": 2.5292, "mean_token_accuracy": 0.42758620977401735, "step": 106860 }, { "epoch": 0.10763556299548467, "grad_norm": 11.9895769676642, "learning_rate": 4.959256577831138e-05, "loss": 2.5368, "mean_token_accuracy": 0.36206896007061007, "step": 106865 }, { "epoch": 0.10764059904858884, "grad_norm": 14.227945077870011, "learning_rate": 4.959249478854745e-05, "loss": 2.3987, "mean_token_accuracy": 0.39655172228813174, "step": 106870 }, { "epoch": 0.10764563510169302, "grad_norm": 9.415251898660706, "learning_rate": 4.959242379265608e-05, "loss": 2.2808, "mean_token_accuracy": 0.4551724076271057, "step": 106875 }, { "epoch": 0.10765067115479719, "grad_norm": 12.354147006948123, "learning_rate": 4.959235279063729e-05, "loss": 2.1985, "mean_token_accuracy": 0.4655172348022461, "step": 106880 }, { "epoch": 0.10765570720790137, "grad_norm": 11.299348128399007, "learning_rate": 4.959228178249111e-05, "loss": 2.7676, "mean_token_accuracy": 0.3620689630508423, "step": 106885 }, { "epoch": 0.10766074326100554, "grad_norm": 10.16039787844633, "learning_rate": 4.959221076821755e-05, "loss": 2.5441, "mean_token_accuracy": 0.38620689511299133, "step": 106890 }, { "epoch": 0.10766577931410971, "grad_norm": 11.012813586769083, "learning_rate": 4.959213974781663e-05, "loss": 2.3132, "mean_token_accuracy": 0.4448275864124298, "step": 106895 }, { "epoch": 0.10767081536721389, "grad_norm": 11.268510508450252, "learning_rate": 4.9592068721288373e-05, "loss": 2.4029, "mean_token_accuracy": 0.39655172228813174, "step": 106900 }, { "epoch": 0.10767585142031806, "grad_norm": 9.226826916605239, "learning_rate": 4.9591997688632806e-05, "loss": 2.0827, "mean_token_accuracy": 0.4569267988204956, "step": 106905 }, { "epoch": 0.10768088747342223, "grad_norm": 12.642494166586225, "learning_rate": 4.959192664984993e-05, "loss": 2.3647, "mean_token_accuracy": 0.4379310250282288, "step": 106910 }, { "epoch": 0.10768592352652641, "grad_norm": 11.138630939629552, "learning_rate": 4.959185560493979e-05, "loss": 2.4635, "mean_token_accuracy": 0.43581366539001465, "step": 106915 }, { "epoch": 0.10769095957963057, "grad_norm": 9.700827587130128, "learning_rate": 4.959178455390239e-05, "loss": 2.5871, "mean_token_accuracy": 0.4000000059604645, "step": 106920 }, { "epoch": 0.10769599563273474, "grad_norm": 11.045386543859502, "learning_rate": 4.959171349673773e-05, "loss": 2.2538, "mean_token_accuracy": 0.42413792610168455, "step": 106925 }, { "epoch": 0.10770103168583892, "grad_norm": 11.94376585504857, "learning_rate": 4.959164243344587e-05, "loss": 2.5539, "mean_token_accuracy": 0.4137930989265442, "step": 106930 }, { "epoch": 0.10770606773894309, "grad_norm": 10.980915290607202, "learning_rate": 4.9591571364026804e-05, "loss": 2.3658, "mean_token_accuracy": 0.41379310488700866, "step": 106935 }, { "epoch": 0.10771110379204726, "grad_norm": 11.535089263460891, "learning_rate": 4.9591500288480565e-05, "loss": 2.3859, "mean_token_accuracy": 0.4137930989265442, "step": 106940 }, { "epoch": 0.10771613984515144, "grad_norm": 11.110720294691056, "learning_rate": 4.959142920680716e-05, "loss": 2.3338, "mean_token_accuracy": 0.4534180283546448, "step": 106945 }, { "epoch": 0.10772117589825561, "grad_norm": 14.82000985595911, "learning_rate": 4.959135811900662e-05, "loss": 2.6034, "mean_token_accuracy": 0.38620689809322356, "step": 106950 }, { "epoch": 0.10772621195135978, "grad_norm": 10.571544165444235, "learning_rate": 4.959128702507895e-05, "loss": 2.2071, "mean_token_accuracy": 0.48620688915252686, "step": 106955 }, { "epoch": 0.10773124800446396, "grad_norm": 12.637791143581376, "learning_rate": 4.9591215925024185e-05, "loss": 2.5277, "mean_token_accuracy": 0.42413793206214906, "step": 106960 }, { "epoch": 0.10773628405756813, "grad_norm": 14.220807513351382, "learning_rate": 4.9591144818842336e-05, "loss": 2.5348, "mean_token_accuracy": 0.41379310488700866, "step": 106965 }, { "epoch": 0.1077413201106723, "grad_norm": 11.702181506459732, "learning_rate": 4.9591073706533424e-05, "loss": 2.6193, "mean_token_accuracy": 0.443254691362381, "step": 106970 }, { "epoch": 0.10774635616377648, "grad_norm": 10.326327762732863, "learning_rate": 4.959100258809748e-05, "loss": 2.1686, "mean_token_accuracy": 0.4310344815254211, "step": 106975 }, { "epoch": 0.10775139221688065, "grad_norm": 11.336737911831042, "learning_rate": 4.959093146353451e-05, "loss": 2.6062, "mean_token_accuracy": 0.36896551251411436, "step": 106980 }, { "epoch": 0.10775642826998483, "grad_norm": 9.72767969604264, "learning_rate": 4.9590860332844535e-05, "loss": 2.5936, "mean_token_accuracy": 0.39310344457626345, "step": 106985 }, { "epoch": 0.10776146432308899, "grad_norm": 8.205227644145948, "learning_rate": 4.959078919602758e-05, "loss": 2.4432, "mean_token_accuracy": 0.42413793206214906, "step": 106990 }, { "epoch": 0.10776650037619316, "grad_norm": 10.508058166432223, "learning_rate": 4.959071805308366e-05, "loss": 2.1863, "mean_token_accuracy": 0.46551724076271056, "step": 106995 }, { "epoch": 0.10777153642929733, "grad_norm": 10.408608414489235, "learning_rate": 4.959064690401279e-05, "loss": 2.4637, "mean_token_accuracy": 0.4083484590053558, "step": 107000 }, { "epoch": 0.10777657248240151, "grad_norm": 12.599113631693305, "learning_rate": 4.959057574881501e-05, "loss": 2.4122, "mean_token_accuracy": 0.3827586233615875, "step": 107005 }, { "epoch": 0.10778160853550568, "grad_norm": 10.046104973558526, "learning_rate": 4.959050458749032e-05, "loss": 2.2215, "mean_token_accuracy": 0.4551724255084991, "step": 107010 }, { "epoch": 0.10778664458860986, "grad_norm": 10.38936285245647, "learning_rate": 4.959043342003875e-05, "loss": 2.4585, "mean_token_accuracy": 0.4413793087005615, "step": 107015 }, { "epoch": 0.10779168064171403, "grad_norm": 11.887184611399931, "learning_rate": 4.959036224646032e-05, "loss": 2.4733, "mean_token_accuracy": 0.4379310369491577, "step": 107020 }, { "epoch": 0.1077967166948182, "grad_norm": 12.150976664691568, "learning_rate": 4.9590291066755026e-05, "loss": 2.7771, "mean_token_accuracy": 0.4034482777118683, "step": 107025 }, { "epoch": 0.10780175274792238, "grad_norm": 15.485262959152017, "learning_rate": 4.959021988092292e-05, "loss": 2.726, "mean_token_accuracy": 0.41379310488700866, "step": 107030 }, { "epoch": 0.10780678880102655, "grad_norm": 9.956924656575037, "learning_rate": 4.959014868896401e-05, "loss": 2.0671, "mean_token_accuracy": 0.458620685338974, "step": 107035 }, { "epoch": 0.10781182485413073, "grad_norm": 9.516171518316742, "learning_rate": 4.959007749087831e-05, "loss": 3.0218, "mean_token_accuracy": 0.3758620619773865, "step": 107040 }, { "epoch": 0.1078168609072349, "grad_norm": 10.872518206226435, "learning_rate": 4.9590006286665854e-05, "loss": 2.521, "mean_token_accuracy": 0.38620689511299133, "step": 107045 }, { "epoch": 0.10782189696033907, "grad_norm": 11.21976720925354, "learning_rate": 4.958993507632665e-05, "loss": 2.6692, "mean_token_accuracy": 0.38965516686439516, "step": 107050 }, { "epoch": 0.10782693301344325, "grad_norm": 10.75634765566282, "learning_rate": 4.9589863859860716e-05, "loss": 2.2986, "mean_token_accuracy": 0.38620689511299133, "step": 107055 }, { "epoch": 0.1078319690665474, "grad_norm": 12.063245540241597, "learning_rate": 4.9589792637268066e-05, "loss": 2.4978, "mean_token_accuracy": 0.4068965554237366, "step": 107060 }, { "epoch": 0.10783700511965158, "grad_norm": 9.748706292483627, "learning_rate": 4.9589721408548746e-05, "loss": 1.93, "mean_token_accuracy": 0.4862069070339203, "step": 107065 }, { "epoch": 0.10784204117275575, "grad_norm": 10.949890092092266, "learning_rate": 4.9589650173702755e-05, "loss": 2.4979, "mean_token_accuracy": 0.38965516686439516, "step": 107070 }, { "epoch": 0.10784707722585993, "grad_norm": 12.802860486650173, "learning_rate": 4.958957893273011e-05, "loss": 2.1315, "mean_token_accuracy": 0.44482758045196535, "step": 107075 }, { "epoch": 0.1078521132789641, "grad_norm": 16.689067428247053, "learning_rate": 4.958950768563084e-05, "loss": 2.508, "mean_token_accuracy": 0.4103448212146759, "step": 107080 }, { "epoch": 0.10785714933206828, "grad_norm": 9.142764262337046, "learning_rate": 4.9589436432404975e-05, "loss": 2.3, "mean_token_accuracy": 0.44827585816383364, "step": 107085 }, { "epoch": 0.10786218538517245, "grad_norm": 10.321886647706958, "learning_rate": 4.958936517305251e-05, "loss": 2.1765, "mean_token_accuracy": 0.42068964838981626, "step": 107090 }, { "epoch": 0.10786722143827662, "grad_norm": 10.576006120563074, "learning_rate": 4.958929390757348e-05, "loss": 2.3124, "mean_token_accuracy": 0.44996975660324096, "step": 107095 }, { "epoch": 0.1078722574913808, "grad_norm": 8.491517345020146, "learning_rate": 4.9589222635967905e-05, "loss": 2.3533, "mean_token_accuracy": 0.46606169939041137, "step": 107100 }, { "epoch": 0.10787729354448497, "grad_norm": 14.648448004805473, "learning_rate": 4.9589151358235796e-05, "loss": 2.5556, "mean_token_accuracy": 0.45033273100852966, "step": 107105 }, { "epoch": 0.10788232959758914, "grad_norm": 11.384283404655084, "learning_rate": 4.9589080074377185e-05, "loss": 2.522, "mean_token_accuracy": 0.4241379380226135, "step": 107110 }, { "epoch": 0.10788736565069332, "grad_norm": 11.85457718177044, "learning_rate": 4.958900878439208e-05, "loss": 2.475, "mean_token_accuracy": 0.3620689630508423, "step": 107115 }, { "epoch": 0.10789240170379749, "grad_norm": 9.900306314179742, "learning_rate": 4.958893748828051e-05, "loss": 2.4336, "mean_token_accuracy": 0.4472474277019501, "step": 107120 }, { "epoch": 0.10789743775690167, "grad_norm": 11.402912262842616, "learning_rate": 4.958886618604248e-05, "loss": 2.2369, "mean_token_accuracy": 0.4551724135875702, "step": 107125 }, { "epoch": 0.10790247381000583, "grad_norm": 10.47367821767516, "learning_rate": 4.9588794877678036e-05, "loss": 2.7329, "mean_token_accuracy": 0.4344827592372894, "step": 107130 }, { "epoch": 0.10790750986311, "grad_norm": 11.258819322469655, "learning_rate": 4.9588723563187177e-05, "loss": 3.357, "mean_token_accuracy": 0.37241379618644715, "step": 107135 }, { "epoch": 0.10791254591621417, "grad_norm": 12.559829967512895, "learning_rate": 4.9588652242569925e-05, "loss": 2.2833, "mean_token_accuracy": 0.4517241299152374, "step": 107140 }, { "epoch": 0.10791758196931835, "grad_norm": 15.834871708418579, "learning_rate": 4.958858091582631e-05, "loss": 2.9651, "mean_token_accuracy": 0.37931033968925476, "step": 107145 }, { "epoch": 0.10792261802242252, "grad_norm": 11.218242847107856, "learning_rate": 4.958850958295633e-05, "loss": 2.3789, "mean_token_accuracy": 0.39655172228813174, "step": 107150 }, { "epoch": 0.1079276540755267, "grad_norm": 9.559395743077209, "learning_rate": 4.9588438243960036e-05, "loss": 2.1937, "mean_token_accuracy": 0.4344827592372894, "step": 107155 }, { "epoch": 0.10793269012863087, "grad_norm": 10.598965202174151, "learning_rate": 4.958836689883742e-05, "loss": 2.5258, "mean_token_accuracy": 0.44827585816383364, "step": 107160 }, { "epoch": 0.10793772618173504, "grad_norm": 19.833252508455786, "learning_rate": 4.958829554758852e-05, "loss": 2.7702, "mean_token_accuracy": 0.39310344457626345, "step": 107165 }, { "epoch": 0.10794276223483922, "grad_norm": 11.90751503448679, "learning_rate": 4.9588224190213344e-05, "loss": 2.5352, "mean_token_accuracy": 0.39310344457626345, "step": 107170 }, { "epoch": 0.10794779828794339, "grad_norm": 12.029558370933948, "learning_rate": 4.958815282671191e-05, "loss": 2.3734, "mean_token_accuracy": 0.46896552443504336, "step": 107175 }, { "epoch": 0.10795283434104756, "grad_norm": 12.213706627105474, "learning_rate": 4.9588081457084256e-05, "loss": 2.9199, "mean_token_accuracy": 0.3758620649576187, "step": 107180 }, { "epoch": 0.10795787039415174, "grad_norm": 11.566061077418949, "learning_rate": 4.95880100813304e-05, "loss": 2.5143, "mean_token_accuracy": 0.41034482717514037, "step": 107185 }, { "epoch": 0.10796290644725591, "grad_norm": 11.797199762362448, "learning_rate": 4.9587938699450335e-05, "loss": 2.6118, "mean_token_accuracy": 0.39310345649719236, "step": 107190 }, { "epoch": 0.10796794250036008, "grad_norm": 10.789797786022955, "learning_rate": 4.95878673114441e-05, "loss": 2.0403, "mean_token_accuracy": 0.458620685338974, "step": 107195 }, { "epoch": 0.10797297855346424, "grad_norm": 11.74311871053466, "learning_rate": 4.958779591731171e-05, "loss": 2.8219, "mean_token_accuracy": 0.3879612863063812, "step": 107200 }, { "epoch": 0.10797801460656842, "grad_norm": 11.680201625989499, "learning_rate": 4.9587724517053196e-05, "loss": 2.4864, "mean_token_accuracy": 0.4034482717514038, "step": 107205 }, { "epoch": 0.10798305065967259, "grad_norm": 11.068890728030484, "learning_rate": 4.9587653110668566e-05, "loss": 2.4819, "mean_token_accuracy": 0.4206896543502808, "step": 107210 }, { "epoch": 0.10798808671277677, "grad_norm": 12.626547126885658, "learning_rate": 4.9587581698157834e-05, "loss": 2.4119, "mean_token_accuracy": 0.4034482717514038, "step": 107215 }, { "epoch": 0.10799312276588094, "grad_norm": 10.823577352195663, "learning_rate": 4.958751027952104e-05, "loss": 2.8991, "mean_token_accuracy": 0.3793103456497192, "step": 107220 }, { "epoch": 0.10799815881898511, "grad_norm": 10.272992506054853, "learning_rate": 4.9587438854758196e-05, "loss": 2.8285, "mean_token_accuracy": 0.38777979016304015, "step": 107225 }, { "epoch": 0.10800319487208929, "grad_norm": 10.346500315759233, "learning_rate": 4.958736742386931e-05, "loss": 2.3054, "mean_token_accuracy": 0.43793103098869324, "step": 107230 }, { "epoch": 0.10800823092519346, "grad_norm": 12.922690792302639, "learning_rate": 4.958729598685441e-05, "loss": 2.5809, "mean_token_accuracy": 0.41379310488700866, "step": 107235 }, { "epoch": 0.10801326697829763, "grad_norm": 9.993981741315281, "learning_rate": 4.9587224543713516e-05, "loss": 2.1847, "mean_token_accuracy": 0.4241379350423813, "step": 107240 }, { "epoch": 0.10801830303140181, "grad_norm": 10.025685651026366, "learning_rate": 4.958715309444665e-05, "loss": 2.565, "mean_token_accuracy": 0.4241379380226135, "step": 107245 }, { "epoch": 0.10802333908450598, "grad_norm": 8.565518092204236, "learning_rate": 4.958708163905383e-05, "loss": 2.2594, "mean_token_accuracy": 0.475862056016922, "step": 107250 }, { "epoch": 0.10802837513761016, "grad_norm": 11.040162681653403, "learning_rate": 4.958701017753508e-05, "loss": 2.341, "mean_token_accuracy": 0.4137930989265442, "step": 107255 }, { "epoch": 0.10803341119071433, "grad_norm": 10.464873015943722, "learning_rate": 4.958693870989041e-05, "loss": 2.1986, "mean_token_accuracy": 0.4931034505367279, "step": 107260 }, { "epoch": 0.1080384472438185, "grad_norm": 9.44870375624461, "learning_rate": 4.958686723611985e-05, "loss": 2.6182, "mean_token_accuracy": 0.44482758045196535, "step": 107265 }, { "epoch": 0.10804348329692266, "grad_norm": 10.092844744653002, "learning_rate": 4.95867957562234e-05, "loss": 2.5479, "mean_token_accuracy": 0.41379310488700866, "step": 107270 }, { "epoch": 0.10804851935002684, "grad_norm": 10.377126181360559, "learning_rate": 4.9586724270201115e-05, "loss": 2.4641, "mean_token_accuracy": 0.4241379380226135, "step": 107275 }, { "epoch": 0.10805355540313101, "grad_norm": 11.093827759210939, "learning_rate": 4.9586652778052976e-05, "loss": 2.5624, "mean_token_accuracy": 0.4, "step": 107280 }, { "epoch": 0.10805859145623518, "grad_norm": 11.300982499222169, "learning_rate": 4.958658127977904e-05, "loss": 2.311, "mean_token_accuracy": 0.4517241358757019, "step": 107285 }, { "epoch": 0.10806362750933936, "grad_norm": 10.759340402018251, "learning_rate": 4.9586509775379294e-05, "loss": 2.9253, "mean_token_accuracy": 0.3241379350423813, "step": 107290 }, { "epoch": 0.10806866356244353, "grad_norm": 15.289329173842033, "learning_rate": 4.958643826485377e-05, "loss": 2.5459, "mean_token_accuracy": 0.40689656138420105, "step": 107295 }, { "epoch": 0.1080736996155477, "grad_norm": 10.64537646012742, "learning_rate": 4.958636674820251e-05, "loss": 2.3555, "mean_token_accuracy": 0.4517241358757019, "step": 107300 }, { "epoch": 0.10807873566865188, "grad_norm": 10.439646458196718, "learning_rate": 4.9586295225425496e-05, "loss": 2.2438, "mean_token_accuracy": 0.46400484442710876, "step": 107305 }, { "epoch": 0.10808377172175605, "grad_norm": 10.373069338254593, "learning_rate": 4.9586223696522774e-05, "loss": 2.0092, "mean_token_accuracy": 0.49458128213882446, "step": 107310 }, { "epoch": 0.10808880777486023, "grad_norm": 10.22462487396539, "learning_rate": 4.958615216149435e-05, "loss": 2.1586, "mean_token_accuracy": 0.4901477873325348, "step": 107315 }, { "epoch": 0.1080938438279644, "grad_norm": 11.445289551072111, "learning_rate": 4.9586080620340254e-05, "loss": 2.3405, "mean_token_accuracy": 0.4310344815254211, "step": 107320 }, { "epoch": 0.10809887988106857, "grad_norm": 11.589002846779666, "learning_rate": 4.9586009073060495e-05, "loss": 2.377, "mean_token_accuracy": 0.42413792610168455, "step": 107325 }, { "epoch": 0.10810391593417275, "grad_norm": 10.062132257376236, "learning_rate": 4.95859375196551e-05, "loss": 2.3485, "mean_token_accuracy": 0.4068965554237366, "step": 107330 }, { "epoch": 0.10810895198727692, "grad_norm": 9.46868935589262, "learning_rate": 4.95858659601241e-05, "loss": 2.8643, "mean_token_accuracy": 0.4137930989265442, "step": 107335 }, { "epoch": 0.10811398804038108, "grad_norm": 9.494639027989525, "learning_rate": 4.958579439446749e-05, "loss": 2.4966, "mean_token_accuracy": 0.4034482717514038, "step": 107340 }, { "epoch": 0.10811902409348526, "grad_norm": 12.802094574690742, "learning_rate": 4.9585722822685306e-05, "loss": 2.9213, "mean_token_accuracy": 0.3931034505367279, "step": 107345 }, { "epoch": 0.10812406014658943, "grad_norm": 12.523136556930131, "learning_rate": 4.958565124477756e-05, "loss": 2.2792, "mean_token_accuracy": 0.4379310429096222, "step": 107350 }, { "epoch": 0.1081290961996936, "grad_norm": 12.434475197743826, "learning_rate": 4.958557966074428e-05, "loss": 2.4196, "mean_token_accuracy": 0.43448275327682495, "step": 107355 }, { "epoch": 0.10813413225279778, "grad_norm": 8.667422303567143, "learning_rate": 4.958550807058548e-05, "loss": 2.288, "mean_token_accuracy": 0.4172413766384125, "step": 107360 }, { "epoch": 0.10813916830590195, "grad_norm": 9.785923036680265, "learning_rate": 4.958543647430118e-05, "loss": 2.5693, "mean_token_accuracy": 0.38275861740112305, "step": 107365 }, { "epoch": 0.10814420435900612, "grad_norm": 10.746296508199606, "learning_rate": 4.958536487189141e-05, "loss": 2.6697, "mean_token_accuracy": 0.43448275327682495, "step": 107370 }, { "epoch": 0.1081492404121103, "grad_norm": 10.650991381878647, "learning_rate": 4.958529326335618e-05, "loss": 2.5991, "mean_token_accuracy": 0.4, "step": 107375 }, { "epoch": 0.10815427646521447, "grad_norm": 10.524250231328603, "learning_rate": 4.9585221648695505e-05, "loss": 2.2478, "mean_token_accuracy": 0.4793103516101837, "step": 107380 }, { "epoch": 0.10815931251831865, "grad_norm": 15.039030097582273, "learning_rate": 4.9585150027909415e-05, "loss": 2.7766, "mean_token_accuracy": 0.4206896543502808, "step": 107385 }, { "epoch": 0.10816434857142282, "grad_norm": 10.887860819406933, "learning_rate": 4.9585078400997934e-05, "loss": 2.6311, "mean_token_accuracy": 0.45009074211120603, "step": 107390 }, { "epoch": 0.10816938462452699, "grad_norm": 9.152435867834567, "learning_rate": 4.958500676796106e-05, "loss": 2.7181, "mean_token_accuracy": 0.38620689511299133, "step": 107395 }, { "epoch": 0.10817442067763117, "grad_norm": 13.203597254523038, "learning_rate": 4.958493512879884e-05, "loss": 2.459, "mean_token_accuracy": 0.4643073260784149, "step": 107400 }, { "epoch": 0.10817945673073534, "grad_norm": 10.613120559609271, "learning_rate": 4.9584863483511274e-05, "loss": 2.2338, "mean_token_accuracy": 0.4310344815254211, "step": 107405 }, { "epoch": 0.1081844927838395, "grad_norm": 9.302783715417787, "learning_rate": 4.958479183209839e-05, "loss": 2.1151, "mean_token_accuracy": 0.5, "step": 107410 }, { "epoch": 0.10818952883694367, "grad_norm": 9.660760906329786, "learning_rate": 4.95847201745602e-05, "loss": 2.3339, "mean_token_accuracy": 0.43103448748588563, "step": 107415 }, { "epoch": 0.10819456489004785, "grad_norm": 12.463404645136231, "learning_rate": 4.9584648510896736e-05, "loss": 2.6261, "mean_token_accuracy": 0.3620689630508423, "step": 107420 }, { "epoch": 0.10819960094315202, "grad_norm": 9.683566349908325, "learning_rate": 4.958457684110801e-05, "loss": 2.4024, "mean_token_accuracy": 0.4448275864124298, "step": 107425 }, { "epoch": 0.1082046369962562, "grad_norm": 10.035247930991305, "learning_rate": 4.958450516519406e-05, "loss": 2.3781, "mean_token_accuracy": 0.45517240166664125, "step": 107430 }, { "epoch": 0.10820967304936037, "grad_norm": 11.622224003970778, "learning_rate": 4.9584433483154874e-05, "loss": 2.4197, "mean_token_accuracy": 0.46896551847457885, "step": 107435 }, { "epoch": 0.10821470910246454, "grad_norm": 9.264001350381193, "learning_rate": 4.958436179499049e-05, "loss": 2.0957, "mean_token_accuracy": 0.48100423216819765, "step": 107440 }, { "epoch": 0.10821974515556872, "grad_norm": 10.420695390730026, "learning_rate": 4.9584290100700926e-05, "loss": 2.4615, "mean_token_accuracy": 0.42413793206214906, "step": 107445 }, { "epoch": 0.10822478120867289, "grad_norm": 9.294736832060329, "learning_rate": 4.95842184002862e-05, "loss": 2.3118, "mean_token_accuracy": 0.38275861740112305, "step": 107450 }, { "epoch": 0.10822981726177706, "grad_norm": 12.687697838901109, "learning_rate": 4.958414669374634e-05, "loss": 2.7852, "mean_token_accuracy": 0.3724137842655182, "step": 107455 }, { "epoch": 0.10823485331488124, "grad_norm": 10.104240226412362, "learning_rate": 4.958407498108136e-05, "loss": 2.0405, "mean_token_accuracy": 0.4620689630508423, "step": 107460 }, { "epoch": 0.10823988936798541, "grad_norm": 8.860770956740469, "learning_rate": 4.9584003262291274e-05, "loss": 2.3939, "mean_token_accuracy": 0.42068964838981626, "step": 107465 }, { "epoch": 0.10824492542108959, "grad_norm": 9.79202651283673, "learning_rate": 4.95839315373761e-05, "loss": 2.5215, "mean_token_accuracy": 0.3655172407627106, "step": 107470 }, { "epoch": 0.10824996147419376, "grad_norm": 9.14099140106788, "learning_rate": 4.958385980633587e-05, "loss": 2.6976, "mean_token_accuracy": 0.4258923172950745, "step": 107475 }, { "epoch": 0.10825499752729792, "grad_norm": 11.05207810881123, "learning_rate": 4.958378806917061e-05, "loss": 2.6333, "mean_token_accuracy": 0.358620685338974, "step": 107480 }, { "epoch": 0.10826003358040209, "grad_norm": 10.51688792104066, "learning_rate": 4.958371632588032e-05, "loss": 2.3226, "mean_token_accuracy": 0.4570477962493896, "step": 107485 }, { "epoch": 0.10826506963350627, "grad_norm": 9.038423443608297, "learning_rate": 4.958364457646504e-05, "loss": 2.2169, "mean_token_accuracy": 0.44827585816383364, "step": 107490 }, { "epoch": 0.10827010568661044, "grad_norm": 10.07695589821479, "learning_rate": 4.958357282092477e-05, "loss": 2.5001, "mean_token_accuracy": 0.42068966031074523, "step": 107495 }, { "epoch": 0.10827514173971461, "grad_norm": 12.1911758363293, "learning_rate": 4.958350105925953e-05, "loss": 2.6414, "mean_token_accuracy": 0.37241379022598264, "step": 107500 }, { "epoch": 0.10828017779281879, "grad_norm": 9.30245249092957, "learning_rate": 4.9583429291469366e-05, "loss": 2.6197, "mean_token_accuracy": 0.3793103516101837, "step": 107505 }, { "epoch": 0.10828521384592296, "grad_norm": 12.387305577628005, "learning_rate": 4.958335751755427e-05, "loss": 2.4096, "mean_token_accuracy": 0.4, "step": 107510 }, { "epoch": 0.10829024989902714, "grad_norm": 11.891341128770739, "learning_rate": 4.9583285737514266e-05, "loss": 2.1559, "mean_token_accuracy": 0.4551724076271057, "step": 107515 }, { "epoch": 0.10829528595213131, "grad_norm": 13.987866228561792, "learning_rate": 4.95832139513494e-05, "loss": 2.2902, "mean_token_accuracy": 0.4448275864124298, "step": 107520 }, { "epoch": 0.10830032200523548, "grad_norm": 10.823924971041762, "learning_rate": 4.958314215905965e-05, "loss": 2.3386, "mean_token_accuracy": 0.42758620977401735, "step": 107525 }, { "epoch": 0.10830535805833966, "grad_norm": 10.563332099407061, "learning_rate": 4.9583070360645075e-05, "loss": 2.5251, "mean_token_accuracy": 0.4206896543502808, "step": 107530 }, { "epoch": 0.10831039411144383, "grad_norm": 10.299329133286241, "learning_rate": 4.958299855610567e-05, "loss": 2.9887, "mean_token_accuracy": 0.3896551728248596, "step": 107535 }, { "epoch": 0.108315430164548, "grad_norm": 9.24378194962503, "learning_rate": 4.958292674544146e-05, "loss": 2.3368, "mean_token_accuracy": 0.4724137902259827, "step": 107540 }, { "epoch": 0.10832046621765218, "grad_norm": 8.922517584232857, "learning_rate": 4.958285492865247e-05, "loss": 2.2844, "mean_token_accuracy": 0.47931033968925474, "step": 107545 }, { "epoch": 0.10832550227075634, "grad_norm": 9.494566560284643, "learning_rate": 4.958278310573873e-05, "loss": 2.3539, "mean_token_accuracy": 0.43103447556495667, "step": 107550 }, { "epoch": 0.10833053832386051, "grad_norm": 11.233770414286804, "learning_rate": 4.958271127670023e-05, "loss": 2.3762, "mean_token_accuracy": 0.4344827651977539, "step": 107555 }, { "epoch": 0.10833557437696469, "grad_norm": 10.873409003105886, "learning_rate": 4.958263944153702e-05, "loss": 2.5282, "mean_token_accuracy": 0.44137930274009707, "step": 107560 }, { "epoch": 0.10834061043006886, "grad_norm": 10.94725529590566, "learning_rate": 4.9582567600249104e-05, "loss": 2.2092, "mean_token_accuracy": 0.4517241299152374, "step": 107565 }, { "epoch": 0.10834564648317303, "grad_norm": 9.956894360677891, "learning_rate": 4.958249575283651e-05, "loss": 2.5438, "mean_token_accuracy": 0.42758620977401735, "step": 107570 }, { "epoch": 0.1083506825362772, "grad_norm": 9.852720301979515, "learning_rate": 4.958242389929925e-05, "loss": 2.3212, "mean_token_accuracy": 0.4137930989265442, "step": 107575 }, { "epoch": 0.10835571858938138, "grad_norm": 10.142005304332944, "learning_rate": 4.958235203963734e-05, "loss": 2.2497, "mean_token_accuracy": 0.48820326924324037, "step": 107580 }, { "epoch": 0.10836075464248555, "grad_norm": 13.449839586034551, "learning_rate": 4.958228017385081e-05, "loss": 2.5428, "mean_token_accuracy": 0.41724138259887694, "step": 107585 }, { "epoch": 0.10836579069558973, "grad_norm": 12.866381576266825, "learning_rate": 4.958220830193968e-05, "loss": 2.8029, "mean_token_accuracy": 0.3379310339689255, "step": 107590 }, { "epoch": 0.1083708267486939, "grad_norm": 10.541573799016785, "learning_rate": 4.958213642390396e-05, "loss": 2.4042, "mean_token_accuracy": 0.4137930989265442, "step": 107595 }, { "epoch": 0.10837586280179808, "grad_norm": 10.25566746548862, "learning_rate": 4.958206453974369e-05, "loss": 2.6518, "mean_token_accuracy": 0.41379310488700866, "step": 107600 }, { "epoch": 0.10838089885490225, "grad_norm": 9.698676517309917, "learning_rate": 4.958199264945887e-05, "loss": 2.3868, "mean_token_accuracy": 0.47586206197738645, "step": 107605 }, { "epoch": 0.10838593490800642, "grad_norm": 10.419011441160416, "learning_rate": 4.9581920753049534e-05, "loss": 2.5083, "mean_token_accuracy": 0.41034483909606934, "step": 107610 }, { "epoch": 0.1083909709611106, "grad_norm": 9.52083331818641, "learning_rate": 4.958184885051569e-05, "loss": 2.4758, "mean_token_accuracy": 0.38275861740112305, "step": 107615 }, { "epoch": 0.10839600701421476, "grad_norm": 11.776694282578525, "learning_rate": 4.958177694185736e-05, "loss": 2.2945, "mean_token_accuracy": 0.44137930274009707, "step": 107620 }, { "epoch": 0.10840104306731893, "grad_norm": 9.902060165033578, "learning_rate": 4.958170502707457e-05, "loss": 2.1022, "mean_token_accuracy": 0.45517241954803467, "step": 107625 }, { "epoch": 0.1084060791204231, "grad_norm": 9.657241804594197, "learning_rate": 4.958163310616734e-05, "loss": 2.5984, "mean_token_accuracy": 0.4172413766384125, "step": 107630 }, { "epoch": 0.10841111517352728, "grad_norm": 10.076574902292391, "learning_rate": 4.958156117913568e-05, "loss": 2.3847, "mean_token_accuracy": 0.3862069010734558, "step": 107635 }, { "epoch": 0.10841615122663145, "grad_norm": 9.996452728064519, "learning_rate": 4.958148924597961e-05, "loss": 2.4251, "mean_token_accuracy": 0.4413793087005615, "step": 107640 }, { "epoch": 0.10842118727973563, "grad_norm": 12.901895317726764, "learning_rate": 4.958141730669917e-05, "loss": 2.7025, "mean_token_accuracy": 0.3931034505367279, "step": 107645 }, { "epoch": 0.1084262233328398, "grad_norm": 10.908951882145884, "learning_rate": 4.9581345361294364e-05, "loss": 2.383, "mean_token_accuracy": 0.42413793206214906, "step": 107650 }, { "epoch": 0.10843125938594397, "grad_norm": 10.019730527859553, "learning_rate": 4.958127340976521e-05, "loss": 2.3888, "mean_token_accuracy": 0.4896551787853241, "step": 107655 }, { "epoch": 0.10843629543904815, "grad_norm": 11.282415504630528, "learning_rate": 4.958120145211174e-05, "loss": 2.7264, "mean_token_accuracy": 0.39655172228813174, "step": 107660 }, { "epoch": 0.10844133149215232, "grad_norm": 9.205569010808901, "learning_rate": 4.958112948833396e-05, "loss": 2.3047, "mean_token_accuracy": 0.4000000059604645, "step": 107665 }, { "epoch": 0.1084463675452565, "grad_norm": 11.587499296846909, "learning_rate": 4.95810575184319e-05, "loss": 2.4643, "mean_token_accuracy": 0.46473078727722167, "step": 107670 }, { "epoch": 0.10845140359836067, "grad_norm": 8.930365861313936, "learning_rate": 4.958098554240557e-05, "loss": 2.091, "mean_token_accuracy": 0.3965517282485962, "step": 107675 }, { "epoch": 0.10845643965146484, "grad_norm": 9.940522958349861, "learning_rate": 4.9580913560255e-05, "loss": 2.1943, "mean_token_accuracy": 0.4275862157344818, "step": 107680 }, { "epoch": 0.10846147570456902, "grad_norm": 14.954939526481805, "learning_rate": 4.9580841571980204e-05, "loss": 2.3722, "mean_token_accuracy": 0.41379311084747317, "step": 107685 }, { "epoch": 0.10846651175767318, "grad_norm": 10.043515394005372, "learning_rate": 4.958076957758121e-05, "loss": 2.2527, "mean_token_accuracy": 0.42758620977401735, "step": 107690 }, { "epoch": 0.10847154781077735, "grad_norm": 11.07833058342766, "learning_rate": 4.958069757705803e-05, "loss": 2.7994, "mean_token_accuracy": 0.3862069010734558, "step": 107695 }, { "epoch": 0.10847658386388152, "grad_norm": 10.436887162517603, "learning_rate": 4.958062557041068e-05, "loss": 2.6833, "mean_token_accuracy": 0.4137930989265442, "step": 107700 }, { "epoch": 0.1084816199169857, "grad_norm": 13.109083800302747, "learning_rate": 4.9580553557639197e-05, "loss": 2.1981, "mean_token_accuracy": 0.417241370677948, "step": 107705 }, { "epoch": 0.10848665597008987, "grad_norm": 8.999146228067858, "learning_rate": 4.9580481538743575e-05, "loss": 2.2286, "mean_token_accuracy": 0.44827587008476255, "step": 107710 }, { "epoch": 0.10849169202319404, "grad_norm": 12.205726235544475, "learning_rate": 4.9580409513723865e-05, "loss": 2.7248, "mean_token_accuracy": 0.40689654350280763, "step": 107715 }, { "epoch": 0.10849672807629822, "grad_norm": 13.211418989216797, "learning_rate": 4.958033748258006e-05, "loss": 2.6636, "mean_token_accuracy": 0.3965517282485962, "step": 107720 }, { "epoch": 0.10850176412940239, "grad_norm": 9.055874683385968, "learning_rate": 4.9580265445312205e-05, "loss": 2.3515, "mean_token_accuracy": 0.4206896543502808, "step": 107725 }, { "epoch": 0.10850680018250657, "grad_norm": 14.681977442729286, "learning_rate": 4.958019340192029e-05, "loss": 2.5968, "mean_token_accuracy": 0.42577131986618044, "step": 107730 }, { "epoch": 0.10851183623561074, "grad_norm": 11.48838623259037, "learning_rate": 4.958012135240436e-05, "loss": 2.2207, "mean_token_accuracy": 0.4551724135875702, "step": 107735 }, { "epoch": 0.10851687228871491, "grad_norm": 9.048892188515945, "learning_rate": 4.958004929676442e-05, "loss": 2.3158, "mean_token_accuracy": 0.38620689511299133, "step": 107740 }, { "epoch": 0.10852190834181909, "grad_norm": 10.25570001694402, "learning_rate": 4.95799772350005e-05, "loss": 2.1282, "mean_token_accuracy": 0.42758620381355283, "step": 107745 }, { "epoch": 0.10852694439492326, "grad_norm": 10.248551393993596, "learning_rate": 4.957990516711262e-05, "loss": 2.6414, "mean_token_accuracy": 0.358620685338974, "step": 107750 }, { "epoch": 0.10853198044802743, "grad_norm": 12.22966268992081, "learning_rate": 4.9579833093100786e-05, "loss": 2.2885, "mean_token_accuracy": 0.5034482836723327, "step": 107755 }, { "epoch": 0.1085370165011316, "grad_norm": 11.13492676504733, "learning_rate": 4.9579761012965034e-05, "loss": 2.0114, "mean_token_accuracy": 0.4862068951129913, "step": 107760 }, { "epoch": 0.10854205255423577, "grad_norm": 10.190497565216308, "learning_rate": 4.957968892670538e-05, "loss": 2.2403, "mean_token_accuracy": 0.44664247035980226, "step": 107765 }, { "epoch": 0.10854708860733994, "grad_norm": 9.719465157318897, "learning_rate": 4.9579616834321835e-05, "loss": 2.4884, "mean_token_accuracy": 0.37241379022598264, "step": 107770 }, { "epoch": 0.10855212466044412, "grad_norm": 12.564084130798074, "learning_rate": 4.9579544735814434e-05, "loss": 2.347, "mean_token_accuracy": 0.4482758641242981, "step": 107775 }, { "epoch": 0.10855716071354829, "grad_norm": 11.284598125268072, "learning_rate": 4.957947263118318e-05, "loss": 2.5532, "mean_token_accuracy": 0.3931034505367279, "step": 107780 }, { "epoch": 0.10856219676665246, "grad_norm": 9.601179675761317, "learning_rate": 4.957940052042811e-05, "loss": 2.1785, "mean_token_accuracy": 0.48965516686439514, "step": 107785 }, { "epoch": 0.10856723281975664, "grad_norm": 10.334860227560275, "learning_rate": 4.957932840354923e-05, "loss": 2.875, "mean_token_accuracy": 0.4103448212146759, "step": 107790 }, { "epoch": 0.10857226887286081, "grad_norm": 9.734078012842598, "learning_rate": 4.957925628054657e-05, "loss": 2.1836, "mean_token_accuracy": 0.39310344457626345, "step": 107795 }, { "epoch": 0.10857730492596498, "grad_norm": 9.431708358796635, "learning_rate": 4.957918415142015e-05, "loss": 2.3445, "mean_token_accuracy": 0.42068966031074523, "step": 107800 }, { "epoch": 0.10858234097906916, "grad_norm": 10.390653286525446, "learning_rate": 4.9579112016169975e-05, "loss": 2.0985, "mean_token_accuracy": 0.46896552443504336, "step": 107805 }, { "epoch": 0.10858737703217333, "grad_norm": 10.578269221935887, "learning_rate": 4.957903987479608e-05, "loss": 2.5589, "mean_token_accuracy": 0.42758620381355283, "step": 107810 }, { "epoch": 0.1085924130852775, "grad_norm": 8.145174514185971, "learning_rate": 4.957896772729848e-05, "loss": 2.3477, "mean_token_accuracy": 0.4275862157344818, "step": 107815 }, { "epoch": 0.10859744913838168, "grad_norm": 10.07165347850683, "learning_rate": 4.95788955736772e-05, "loss": 2.2564, "mean_token_accuracy": 0.4379310429096222, "step": 107820 }, { "epoch": 0.10860248519148585, "grad_norm": 9.76073721936002, "learning_rate": 4.957882341393225e-05, "loss": 2.3799, "mean_token_accuracy": 0.41428571939468384, "step": 107825 }, { "epoch": 0.10860752124459001, "grad_norm": 11.430223968135316, "learning_rate": 4.957875124806367e-05, "loss": 2.8401, "mean_token_accuracy": 0.39655172228813174, "step": 107830 }, { "epoch": 0.10861255729769419, "grad_norm": 9.179443591895861, "learning_rate": 4.957867907607145e-05, "loss": 2.3865, "mean_token_accuracy": 0.47931033968925474, "step": 107835 }, { "epoch": 0.10861759335079836, "grad_norm": 11.580930374660609, "learning_rate": 4.9578606897955636e-05, "loss": 2.2179, "mean_token_accuracy": 0.4344827592372894, "step": 107840 }, { "epoch": 0.10862262940390253, "grad_norm": 9.721771272805599, "learning_rate": 4.957853471371623e-05, "loss": 2.4192, "mean_token_accuracy": 0.43103447556495667, "step": 107845 }, { "epoch": 0.10862766545700671, "grad_norm": 10.368469971728636, "learning_rate": 4.957846252335326e-05, "loss": 1.991, "mean_token_accuracy": 0.4586206912994385, "step": 107850 }, { "epoch": 0.10863270151011088, "grad_norm": 10.965851375509738, "learning_rate": 4.9578390326866754e-05, "loss": 2.7694, "mean_token_accuracy": 0.35862069129943847, "step": 107855 }, { "epoch": 0.10863773756321506, "grad_norm": 12.479519430493118, "learning_rate": 4.957831812425672e-05, "loss": 2.4902, "mean_token_accuracy": 0.4034482717514038, "step": 107860 }, { "epoch": 0.10864277361631923, "grad_norm": 11.213427912062103, "learning_rate": 4.9578245915523187e-05, "loss": 2.2476, "mean_token_accuracy": 0.45517241954803467, "step": 107865 }, { "epoch": 0.1086478096694234, "grad_norm": 9.102651433724484, "learning_rate": 4.957817370066616e-05, "loss": 2.6581, "mean_token_accuracy": 0.41379310488700866, "step": 107870 }, { "epoch": 0.10865284572252758, "grad_norm": 9.848736678041211, "learning_rate": 4.9578101479685676e-05, "loss": 2.4875, "mean_token_accuracy": 0.41724138259887694, "step": 107875 }, { "epoch": 0.10865788177563175, "grad_norm": 12.0343968566699, "learning_rate": 4.957802925258175e-05, "loss": 2.472, "mean_token_accuracy": 0.43793103098869324, "step": 107880 }, { "epoch": 0.10866291782873592, "grad_norm": 12.737831842083631, "learning_rate": 4.957795701935439e-05, "loss": 2.669, "mean_token_accuracy": 0.43793103098869324, "step": 107885 }, { "epoch": 0.1086679538818401, "grad_norm": 11.280302186493369, "learning_rate": 4.957788478000364e-05, "loss": 2.8683, "mean_token_accuracy": 0.3793103456497192, "step": 107890 }, { "epoch": 0.10867298993494427, "grad_norm": 11.505229966531815, "learning_rate": 4.95778125345295e-05, "loss": 2.3707, "mean_token_accuracy": 0.4413793087005615, "step": 107895 }, { "epoch": 0.10867802598804843, "grad_norm": 11.696723527524535, "learning_rate": 4.9577740282932e-05, "loss": 2.4819, "mean_token_accuracy": 0.42413793206214906, "step": 107900 }, { "epoch": 0.1086830620411526, "grad_norm": 13.27828633819417, "learning_rate": 4.957766802521115e-05, "loss": 2.7083, "mean_token_accuracy": 0.4103448301553726, "step": 107905 }, { "epoch": 0.10868809809425678, "grad_norm": 11.05128518338207, "learning_rate": 4.957759576136698e-05, "loss": 2.1464, "mean_token_accuracy": 0.46551724076271056, "step": 107910 }, { "epoch": 0.10869313414736095, "grad_norm": 14.405241113682067, "learning_rate": 4.95775234913995e-05, "loss": 2.6629, "mean_token_accuracy": 0.3793103456497192, "step": 107915 }, { "epoch": 0.10869817020046513, "grad_norm": 10.87844225916082, "learning_rate": 4.9577451215308745e-05, "loss": 2.5054, "mean_token_accuracy": 0.4034482717514038, "step": 107920 }, { "epoch": 0.1087032062535693, "grad_norm": 10.614373495693224, "learning_rate": 4.957737893309472e-05, "loss": 2.6128, "mean_token_accuracy": 0.3482758581638336, "step": 107925 }, { "epoch": 0.10870824230667347, "grad_norm": 10.105755457826929, "learning_rate": 4.9577306644757455e-05, "loss": 2.1546, "mean_token_accuracy": 0.4758620738983154, "step": 107930 }, { "epoch": 0.10871327835977765, "grad_norm": 11.37408020714365, "learning_rate": 4.957723435029697e-05, "loss": 2.3592, "mean_token_accuracy": 0.45862069725990295, "step": 107935 }, { "epoch": 0.10871831441288182, "grad_norm": 11.717717882526417, "learning_rate": 4.957716204971328e-05, "loss": 2.2561, "mean_token_accuracy": 0.47931033968925474, "step": 107940 }, { "epoch": 0.108723350465986, "grad_norm": 10.639334449477898, "learning_rate": 4.95770897430064e-05, "loss": 2.464, "mean_token_accuracy": 0.41724138259887694, "step": 107945 }, { "epoch": 0.10872838651909017, "grad_norm": 11.474340397399446, "learning_rate": 4.957701743017636e-05, "loss": 2.4107, "mean_token_accuracy": 0.41034482717514037, "step": 107950 }, { "epoch": 0.10873342257219434, "grad_norm": 12.57865789778267, "learning_rate": 4.957694511122318e-05, "loss": 2.4403, "mean_token_accuracy": 0.41379310488700866, "step": 107955 }, { "epoch": 0.10873845862529852, "grad_norm": 10.182736488404696, "learning_rate": 4.957687278614688e-05, "loss": 2.2516, "mean_token_accuracy": 0.4310344815254211, "step": 107960 }, { "epoch": 0.10874349467840269, "grad_norm": 10.297561916473837, "learning_rate": 4.957680045494747e-05, "loss": 2.2555, "mean_token_accuracy": 0.4938423693180084, "step": 107965 }, { "epoch": 0.10874853073150685, "grad_norm": 10.132930233663659, "learning_rate": 4.957672811762498e-05, "loss": 2.1096, "mean_token_accuracy": 0.42758620977401735, "step": 107970 }, { "epoch": 0.10875356678461102, "grad_norm": 10.729307032032356, "learning_rate": 4.957665577417942e-05, "loss": 2.7136, "mean_token_accuracy": 0.38965516686439516, "step": 107975 }, { "epoch": 0.1087586028377152, "grad_norm": 11.38983820438597, "learning_rate": 4.957658342461082e-05, "loss": 2.4992, "mean_token_accuracy": 0.4344827592372894, "step": 107980 }, { "epoch": 0.10876363889081937, "grad_norm": 11.025366738862676, "learning_rate": 4.957651106891921e-05, "loss": 2.6043, "mean_token_accuracy": 0.4034482777118683, "step": 107985 }, { "epoch": 0.10876867494392355, "grad_norm": 10.46297519838561, "learning_rate": 4.957643870710459e-05, "loss": 2.6122, "mean_token_accuracy": 0.3655172407627106, "step": 107990 }, { "epoch": 0.10877371099702772, "grad_norm": 9.541571150209489, "learning_rate": 4.9576366339166983e-05, "loss": 2.1058, "mean_token_accuracy": 0.5019963681697845, "step": 107995 }, { "epoch": 0.1087787470501319, "grad_norm": 10.95452656592583, "learning_rate": 4.9576293965106415e-05, "loss": 2.934, "mean_token_accuracy": 0.34137930870056155, "step": 108000 }, { "epoch": 0.10878378310323607, "grad_norm": 9.403151743631476, "learning_rate": 4.9576221584922904e-05, "loss": 2.0866, "mean_token_accuracy": 0.48275862336158754, "step": 108005 }, { "epoch": 0.10878881915634024, "grad_norm": 10.21360845422915, "learning_rate": 4.9576149198616475e-05, "loss": 2.3717, "mean_token_accuracy": 0.4172413766384125, "step": 108010 }, { "epoch": 0.10879385520944442, "grad_norm": 11.573570887355926, "learning_rate": 4.9576076806187144e-05, "loss": 2.3018, "mean_token_accuracy": 0.41724138259887694, "step": 108015 }, { "epoch": 0.10879889126254859, "grad_norm": 9.457313843012107, "learning_rate": 4.9576004407634916e-05, "loss": 2.5998, "mean_token_accuracy": 0.441379314661026, "step": 108020 }, { "epoch": 0.10880392731565276, "grad_norm": 10.661575915492778, "learning_rate": 4.9575932002959847e-05, "loss": 2.341, "mean_token_accuracy": 0.39310344457626345, "step": 108025 }, { "epoch": 0.10880896336875694, "grad_norm": 14.987939893896437, "learning_rate": 4.957585959216193e-05, "loss": 2.3976, "mean_token_accuracy": 0.38620689511299133, "step": 108030 }, { "epoch": 0.10881399942186111, "grad_norm": 10.495172534890404, "learning_rate": 4.957578717524118e-05, "loss": 1.9939, "mean_token_accuracy": 0.5160314619541169, "step": 108035 }, { "epoch": 0.10881903547496527, "grad_norm": 10.073257899056186, "learning_rate": 4.957571475219763e-05, "loss": 2.4188, "mean_token_accuracy": 0.4344827651977539, "step": 108040 }, { "epoch": 0.10882407152806944, "grad_norm": 10.020221787427186, "learning_rate": 4.957564232303131e-05, "loss": 2.2604, "mean_token_accuracy": 0.4758620738983154, "step": 108045 }, { "epoch": 0.10882910758117362, "grad_norm": 14.692738473639386, "learning_rate": 4.957556988774223e-05, "loss": 2.5038, "mean_token_accuracy": 0.458620685338974, "step": 108050 }, { "epoch": 0.10883414363427779, "grad_norm": 12.013371097987012, "learning_rate": 4.9575497446330396e-05, "loss": 2.1864, "mean_token_accuracy": 0.4344827592372894, "step": 108055 }, { "epoch": 0.10883917968738197, "grad_norm": 11.996179751429695, "learning_rate": 4.9575424998795846e-05, "loss": 2.5556, "mean_token_accuracy": 0.36896551847457887, "step": 108060 }, { "epoch": 0.10884421574048614, "grad_norm": 9.82080505469875, "learning_rate": 4.957535254513859e-05, "loss": 2.2525, "mean_token_accuracy": 0.4620689690113068, "step": 108065 }, { "epoch": 0.10884925179359031, "grad_norm": 15.273675682413538, "learning_rate": 4.957528008535865e-05, "loss": 2.7929, "mean_token_accuracy": 0.4068965494632721, "step": 108070 }, { "epoch": 0.10885428784669449, "grad_norm": 11.676115863539788, "learning_rate": 4.957520761945606e-05, "loss": 2.2338, "mean_token_accuracy": 0.4172413766384125, "step": 108075 }, { "epoch": 0.10885932389979866, "grad_norm": 12.064138271427389, "learning_rate": 4.957513514743082e-05, "loss": 2.3401, "mean_token_accuracy": 0.4275861978530884, "step": 108080 }, { "epoch": 0.10886435995290283, "grad_norm": 10.05242403700605, "learning_rate": 4.9575062669282956e-05, "loss": 2.4797, "mean_token_accuracy": 0.4068965494632721, "step": 108085 }, { "epoch": 0.10886939600600701, "grad_norm": 9.396606420006043, "learning_rate": 4.9574990185012506e-05, "loss": 2.4669, "mean_token_accuracy": 0.3827586233615875, "step": 108090 }, { "epoch": 0.10887443205911118, "grad_norm": 11.650425855942851, "learning_rate": 4.9574917694619463e-05, "loss": 2.615, "mean_token_accuracy": 0.37586206793785093, "step": 108095 }, { "epoch": 0.10887946811221536, "grad_norm": 10.267526455166738, "learning_rate": 4.9574845198103864e-05, "loss": 2.2319, "mean_token_accuracy": 0.4172413766384125, "step": 108100 }, { "epoch": 0.10888450416531953, "grad_norm": 12.498345415347659, "learning_rate": 4.957477269546572e-05, "loss": 2.2986, "mean_token_accuracy": 0.4379310369491577, "step": 108105 }, { "epoch": 0.10888954021842369, "grad_norm": 8.205448714540209, "learning_rate": 4.957470018670506e-05, "loss": 2.1284, "mean_token_accuracy": 0.5068965494632721, "step": 108110 }, { "epoch": 0.10889457627152786, "grad_norm": 19.647114577009354, "learning_rate": 4.95746276718219e-05, "loss": 2.5784, "mean_token_accuracy": 0.39655172228813174, "step": 108115 }, { "epoch": 0.10889961232463204, "grad_norm": 12.97147890919276, "learning_rate": 4.957455515081626e-05, "loss": 2.5496, "mean_token_accuracy": 0.3931034505367279, "step": 108120 }, { "epoch": 0.10890464837773621, "grad_norm": 9.935039590879072, "learning_rate": 4.957448262368815e-05, "loss": 2.5451, "mean_token_accuracy": 0.4137930989265442, "step": 108125 }, { "epoch": 0.10890968443084038, "grad_norm": 10.483668187022491, "learning_rate": 4.9574410090437604e-05, "loss": 2.3306, "mean_token_accuracy": 0.4482758641242981, "step": 108130 }, { "epoch": 0.10891472048394456, "grad_norm": 8.826081389787808, "learning_rate": 4.9574337551064645e-05, "loss": 2.2358, "mean_token_accuracy": 0.4413793087005615, "step": 108135 }, { "epoch": 0.10891975653704873, "grad_norm": 14.689322150219958, "learning_rate": 4.9574265005569284e-05, "loss": 2.3884, "mean_token_accuracy": 0.5103448331356049, "step": 108140 }, { "epoch": 0.1089247925901529, "grad_norm": 11.144727887098208, "learning_rate": 4.957419245395155e-05, "loss": 2.2089, "mean_token_accuracy": 0.4724137902259827, "step": 108145 }, { "epoch": 0.10892982864325708, "grad_norm": 10.544490285058664, "learning_rate": 4.957411989621144e-05, "loss": 2.1723, "mean_token_accuracy": 0.458620685338974, "step": 108150 }, { "epoch": 0.10893486469636125, "grad_norm": 9.467608452738634, "learning_rate": 4.9574047332349e-05, "loss": 2.4276, "mean_token_accuracy": 0.4000000059604645, "step": 108155 }, { "epoch": 0.10893990074946543, "grad_norm": 8.898884737868197, "learning_rate": 4.957397476236424e-05, "loss": 2.0143, "mean_token_accuracy": 0.4985480964183807, "step": 108160 }, { "epoch": 0.1089449368025696, "grad_norm": 15.059485773345463, "learning_rate": 4.9573902186257174e-05, "loss": 2.5793, "mean_token_accuracy": 0.4172413766384125, "step": 108165 }, { "epoch": 0.10894997285567377, "grad_norm": 9.320827974452195, "learning_rate": 4.957382960402784e-05, "loss": 2.1197, "mean_token_accuracy": 0.4571687877178192, "step": 108170 }, { "epoch": 0.10895500890877795, "grad_norm": 8.973035806766127, "learning_rate": 4.9573757015676234e-05, "loss": 2.6466, "mean_token_accuracy": 0.38965516686439516, "step": 108175 }, { "epoch": 0.10896004496188211, "grad_norm": 10.79885469224029, "learning_rate": 4.95736844212024e-05, "loss": 2.6738, "mean_token_accuracy": 0.39310344457626345, "step": 108180 }, { "epoch": 0.10896508101498628, "grad_norm": 11.975679355972376, "learning_rate": 4.9573611820606345e-05, "loss": 2.3399, "mean_token_accuracy": 0.45517241954803467, "step": 108185 }, { "epoch": 0.10897011706809046, "grad_norm": 9.281850799655222, "learning_rate": 4.957353921388809e-05, "loss": 3.0748, "mean_token_accuracy": 0.39655172228813174, "step": 108190 }, { "epoch": 0.10897515312119463, "grad_norm": 11.565441338285277, "learning_rate": 4.957346660104766e-05, "loss": 2.6548, "mean_token_accuracy": 0.4068965494632721, "step": 108195 }, { "epoch": 0.1089801891742988, "grad_norm": 10.197908561640979, "learning_rate": 4.957339398208507e-05, "loss": 2.2878, "mean_token_accuracy": 0.48275861144065857, "step": 108200 }, { "epoch": 0.10898522522740298, "grad_norm": 13.350505600424414, "learning_rate": 4.957332135700034e-05, "loss": 2.4207, "mean_token_accuracy": 0.42758620381355283, "step": 108205 }, { "epoch": 0.10899026128050715, "grad_norm": 10.519676093018191, "learning_rate": 4.957324872579349e-05, "loss": 2.4808, "mean_token_accuracy": 0.4366606116294861, "step": 108210 }, { "epoch": 0.10899529733361132, "grad_norm": 10.50880654358656, "learning_rate": 4.957317608846454e-05, "loss": 2.7415, "mean_token_accuracy": 0.38620689511299133, "step": 108215 }, { "epoch": 0.1090003333867155, "grad_norm": 13.56973323792708, "learning_rate": 4.957310344501352e-05, "loss": 2.8227, "mean_token_accuracy": 0.3862068921327591, "step": 108220 }, { "epoch": 0.10900536943981967, "grad_norm": 8.816985028773987, "learning_rate": 4.957303079544044e-05, "loss": 2.2547, "mean_token_accuracy": 0.46551724076271056, "step": 108225 }, { "epoch": 0.10901040549292385, "grad_norm": 9.696729429715983, "learning_rate": 4.957295813974532e-05, "loss": 2.3033, "mean_token_accuracy": 0.42413793206214906, "step": 108230 }, { "epoch": 0.10901544154602802, "grad_norm": 11.55662873057817, "learning_rate": 4.957288547792819e-05, "loss": 2.6538, "mean_token_accuracy": 0.3931034505367279, "step": 108235 }, { "epoch": 0.10902047759913219, "grad_norm": 11.430457806643274, "learning_rate": 4.957281280998905e-05, "loss": 2.5991, "mean_token_accuracy": 0.382758629322052, "step": 108240 }, { "epoch": 0.10902551365223637, "grad_norm": 12.761544139807148, "learning_rate": 4.957274013592794e-05, "loss": 2.4026, "mean_token_accuracy": 0.4482758641242981, "step": 108245 }, { "epoch": 0.10903054970534053, "grad_norm": 10.93873973240141, "learning_rate": 4.957266745574488e-05, "loss": 2.3708, "mean_token_accuracy": 0.42413793206214906, "step": 108250 }, { "epoch": 0.1090355857584447, "grad_norm": 9.972063477556208, "learning_rate": 4.9572594769439875e-05, "loss": 2.285, "mean_token_accuracy": 0.4310344696044922, "step": 108255 }, { "epoch": 0.10904062181154887, "grad_norm": 13.358316529814356, "learning_rate": 4.9572522077012954e-05, "loss": 2.4205, "mean_token_accuracy": 0.42758620381355283, "step": 108260 }, { "epoch": 0.10904565786465305, "grad_norm": 11.587656475147119, "learning_rate": 4.957244937846414e-05, "loss": 2.5325, "mean_token_accuracy": 0.42413793206214906, "step": 108265 }, { "epoch": 0.10905069391775722, "grad_norm": 7.870348937029143, "learning_rate": 4.957237667379345e-05, "loss": 2.0006, "mean_token_accuracy": 0.4931034445762634, "step": 108270 }, { "epoch": 0.1090557299708614, "grad_norm": 12.540386994390301, "learning_rate": 4.95723039630009e-05, "loss": 2.498, "mean_token_accuracy": 0.4206896543502808, "step": 108275 }, { "epoch": 0.10906076602396557, "grad_norm": 12.365801220103963, "learning_rate": 4.9572231246086515e-05, "loss": 2.3619, "mean_token_accuracy": 0.47586206197738645, "step": 108280 }, { "epoch": 0.10906580207706974, "grad_norm": 15.286300865465538, "learning_rate": 4.957215852305032e-05, "loss": 2.9182, "mean_token_accuracy": 0.38965516686439516, "step": 108285 }, { "epoch": 0.10907083813017392, "grad_norm": 12.390272458637462, "learning_rate": 4.957208579389233e-05, "loss": 2.6025, "mean_token_accuracy": 0.3896551728248596, "step": 108290 }, { "epoch": 0.10907587418327809, "grad_norm": 9.10308311971539, "learning_rate": 4.9572013058612554e-05, "loss": 2.4029, "mean_token_accuracy": 0.42413793206214906, "step": 108295 }, { "epoch": 0.10908091023638226, "grad_norm": 9.723218202371056, "learning_rate": 4.957194031721102e-05, "loss": 1.9334, "mean_token_accuracy": 0.4670901417732239, "step": 108300 }, { "epoch": 0.10908594628948644, "grad_norm": 10.91961489154031, "learning_rate": 4.9571867569687765e-05, "loss": 2.0924, "mean_token_accuracy": 0.41724138855934145, "step": 108305 }, { "epoch": 0.10909098234259061, "grad_norm": 10.053105494412218, "learning_rate": 4.957179481604279e-05, "loss": 2.8632, "mean_token_accuracy": 0.43623714447021483, "step": 108310 }, { "epoch": 0.10909601839569479, "grad_norm": 13.117692457570408, "learning_rate": 4.957172205627612e-05, "loss": 2.4038, "mean_token_accuracy": 0.45517241954803467, "step": 108315 }, { "epoch": 0.10910105444879895, "grad_norm": 9.433686975924559, "learning_rate": 4.9571649290387776e-05, "loss": 2.1009, "mean_token_accuracy": 0.4965517342090607, "step": 108320 }, { "epoch": 0.10910609050190312, "grad_norm": 13.404124523328223, "learning_rate": 4.957157651837778e-05, "loss": 2.2973, "mean_token_accuracy": 0.4847549915313721, "step": 108325 }, { "epoch": 0.10911112655500729, "grad_norm": 10.215457499307105, "learning_rate": 4.957150374024615e-05, "loss": 2.3049, "mean_token_accuracy": 0.48275862336158754, "step": 108330 }, { "epoch": 0.10911616260811147, "grad_norm": 10.884165755558707, "learning_rate": 4.95714309559929e-05, "loss": 2.4353, "mean_token_accuracy": 0.40344828367233276, "step": 108335 }, { "epoch": 0.10912119866121564, "grad_norm": 10.961152783339772, "learning_rate": 4.957135816561806e-05, "loss": 2.4183, "mean_token_accuracy": 0.4482758641242981, "step": 108340 }, { "epoch": 0.10912623471431981, "grad_norm": 10.339602000872851, "learning_rate": 4.957128536912165e-05, "loss": 2.1201, "mean_token_accuracy": 0.4896551787853241, "step": 108345 }, { "epoch": 0.10913127076742399, "grad_norm": 9.372026241108806, "learning_rate": 4.9571212566503675e-05, "loss": 1.9588, "mean_token_accuracy": 0.5137930989265442, "step": 108350 }, { "epoch": 0.10913630682052816, "grad_norm": 10.981711469328777, "learning_rate": 4.9571139757764176e-05, "loss": 2.1345, "mean_token_accuracy": 0.4482758641242981, "step": 108355 }, { "epoch": 0.10914134287363234, "grad_norm": 10.200577394268366, "learning_rate": 4.957106694290316e-05, "loss": 2.3385, "mean_token_accuracy": 0.4517241358757019, "step": 108360 }, { "epoch": 0.10914637892673651, "grad_norm": 8.376924389264458, "learning_rate": 4.9570994121920656e-05, "loss": 2.133, "mean_token_accuracy": 0.4931034564971924, "step": 108365 }, { "epoch": 0.10915141497984068, "grad_norm": 10.327140714044267, "learning_rate": 4.957092129481668e-05, "loss": 2.5852, "mean_token_accuracy": 0.4034482717514038, "step": 108370 }, { "epoch": 0.10915645103294486, "grad_norm": 10.343691824511522, "learning_rate": 4.957084846159124e-05, "loss": 2.7169, "mean_token_accuracy": 0.4068965494632721, "step": 108375 }, { "epoch": 0.10916148708604903, "grad_norm": 12.118700244603959, "learning_rate": 4.9570775622244384e-05, "loss": 2.5423, "mean_token_accuracy": 0.4103448212146759, "step": 108380 }, { "epoch": 0.1091665231391532, "grad_norm": 10.901960424015048, "learning_rate": 4.9570702776776106e-05, "loss": 2.6154, "mean_token_accuracy": 0.3586206942796707, "step": 108385 }, { "epoch": 0.10917155919225736, "grad_norm": 9.660631562939315, "learning_rate": 4.957062992518644e-05, "loss": 2.5239, "mean_token_accuracy": 0.41034482717514037, "step": 108390 }, { "epoch": 0.10917659524536154, "grad_norm": 10.805359119674723, "learning_rate": 4.95705570674754e-05, "loss": 3.0089, "mean_token_accuracy": 0.3448275774717331, "step": 108395 }, { "epoch": 0.10918163129846571, "grad_norm": 10.562073921113637, "learning_rate": 4.9570484203643016e-05, "loss": 2.2378, "mean_token_accuracy": 0.5110837459564209, "step": 108400 }, { "epoch": 0.10918666735156989, "grad_norm": 9.499923768028935, "learning_rate": 4.957041133368929e-05, "loss": 2.2979, "mean_token_accuracy": 0.42758620977401735, "step": 108405 }, { "epoch": 0.10919170340467406, "grad_norm": 13.03946893888272, "learning_rate": 4.9570338457614265e-05, "loss": 2.2928, "mean_token_accuracy": 0.4310344934463501, "step": 108410 }, { "epoch": 0.10919673945777823, "grad_norm": 11.417979671059793, "learning_rate": 4.957026557541794e-05, "loss": 2.5613, "mean_token_accuracy": 0.39310345649719236, "step": 108415 }, { "epoch": 0.1092017755108824, "grad_norm": 8.99716566265736, "learning_rate": 4.957019268710035e-05, "loss": 1.9817, "mean_token_accuracy": 0.4724137902259827, "step": 108420 }, { "epoch": 0.10920681156398658, "grad_norm": 12.568965942508944, "learning_rate": 4.95701197926615e-05, "loss": 2.7194, "mean_token_accuracy": 0.38275861740112305, "step": 108425 }, { "epoch": 0.10921184761709075, "grad_norm": 10.503866668828742, "learning_rate": 4.957004689210143e-05, "loss": 2.321, "mean_token_accuracy": 0.4172413766384125, "step": 108430 }, { "epoch": 0.10921688367019493, "grad_norm": 11.453502483966966, "learning_rate": 4.956997398542015e-05, "loss": 2.4259, "mean_token_accuracy": 0.4379310369491577, "step": 108435 }, { "epoch": 0.1092219197232991, "grad_norm": 9.504855925697074, "learning_rate": 4.956990107261768e-05, "loss": 2.5629, "mean_token_accuracy": 0.4034482717514038, "step": 108440 }, { "epoch": 0.10922695577640328, "grad_norm": 7.927636634948441, "learning_rate": 4.956982815369403e-05, "loss": 2.1775, "mean_token_accuracy": 0.47241379618644713, "step": 108445 }, { "epoch": 0.10923199182950745, "grad_norm": 9.68166631441675, "learning_rate": 4.9569755228649244e-05, "loss": 2.6143, "mean_token_accuracy": 0.39310344457626345, "step": 108450 }, { "epoch": 0.10923702788261162, "grad_norm": 9.734905257771333, "learning_rate": 4.9569682297483316e-05, "loss": 2.2195, "mean_token_accuracy": 0.47241380214691164, "step": 108455 }, { "epoch": 0.10924206393571578, "grad_norm": 13.6352396891129, "learning_rate": 4.9569609360196294e-05, "loss": 2.6495, "mean_token_accuracy": 0.3551724165678024, "step": 108460 }, { "epoch": 0.10924709998881996, "grad_norm": 10.145225661141481, "learning_rate": 4.956953641678818e-05, "loss": 2.1389, "mean_token_accuracy": 0.47241379618644713, "step": 108465 }, { "epoch": 0.10925213604192413, "grad_norm": 11.423748588562828, "learning_rate": 4.956946346725899e-05, "loss": 2.341, "mean_token_accuracy": 0.4137930989265442, "step": 108470 }, { "epoch": 0.1092571720950283, "grad_norm": 10.463783815585675, "learning_rate": 4.956939051160875e-05, "loss": 2.1605, "mean_token_accuracy": 0.44137930274009707, "step": 108475 }, { "epoch": 0.10926220814813248, "grad_norm": 11.02651498479961, "learning_rate": 4.956931754983749e-05, "loss": 2.5377, "mean_token_accuracy": 0.4241379380226135, "step": 108480 }, { "epoch": 0.10926724420123665, "grad_norm": 11.648402800939554, "learning_rate": 4.9569244581945225e-05, "loss": 2.6916, "mean_token_accuracy": 0.36346037983894347, "step": 108485 }, { "epoch": 0.10927228025434083, "grad_norm": 12.758083638189552, "learning_rate": 4.956917160793197e-05, "loss": 2.677, "mean_token_accuracy": 0.35172412991523744, "step": 108490 }, { "epoch": 0.109277316307445, "grad_norm": 10.346411580881599, "learning_rate": 4.956909862779774e-05, "loss": 2.1845, "mean_token_accuracy": 0.4655172348022461, "step": 108495 }, { "epoch": 0.10928235236054917, "grad_norm": 12.511417127285094, "learning_rate": 4.956902564154258e-05, "loss": 2.2436, "mean_token_accuracy": 0.48620688915252686, "step": 108500 }, { "epoch": 0.10928738841365335, "grad_norm": 8.916031877945324, "learning_rate": 4.956895264916648e-05, "loss": 2.6102, "mean_token_accuracy": 0.3758620619773865, "step": 108505 }, { "epoch": 0.10929242446675752, "grad_norm": 11.873683213908357, "learning_rate": 4.9568879650669475e-05, "loss": 2.4268, "mean_token_accuracy": 0.3931034505367279, "step": 108510 }, { "epoch": 0.1092974605198617, "grad_norm": 9.888570492973518, "learning_rate": 4.956880664605159e-05, "loss": 2.3682, "mean_token_accuracy": 0.4137930989265442, "step": 108515 }, { "epoch": 0.10930249657296587, "grad_norm": 10.962545228941444, "learning_rate": 4.956873363531283e-05, "loss": 2.7676, "mean_token_accuracy": 0.36896551847457887, "step": 108520 }, { "epoch": 0.10930753262607004, "grad_norm": 9.229155459405217, "learning_rate": 4.9568660618453235e-05, "loss": 1.9578, "mean_token_accuracy": 0.4551724135875702, "step": 108525 }, { "epoch": 0.1093125686791742, "grad_norm": 11.714011720261418, "learning_rate": 4.9568587595472805e-05, "loss": 2.5371, "mean_token_accuracy": 0.40532365441322327, "step": 108530 }, { "epoch": 0.10931760473227838, "grad_norm": 8.694130699884665, "learning_rate": 4.956851456637157e-05, "loss": 2.124, "mean_token_accuracy": 0.43103447556495667, "step": 108535 }, { "epoch": 0.10932264078538255, "grad_norm": 9.940895009659549, "learning_rate": 4.956844153114956e-05, "loss": 1.9123, "mean_token_accuracy": 0.5275861978530884, "step": 108540 }, { "epoch": 0.10932767683848672, "grad_norm": 11.782744428796434, "learning_rate": 4.9568368489806773e-05, "loss": 2.5556, "mean_token_accuracy": 0.4620689630508423, "step": 108545 }, { "epoch": 0.1093327128915909, "grad_norm": 7.097656365248149, "learning_rate": 4.956829544234325e-05, "loss": 1.9566, "mean_token_accuracy": 0.515517246723175, "step": 108550 }, { "epoch": 0.10933774894469507, "grad_norm": 16.880874774381244, "learning_rate": 4.9568222388759e-05, "loss": 2.4144, "mean_token_accuracy": 0.37586206793785093, "step": 108555 }, { "epoch": 0.10934278499779924, "grad_norm": 11.434189214434877, "learning_rate": 4.9568149329054056e-05, "loss": 2.4591, "mean_token_accuracy": 0.4034482717514038, "step": 108560 }, { "epoch": 0.10934782105090342, "grad_norm": 13.50847539939889, "learning_rate": 4.9568076263228416e-05, "loss": 2.6547, "mean_token_accuracy": 0.3999999940395355, "step": 108565 }, { "epoch": 0.10935285710400759, "grad_norm": 10.963449505652344, "learning_rate": 4.9568003191282115e-05, "loss": 2.4893, "mean_token_accuracy": 0.4551724135875702, "step": 108570 }, { "epoch": 0.10935789315711177, "grad_norm": 11.571722551782264, "learning_rate": 4.9567930113215175e-05, "loss": 2.3046, "mean_token_accuracy": 0.4068965494632721, "step": 108575 }, { "epoch": 0.10936292921021594, "grad_norm": 11.300867966490793, "learning_rate": 4.95678570290276e-05, "loss": 2.8549, "mean_token_accuracy": 0.39999999701976774, "step": 108580 }, { "epoch": 0.10936796526332011, "grad_norm": 8.203230108640994, "learning_rate": 4.9567783938719436e-05, "loss": 2.4367, "mean_token_accuracy": 0.4853599429130554, "step": 108585 }, { "epoch": 0.10937300131642429, "grad_norm": 10.602353265511875, "learning_rate": 4.9567710842290685e-05, "loss": 2.5164, "mean_token_accuracy": 0.38275861740112305, "step": 108590 }, { "epoch": 0.10937803736952846, "grad_norm": 10.36125809228581, "learning_rate": 4.9567637739741375e-05, "loss": 2.3382, "mean_token_accuracy": 0.43103448748588563, "step": 108595 }, { "epoch": 0.10938307342263262, "grad_norm": 9.579450353640178, "learning_rate": 4.956756463107152e-05, "loss": 2.6928, "mean_token_accuracy": 0.3931034505367279, "step": 108600 }, { "epoch": 0.1093881094757368, "grad_norm": 7.815804526294398, "learning_rate": 4.956749151628115e-05, "loss": 2.4032, "mean_token_accuracy": 0.4483968555927277, "step": 108605 }, { "epoch": 0.10939314552884097, "grad_norm": 11.501560278260502, "learning_rate": 4.956741839537027e-05, "loss": 2.3212, "mean_token_accuracy": 0.44137930274009707, "step": 108610 }, { "epoch": 0.10939818158194514, "grad_norm": 12.306308563207965, "learning_rate": 4.956734526833891e-05, "loss": 2.0798, "mean_token_accuracy": 0.4570477843284607, "step": 108615 }, { "epoch": 0.10940321763504932, "grad_norm": 9.596085647339196, "learning_rate": 4.95672721351871e-05, "loss": 2.2661, "mean_token_accuracy": 0.4379310250282288, "step": 108620 }, { "epoch": 0.10940825368815349, "grad_norm": 10.933406617928826, "learning_rate": 4.956719899591484e-05, "loss": 2.5301, "mean_token_accuracy": 0.37241379618644715, "step": 108625 }, { "epoch": 0.10941328974125766, "grad_norm": 12.024663090448541, "learning_rate": 4.956712585052216e-05, "loss": 2.0425, "mean_token_accuracy": 0.49655171036720275, "step": 108630 }, { "epoch": 0.10941832579436184, "grad_norm": 11.365256647370954, "learning_rate": 4.956705269900908e-05, "loss": 2.3019, "mean_token_accuracy": 0.44482759237289426, "step": 108635 }, { "epoch": 0.10942336184746601, "grad_norm": 11.117497196982137, "learning_rate": 4.956697954137563e-05, "loss": 2.1995, "mean_token_accuracy": 0.4137930989265442, "step": 108640 }, { "epoch": 0.10942839790057018, "grad_norm": 11.286968685953827, "learning_rate": 4.9566906377621814e-05, "loss": 2.5183, "mean_token_accuracy": 0.3896551728248596, "step": 108645 }, { "epoch": 0.10943343395367436, "grad_norm": 10.844175633416276, "learning_rate": 4.9566833207747656e-05, "loss": 2.4981, "mean_token_accuracy": 0.41379310488700866, "step": 108650 }, { "epoch": 0.10943847000677853, "grad_norm": 10.260610069847061, "learning_rate": 4.9566760031753176e-05, "loss": 2.4458, "mean_token_accuracy": 0.4344827592372894, "step": 108655 }, { "epoch": 0.1094435060598827, "grad_norm": 11.182699354731659, "learning_rate": 4.956668684963841e-05, "loss": 2.7002, "mean_token_accuracy": 0.42589232325553894, "step": 108660 }, { "epoch": 0.10944854211298688, "grad_norm": 8.458551738372089, "learning_rate": 4.956661366140336e-05, "loss": 2.508, "mean_token_accuracy": 0.37586207389831544, "step": 108665 }, { "epoch": 0.10945357816609104, "grad_norm": 11.192587152765487, "learning_rate": 4.956654046704805e-05, "loss": 2.5648, "mean_token_accuracy": 0.38100423812866213, "step": 108670 }, { "epoch": 0.10945861421919521, "grad_norm": 11.511134975528474, "learning_rate": 4.9566467266572514e-05, "loss": 2.5, "mean_token_accuracy": 0.3999999940395355, "step": 108675 }, { "epoch": 0.10946365027229939, "grad_norm": 12.47021857143296, "learning_rate": 4.956639405997675e-05, "loss": 2.0222, "mean_token_accuracy": 0.5413793087005615, "step": 108680 }, { "epoch": 0.10946868632540356, "grad_norm": 11.419565921442782, "learning_rate": 4.956632084726079e-05, "loss": 2.4063, "mean_token_accuracy": 0.38275861740112305, "step": 108685 }, { "epoch": 0.10947372237850773, "grad_norm": 11.398851867381527, "learning_rate": 4.9566247628424655e-05, "loss": 2.2905, "mean_token_accuracy": 0.45517241954803467, "step": 108690 }, { "epoch": 0.10947875843161191, "grad_norm": 10.492629529401714, "learning_rate": 4.9566174403468365e-05, "loss": 2.5323, "mean_token_accuracy": 0.44482758045196535, "step": 108695 }, { "epoch": 0.10948379448471608, "grad_norm": 10.0336889384859, "learning_rate": 4.956610117239194e-05, "loss": 2.2245, "mean_token_accuracy": 0.4569268047809601, "step": 108700 }, { "epoch": 0.10948883053782026, "grad_norm": 11.181414314160177, "learning_rate": 4.9566027935195396e-05, "loss": 2.2796, "mean_token_accuracy": 0.43272837400436404, "step": 108705 }, { "epoch": 0.10949386659092443, "grad_norm": 8.98544241455164, "learning_rate": 4.9565954691878765e-05, "loss": 2.2329, "mean_token_accuracy": 0.44827585816383364, "step": 108710 }, { "epoch": 0.1094989026440286, "grad_norm": 10.57836124799796, "learning_rate": 4.956588144244205e-05, "loss": 2.5064, "mean_token_accuracy": 0.4241379380226135, "step": 108715 }, { "epoch": 0.10950393869713278, "grad_norm": 11.9543074173301, "learning_rate": 4.9565808186885285e-05, "loss": 2.4164, "mean_token_accuracy": 0.4, "step": 108720 }, { "epoch": 0.10950897475023695, "grad_norm": 9.808049072316026, "learning_rate": 4.9565734925208476e-05, "loss": 2.977, "mean_token_accuracy": 0.3482758551836014, "step": 108725 }, { "epoch": 0.10951401080334112, "grad_norm": 9.833083804690995, "learning_rate": 4.956566165741167e-05, "loss": 2.1515, "mean_token_accuracy": 0.4620689630508423, "step": 108730 }, { "epoch": 0.1095190468564453, "grad_norm": 13.038763886401624, "learning_rate": 4.956558838349486e-05, "loss": 2.3303, "mean_token_accuracy": 0.43448275327682495, "step": 108735 }, { "epoch": 0.10952408290954946, "grad_norm": 10.162877259629278, "learning_rate": 4.9565515103458074e-05, "loss": 2.4305, "mean_token_accuracy": 0.4689655125141144, "step": 108740 }, { "epoch": 0.10952911896265363, "grad_norm": 11.539658194386842, "learning_rate": 4.956544181730134e-05, "loss": 2.4234, "mean_token_accuracy": 0.4, "step": 108745 }, { "epoch": 0.1095341550157578, "grad_norm": 11.546414194787163, "learning_rate": 4.9565368525024676e-05, "loss": 2.5711, "mean_token_accuracy": 0.3344827651977539, "step": 108750 }, { "epoch": 0.10953919106886198, "grad_norm": 7.491873613831233, "learning_rate": 4.95652952266281e-05, "loss": 2.024, "mean_token_accuracy": 0.48844525814056394, "step": 108755 }, { "epoch": 0.10954422712196615, "grad_norm": 10.461560499969957, "learning_rate": 4.9565221922111625e-05, "loss": 1.9001, "mean_token_accuracy": 0.5112069010734558, "step": 108760 }, { "epoch": 0.10954926317507033, "grad_norm": 9.423412894797863, "learning_rate": 4.956514861147529e-05, "loss": 2.5084, "mean_token_accuracy": 0.42758620381355283, "step": 108765 }, { "epoch": 0.1095542992281745, "grad_norm": 9.032713231007905, "learning_rate": 4.9565075294719086e-05, "loss": 2.472, "mean_token_accuracy": 0.4586206912994385, "step": 108770 }, { "epoch": 0.10955933528127867, "grad_norm": 9.549320520417819, "learning_rate": 4.956500197184307e-05, "loss": 2.336, "mean_token_accuracy": 0.4344827592372894, "step": 108775 }, { "epoch": 0.10956437133438285, "grad_norm": 8.858719270822732, "learning_rate": 4.956492864284723e-05, "loss": 2.2514, "mean_token_accuracy": 0.4448275864124298, "step": 108780 }, { "epoch": 0.10956940738748702, "grad_norm": 8.972141698119945, "learning_rate": 4.9564855307731604e-05, "loss": 2.4366, "mean_token_accuracy": 0.42413793206214906, "step": 108785 }, { "epoch": 0.1095744434405912, "grad_norm": 10.75523328238598, "learning_rate": 4.956478196649621e-05, "loss": 2.5833, "mean_token_accuracy": 0.458620685338974, "step": 108790 }, { "epoch": 0.10957947949369537, "grad_norm": 15.136250847038207, "learning_rate": 4.956470861914107e-05, "loss": 2.4689, "mean_token_accuracy": 0.43793103098869324, "step": 108795 }, { "epoch": 0.10958451554679954, "grad_norm": 11.161871078113332, "learning_rate": 4.956463526566619e-05, "loss": 2.2914, "mean_token_accuracy": 0.4034482777118683, "step": 108800 }, { "epoch": 0.10958955159990372, "grad_norm": 11.104912436014118, "learning_rate": 4.956456190607161e-05, "loss": 2.4898, "mean_token_accuracy": 0.4084089517593384, "step": 108805 }, { "epoch": 0.10959458765300788, "grad_norm": 12.26517361156502, "learning_rate": 4.9564488540357333e-05, "loss": 2.5938, "mean_token_accuracy": 0.3862068891525269, "step": 108810 }, { "epoch": 0.10959962370611205, "grad_norm": 24.670047407451257, "learning_rate": 4.9564415168523395e-05, "loss": 2.9217, "mean_token_accuracy": 0.35517241060733795, "step": 108815 }, { "epoch": 0.10960465975921622, "grad_norm": 9.313628219943586, "learning_rate": 4.9564341790569814e-05, "loss": 2.0554, "mean_token_accuracy": 0.47586206793785096, "step": 108820 }, { "epoch": 0.1096096958123204, "grad_norm": 11.91295505229531, "learning_rate": 4.9564268406496595e-05, "loss": 2.4908, "mean_token_accuracy": 0.37586207389831544, "step": 108825 }, { "epoch": 0.10961473186542457, "grad_norm": 11.376378925485312, "learning_rate": 4.9564195016303773e-05, "loss": 2.2501, "mean_token_accuracy": 0.46896552443504336, "step": 108830 }, { "epoch": 0.10961976791852875, "grad_norm": 12.06563125639636, "learning_rate": 4.956412161999136e-05, "loss": 3.1737, "mean_token_accuracy": 0.29999999403953553, "step": 108835 }, { "epoch": 0.10962480397163292, "grad_norm": 10.02406054281331, "learning_rate": 4.956404821755938e-05, "loss": 2.5961, "mean_token_accuracy": 0.3931034505367279, "step": 108840 }, { "epoch": 0.1096298400247371, "grad_norm": 10.01634283227325, "learning_rate": 4.956397480900786e-05, "loss": 2.7164, "mean_token_accuracy": 0.3448275923728943, "step": 108845 }, { "epoch": 0.10963487607784127, "grad_norm": 10.959486846850748, "learning_rate": 4.9563901394336816e-05, "loss": 2.5941, "mean_token_accuracy": 0.40689654350280763, "step": 108850 }, { "epoch": 0.10963991213094544, "grad_norm": 10.777803636831063, "learning_rate": 4.9563827973546265e-05, "loss": 2.3814, "mean_token_accuracy": 0.4206896543502808, "step": 108855 }, { "epoch": 0.10964494818404961, "grad_norm": 11.234310989495029, "learning_rate": 4.9563754546636225e-05, "loss": 2.1941, "mean_token_accuracy": 0.5137930870056152, "step": 108860 }, { "epoch": 0.10964998423715379, "grad_norm": 13.029129294493176, "learning_rate": 4.9563681113606725e-05, "loss": 2.1202, "mean_token_accuracy": 0.4482758641242981, "step": 108865 }, { "epoch": 0.10965502029025796, "grad_norm": 13.2264640952751, "learning_rate": 4.956360767445777e-05, "loss": 2.4013, "mean_token_accuracy": 0.45347853302955626, "step": 108870 }, { "epoch": 0.10966005634336214, "grad_norm": 10.588105514833211, "learning_rate": 4.95635342291894e-05, "loss": 2.5287, "mean_token_accuracy": 0.4310344815254211, "step": 108875 }, { "epoch": 0.1096650923964663, "grad_norm": 8.38986348488346, "learning_rate": 4.956346077780163e-05, "loss": 2.4858, "mean_token_accuracy": 0.42280701994895936, "step": 108880 }, { "epoch": 0.10967012844957047, "grad_norm": 11.367896400979324, "learning_rate": 4.9563387320294475e-05, "loss": 2.5733, "mean_token_accuracy": 0.4137930989265442, "step": 108885 }, { "epoch": 0.10967516450267464, "grad_norm": 10.94798172670199, "learning_rate": 4.9563313856667956e-05, "loss": 2.8079, "mean_token_accuracy": 0.3655172407627106, "step": 108890 }, { "epoch": 0.10968020055577882, "grad_norm": 9.997614792450273, "learning_rate": 4.9563240386922085e-05, "loss": 2.559, "mean_token_accuracy": 0.4363581418991089, "step": 108895 }, { "epoch": 0.10968523660888299, "grad_norm": 12.603090892798074, "learning_rate": 4.95631669110569e-05, "loss": 2.4431, "mean_token_accuracy": 0.3965517282485962, "step": 108900 }, { "epoch": 0.10969027266198716, "grad_norm": 12.577871356908824, "learning_rate": 4.9563093429072414e-05, "loss": 2.2132, "mean_token_accuracy": 0.45517241954803467, "step": 108905 }, { "epoch": 0.10969530871509134, "grad_norm": 11.31073341549833, "learning_rate": 4.956301994096865e-05, "loss": 2.2594, "mean_token_accuracy": 0.4379310369491577, "step": 108910 }, { "epoch": 0.10970034476819551, "grad_norm": 10.245542774431373, "learning_rate": 4.9562946446745626e-05, "loss": 2.4868, "mean_token_accuracy": 0.36896551847457887, "step": 108915 }, { "epoch": 0.10970538082129969, "grad_norm": 11.499372335243203, "learning_rate": 4.956287294640335e-05, "loss": 2.4993, "mean_token_accuracy": 0.41379310488700866, "step": 108920 }, { "epoch": 0.10971041687440386, "grad_norm": 11.5584277646368, "learning_rate": 4.956279943994186e-05, "loss": 2.2586, "mean_token_accuracy": 0.44361767172813416, "step": 108925 }, { "epoch": 0.10971545292750803, "grad_norm": 11.26854249102457, "learning_rate": 4.9562725927361175e-05, "loss": 2.704, "mean_token_accuracy": 0.4413793087005615, "step": 108930 }, { "epoch": 0.10972048898061221, "grad_norm": 12.244780790930013, "learning_rate": 4.9562652408661306e-05, "loss": 2.5002, "mean_token_accuracy": 0.39310344457626345, "step": 108935 }, { "epoch": 0.10972552503371638, "grad_norm": 9.966766507119857, "learning_rate": 4.9562578883842274e-05, "loss": 2.1889, "mean_token_accuracy": 0.4655172348022461, "step": 108940 }, { "epoch": 0.10973056108682055, "grad_norm": 9.487043282670616, "learning_rate": 4.956250535290411e-05, "loss": 2.4244, "mean_token_accuracy": 0.4517241418361664, "step": 108945 }, { "epoch": 0.10973559713992471, "grad_norm": 10.76844236529563, "learning_rate": 4.956243181584683e-05, "loss": 2.1593, "mean_token_accuracy": 0.4344827592372894, "step": 108950 }, { "epoch": 0.10974063319302889, "grad_norm": 14.655030719350563, "learning_rate": 4.9562358272670446e-05, "loss": 2.4538, "mean_token_accuracy": 0.43793103098869324, "step": 108955 }, { "epoch": 0.10974566924613306, "grad_norm": 9.489390959980613, "learning_rate": 4.956228472337499e-05, "loss": 2.4256, "mean_token_accuracy": 0.42068966031074523, "step": 108960 }, { "epoch": 0.10975070529923724, "grad_norm": 8.45499594880466, "learning_rate": 4.956221116796047e-05, "loss": 2.1903, "mean_token_accuracy": 0.4862068951129913, "step": 108965 }, { "epoch": 0.10975574135234141, "grad_norm": 10.663743035899332, "learning_rate": 4.9562137606426915e-05, "loss": 2.2014, "mean_token_accuracy": 0.44482759237289426, "step": 108970 }, { "epoch": 0.10976077740544558, "grad_norm": 8.931933969285565, "learning_rate": 4.9562064038774355e-05, "loss": 2.4114, "mean_token_accuracy": 0.4241379380226135, "step": 108975 }, { "epoch": 0.10976581345854976, "grad_norm": 10.06488116436584, "learning_rate": 4.956199046500279e-05, "loss": 2.3047, "mean_token_accuracy": 0.4689655065536499, "step": 108980 }, { "epoch": 0.10977084951165393, "grad_norm": 9.62908143732193, "learning_rate": 4.956191688511225e-05, "loss": 2.677, "mean_token_accuracy": 0.4034482777118683, "step": 108985 }, { "epoch": 0.1097758855647581, "grad_norm": 11.612168542583195, "learning_rate": 4.956184329910275e-05, "loss": 2.4342, "mean_token_accuracy": 0.3965517282485962, "step": 108990 }, { "epoch": 0.10978092161786228, "grad_norm": 10.823624749613211, "learning_rate": 4.956176970697432e-05, "loss": 2.7756, "mean_token_accuracy": 0.37586206793785093, "step": 108995 }, { "epoch": 0.10978595767096645, "grad_norm": 8.696694680540846, "learning_rate": 4.956169610872697e-05, "loss": 2.2633, "mean_token_accuracy": 0.443254691362381, "step": 109000 }, { "epoch": 0.10979099372407063, "grad_norm": 9.115203828898157, "learning_rate": 4.9561622504360744e-05, "loss": 2.4558, "mean_token_accuracy": 0.4068965554237366, "step": 109005 }, { "epoch": 0.1097960297771748, "grad_norm": 10.394024740651497, "learning_rate": 4.956154889387563e-05, "loss": 2.4039, "mean_token_accuracy": 0.42262552976608275, "step": 109010 }, { "epoch": 0.10980106583027897, "grad_norm": 12.558174560202506, "learning_rate": 4.956147527727167e-05, "loss": 2.4549, "mean_token_accuracy": 0.4482758641242981, "step": 109015 }, { "epoch": 0.10980610188338313, "grad_norm": 13.029078177801535, "learning_rate": 4.956140165454887e-05, "loss": 2.3651, "mean_token_accuracy": 0.4137930929660797, "step": 109020 }, { "epoch": 0.10981113793648731, "grad_norm": 8.719260535972563, "learning_rate": 4.956132802570726e-05, "loss": 2.2063, "mean_token_accuracy": 0.48620688915252686, "step": 109025 }, { "epoch": 0.10981617398959148, "grad_norm": 14.97062534915284, "learning_rate": 4.9561254390746866e-05, "loss": 2.6109, "mean_token_accuracy": 0.3862069010734558, "step": 109030 }, { "epoch": 0.10982121004269566, "grad_norm": 9.595315478970837, "learning_rate": 4.956118074966769e-05, "loss": 2.3554, "mean_token_accuracy": 0.42758620977401735, "step": 109035 }, { "epoch": 0.10982624609579983, "grad_norm": 10.244274513043388, "learning_rate": 4.956110710246977e-05, "loss": 2.2588, "mean_token_accuracy": 0.4344827473163605, "step": 109040 }, { "epoch": 0.109831282148904, "grad_norm": 11.507146602902875, "learning_rate": 4.9561033449153124e-05, "loss": 2.7435, "mean_token_accuracy": 0.3793103456497192, "step": 109045 }, { "epoch": 0.10983631820200818, "grad_norm": 8.293472591177698, "learning_rate": 4.956095978971776e-05, "loss": 1.7032, "mean_token_accuracy": 0.5572292804718018, "step": 109050 }, { "epoch": 0.10984135425511235, "grad_norm": 11.360813037703537, "learning_rate": 4.9560886124163704e-05, "loss": 2.936, "mean_token_accuracy": 0.3517241358757019, "step": 109055 }, { "epoch": 0.10984639030821652, "grad_norm": 9.135239364974659, "learning_rate": 4.9560812452490985e-05, "loss": 2.1318, "mean_token_accuracy": 0.5000000059604645, "step": 109060 }, { "epoch": 0.1098514263613207, "grad_norm": 14.214867158708788, "learning_rate": 4.956073877469961e-05, "loss": 2.4464, "mean_token_accuracy": 0.4241379380226135, "step": 109065 }, { "epoch": 0.10985646241442487, "grad_norm": 10.533282752688555, "learning_rate": 4.956066509078962e-05, "loss": 2.4081, "mean_token_accuracy": 0.4551724135875702, "step": 109070 }, { "epoch": 0.10986149846752905, "grad_norm": 11.67710087480588, "learning_rate": 4.956059140076101e-05, "loss": 2.1866, "mean_token_accuracy": 0.46442831158638, "step": 109075 }, { "epoch": 0.10986653452063322, "grad_norm": 10.756572263131078, "learning_rate": 4.9560517704613824e-05, "loss": 2.5072, "mean_token_accuracy": 0.42413792610168455, "step": 109080 }, { "epoch": 0.10987157057373739, "grad_norm": 13.000984900594185, "learning_rate": 4.9560444002348064e-05, "loss": 2.5627, "mean_token_accuracy": 0.41034482717514037, "step": 109085 }, { "epoch": 0.10987660662684155, "grad_norm": 9.61400504338785, "learning_rate": 4.956037029396376e-05, "loss": 2.5575, "mean_token_accuracy": 0.41034482717514037, "step": 109090 }, { "epoch": 0.10988164267994573, "grad_norm": 10.147762278408658, "learning_rate": 4.956029657946093e-05, "loss": 2.0324, "mean_token_accuracy": 0.5034482777118683, "step": 109095 }, { "epoch": 0.1098866787330499, "grad_norm": 11.882602427341729, "learning_rate": 4.956022285883959e-05, "loss": 2.4902, "mean_token_accuracy": 0.3931034505367279, "step": 109100 }, { "epoch": 0.10989171478615407, "grad_norm": 9.907048937390432, "learning_rate": 4.956014913209976e-05, "loss": 2.2911, "mean_token_accuracy": 0.41379311084747317, "step": 109105 }, { "epoch": 0.10989675083925825, "grad_norm": 9.123481824334343, "learning_rate": 4.956007539924148e-05, "loss": 2.3323, "mean_token_accuracy": 0.4034482717514038, "step": 109110 }, { "epoch": 0.10990178689236242, "grad_norm": 10.32365337270559, "learning_rate": 4.9560001660264746e-05, "loss": 2.3214, "mean_token_accuracy": 0.44482757449150084, "step": 109115 }, { "epoch": 0.1099068229454666, "grad_norm": 9.666834411935854, "learning_rate": 4.955992791516959e-05, "loss": 2.5506, "mean_token_accuracy": 0.4172413766384125, "step": 109120 }, { "epoch": 0.10991185899857077, "grad_norm": 10.090616537661079, "learning_rate": 4.9559854163956035e-05, "loss": 2.2862, "mean_token_accuracy": 0.4698275923728943, "step": 109125 }, { "epoch": 0.10991689505167494, "grad_norm": 11.80025952737308, "learning_rate": 4.9559780406624094e-05, "loss": 2.6937, "mean_token_accuracy": 0.4137930989265442, "step": 109130 }, { "epoch": 0.10992193110477912, "grad_norm": 9.942229468930933, "learning_rate": 4.955970664317379e-05, "loss": 2.2458, "mean_token_accuracy": 0.43793103098869324, "step": 109135 }, { "epoch": 0.10992696715788329, "grad_norm": 10.687084895712541, "learning_rate": 4.955963287360514e-05, "loss": 2.7939, "mean_token_accuracy": 0.41034482717514037, "step": 109140 }, { "epoch": 0.10993200321098746, "grad_norm": 11.331095515013114, "learning_rate": 4.955955909791818e-05, "loss": 2.1298, "mean_token_accuracy": 0.42413793206214906, "step": 109145 }, { "epoch": 0.10993703926409164, "grad_norm": 8.395247658307, "learning_rate": 4.955948531611291e-05, "loss": 2.0727, "mean_token_accuracy": 0.44664247035980226, "step": 109150 }, { "epoch": 0.10994207531719581, "grad_norm": 9.1294058608412, "learning_rate": 4.955941152818936e-05, "loss": 2.6236, "mean_token_accuracy": 0.3862069010734558, "step": 109155 }, { "epoch": 0.10994711137029997, "grad_norm": 9.572630979379975, "learning_rate": 4.955933773414755e-05, "loss": 2.0219, "mean_token_accuracy": 0.4551724135875702, "step": 109160 }, { "epoch": 0.10995214742340415, "grad_norm": 9.536677659619636, "learning_rate": 4.9559263933987504e-05, "loss": 2.0314, "mean_token_accuracy": 0.46551724672317507, "step": 109165 }, { "epoch": 0.10995718347650832, "grad_norm": 10.207024342914838, "learning_rate": 4.9559190127709235e-05, "loss": 2.594, "mean_token_accuracy": 0.3827586203813553, "step": 109170 }, { "epoch": 0.10996221952961249, "grad_norm": 10.562118068320435, "learning_rate": 4.955911631531277e-05, "loss": 2.3512, "mean_token_accuracy": 0.4310344815254211, "step": 109175 }, { "epoch": 0.10996725558271667, "grad_norm": 9.805271019172876, "learning_rate": 4.955904249679813e-05, "loss": 2.174, "mean_token_accuracy": 0.46551724076271056, "step": 109180 }, { "epoch": 0.10997229163582084, "grad_norm": 10.900952548385096, "learning_rate": 4.955896867216532e-05, "loss": 2.4664, "mean_token_accuracy": 0.4327888786792755, "step": 109185 }, { "epoch": 0.10997732768892501, "grad_norm": 9.016507243162119, "learning_rate": 4.955889484141439e-05, "loss": 2.2283, "mean_token_accuracy": 0.46896551847457885, "step": 109190 }, { "epoch": 0.10998236374202919, "grad_norm": 11.541788444179556, "learning_rate": 4.955882100454533e-05, "loss": 2.1302, "mean_token_accuracy": 0.4537810027599335, "step": 109195 }, { "epoch": 0.10998739979513336, "grad_norm": 9.921947478533077, "learning_rate": 4.955874716155818e-05, "loss": 2.1204, "mean_token_accuracy": 0.49655172824859617, "step": 109200 }, { "epoch": 0.10999243584823754, "grad_norm": 10.520993757513427, "learning_rate": 4.9558673312452954e-05, "loss": 2.3247, "mean_token_accuracy": 0.39655172228813174, "step": 109205 }, { "epoch": 0.10999747190134171, "grad_norm": 10.197429672015701, "learning_rate": 4.955859945722967e-05, "loss": 2.2475, "mean_token_accuracy": 0.42413793206214906, "step": 109210 }, { "epoch": 0.11000250795444588, "grad_norm": 11.577869560983032, "learning_rate": 4.955852559588835e-05, "loss": 2.4024, "mean_token_accuracy": 0.42758620381355283, "step": 109215 }, { "epoch": 0.11000754400755006, "grad_norm": 11.605556582082269, "learning_rate": 4.955845172842902e-05, "loss": 2.2004, "mean_token_accuracy": 0.44482759237289426, "step": 109220 }, { "epoch": 0.11001258006065423, "grad_norm": 11.703510667040309, "learning_rate": 4.955837785485169e-05, "loss": 2.5897, "mean_token_accuracy": 0.42758620977401735, "step": 109225 }, { "epoch": 0.11001761611375839, "grad_norm": 10.582449903275375, "learning_rate": 4.955830397515639e-05, "loss": 2.832, "mean_token_accuracy": 0.33793103098869326, "step": 109230 }, { "epoch": 0.11002265216686256, "grad_norm": 9.240855462927925, "learning_rate": 4.9558230089343136e-05, "loss": 2.3382, "mean_token_accuracy": 0.3965517282485962, "step": 109235 }, { "epoch": 0.11002768821996674, "grad_norm": 12.371712584977232, "learning_rate": 4.9558156197411954e-05, "loss": 2.4259, "mean_token_accuracy": 0.4, "step": 109240 }, { "epoch": 0.11003272427307091, "grad_norm": 12.781668667436623, "learning_rate": 4.9558082299362856e-05, "loss": 2.4081, "mean_token_accuracy": 0.38620689511299133, "step": 109245 }, { "epoch": 0.11003776032617509, "grad_norm": 12.500905724944795, "learning_rate": 4.9558008395195863e-05, "loss": 2.7141, "mean_token_accuracy": 0.37241379618644715, "step": 109250 }, { "epoch": 0.11004279637927926, "grad_norm": 14.045517669688499, "learning_rate": 4.955793448491101e-05, "loss": 2.6004, "mean_token_accuracy": 0.39655172228813174, "step": 109255 }, { "epoch": 0.11004783243238343, "grad_norm": 9.73210681947098, "learning_rate": 4.95578605685083e-05, "loss": 2.0796, "mean_token_accuracy": 0.4586206912994385, "step": 109260 }, { "epoch": 0.1100528684854876, "grad_norm": 16.367300958170205, "learning_rate": 4.9557786645987756e-05, "loss": 2.469, "mean_token_accuracy": 0.4310344815254211, "step": 109265 }, { "epoch": 0.11005790453859178, "grad_norm": 12.026634040401063, "learning_rate": 4.9557712717349405e-05, "loss": 2.7868, "mean_token_accuracy": 0.38275861740112305, "step": 109270 }, { "epoch": 0.11006294059169595, "grad_norm": 11.97665012264357, "learning_rate": 4.9557638782593275e-05, "loss": 2.3032, "mean_token_accuracy": 0.4620689630508423, "step": 109275 }, { "epoch": 0.11006797664480013, "grad_norm": 8.873279672474425, "learning_rate": 4.955756484171937e-05, "loss": 2.1302, "mean_token_accuracy": 0.49655171632766726, "step": 109280 }, { "epoch": 0.1100730126979043, "grad_norm": 12.927814400287867, "learning_rate": 4.955749089472771e-05, "loss": 2.5186, "mean_token_accuracy": 0.4103448331356049, "step": 109285 }, { "epoch": 0.11007804875100848, "grad_norm": 11.353392999280747, "learning_rate": 4.955741694161833e-05, "loss": 2.6087, "mean_token_accuracy": 0.3931034505367279, "step": 109290 }, { "epoch": 0.11008308480411265, "grad_norm": 8.32873067447434, "learning_rate": 4.955734298239124e-05, "loss": 2.1706, "mean_token_accuracy": 0.47241378426551817, "step": 109295 }, { "epoch": 0.11008812085721681, "grad_norm": 11.388515903319364, "learning_rate": 4.9557269017046466e-05, "loss": 2.7546, "mean_token_accuracy": 0.4068965494632721, "step": 109300 }, { "epoch": 0.11009315691032098, "grad_norm": 14.069913783782553, "learning_rate": 4.955719504558403e-05, "loss": 2.5818, "mean_token_accuracy": 0.4, "step": 109305 }, { "epoch": 0.11009819296342516, "grad_norm": 10.990039738625331, "learning_rate": 4.955712106800394e-05, "loss": 3.1273, "mean_token_accuracy": 0.38275861740112305, "step": 109310 }, { "epoch": 0.11010322901652933, "grad_norm": 12.389077670206957, "learning_rate": 4.955704708430623e-05, "loss": 2.7016, "mean_token_accuracy": 0.4413793087005615, "step": 109315 }, { "epoch": 0.1101082650696335, "grad_norm": 12.905178621033356, "learning_rate": 4.9556973094490916e-05, "loss": 2.8666, "mean_token_accuracy": 0.43448275327682495, "step": 109320 }, { "epoch": 0.11011330112273768, "grad_norm": 10.732526781395356, "learning_rate": 4.955689909855802e-05, "loss": 3.0251, "mean_token_accuracy": 0.3379310339689255, "step": 109325 }, { "epoch": 0.11011833717584185, "grad_norm": 10.722491266444997, "learning_rate": 4.9556825096507554e-05, "loss": 2.5219, "mean_token_accuracy": 0.41379310190677643, "step": 109330 }, { "epoch": 0.11012337322894603, "grad_norm": 10.846609840891913, "learning_rate": 4.9556751088339545e-05, "loss": 2.4061, "mean_token_accuracy": 0.4517241358757019, "step": 109335 }, { "epoch": 0.1101284092820502, "grad_norm": 12.023357898130989, "learning_rate": 4.955667707405403e-05, "loss": 2.3183, "mean_token_accuracy": 0.441379314661026, "step": 109340 }, { "epoch": 0.11013344533515437, "grad_norm": 10.845900272437596, "learning_rate": 4.955660305365099e-05, "loss": 2.9428, "mean_token_accuracy": 0.34482758641242983, "step": 109345 }, { "epoch": 0.11013848138825855, "grad_norm": 10.35951877913365, "learning_rate": 4.9556529027130484e-05, "loss": 2.5158, "mean_token_accuracy": 0.3931034505367279, "step": 109350 }, { "epoch": 0.11014351744136272, "grad_norm": 11.070199972009435, "learning_rate": 4.955645499449251e-05, "loss": 2.212, "mean_token_accuracy": 0.4344827592372894, "step": 109355 }, { "epoch": 0.1101485534944669, "grad_norm": 13.00614912999903, "learning_rate": 4.9556380955737094e-05, "loss": 2.0353, "mean_token_accuracy": 0.5206896543502808, "step": 109360 }, { "epoch": 0.11015358954757107, "grad_norm": 11.877169643945226, "learning_rate": 4.9556306910864274e-05, "loss": 2.5716, "mean_token_accuracy": 0.41034482717514037, "step": 109365 }, { "epoch": 0.11015862560067523, "grad_norm": 7.635389021133433, "learning_rate": 4.955623285987405e-05, "loss": 2.2498, "mean_token_accuracy": 0.4793103516101837, "step": 109370 }, { "epoch": 0.1101636616537794, "grad_norm": 11.77651774718235, "learning_rate": 4.955615880276643e-05, "loss": 2.4145, "mean_token_accuracy": 0.4206896543502808, "step": 109375 }, { "epoch": 0.11016869770688358, "grad_norm": 9.140364914867904, "learning_rate": 4.9556084739541465e-05, "loss": 2.1714, "mean_token_accuracy": 0.482758617401123, "step": 109380 }, { "epoch": 0.11017373375998775, "grad_norm": 9.969413366228334, "learning_rate": 4.9556010670199165e-05, "loss": 2.4727, "mean_token_accuracy": 0.4000000059604645, "step": 109385 }, { "epoch": 0.11017876981309192, "grad_norm": 9.655183222443686, "learning_rate": 4.955593659473955e-05, "loss": 1.9601, "mean_token_accuracy": 0.46896552443504336, "step": 109390 }, { "epoch": 0.1101838058661961, "grad_norm": 13.220295950243484, "learning_rate": 4.9555862513162626e-05, "loss": 2.6718, "mean_token_accuracy": 0.41379311084747317, "step": 109395 }, { "epoch": 0.11018884191930027, "grad_norm": 13.056702115673938, "learning_rate": 4.9555788425468435e-05, "loss": 2.5597, "mean_token_accuracy": 0.4068965494632721, "step": 109400 }, { "epoch": 0.11019387797240444, "grad_norm": 12.313428856175328, "learning_rate": 4.9555714331656986e-05, "loss": 2.4862, "mean_token_accuracy": 0.3931034505367279, "step": 109405 }, { "epoch": 0.11019891402550862, "grad_norm": 12.010561202697476, "learning_rate": 4.9555640231728305e-05, "loss": 2.3542, "mean_token_accuracy": 0.44827585220336913, "step": 109410 }, { "epoch": 0.11020395007861279, "grad_norm": 9.549941453729044, "learning_rate": 4.9555566125682414e-05, "loss": 2.2935, "mean_token_accuracy": 0.42758620977401735, "step": 109415 }, { "epoch": 0.11020898613171697, "grad_norm": 11.571261226572455, "learning_rate": 4.9555492013519325e-05, "loss": 2.3012, "mean_token_accuracy": 0.4594827651977539, "step": 109420 }, { "epoch": 0.11021402218482114, "grad_norm": 9.978467544062743, "learning_rate": 4.9555417895239066e-05, "loss": 2.2097, "mean_token_accuracy": 0.46206897497177124, "step": 109425 }, { "epoch": 0.11021905823792531, "grad_norm": 9.22213391527546, "learning_rate": 4.955534377084165e-05, "loss": 2.1686, "mean_token_accuracy": 0.4620689630508423, "step": 109430 }, { "epoch": 0.11022409429102949, "grad_norm": 15.63808825943289, "learning_rate": 4.9555269640327104e-05, "loss": 2.7615, "mean_token_accuracy": 0.3758620649576187, "step": 109435 }, { "epoch": 0.11022913034413365, "grad_norm": 10.900510942479785, "learning_rate": 4.955519550369544e-05, "loss": 2.8324, "mean_token_accuracy": 0.3655172437429428, "step": 109440 }, { "epoch": 0.11023416639723782, "grad_norm": 11.397674057538907, "learning_rate": 4.95551213609467e-05, "loss": 2.4514, "mean_token_accuracy": 0.3965517282485962, "step": 109445 }, { "epoch": 0.110239202450342, "grad_norm": 10.75236197834535, "learning_rate": 4.955504721208087e-05, "loss": 3.2443, "mean_token_accuracy": 0.36896551549434664, "step": 109450 }, { "epoch": 0.11024423850344617, "grad_norm": 9.8912308325458, "learning_rate": 4.9554973057098005e-05, "loss": 2.2946, "mean_token_accuracy": 0.44827585816383364, "step": 109455 }, { "epoch": 0.11024927455655034, "grad_norm": 8.207547283177636, "learning_rate": 4.955489889599811e-05, "loss": 2.1444, "mean_token_accuracy": 0.5295825719833374, "step": 109460 }, { "epoch": 0.11025431060965452, "grad_norm": 12.073450475059984, "learning_rate": 4.9554824728781206e-05, "loss": 2.1309, "mean_token_accuracy": 0.48620688915252686, "step": 109465 }, { "epoch": 0.11025934666275869, "grad_norm": 14.434298059952658, "learning_rate": 4.955475055544731e-05, "loss": 2.3811, "mean_token_accuracy": 0.43103447556495667, "step": 109470 }, { "epoch": 0.11026438271586286, "grad_norm": 9.406792768219908, "learning_rate": 4.955467637599645e-05, "loss": 2.4872, "mean_token_accuracy": 0.3896551728248596, "step": 109475 }, { "epoch": 0.11026941876896704, "grad_norm": 11.76848425384687, "learning_rate": 4.955460219042864e-05, "loss": 2.6666, "mean_token_accuracy": 0.3896551728248596, "step": 109480 }, { "epoch": 0.11027445482207121, "grad_norm": 9.193786222754824, "learning_rate": 4.95545279987439e-05, "loss": 2.4146, "mean_token_accuracy": 0.39491832852363584, "step": 109485 }, { "epoch": 0.11027949087517538, "grad_norm": 14.46834330140287, "learning_rate": 4.9554453800942265e-05, "loss": 2.2428, "mean_token_accuracy": 0.4275861978530884, "step": 109490 }, { "epoch": 0.11028452692827956, "grad_norm": 10.469294607553444, "learning_rate": 4.955437959702374e-05, "loss": 2.1871, "mean_token_accuracy": 0.45517240166664125, "step": 109495 }, { "epoch": 0.11028956298138373, "grad_norm": 9.827957063673365, "learning_rate": 4.955430538698835e-05, "loss": 2.0378, "mean_token_accuracy": 0.5298850536346436, "step": 109500 }, { "epoch": 0.1102945990344879, "grad_norm": 10.638137693186344, "learning_rate": 4.955423117083612e-05, "loss": 2.6076, "mean_token_accuracy": 0.4620689630508423, "step": 109505 }, { "epoch": 0.11029963508759207, "grad_norm": 11.747277135006366, "learning_rate": 4.955415694856706e-05, "loss": 2.47, "mean_token_accuracy": 0.3896551728248596, "step": 109510 }, { "epoch": 0.11030467114069624, "grad_norm": 9.601980644215656, "learning_rate": 4.95540827201812e-05, "loss": 2.5441, "mean_token_accuracy": 0.3758620619773865, "step": 109515 }, { "epoch": 0.11030970719380041, "grad_norm": 13.986962082498318, "learning_rate": 4.9554008485678564e-05, "loss": 2.4506, "mean_token_accuracy": 0.4206896543502808, "step": 109520 }, { "epoch": 0.11031474324690459, "grad_norm": 9.2992598269699, "learning_rate": 4.955393424505916e-05, "loss": 2.008, "mean_token_accuracy": 0.482758617401123, "step": 109525 }, { "epoch": 0.11031977930000876, "grad_norm": 12.521267897572608, "learning_rate": 4.9553859998323014e-05, "loss": 2.3269, "mean_token_accuracy": 0.4551724076271057, "step": 109530 }, { "epoch": 0.11032481535311293, "grad_norm": 11.551533419814339, "learning_rate": 4.955378574547015e-05, "loss": 2.3207, "mean_token_accuracy": 0.4482758641242981, "step": 109535 }, { "epoch": 0.11032985140621711, "grad_norm": 9.4768386392394, "learning_rate": 4.9553711486500586e-05, "loss": 2.3067, "mean_token_accuracy": 0.48965516686439514, "step": 109540 }, { "epoch": 0.11033488745932128, "grad_norm": 8.915524231201431, "learning_rate": 4.9553637221414345e-05, "loss": 2.1285, "mean_token_accuracy": 0.441379314661026, "step": 109545 }, { "epoch": 0.11033992351242546, "grad_norm": 9.158754359256083, "learning_rate": 4.955356295021144e-05, "loss": 2.6137, "mean_token_accuracy": 0.3655172407627106, "step": 109550 }, { "epoch": 0.11034495956552963, "grad_norm": 10.334274787792156, "learning_rate": 4.9553488672891904e-05, "loss": 2.3052, "mean_token_accuracy": 0.4517241299152374, "step": 109555 }, { "epoch": 0.1103499956186338, "grad_norm": 14.83845485644772, "learning_rate": 4.9553414389455745e-05, "loss": 2.4244, "mean_token_accuracy": 0.4655172348022461, "step": 109560 }, { "epoch": 0.11035503167173798, "grad_norm": 10.952943568136478, "learning_rate": 4.955334009990299e-05, "loss": 2.3906, "mean_token_accuracy": 0.4103448212146759, "step": 109565 }, { "epoch": 0.11036006772484215, "grad_norm": 11.095427012312488, "learning_rate": 4.955326580423367e-05, "loss": 2.4344, "mean_token_accuracy": 0.4241379380226135, "step": 109570 }, { "epoch": 0.11036510377794632, "grad_norm": 10.372240080987558, "learning_rate": 4.955319150244778e-05, "loss": 2.0447, "mean_token_accuracy": 0.5068965494632721, "step": 109575 }, { "epoch": 0.11037013983105048, "grad_norm": 9.03514956929517, "learning_rate": 4.955311719454536e-05, "loss": 2.4609, "mean_token_accuracy": 0.44137930274009707, "step": 109580 }, { "epoch": 0.11037517588415466, "grad_norm": 9.427176315848945, "learning_rate": 4.9553042880526425e-05, "loss": 2.3222, "mean_token_accuracy": 0.4689655125141144, "step": 109585 }, { "epoch": 0.11038021193725883, "grad_norm": 10.254788826424004, "learning_rate": 4.955296856039099e-05, "loss": 2.7149, "mean_token_accuracy": 0.3655172407627106, "step": 109590 }, { "epoch": 0.110385247990363, "grad_norm": 11.058537366012793, "learning_rate": 4.955289423413909e-05, "loss": 2.0562, "mean_token_accuracy": 0.5034482777118683, "step": 109595 }, { "epoch": 0.11039028404346718, "grad_norm": 10.674820923840143, "learning_rate": 4.955281990177074e-05, "loss": 2.5423, "mean_token_accuracy": 0.4344827651977539, "step": 109600 }, { "epoch": 0.11039532009657135, "grad_norm": 9.315257241279388, "learning_rate": 4.955274556328595e-05, "loss": 2.5583, "mean_token_accuracy": 0.4517241358757019, "step": 109605 }, { "epoch": 0.11040035614967553, "grad_norm": 11.023524514206002, "learning_rate": 4.955267121868476e-05, "loss": 2.5568, "mean_token_accuracy": 0.3965517282485962, "step": 109610 }, { "epoch": 0.1104053922027797, "grad_norm": 13.108712415522671, "learning_rate": 4.955259686796716e-05, "loss": 2.7166, "mean_token_accuracy": 0.41034482717514037, "step": 109615 }, { "epoch": 0.11041042825588387, "grad_norm": 9.515249960031287, "learning_rate": 4.9552522511133206e-05, "loss": 2.1482, "mean_token_accuracy": 0.458620685338974, "step": 109620 }, { "epoch": 0.11041546430898805, "grad_norm": 13.48738857296415, "learning_rate": 4.95524481481829e-05, "loss": 2.2626, "mean_token_accuracy": 0.4206896543502808, "step": 109625 }, { "epoch": 0.11042050036209222, "grad_norm": 16.671338326199596, "learning_rate": 4.955237377911627e-05, "loss": 2.5432, "mean_token_accuracy": 0.4172413766384125, "step": 109630 }, { "epoch": 0.1104255364151964, "grad_norm": 10.403399271466787, "learning_rate": 4.955229940393332e-05, "loss": 2.4631, "mean_token_accuracy": 0.41379310488700866, "step": 109635 }, { "epoch": 0.11043057246830057, "grad_norm": 11.389975719114739, "learning_rate": 4.9552225022634094e-05, "loss": 2.395, "mean_token_accuracy": 0.43448275327682495, "step": 109640 }, { "epoch": 0.11043560852140474, "grad_norm": 13.346718168449666, "learning_rate": 4.955215063521859e-05, "loss": 2.2772, "mean_token_accuracy": 0.4586206912994385, "step": 109645 }, { "epoch": 0.1104406445745089, "grad_norm": 32.283932634229714, "learning_rate": 4.955207624168685e-05, "loss": 2.7353, "mean_token_accuracy": 0.4448275864124298, "step": 109650 }, { "epoch": 0.11044568062761308, "grad_norm": 12.385272983795435, "learning_rate": 4.955200184203888e-05, "loss": 2.8038, "mean_token_accuracy": 0.3379310369491577, "step": 109655 }, { "epoch": 0.11045071668071725, "grad_norm": 11.1051776907032, "learning_rate": 4.955192743627471e-05, "loss": 2.6471, "mean_token_accuracy": 0.41379310488700866, "step": 109660 }, { "epoch": 0.11045575273382142, "grad_norm": 10.874896130766787, "learning_rate": 4.955185302439435e-05, "loss": 2.2655, "mean_token_accuracy": 0.44827585816383364, "step": 109665 }, { "epoch": 0.1104607887869256, "grad_norm": 9.292569251972836, "learning_rate": 4.955177860639783e-05, "loss": 2.3477, "mean_token_accuracy": 0.4, "step": 109670 }, { "epoch": 0.11046582484002977, "grad_norm": 9.389619648486853, "learning_rate": 4.9551704182285156e-05, "loss": 2.5465, "mean_token_accuracy": 0.41379311084747317, "step": 109675 }, { "epoch": 0.11047086089313395, "grad_norm": 10.673917268828012, "learning_rate": 4.955162975205637e-05, "loss": 2.3425, "mean_token_accuracy": 0.4137930989265442, "step": 109680 }, { "epoch": 0.11047589694623812, "grad_norm": 9.572200887804915, "learning_rate": 4.955155531571148e-05, "loss": 2.421, "mean_token_accuracy": 0.42758620381355283, "step": 109685 }, { "epoch": 0.1104809329993423, "grad_norm": 9.068268375554354, "learning_rate": 4.955148087325051e-05, "loss": 2.3713, "mean_token_accuracy": 0.42413793206214906, "step": 109690 }, { "epoch": 0.11048596905244647, "grad_norm": 11.722302539232277, "learning_rate": 4.955140642467348e-05, "loss": 2.1007, "mean_token_accuracy": 0.4571687877178192, "step": 109695 }, { "epoch": 0.11049100510555064, "grad_norm": 11.223394323575025, "learning_rate": 4.95513319699804e-05, "loss": 2.7298, "mean_token_accuracy": 0.39310344159603117, "step": 109700 }, { "epoch": 0.11049604115865481, "grad_norm": 9.904687433262863, "learning_rate": 4.955125750917131e-05, "loss": 2.3784, "mean_token_accuracy": 0.4551724135875702, "step": 109705 }, { "epoch": 0.11050107721175899, "grad_norm": 20.13622530860082, "learning_rate": 4.9551183042246217e-05, "loss": 2.8809, "mean_token_accuracy": 0.441379314661026, "step": 109710 }, { "epoch": 0.11050611326486316, "grad_norm": 11.850941322256276, "learning_rate": 4.9551108569205154e-05, "loss": 2.3217, "mean_token_accuracy": 0.4448275864124298, "step": 109715 }, { "epoch": 0.11051114931796732, "grad_norm": 9.987281615342921, "learning_rate": 4.9551034090048134e-05, "loss": 2.2064, "mean_token_accuracy": 0.4604355752468109, "step": 109720 }, { "epoch": 0.1105161853710715, "grad_norm": 13.640592864088912, "learning_rate": 4.955095960477516e-05, "loss": 2.7339, "mean_token_accuracy": 0.3620689660310745, "step": 109725 }, { "epoch": 0.11052122142417567, "grad_norm": 14.368705103113806, "learning_rate": 4.955088511338629e-05, "loss": 2.593, "mean_token_accuracy": 0.4034482777118683, "step": 109730 }, { "epoch": 0.11052625747727984, "grad_norm": 10.709924261817832, "learning_rate": 4.955081061588152e-05, "loss": 2.6429, "mean_token_accuracy": 0.39655172228813174, "step": 109735 }, { "epoch": 0.11053129353038402, "grad_norm": 10.029966508314171, "learning_rate": 4.955073611226087e-05, "loss": 2.3564, "mean_token_accuracy": 0.41379310488700866, "step": 109740 }, { "epoch": 0.11053632958348819, "grad_norm": 9.719727214402484, "learning_rate": 4.9550661602524364e-05, "loss": 2.5198, "mean_token_accuracy": 0.4379310250282288, "step": 109745 }, { "epoch": 0.11054136563659236, "grad_norm": 15.32053188174393, "learning_rate": 4.955058708667204e-05, "loss": 2.538, "mean_token_accuracy": 0.4068965554237366, "step": 109750 }, { "epoch": 0.11054640168969654, "grad_norm": 13.546835382426913, "learning_rate": 4.9550512564703895e-05, "loss": 2.2657, "mean_token_accuracy": 0.4620689690113068, "step": 109755 }, { "epoch": 0.11055143774280071, "grad_norm": 10.179518109351017, "learning_rate": 4.955043803661995e-05, "loss": 2.2496, "mean_token_accuracy": 0.44137930274009707, "step": 109760 }, { "epoch": 0.11055647379590489, "grad_norm": 9.906249218313276, "learning_rate": 4.955036350242024e-05, "loss": 2.6624, "mean_token_accuracy": 0.40344828367233276, "step": 109765 }, { "epoch": 0.11056150984900906, "grad_norm": 9.015173128222282, "learning_rate": 4.955028896210478e-05, "loss": 2.081, "mean_token_accuracy": 0.5034482836723327, "step": 109770 }, { "epoch": 0.11056654590211323, "grad_norm": 12.675304093858019, "learning_rate": 4.955021441567359e-05, "loss": 2.6159, "mean_token_accuracy": 0.42758620381355283, "step": 109775 }, { "epoch": 0.11057158195521741, "grad_norm": 11.774736227891056, "learning_rate": 4.9550139863126694e-05, "loss": 2.6381, "mean_token_accuracy": 0.4275861978530884, "step": 109780 }, { "epoch": 0.11057661800832158, "grad_norm": 11.759809373559845, "learning_rate": 4.9550065304464105e-05, "loss": 2.7168, "mean_token_accuracy": 0.4068965554237366, "step": 109785 }, { "epoch": 0.11058165406142574, "grad_norm": 11.344495015857778, "learning_rate": 4.954999073968585e-05, "loss": 2.5939, "mean_token_accuracy": 0.3827586114406586, "step": 109790 }, { "epoch": 0.11058669011452991, "grad_norm": 9.546165615006423, "learning_rate": 4.954991616879195e-05, "loss": 2.5838, "mean_token_accuracy": 0.4241379380226135, "step": 109795 }, { "epoch": 0.11059172616763409, "grad_norm": 9.58269865528971, "learning_rate": 4.954984159178241e-05, "loss": 1.9524, "mean_token_accuracy": 0.48620688915252686, "step": 109800 }, { "epoch": 0.11059676222073826, "grad_norm": 12.245640387298852, "learning_rate": 4.954976700865728e-05, "loss": 2.2869, "mean_token_accuracy": 0.43793101906776427, "step": 109805 }, { "epoch": 0.11060179827384244, "grad_norm": 12.237561594788703, "learning_rate": 4.954969241941656e-05, "loss": 2.5247, "mean_token_accuracy": 0.3896551787853241, "step": 109810 }, { "epoch": 0.11060683432694661, "grad_norm": 13.56023357104217, "learning_rate": 4.954961782406027e-05, "loss": 2.5396, "mean_token_accuracy": 0.42413793206214906, "step": 109815 }, { "epoch": 0.11061187038005078, "grad_norm": 11.363380819785297, "learning_rate": 4.954954322258845e-05, "loss": 2.4353, "mean_token_accuracy": 0.42256503105163573, "step": 109820 }, { "epoch": 0.11061690643315496, "grad_norm": 11.028061172380802, "learning_rate": 4.954946861500109e-05, "loss": 2.4616, "mean_token_accuracy": 0.45172414779663084, "step": 109825 }, { "epoch": 0.11062194248625913, "grad_norm": 9.02747523092846, "learning_rate": 4.954939400129824e-05, "loss": 2.2565, "mean_token_accuracy": 0.41379310488700866, "step": 109830 }, { "epoch": 0.1106269785393633, "grad_norm": 13.9316982551118, "learning_rate": 4.95493193814799e-05, "loss": 2.5783, "mean_token_accuracy": 0.41379311084747317, "step": 109835 }, { "epoch": 0.11063201459246748, "grad_norm": 10.591652515223547, "learning_rate": 4.95492447555461e-05, "loss": 2.7444, "mean_token_accuracy": 0.40689656138420105, "step": 109840 }, { "epoch": 0.11063705064557165, "grad_norm": 10.871162833352379, "learning_rate": 4.954917012349686e-05, "loss": 2.3832, "mean_token_accuracy": 0.482758617401123, "step": 109845 }, { "epoch": 0.11064208669867583, "grad_norm": 10.346597303766462, "learning_rate": 4.954909548533221e-05, "loss": 2.5977, "mean_token_accuracy": 0.4586206912994385, "step": 109850 }, { "epoch": 0.11064712275178, "grad_norm": 10.245106963223579, "learning_rate": 4.9549020841052153e-05, "loss": 1.9882, "mean_token_accuracy": 0.5021173536777497, "step": 109855 }, { "epoch": 0.11065215880488416, "grad_norm": 12.653765015318836, "learning_rate": 4.954894619065672e-05, "loss": 2.8519, "mean_token_accuracy": 0.3758620649576187, "step": 109860 }, { "epoch": 0.11065719485798833, "grad_norm": 10.66559733395443, "learning_rate": 4.954887153414593e-05, "loss": 2.3186, "mean_token_accuracy": 0.4482758641242981, "step": 109865 }, { "epoch": 0.11066223091109251, "grad_norm": 10.286967440091127, "learning_rate": 4.95487968715198e-05, "loss": 2.7121, "mean_token_accuracy": 0.3620689570903778, "step": 109870 }, { "epoch": 0.11066726696419668, "grad_norm": 14.504670113400602, "learning_rate": 4.9548722202778355e-05, "loss": 3.1091, "mean_token_accuracy": 0.31724137961864474, "step": 109875 }, { "epoch": 0.11067230301730085, "grad_norm": 9.268425655153992, "learning_rate": 4.9548647527921616e-05, "loss": 2.3421, "mean_token_accuracy": 0.4620689690113068, "step": 109880 }, { "epoch": 0.11067733907040503, "grad_norm": 11.762144195784408, "learning_rate": 4.9548572846949605e-05, "loss": 2.5762, "mean_token_accuracy": 0.4206896543502808, "step": 109885 }, { "epoch": 0.1106823751235092, "grad_norm": 11.084962044007144, "learning_rate": 4.9548498159862334e-05, "loss": 2.6662, "mean_token_accuracy": 0.40344826877117157, "step": 109890 }, { "epoch": 0.11068741117661338, "grad_norm": 11.613146716309696, "learning_rate": 4.954842346665983e-05, "loss": 2.6062, "mean_token_accuracy": 0.4344827592372894, "step": 109895 }, { "epoch": 0.11069244722971755, "grad_norm": 11.502046407054445, "learning_rate": 4.9548348767342115e-05, "loss": 2.2017, "mean_token_accuracy": 0.4068965494632721, "step": 109900 }, { "epoch": 0.11069748328282172, "grad_norm": 10.374903355464683, "learning_rate": 4.9548274061909215e-05, "loss": 2.3751, "mean_token_accuracy": 0.4448275864124298, "step": 109905 }, { "epoch": 0.1107025193359259, "grad_norm": 11.80128765962287, "learning_rate": 4.9548199350361136e-05, "loss": 2.3038, "mean_token_accuracy": 0.4758620738983154, "step": 109910 }, { "epoch": 0.11070755538903007, "grad_norm": 12.309000490712037, "learning_rate": 4.9548124632697914e-05, "loss": 2.144, "mean_token_accuracy": 0.4551724076271057, "step": 109915 }, { "epoch": 0.11071259144213424, "grad_norm": 12.987800322106187, "learning_rate": 4.9548049908919554e-05, "loss": 2.6532, "mean_token_accuracy": 0.3931034505367279, "step": 109920 }, { "epoch": 0.1107176274952384, "grad_norm": 9.856746840931061, "learning_rate": 4.9547975179026084e-05, "loss": 1.9359, "mean_token_accuracy": 0.49999998807907103, "step": 109925 }, { "epoch": 0.11072266354834258, "grad_norm": 24.511497974789975, "learning_rate": 4.954790044301753e-05, "loss": 2.7501, "mean_token_accuracy": 0.38275861740112305, "step": 109930 }, { "epoch": 0.11072769960144675, "grad_norm": 10.625728982011431, "learning_rate": 4.9547825700893914e-05, "loss": 2.8649, "mean_token_accuracy": 0.37586207389831544, "step": 109935 }, { "epoch": 0.11073273565455093, "grad_norm": 10.453088954549797, "learning_rate": 4.954775095265524e-05, "loss": 2.3728, "mean_token_accuracy": 0.43103448748588563, "step": 109940 }, { "epoch": 0.1107377717076551, "grad_norm": 11.0728554615507, "learning_rate": 4.9547676198301554e-05, "loss": 2.447, "mean_token_accuracy": 0.42413793206214906, "step": 109945 }, { "epoch": 0.11074280776075927, "grad_norm": 12.654626196566557, "learning_rate": 4.9547601437832844e-05, "loss": 2.8965, "mean_token_accuracy": 0.3655172407627106, "step": 109950 }, { "epoch": 0.11074784381386345, "grad_norm": 10.443391565215034, "learning_rate": 4.9547526671249166e-05, "loss": 2.5015, "mean_token_accuracy": 0.3931034505367279, "step": 109955 }, { "epoch": 0.11075287986696762, "grad_norm": 11.513641732787965, "learning_rate": 4.954745189855052e-05, "loss": 3.0081, "mean_token_accuracy": 0.34827586114406583, "step": 109960 }, { "epoch": 0.1107579159200718, "grad_norm": 9.741013950465092, "learning_rate": 4.954737711973693e-05, "loss": 1.9497, "mean_token_accuracy": 0.48965516686439514, "step": 109965 }, { "epoch": 0.11076295197317597, "grad_norm": 10.268028046277964, "learning_rate": 4.954730233480842e-05, "loss": 2.3784, "mean_token_accuracy": 0.44482758045196535, "step": 109970 }, { "epoch": 0.11076798802628014, "grad_norm": 7.991277488403565, "learning_rate": 4.9547227543765004e-05, "loss": 2.3326, "mean_token_accuracy": 0.42068966031074523, "step": 109975 }, { "epoch": 0.11077302407938432, "grad_norm": 8.914467459232396, "learning_rate": 4.954715274660671e-05, "loss": 2.0928, "mean_token_accuracy": 0.49655172824859617, "step": 109980 }, { "epoch": 0.11077806013248849, "grad_norm": 9.061479512418181, "learning_rate": 4.954707794333355e-05, "loss": 2.2215, "mean_token_accuracy": 0.4344827592372894, "step": 109985 }, { "epoch": 0.11078309618559266, "grad_norm": 9.651910742173058, "learning_rate": 4.9547003133945555e-05, "loss": 2.2611, "mean_token_accuracy": 0.4838669955730438, "step": 109990 }, { "epoch": 0.11078813223869682, "grad_norm": 11.424252864925052, "learning_rate": 4.954692831844274e-05, "loss": 2.4038, "mean_token_accuracy": 0.43448275327682495, "step": 109995 }, { "epoch": 0.110793168291801, "grad_norm": 11.518070052847287, "learning_rate": 4.954685349682513e-05, "loss": 2.4439, "mean_token_accuracy": 0.4206896543502808, "step": 110000 }, { "epoch": 0.11079820434490517, "grad_norm": 12.068718110049637, "learning_rate": 4.954677866909275e-05, "loss": 2.6058, "mean_token_accuracy": 0.3965517282485962, "step": 110005 }, { "epoch": 0.11080324039800934, "grad_norm": 10.487826596411983, "learning_rate": 4.95467038352456e-05, "loss": 2.0984, "mean_token_accuracy": 0.4620689570903778, "step": 110010 }, { "epoch": 0.11080827645111352, "grad_norm": 11.859439615410343, "learning_rate": 4.9546628995283724e-05, "loss": 2.6553, "mean_token_accuracy": 0.43793103098869324, "step": 110015 }, { "epoch": 0.11081331250421769, "grad_norm": 10.888986211809044, "learning_rate": 4.954655414920712e-05, "loss": 2.7119, "mean_token_accuracy": 0.3620689630508423, "step": 110020 }, { "epoch": 0.11081834855732187, "grad_norm": 13.535134401186484, "learning_rate": 4.9546479297015837e-05, "loss": 2.2219, "mean_token_accuracy": 0.4448275864124298, "step": 110025 }, { "epoch": 0.11082338461042604, "grad_norm": 12.382731144823627, "learning_rate": 4.9546404438709874e-05, "loss": 2.551, "mean_token_accuracy": 0.4068965554237366, "step": 110030 }, { "epoch": 0.11082842066353021, "grad_norm": 13.59385685116867, "learning_rate": 4.954632957428926e-05, "loss": 2.3917, "mean_token_accuracy": 0.4620689570903778, "step": 110035 }, { "epoch": 0.11083345671663439, "grad_norm": 9.659639169689061, "learning_rate": 4.9546254703754015e-05, "loss": 2.1192, "mean_token_accuracy": 0.5120689690113067, "step": 110040 }, { "epoch": 0.11083849276973856, "grad_norm": 11.124379741638602, "learning_rate": 4.954617982710416e-05, "loss": 2.0506, "mean_token_accuracy": 0.458620685338974, "step": 110045 }, { "epoch": 0.11084352882284274, "grad_norm": 9.97150667264149, "learning_rate": 4.9546104944339714e-05, "loss": 2.6434, "mean_token_accuracy": 0.36896551251411436, "step": 110050 }, { "epoch": 0.11084856487594691, "grad_norm": 11.568727237806593, "learning_rate": 4.9546030055460694e-05, "loss": 2.5471, "mean_token_accuracy": 0.40000000298023225, "step": 110055 }, { "epoch": 0.11085360092905108, "grad_norm": 13.942814407888182, "learning_rate": 4.9545955160467126e-05, "loss": 2.958, "mean_token_accuracy": 0.31518452167510985, "step": 110060 }, { "epoch": 0.11085863698215524, "grad_norm": 11.155265925637252, "learning_rate": 4.954588025935904e-05, "loss": 2.2793, "mean_token_accuracy": 0.4068965494632721, "step": 110065 }, { "epoch": 0.11086367303525942, "grad_norm": 10.410653943219387, "learning_rate": 4.9545805352136434e-05, "loss": 2.1551, "mean_token_accuracy": 0.4379310369491577, "step": 110070 }, { "epoch": 0.11086870908836359, "grad_norm": 9.620776123483173, "learning_rate": 4.9545730438799344e-05, "loss": 2.2818, "mean_token_accuracy": 0.4448275864124298, "step": 110075 }, { "epoch": 0.11087374514146776, "grad_norm": 11.085788917626777, "learning_rate": 4.9545655519347795e-05, "loss": 2.3998, "mean_token_accuracy": 0.46206897497177124, "step": 110080 }, { "epoch": 0.11087878119457194, "grad_norm": 10.722759915260553, "learning_rate": 4.95455805937818e-05, "loss": 1.945, "mean_token_accuracy": 0.517241370677948, "step": 110085 }, { "epoch": 0.11088381724767611, "grad_norm": 10.727426872371629, "learning_rate": 4.9545505662101385e-05, "loss": 2.2256, "mean_token_accuracy": 0.4379310429096222, "step": 110090 }, { "epoch": 0.11088885330078029, "grad_norm": 10.901751741023881, "learning_rate": 4.954543072430655e-05, "loss": 2.4993, "mean_token_accuracy": 0.4344827473163605, "step": 110095 }, { "epoch": 0.11089388935388446, "grad_norm": 10.582380533412987, "learning_rate": 4.954535578039734e-05, "loss": 2.6804, "mean_token_accuracy": 0.3931034505367279, "step": 110100 }, { "epoch": 0.11089892540698863, "grad_norm": 11.885277752466243, "learning_rate": 4.9545280830373774e-05, "loss": 2.6796, "mean_token_accuracy": 0.38620689511299133, "step": 110105 }, { "epoch": 0.1109039614600928, "grad_norm": 14.748744086151163, "learning_rate": 4.9545205874235866e-05, "loss": 2.4036, "mean_token_accuracy": 0.44482758045196535, "step": 110110 }, { "epoch": 0.11090899751319698, "grad_norm": 11.087118471490058, "learning_rate": 4.954513091198364e-05, "loss": 2.6209, "mean_token_accuracy": 0.3965517282485962, "step": 110115 }, { "epoch": 0.11091403356630115, "grad_norm": 10.10432501022113, "learning_rate": 4.954505594361711e-05, "loss": 2.3121, "mean_token_accuracy": 0.441379314661026, "step": 110120 }, { "epoch": 0.11091906961940533, "grad_norm": 9.174868000152191, "learning_rate": 4.9544980969136304e-05, "loss": 2.4145, "mean_token_accuracy": 0.4034482777118683, "step": 110125 }, { "epoch": 0.1109241056725095, "grad_norm": 11.997378902262785, "learning_rate": 4.9544905988541234e-05, "loss": 2.4038, "mean_token_accuracy": 0.41724138259887694, "step": 110130 }, { "epoch": 0.11092914172561366, "grad_norm": 9.640441241196982, "learning_rate": 4.9544831001831934e-05, "loss": 2.4542, "mean_token_accuracy": 0.4172413766384125, "step": 110135 }, { "epoch": 0.11093417777871784, "grad_norm": 11.094944454720647, "learning_rate": 4.954475600900842e-05, "loss": 2.4031, "mean_token_accuracy": 0.42413793206214906, "step": 110140 }, { "epoch": 0.11093921383182201, "grad_norm": 9.66466292718135, "learning_rate": 4.95446810100707e-05, "loss": 2.5415, "mean_token_accuracy": 0.42068964838981626, "step": 110145 }, { "epoch": 0.11094424988492618, "grad_norm": 9.004590094798614, "learning_rate": 4.954460600501881e-05, "loss": 2.1957, "mean_token_accuracy": 0.4172413766384125, "step": 110150 }, { "epoch": 0.11094928593803036, "grad_norm": 9.593047790045324, "learning_rate": 4.9544530993852775e-05, "loss": 2.1682, "mean_token_accuracy": 0.4601935863494873, "step": 110155 }, { "epoch": 0.11095432199113453, "grad_norm": 11.122775640429913, "learning_rate": 4.954445597657259e-05, "loss": 2.4374, "mean_token_accuracy": 0.42758620381355283, "step": 110160 }, { "epoch": 0.1109593580442387, "grad_norm": 11.06441242174139, "learning_rate": 4.954438095317831e-05, "loss": 2.4318, "mean_token_accuracy": 0.43448275327682495, "step": 110165 }, { "epoch": 0.11096439409734288, "grad_norm": 9.79968108215539, "learning_rate": 4.954430592366993e-05, "loss": 2.358, "mean_token_accuracy": 0.42413792610168455, "step": 110170 }, { "epoch": 0.11096943015044705, "grad_norm": 12.593161215383045, "learning_rate": 4.954423088804748e-05, "loss": 2.6868, "mean_token_accuracy": 0.4034482717514038, "step": 110175 }, { "epoch": 0.11097446620355123, "grad_norm": 11.44840356709061, "learning_rate": 4.954415584631099e-05, "loss": 2.3658, "mean_token_accuracy": 0.47586206197738645, "step": 110180 }, { "epoch": 0.1109795022566554, "grad_norm": 10.557877242503999, "learning_rate": 4.954408079846046e-05, "loss": 2.5215, "mean_token_accuracy": 0.4586206912994385, "step": 110185 }, { "epoch": 0.11098453830975957, "grad_norm": 9.337159884677273, "learning_rate": 4.954400574449592e-05, "loss": 2.3686, "mean_token_accuracy": 0.38620689511299133, "step": 110190 }, { "epoch": 0.11098957436286375, "grad_norm": 10.220651248503906, "learning_rate": 4.95439306844174e-05, "loss": 2.4958, "mean_token_accuracy": 0.40689656138420105, "step": 110195 }, { "epoch": 0.11099461041596792, "grad_norm": 15.058861993490368, "learning_rate": 4.9543855618224915e-05, "loss": 2.8307, "mean_token_accuracy": 0.3689655065536499, "step": 110200 }, { "epoch": 0.11099964646907208, "grad_norm": 11.508079660034259, "learning_rate": 4.954378054591848e-05, "loss": 2.24, "mean_token_accuracy": 0.4620689630508423, "step": 110205 }, { "epoch": 0.11100468252217625, "grad_norm": 9.96174904279089, "learning_rate": 4.954370546749812e-05, "loss": 2.7077, "mean_token_accuracy": 0.4181034445762634, "step": 110210 }, { "epoch": 0.11100971857528043, "grad_norm": 10.851027597930178, "learning_rate": 4.9543630382963856e-05, "loss": 2.5933, "mean_token_accuracy": 0.4034482717514038, "step": 110215 }, { "epoch": 0.1110147546283846, "grad_norm": 10.177923066405706, "learning_rate": 4.9543555292315706e-05, "loss": 2.2393, "mean_token_accuracy": 0.4482758641242981, "step": 110220 }, { "epoch": 0.11101979068148878, "grad_norm": 10.285213547500963, "learning_rate": 4.954348019555371e-05, "loss": 2.3333, "mean_token_accuracy": 0.4206896543502808, "step": 110225 }, { "epoch": 0.11102482673459295, "grad_norm": 10.095437591075353, "learning_rate": 4.954340509267785e-05, "loss": 2.4565, "mean_token_accuracy": 0.4206896543502808, "step": 110230 }, { "epoch": 0.11102986278769712, "grad_norm": 10.785176337978845, "learning_rate": 4.954332998368818e-05, "loss": 2.5958, "mean_token_accuracy": 0.4344827651977539, "step": 110235 }, { "epoch": 0.1110348988408013, "grad_norm": 11.698265739953348, "learning_rate": 4.9543254868584716e-05, "loss": 2.6384, "mean_token_accuracy": 0.37241379022598264, "step": 110240 }, { "epoch": 0.11103993489390547, "grad_norm": 14.712600744845648, "learning_rate": 4.9543179747367466e-05, "loss": 2.9592, "mean_token_accuracy": 0.33793103098869326, "step": 110245 }, { "epoch": 0.11104497094700964, "grad_norm": 11.633217030752702, "learning_rate": 4.954310462003646e-05, "loss": 2.3947, "mean_token_accuracy": 0.42177858352661135, "step": 110250 }, { "epoch": 0.11105000700011382, "grad_norm": 10.885593285085745, "learning_rate": 4.9543029486591706e-05, "loss": 2.8173, "mean_token_accuracy": 0.3862068921327591, "step": 110255 }, { "epoch": 0.11105504305321799, "grad_norm": 17.17325481498889, "learning_rate": 4.954295434703325e-05, "loss": 3.221, "mean_token_accuracy": 0.39310344159603117, "step": 110260 }, { "epoch": 0.11106007910632217, "grad_norm": 9.684765455382694, "learning_rate": 4.954287920136109e-05, "loss": 1.8703, "mean_token_accuracy": 0.5517241477966308, "step": 110265 }, { "epoch": 0.11106511515942634, "grad_norm": 9.948375734490787, "learning_rate": 4.954280404957526e-05, "loss": 2.7016, "mean_token_accuracy": 0.37586206793785093, "step": 110270 }, { "epoch": 0.1110701512125305, "grad_norm": 10.044869395111666, "learning_rate": 4.954272889167578e-05, "loss": 2.3212, "mean_token_accuracy": 0.45704779028892517, "step": 110275 }, { "epoch": 0.11107518726563467, "grad_norm": 12.321505575188331, "learning_rate": 4.9542653727662655e-05, "loss": 2.8457, "mean_token_accuracy": 0.3620689630508423, "step": 110280 }, { "epoch": 0.11108022331873885, "grad_norm": 8.825196663857502, "learning_rate": 4.9542578557535926e-05, "loss": 2.4642, "mean_token_accuracy": 0.44295220375061034, "step": 110285 }, { "epoch": 0.11108525937184302, "grad_norm": 10.082480663983144, "learning_rate": 4.9542503381295605e-05, "loss": 2.7871, "mean_token_accuracy": 0.379310342669487, "step": 110290 }, { "epoch": 0.1110902954249472, "grad_norm": 9.303642530805092, "learning_rate": 4.954242819894171e-05, "loss": 2.6453, "mean_token_accuracy": 0.3793103456497192, "step": 110295 }, { "epoch": 0.11109533147805137, "grad_norm": 11.223689810610527, "learning_rate": 4.9542353010474274e-05, "loss": 2.741, "mean_token_accuracy": 0.38620689511299133, "step": 110300 }, { "epoch": 0.11110036753115554, "grad_norm": 13.02158096692777, "learning_rate": 4.9542277815893304e-05, "loss": 2.9099, "mean_token_accuracy": 0.37241379022598264, "step": 110305 }, { "epoch": 0.11110540358425972, "grad_norm": 11.61095458882924, "learning_rate": 4.954220261519882e-05, "loss": 2.5561, "mean_token_accuracy": 0.4103448331356049, "step": 110310 }, { "epoch": 0.11111043963736389, "grad_norm": 9.355107755077444, "learning_rate": 4.954212740839085e-05, "loss": 2.4781, "mean_token_accuracy": 0.38275861740112305, "step": 110315 }, { "epoch": 0.11111547569046806, "grad_norm": 11.150287279466312, "learning_rate": 4.954205219546942e-05, "loss": 2.4673, "mean_token_accuracy": 0.4068965494632721, "step": 110320 }, { "epoch": 0.11112051174357224, "grad_norm": 13.397771189045741, "learning_rate": 4.954197697643454e-05, "loss": 3.1258, "mean_token_accuracy": 0.39310344457626345, "step": 110325 }, { "epoch": 0.11112554779667641, "grad_norm": 11.043361513122283, "learning_rate": 4.954190175128624e-05, "loss": 2.352, "mean_token_accuracy": 0.3827586233615875, "step": 110330 }, { "epoch": 0.11113058384978058, "grad_norm": 8.962144224146357, "learning_rate": 4.954182652002454e-05, "loss": 2.3292, "mean_token_accuracy": 0.4206896543502808, "step": 110335 }, { "epoch": 0.11113561990288476, "grad_norm": 9.948653972187333, "learning_rate": 4.9541751282649446e-05, "loss": 2.2913, "mean_token_accuracy": 0.4689655125141144, "step": 110340 }, { "epoch": 0.11114065595598892, "grad_norm": 15.077061327698045, "learning_rate": 4.9541676039161e-05, "loss": 2.5348, "mean_token_accuracy": 0.4000000059604645, "step": 110345 }, { "epoch": 0.11114569200909309, "grad_norm": 11.029063436377841, "learning_rate": 4.95416007895592e-05, "loss": 2.1476, "mean_token_accuracy": 0.4724137902259827, "step": 110350 }, { "epoch": 0.11115072806219727, "grad_norm": 8.888393755009425, "learning_rate": 4.954152553384408e-05, "loss": 2.7789, "mean_token_accuracy": 0.4103448212146759, "step": 110355 }, { "epoch": 0.11115576411530144, "grad_norm": 10.508304348408263, "learning_rate": 4.9541450272015676e-05, "loss": 1.7988, "mean_token_accuracy": 0.529885059595108, "step": 110360 }, { "epoch": 0.11116080016840561, "grad_norm": 12.335557853499209, "learning_rate": 4.954137500407399e-05, "loss": 2.278, "mean_token_accuracy": 0.4502117335796356, "step": 110365 }, { "epoch": 0.11116583622150979, "grad_norm": 10.536092121125144, "learning_rate": 4.9541299730019035e-05, "loss": 2.5118, "mean_token_accuracy": 0.4034482777118683, "step": 110370 }, { "epoch": 0.11117087227461396, "grad_norm": 10.50055006341244, "learning_rate": 4.9541224449850855e-05, "loss": 2.2827, "mean_token_accuracy": 0.42758620381355283, "step": 110375 }, { "epoch": 0.11117590832771813, "grad_norm": 10.495838385562928, "learning_rate": 4.954114916356945e-05, "loss": 2.5279, "mean_token_accuracy": 0.4344827592372894, "step": 110380 }, { "epoch": 0.11118094438082231, "grad_norm": 12.974342219083915, "learning_rate": 4.954107387117486e-05, "loss": 2.0973, "mean_token_accuracy": 0.517241382598877, "step": 110385 }, { "epoch": 0.11118598043392648, "grad_norm": 10.158284186311523, "learning_rate": 4.954099857266709e-05, "loss": 2.4258, "mean_token_accuracy": 0.4068965554237366, "step": 110390 }, { "epoch": 0.11119101648703066, "grad_norm": 11.423101564756257, "learning_rate": 4.9540923268046166e-05, "loss": 2.3213, "mean_token_accuracy": 0.4379310369491577, "step": 110395 }, { "epoch": 0.11119605254013483, "grad_norm": 11.37459651136444, "learning_rate": 4.954084795731212e-05, "loss": 2.298, "mean_token_accuracy": 0.4206896543502808, "step": 110400 }, { "epoch": 0.111201088593239, "grad_norm": 10.80007583529436, "learning_rate": 4.9540772640464944e-05, "loss": 2.7532, "mean_token_accuracy": 0.41034483909606934, "step": 110405 }, { "epoch": 0.11120612464634318, "grad_norm": 11.335589089895011, "learning_rate": 4.9540697317504686e-05, "loss": 2.1709, "mean_token_accuracy": 0.458620685338974, "step": 110410 }, { "epoch": 0.11121116069944734, "grad_norm": 12.228076525824628, "learning_rate": 4.9540621988431364e-05, "loss": 2.2282, "mean_token_accuracy": 0.4551724076271057, "step": 110415 }, { "epoch": 0.11121619675255151, "grad_norm": 8.762992080173202, "learning_rate": 4.9540546653244986e-05, "loss": 2.2797, "mean_token_accuracy": 0.482758629322052, "step": 110420 }, { "epoch": 0.11122123280565568, "grad_norm": 13.360289357424698, "learning_rate": 4.954047131194558e-05, "loss": 2.6025, "mean_token_accuracy": 0.4344827592372894, "step": 110425 }, { "epoch": 0.11122626885875986, "grad_norm": 11.793708266371718, "learning_rate": 4.954039596453317e-05, "loss": 2.2251, "mean_token_accuracy": 0.4172413766384125, "step": 110430 }, { "epoch": 0.11123130491186403, "grad_norm": 11.69601847267536, "learning_rate": 4.954032061100778e-05, "loss": 2.3525, "mean_token_accuracy": 0.4103448331356049, "step": 110435 }, { "epoch": 0.1112363409649682, "grad_norm": 10.845760133687905, "learning_rate": 4.954024525136942e-05, "loss": 2.445, "mean_token_accuracy": 0.42758620977401735, "step": 110440 }, { "epoch": 0.11124137701807238, "grad_norm": 12.493566632589582, "learning_rate": 4.954016988561811e-05, "loss": 2.4163, "mean_token_accuracy": 0.4379310429096222, "step": 110445 }, { "epoch": 0.11124641307117655, "grad_norm": 20.37054813027392, "learning_rate": 4.954009451375388e-05, "loss": 2.2182, "mean_token_accuracy": 0.44960677027702334, "step": 110450 }, { "epoch": 0.11125144912428073, "grad_norm": 11.551775185124189, "learning_rate": 4.954001913577675e-05, "loss": 2.5128, "mean_token_accuracy": 0.3931034505367279, "step": 110455 }, { "epoch": 0.1112564851773849, "grad_norm": 10.053724802856078, "learning_rate": 4.953994375168674e-05, "loss": 2.4334, "mean_token_accuracy": 0.43793103098869324, "step": 110460 }, { "epoch": 0.11126152123048907, "grad_norm": 10.242921357952964, "learning_rate": 4.953986836148387e-05, "loss": 2.4438, "mean_token_accuracy": 0.42413793206214906, "step": 110465 }, { "epoch": 0.11126655728359325, "grad_norm": 14.65582635615077, "learning_rate": 4.953979296516815e-05, "loss": 2.39, "mean_token_accuracy": 0.41034482717514037, "step": 110470 }, { "epoch": 0.11127159333669742, "grad_norm": 10.09093030157355, "learning_rate": 4.953971756273962e-05, "loss": 2.5585, "mean_token_accuracy": 0.39655172228813174, "step": 110475 }, { "epoch": 0.1112766293898016, "grad_norm": 9.750261506284803, "learning_rate": 4.953964215419829e-05, "loss": 2.2408, "mean_token_accuracy": 0.4689655125141144, "step": 110480 }, { "epoch": 0.11128166544290576, "grad_norm": 10.527660368852109, "learning_rate": 4.9539566739544185e-05, "loss": 2.7538, "mean_token_accuracy": 0.36896551549434664, "step": 110485 }, { "epoch": 0.11128670149600993, "grad_norm": 10.506903661457997, "learning_rate": 4.953949131877732e-05, "loss": 2.0252, "mean_token_accuracy": 0.47931034564971925, "step": 110490 }, { "epoch": 0.1112917375491141, "grad_norm": 10.828540407467989, "learning_rate": 4.9539415891897724e-05, "loss": 2.2575, "mean_token_accuracy": 0.48275862336158754, "step": 110495 }, { "epoch": 0.11129677360221828, "grad_norm": 11.35562333335125, "learning_rate": 4.953934045890541e-05, "loss": 2.5985, "mean_token_accuracy": 0.37586206793785093, "step": 110500 }, { "epoch": 0.11130180965532245, "grad_norm": 9.725937323099147, "learning_rate": 4.9539265019800404e-05, "loss": 2.3709, "mean_token_accuracy": 0.4620689690113068, "step": 110505 }, { "epoch": 0.11130684570842662, "grad_norm": 10.884393705252405, "learning_rate": 4.953918957458273e-05, "loss": 2.2995, "mean_token_accuracy": 0.48154870271682737, "step": 110510 }, { "epoch": 0.1113118817615308, "grad_norm": 12.848763980197587, "learning_rate": 4.95391141232524e-05, "loss": 2.5165, "mean_token_accuracy": 0.45547489523887635, "step": 110515 }, { "epoch": 0.11131691781463497, "grad_norm": 10.762092124487754, "learning_rate": 4.953903866580945e-05, "loss": 2.3617, "mean_token_accuracy": 0.42413792610168455, "step": 110520 }, { "epoch": 0.11132195386773915, "grad_norm": 12.958528396881144, "learning_rate": 4.9538963202253877e-05, "loss": 2.5745, "mean_token_accuracy": 0.4, "step": 110525 }, { "epoch": 0.11132698992084332, "grad_norm": 11.03736355806719, "learning_rate": 4.953888773258572e-05, "loss": 2.5607, "mean_token_accuracy": 0.37586206793785093, "step": 110530 }, { "epoch": 0.1113320259739475, "grad_norm": 12.355525708990191, "learning_rate": 4.953881225680499e-05, "loss": 2.6048, "mean_token_accuracy": 0.3931034475564957, "step": 110535 }, { "epoch": 0.11133706202705167, "grad_norm": 10.32782352270584, "learning_rate": 4.953873677491172e-05, "loss": 2.3601, "mean_token_accuracy": 0.42068964838981626, "step": 110540 }, { "epoch": 0.11134209808015584, "grad_norm": 9.827078563739821, "learning_rate": 4.9538661286905916e-05, "loss": 2.4995, "mean_token_accuracy": 0.43448275327682495, "step": 110545 }, { "epoch": 0.11134713413326001, "grad_norm": 10.612580609890855, "learning_rate": 4.9538585792787615e-05, "loss": 2.3123, "mean_token_accuracy": 0.3931034505367279, "step": 110550 }, { "epoch": 0.11135217018636417, "grad_norm": 10.907721166149548, "learning_rate": 4.9538510292556826e-05, "loss": 2.6297, "mean_token_accuracy": 0.4034482777118683, "step": 110555 }, { "epoch": 0.11135720623946835, "grad_norm": 10.490573046916055, "learning_rate": 4.9538434786213576e-05, "loss": 2.3859, "mean_token_accuracy": 0.42758620381355283, "step": 110560 }, { "epoch": 0.11136224229257252, "grad_norm": 9.34342491229045, "learning_rate": 4.953835927375789e-05, "loss": 2.4234, "mean_token_accuracy": 0.4206896424293518, "step": 110565 }, { "epoch": 0.1113672783456767, "grad_norm": 10.296674005922966, "learning_rate": 4.9538283755189774e-05, "loss": 2.6413, "mean_token_accuracy": 0.4068965494632721, "step": 110570 }, { "epoch": 0.11137231439878087, "grad_norm": 12.215545999974788, "learning_rate": 4.953820823050926e-05, "loss": 2.6204, "mean_token_accuracy": 0.4137930929660797, "step": 110575 }, { "epoch": 0.11137735045188504, "grad_norm": 9.997158353579897, "learning_rate": 4.953813269971637e-05, "loss": 1.9829, "mean_token_accuracy": 0.46412582993507384, "step": 110580 }, { "epoch": 0.11138238650498922, "grad_norm": 11.591238824863009, "learning_rate": 4.953805716281112e-05, "loss": 2.1891, "mean_token_accuracy": 0.4413793087005615, "step": 110585 }, { "epoch": 0.11138742255809339, "grad_norm": 10.285217000722298, "learning_rate": 4.953798161979353e-05, "loss": 2.2364, "mean_token_accuracy": 0.4344827592372894, "step": 110590 }, { "epoch": 0.11139245861119756, "grad_norm": 11.157793969430854, "learning_rate": 4.9537906070663624e-05, "loss": 2.8121, "mean_token_accuracy": 0.37586206793785093, "step": 110595 }, { "epoch": 0.11139749466430174, "grad_norm": 10.35494370593346, "learning_rate": 4.9537830515421426e-05, "loss": 2.7957, "mean_token_accuracy": 0.37241379022598264, "step": 110600 }, { "epoch": 0.11140253071740591, "grad_norm": 12.853773983244253, "learning_rate": 4.953775495406695e-05, "loss": 2.4785, "mean_token_accuracy": 0.3931034505367279, "step": 110605 }, { "epoch": 0.11140756677051009, "grad_norm": 11.210287908247357, "learning_rate": 4.953767938660022e-05, "loss": 2.4828, "mean_token_accuracy": 0.42413792610168455, "step": 110610 }, { "epoch": 0.11141260282361426, "grad_norm": 9.498600452489391, "learning_rate": 4.953760381302126e-05, "loss": 2.367, "mean_token_accuracy": 0.3827586233615875, "step": 110615 }, { "epoch": 0.11141763887671843, "grad_norm": 11.819527715681321, "learning_rate": 4.953752823333008e-05, "loss": 2.2712, "mean_token_accuracy": 0.46781609058380125, "step": 110620 }, { "epoch": 0.1114226749298226, "grad_norm": 13.00013439027541, "learning_rate": 4.9537452647526724e-05, "loss": 2.7481, "mean_token_accuracy": 0.4206896543502808, "step": 110625 }, { "epoch": 0.11142771098292677, "grad_norm": 10.87984326479111, "learning_rate": 4.953737705561119e-05, "loss": 2.2068, "mean_token_accuracy": 0.47931033968925474, "step": 110630 }, { "epoch": 0.11143274703603094, "grad_norm": 9.772128858780778, "learning_rate": 4.953730145758351e-05, "loss": 2.2872, "mean_token_accuracy": 0.4551724076271057, "step": 110635 }, { "epoch": 0.11143778308913511, "grad_norm": 10.115109721506451, "learning_rate": 4.953722585344369e-05, "loss": 2.7062, "mean_token_accuracy": 0.3482758581638336, "step": 110640 }, { "epoch": 0.11144281914223929, "grad_norm": 11.443729566491506, "learning_rate": 4.953715024319178e-05, "loss": 2.6285, "mean_token_accuracy": 0.3862068921327591, "step": 110645 }, { "epoch": 0.11144785519534346, "grad_norm": 11.755669373232548, "learning_rate": 4.953707462682778e-05, "loss": 2.6414, "mean_token_accuracy": 0.4379310429096222, "step": 110650 }, { "epoch": 0.11145289124844764, "grad_norm": 11.07940779943921, "learning_rate": 4.953699900435172e-05, "loss": 2.3766, "mean_token_accuracy": 0.3999999940395355, "step": 110655 }, { "epoch": 0.11145792730155181, "grad_norm": 13.284320189212751, "learning_rate": 4.95369233757636e-05, "loss": 2.5046, "mean_token_accuracy": 0.4206896543502808, "step": 110660 }, { "epoch": 0.11146296335465598, "grad_norm": 9.548899674063628, "learning_rate": 4.9536847741063466e-05, "loss": 2.109, "mean_token_accuracy": 0.4793103516101837, "step": 110665 }, { "epoch": 0.11146799940776016, "grad_norm": 13.762526908063501, "learning_rate": 4.953677210025133e-05, "loss": 2.4152, "mean_token_accuracy": 0.4206896543502808, "step": 110670 }, { "epoch": 0.11147303546086433, "grad_norm": 10.376260966025225, "learning_rate": 4.9536696453327214e-05, "loss": 1.9672, "mean_token_accuracy": 0.4979064047336578, "step": 110675 }, { "epoch": 0.1114780715139685, "grad_norm": 11.30098161996951, "learning_rate": 4.953662080029113e-05, "loss": 2.3488, "mean_token_accuracy": 0.493103438615799, "step": 110680 }, { "epoch": 0.11148310756707268, "grad_norm": 13.23386022898983, "learning_rate": 4.953654514114312e-05, "loss": 2.3514, "mean_token_accuracy": 0.41724138259887694, "step": 110685 }, { "epoch": 0.11148814362017685, "grad_norm": 10.60955388038811, "learning_rate": 4.953646947588319e-05, "loss": 2.4547, "mean_token_accuracy": 0.4379310369491577, "step": 110690 }, { "epoch": 0.11149317967328101, "grad_norm": 10.292455483662872, "learning_rate": 4.953639380451136e-05, "loss": 2.5023, "mean_token_accuracy": 0.43448275327682495, "step": 110695 }, { "epoch": 0.11149821572638519, "grad_norm": 10.716393784483344, "learning_rate": 4.953631812702765e-05, "loss": 2.4966, "mean_token_accuracy": 0.4068965554237366, "step": 110700 }, { "epoch": 0.11150325177948936, "grad_norm": 12.164293124710401, "learning_rate": 4.953624244343209e-05, "loss": 2.4054, "mean_token_accuracy": 0.4154264986515045, "step": 110705 }, { "epoch": 0.11150828783259353, "grad_norm": 10.710341024233369, "learning_rate": 4.9536166753724694e-05, "loss": 2.5788, "mean_token_accuracy": 0.41379310488700866, "step": 110710 }, { "epoch": 0.11151332388569771, "grad_norm": 11.588803029077017, "learning_rate": 4.953609105790549e-05, "loss": 2.4546, "mean_token_accuracy": 0.4068965494632721, "step": 110715 }, { "epoch": 0.11151835993880188, "grad_norm": 9.482871540570281, "learning_rate": 4.953601535597448e-05, "loss": 2.3732, "mean_token_accuracy": 0.44482758045196535, "step": 110720 }, { "epoch": 0.11152339599190605, "grad_norm": 10.663535739391383, "learning_rate": 4.9535939647931715e-05, "loss": 2.661, "mean_token_accuracy": 0.3896551787853241, "step": 110725 }, { "epoch": 0.11152843204501023, "grad_norm": 10.059427643298658, "learning_rate": 4.953586393377719e-05, "loss": 2.2777, "mean_token_accuracy": 0.4604355812072754, "step": 110730 }, { "epoch": 0.1115334680981144, "grad_norm": 11.343786322693434, "learning_rate": 4.953578821351094e-05, "loss": 2.2654, "mean_token_accuracy": 0.4620689570903778, "step": 110735 }, { "epoch": 0.11153850415121858, "grad_norm": 11.656704348393465, "learning_rate": 4.953571248713297e-05, "loss": 3.1517, "mean_token_accuracy": 0.3310344755649567, "step": 110740 }, { "epoch": 0.11154354020432275, "grad_norm": 10.279025073617573, "learning_rate": 4.9535636754643336e-05, "loss": 2.7121, "mean_token_accuracy": 0.4, "step": 110745 }, { "epoch": 0.11154857625742692, "grad_norm": 9.535866612954388, "learning_rate": 4.953556101604202e-05, "loss": 2.3325, "mean_token_accuracy": 0.4413793087005615, "step": 110750 }, { "epoch": 0.1115536123105311, "grad_norm": 9.502575982822695, "learning_rate": 4.953548527132906e-05, "loss": 2.5756, "mean_token_accuracy": 0.3965517282485962, "step": 110755 }, { "epoch": 0.11155864836363527, "grad_norm": 9.339453241139926, "learning_rate": 4.953540952050448e-05, "loss": 2.7921, "mean_token_accuracy": 0.4000000059604645, "step": 110760 }, { "epoch": 0.11156368441673943, "grad_norm": 13.484861169499599, "learning_rate": 4.9535333763568296e-05, "loss": 2.7356, "mean_token_accuracy": 0.4413793087005615, "step": 110765 }, { "epoch": 0.1115687204698436, "grad_norm": 11.06732019133782, "learning_rate": 4.953525800052053e-05, "loss": 2.9855, "mean_token_accuracy": 0.38965516686439516, "step": 110770 }, { "epoch": 0.11157375652294778, "grad_norm": 9.616676088453826, "learning_rate": 4.9535182231361205e-05, "loss": 2.3516, "mean_token_accuracy": 0.4068965494632721, "step": 110775 }, { "epoch": 0.11157879257605195, "grad_norm": 11.352184240263004, "learning_rate": 4.9535106456090334e-05, "loss": 2.4339, "mean_token_accuracy": 0.3944343626499176, "step": 110780 }, { "epoch": 0.11158382862915613, "grad_norm": 12.396612748659251, "learning_rate": 4.953503067470795e-05, "loss": 2.707, "mean_token_accuracy": 0.40514216423034666, "step": 110785 }, { "epoch": 0.1115888646822603, "grad_norm": 9.779578194676493, "learning_rate": 4.953495488721407e-05, "loss": 2.478, "mean_token_accuracy": 0.42758620977401735, "step": 110790 }, { "epoch": 0.11159390073536447, "grad_norm": 11.795191925124541, "learning_rate": 4.95348790936087e-05, "loss": 2.8242, "mean_token_accuracy": 0.34482758641242983, "step": 110795 }, { "epoch": 0.11159893678846865, "grad_norm": 9.15154141335474, "learning_rate": 4.9534803293891885e-05, "loss": 2.4325, "mean_token_accuracy": 0.458620685338974, "step": 110800 }, { "epoch": 0.11160397284157282, "grad_norm": 10.246749224216927, "learning_rate": 4.9534727488063636e-05, "loss": 2.6053, "mean_token_accuracy": 0.41034482419490814, "step": 110805 }, { "epoch": 0.111609008894677, "grad_norm": 10.1329068323462, "learning_rate": 4.953465167612397e-05, "loss": 2.2484, "mean_token_accuracy": 0.47931033968925474, "step": 110810 }, { "epoch": 0.11161404494778117, "grad_norm": 9.905672545290285, "learning_rate": 4.953457585807291e-05, "loss": 2.5303, "mean_token_accuracy": 0.4344827651977539, "step": 110815 }, { "epoch": 0.11161908100088534, "grad_norm": 10.765856681195736, "learning_rate": 4.9534500033910474e-05, "loss": 2.3704, "mean_token_accuracy": 0.42413793206214906, "step": 110820 }, { "epoch": 0.11162411705398952, "grad_norm": 11.593184941508369, "learning_rate": 4.953442420363669e-05, "loss": 2.3821, "mean_token_accuracy": 0.4344827592372894, "step": 110825 }, { "epoch": 0.11162915310709369, "grad_norm": 14.602525630328351, "learning_rate": 4.953434836725158e-05, "loss": 2.6927, "mean_token_accuracy": 0.358620685338974, "step": 110830 }, { "epoch": 0.11163418916019785, "grad_norm": 8.303154935895021, "learning_rate": 4.953427252475517e-05, "loss": 2.1447, "mean_token_accuracy": 0.46206897497177124, "step": 110835 }, { "epoch": 0.11163922521330202, "grad_norm": 9.626989284073598, "learning_rate": 4.953419667614745e-05, "loss": 2.2338, "mean_token_accuracy": 0.4379310429096222, "step": 110840 }, { "epoch": 0.1116442612664062, "grad_norm": 9.507409702526015, "learning_rate": 4.953412082142848e-05, "loss": 2.0527, "mean_token_accuracy": 0.49999999403953554, "step": 110845 }, { "epoch": 0.11164929731951037, "grad_norm": 11.49961083813994, "learning_rate": 4.9534044960598264e-05, "loss": 2.5957, "mean_token_accuracy": 0.3965517282485962, "step": 110850 }, { "epoch": 0.11165433337261454, "grad_norm": 9.7436923832067, "learning_rate": 4.9533969093656825e-05, "loss": 2.2096, "mean_token_accuracy": 0.4206896543502808, "step": 110855 }, { "epoch": 0.11165936942571872, "grad_norm": 10.648083208190558, "learning_rate": 4.953389322060417e-05, "loss": 2.5918, "mean_token_accuracy": 0.4206896543502808, "step": 110860 }, { "epoch": 0.11166440547882289, "grad_norm": 10.816188705176922, "learning_rate": 4.953381734144034e-05, "loss": 2.6074, "mean_token_accuracy": 0.42413793206214906, "step": 110865 }, { "epoch": 0.11166944153192707, "grad_norm": 10.177074200698323, "learning_rate": 4.953374145616535e-05, "loss": 2.7441, "mean_token_accuracy": 0.3999999940395355, "step": 110870 }, { "epoch": 0.11167447758503124, "grad_norm": 13.473949425816114, "learning_rate": 4.9533665564779215e-05, "loss": 2.6249, "mean_token_accuracy": 0.3896551787853241, "step": 110875 }, { "epoch": 0.11167951363813541, "grad_norm": 9.925723353855735, "learning_rate": 4.9533589667281964e-05, "loss": 2.2559, "mean_token_accuracy": 0.47931034564971925, "step": 110880 }, { "epoch": 0.11168454969123959, "grad_norm": 11.997162726550556, "learning_rate": 4.9533513763673614e-05, "loss": 2.7263, "mean_token_accuracy": 0.3896551698446274, "step": 110885 }, { "epoch": 0.11168958574434376, "grad_norm": 9.295672855691638, "learning_rate": 4.9533437853954186e-05, "loss": 2.0838, "mean_token_accuracy": 0.5068965554237366, "step": 110890 }, { "epoch": 0.11169462179744793, "grad_norm": 10.298976124105616, "learning_rate": 4.95333619381237e-05, "loss": 2.6653, "mean_token_accuracy": 0.39310343861579894, "step": 110895 }, { "epoch": 0.11169965785055211, "grad_norm": 10.337588599310013, "learning_rate": 4.953328601618219e-05, "loss": 2.3744, "mean_token_accuracy": 0.43103447556495667, "step": 110900 }, { "epoch": 0.11170469390365627, "grad_norm": 10.603923910155563, "learning_rate": 4.9533210088129654e-05, "loss": 2.5229, "mean_token_accuracy": 0.41724138259887694, "step": 110905 }, { "epoch": 0.11170972995676044, "grad_norm": 11.612979925316814, "learning_rate": 4.953313415396613e-05, "loss": 2.1222, "mean_token_accuracy": 0.46551724672317507, "step": 110910 }, { "epoch": 0.11171476600986462, "grad_norm": 9.193943927243676, "learning_rate": 4.953305821369163e-05, "loss": 1.8503, "mean_token_accuracy": 0.5344827532768249, "step": 110915 }, { "epoch": 0.11171980206296879, "grad_norm": 10.12334670770148, "learning_rate": 4.953298226730618e-05, "loss": 2.4957, "mean_token_accuracy": 0.4275861978530884, "step": 110920 }, { "epoch": 0.11172483811607296, "grad_norm": 9.054474149665763, "learning_rate": 4.953290631480981e-05, "loss": 1.8832, "mean_token_accuracy": 0.5206896603107453, "step": 110925 }, { "epoch": 0.11172987416917714, "grad_norm": 10.053945288436957, "learning_rate": 4.953283035620251e-05, "loss": 2.1417, "mean_token_accuracy": 0.4517241418361664, "step": 110930 }, { "epoch": 0.11173491022228131, "grad_norm": 9.412402485151269, "learning_rate": 4.9532754391484346e-05, "loss": 2.1324, "mean_token_accuracy": 0.4862069010734558, "step": 110935 }, { "epoch": 0.11173994627538548, "grad_norm": 9.430807208054931, "learning_rate": 4.95326784206553e-05, "loss": 2.3637, "mean_token_accuracy": 0.45517241954803467, "step": 110940 }, { "epoch": 0.11174498232848966, "grad_norm": 10.463232152204137, "learning_rate": 4.953260244371542e-05, "loss": 2.4481, "mean_token_accuracy": 0.4068965494632721, "step": 110945 }, { "epoch": 0.11175001838159383, "grad_norm": 9.247274572045685, "learning_rate": 4.95325264606647e-05, "loss": 2.2117, "mean_token_accuracy": 0.42758620381355283, "step": 110950 }, { "epoch": 0.111755054434698, "grad_norm": 10.650369682632245, "learning_rate": 4.953245047150319e-05, "loss": 2.3998, "mean_token_accuracy": 0.39655172228813174, "step": 110955 }, { "epoch": 0.11176009048780218, "grad_norm": 12.732139309490627, "learning_rate": 4.953237447623089e-05, "loss": 2.5877, "mean_token_accuracy": 0.4, "step": 110960 }, { "epoch": 0.11176512654090635, "grad_norm": 10.861681915362963, "learning_rate": 4.953229847484783e-05, "loss": 2.3366, "mean_token_accuracy": 0.3896551728248596, "step": 110965 }, { "epoch": 0.11177016259401053, "grad_norm": 9.858798211628004, "learning_rate": 4.9532222467354034e-05, "loss": 2.3234, "mean_token_accuracy": 0.4982456147670746, "step": 110970 }, { "epoch": 0.11177519864711469, "grad_norm": 10.124148828320575, "learning_rate": 4.953214645374952e-05, "loss": 2.4675, "mean_token_accuracy": 0.458620685338974, "step": 110975 }, { "epoch": 0.11178023470021886, "grad_norm": 16.524433944756108, "learning_rate": 4.95320704340343e-05, "loss": 3.1711, "mean_token_accuracy": 0.358620685338974, "step": 110980 }, { "epoch": 0.11178527075332303, "grad_norm": 11.320927483490227, "learning_rate": 4.95319944082084e-05, "loss": 2.2631, "mean_token_accuracy": 0.42758620977401735, "step": 110985 }, { "epoch": 0.11179030680642721, "grad_norm": 12.663338965840186, "learning_rate": 4.953191837627186e-05, "loss": 2.5605, "mean_token_accuracy": 0.4689655065536499, "step": 110990 }, { "epoch": 0.11179534285953138, "grad_norm": 13.396691761271915, "learning_rate": 4.9531842338224675e-05, "loss": 2.2233, "mean_token_accuracy": 0.4310344815254211, "step": 110995 }, { "epoch": 0.11180037891263556, "grad_norm": 12.472089474532494, "learning_rate": 4.953176629406688e-05, "loss": 2.1972, "mean_token_accuracy": 0.4689655125141144, "step": 111000 }, { "epoch": 0.11180541496573973, "grad_norm": 14.046998961968201, "learning_rate": 4.953169024379849e-05, "loss": 2.3181, "mean_token_accuracy": 0.42758620977401735, "step": 111005 }, { "epoch": 0.1118104510188439, "grad_norm": 9.540763109644704, "learning_rate": 4.953161418741953e-05, "loss": 2.1762, "mean_token_accuracy": 0.4931034505367279, "step": 111010 }, { "epoch": 0.11181548707194808, "grad_norm": 13.756471299461124, "learning_rate": 4.9531538124930017e-05, "loss": 2.8185, "mean_token_accuracy": 0.3896551728248596, "step": 111015 }, { "epoch": 0.11182052312505225, "grad_norm": 8.693958263619736, "learning_rate": 4.9531462056329976e-05, "loss": 2.5323, "mean_token_accuracy": 0.4103448212146759, "step": 111020 }, { "epoch": 0.11182555917815643, "grad_norm": 10.713517853928321, "learning_rate": 4.9531385981619426e-05, "loss": 2.0694, "mean_token_accuracy": 0.4931034445762634, "step": 111025 }, { "epoch": 0.1118305952312606, "grad_norm": 9.69548130710903, "learning_rate": 4.9531309900798386e-05, "loss": 2.3841, "mean_token_accuracy": 0.4689655065536499, "step": 111030 }, { "epoch": 0.11183563128436477, "grad_norm": 12.798650720802252, "learning_rate": 4.953123381386689e-05, "loss": 2.7263, "mean_token_accuracy": 0.4034482777118683, "step": 111035 }, { "epoch": 0.11184066733746895, "grad_norm": 11.525139206890879, "learning_rate": 4.953115772082495e-05, "loss": 2.8028, "mean_token_accuracy": 0.39310344457626345, "step": 111040 }, { "epoch": 0.1118457033905731, "grad_norm": 10.865228083289118, "learning_rate": 4.953108162167257e-05, "loss": 2.3897, "mean_token_accuracy": 0.441379314661026, "step": 111045 }, { "epoch": 0.11185073944367728, "grad_norm": 10.565659199273556, "learning_rate": 4.95310055164098e-05, "loss": 2.4671, "mean_token_accuracy": 0.4310344815254211, "step": 111050 }, { "epoch": 0.11185577549678145, "grad_norm": 10.237238183303273, "learning_rate": 4.9530929405036644e-05, "loss": 2.8823, "mean_token_accuracy": 0.3448275804519653, "step": 111055 }, { "epoch": 0.11186081154988563, "grad_norm": 9.951644754213525, "learning_rate": 4.953085328755313e-05, "loss": 2.7408, "mean_token_accuracy": 0.37241379022598264, "step": 111060 }, { "epoch": 0.1118658476029898, "grad_norm": 10.484451819199595, "learning_rate": 4.953077716395928e-05, "loss": 2.1717, "mean_token_accuracy": 0.441379314661026, "step": 111065 }, { "epoch": 0.11187088365609398, "grad_norm": 12.531515707193178, "learning_rate": 4.95307010342551e-05, "loss": 2.3904, "mean_token_accuracy": 0.38620689511299133, "step": 111070 }, { "epoch": 0.11187591970919815, "grad_norm": 9.875491658205487, "learning_rate": 4.953062489844064e-05, "loss": 2.3522, "mean_token_accuracy": 0.4586206912994385, "step": 111075 }, { "epoch": 0.11188095576230232, "grad_norm": 10.313897918031554, "learning_rate": 4.953054875651589e-05, "loss": 2.5532, "mean_token_accuracy": 0.4275862157344818, "step": 111080 }, { "epoch": 0.1118859918154065, "grad_norm": 10.97537884778486, "learning_rate": 4.953047260848089e-05, "loss": 2.5606, "mean_token_accuracy": 0.4172413766384125, "step": 111085 }, { "epoch": 0.11189102786851067, "grad_norm": 10.810061069684265, "learning_rate": 4.953039645433565e-05, "loss": 2.478, "mean_token_accuracy": 0.4000000059604645, "step": 111090 }, { "epoch": 0.11189606392161484, "grad_norm": 9.310833566438879, "learning_rate": 4.953032029408021e-05, "loss": 2.3566, "mean_token_accuracy": 0.46896551847457885, "step": 111095 }, { "epoch": 0.11190109997471902, "grad_norm": 9.468411684501064, "learning_rate": 4.9530244127714567e-05, "loss": 2.4165, "mean_token_accuracy": 0.4344827592372894, "step": 111100 }, { "epoch": 0.11190613602782319, "grad_norm": 8.842924622038577, "learning_rate": 4.953016795523876e-05, "loss": 2.2643, "mean_token_accuracy": 0.5049261093139649, "step": 111105 }, { "epoch": 0.11191117208092737, "grad_norm": 10.918748783857758, "learning_rate": 4.95300917766528e-05, "loss": 2.1434, "mean_token_accuracy": 0.42758620381355283, "step": 111110 }, { "epoch": 0.11191620813403153, "grad_norm": 13.525772695657489, "learning_rate": 4.9530015591956717e-05, "loss": 2.4318, "mean_token_accuracy": 0.4605565667152405, "step": 111115 }, { "epoch": 0.1119212441871357, "grad_norm": 9.816509222091243, "learning_rate": 4.952993940115052e-05, "loss": 2.4423, "mean_token_accuracy": 0.44482757449150084, "step": 111120 }, { "epoch": 0.11192628024023987, "grad_norm": 9.712713442780796, "learning_rate": 4.952986320423424e-05, "loss": 2.2117, "mean_token_accuracy": 0.4586206912994385, "step": 111125 }, { "epoch": 0.11193131629334405, "grad_norm": 12.10666252517279, "learning_rate": 4.9529787001207895e-05, "loss": 2.4886, "mean_token_accuracy": 0.39655172228813174, "step": 111130 }, { "epoch": 0.11193635234644822, "grad_norm": 12.167263075450759, "learning_rate": 4.95297107920715e-05, "loss": 2.5754, "mean_token_accuracy": 0.3758620619773865, "step": 111135 }, { "epoch": 0.1119413883995524, "grad_norm": 14.332584530072541, "learning_rate": 4.95296345768251e-05, "loss": 2.9891, "mean_token_accuracy": 0.4413793087005615, "step": 111140 }, { "epoch": 0.11194642445265657, "grad_norm": 9.670487177999423, "learning_rate": 4.9529558355468685e-05, "loss": 2.6526, "mean_token_accuracy": 0.38275861740112305, "step": 111145 }, { "epoch": 0.11195146050576074, "grad_norm": 9.62758780723168, "learning_rate": 4.9529482128002296e-05, "loss": 2.0316, "mean_token_accuracy": 0.5194192349910736, "step": 111150 }, { "epoch": 0.11195649655886492, "grad_norm": 11.294622904054343, "learning_rate": 4.9529405894425946e-05, "loss": 2.5927, "mean_token_accuracy": 0.42583181858062746, "step": 111155 }, { "epoch": 0.11196153261196909, "grad_norm": 9.049024622146542, "learning_rate": 4.9529329654739656e-05, "loss": 2.3028, "mean_token_accuracy": 0.41379311084747317, "step": 111160 }, { "epoch": 0.11196656866507326, "grad_norm": 7.460371526434573, "learning_rate": 4.952925340894345e-05, "loss": 2.272, "mean_token_accuracy": 0.45517241954803467, "step": 111165 }, { "epoch": 0.11197160471817744, "grad_norm": 10.561710427335155, "learning_rate": 4.952917715703735e-05, "loss": 2.6899, "mean_token_accuracy": 0.3275861978530884, "step": 111170 }, { "epoch": 0.11197664077128161, "grad_norm": 8.509174651008847, "learning_rate": 4.9529100899021376e-05, "loss": 2.1767, "mean_token_accuracy": 0.4517241418361664, "step": 111175 }, { "epoch": 0.11198167682438578, "grad_norm": 9.766128160097967, "learning_rate": 4.952902463489555e-05, "loss": 2.2074, "mean_token_accuracy": 0.482758629322052, "step": 111180 }, { "epoch": 0.11198671287748994, "grad_norm": 11.084506233577857, "learning_rate": 4.9528948364659885e-05, "loss": 2.6478, "mean_token_accuracy": 0.42885662317276, "step": 111185 }, { "epoch": 0.11199174893059412, "grad_norm": 11.296836032036177, "learning_rate": 4.952887208831441e-05, "loss": 2.285, "mean_token_accuracy": 0.45862069725990295, "step": 111190 }, { "epoch": 0.11199678498369829, "grad_norm": 10.208591807688657, "learning_rate": 4.952879580585915e-05, "loss": 2.7127, "mean_token_accuracy": 0.36896551251411436, "step": 111195 }, { "epoch": 0.11200182103680247, "grad_norm": 12.768241008486962, "learning_rate": 4.9528719517294125e-05, "loss": 2.5838, "mean_token_accuracy": 0.42758620381355283, "step": 111200 }, { "epoch": 0.11200685708990664, "grad_norm": 11.832199326700236, "learning_rate": 4.952864322261935e-05, "loss": 2.5365, "mean_token_accuracy": 0.3862068921327591, "step": 111205 }, { "epoch": 0.11201189314301081, "grad_norm": 10.47701256959548, "learning_rate": 4.952856692183483e-05, "loss": 2.5727, "mean_token_accuracy": 0.42758620977401735, "step": 111210 }, { "epoch": 0.11201692919611499, "grad_norm": 11.959750929542492, "learning_rate": 4.952849061494063e-05, "loss": 2.0576, "mean_token_accuracy": 0.5399878978729248, "step": 111215 }, { "epoch": 0.11202196524921916, "grad_norm": 14.215202551521886, "learning_rate": 4.9528414301936736e-05, "loss": 2.2206, "mean_token_accuracy": 0.5229280173778534, "step": 111220 }, { "epoch": 0.11202700130232333, "grad_norm": 11.877617653268583, "learning_rate": 4.952833798282318e-05, "loss": 2.1696, "mean_token_accuracy": 0.4482758641242981, "step": 111225 }, { "epoch": 0.11203203735542751, "grad_norm": 12.249088500671558, "learning_rate": 4.9528261657599976e-05, "loss": 2.7504, "mean_token_accuracy": 0.38275861740112305, "step": 111230 }, { "epoch": 0.11203707340853168, "grad_norm": 10.589151339813798, "learning_rate": 4.952818532626716e-05, "loss": 2.4574, "mean_token_accuracy": 0.4551724135875702, "step": 111235 }, { "epoch": 0.11204210946163586, "grad_norm": 9.036791724909689, "learning_rate": 4.9528108988824744e-05, "loss": 2.4017, "mean_token_accuracy": 0.4880822837352753, "step": 111240 }, { "epoch": 0.11204714551474003, "grad_norm": 9.974213918441478, "learning_rate": 4.952803264527274e-05, "loss": 2.0507, "mean_token_accuracy": 0.4758620738983154, "step": 111245 }, { "epoch": 0.1120521815678442, "grad_norm": 9.420207511631451, "learning_rate": 4.952795629561119e-05, "loss": 2.3749, "mean_token_accuracy": 0.4103448212146759, "step": 111250 }, { "epoch": 0.11205721762094836, "grad_norm": 13.017623951879994, "learning_rate": 4.9527879939840104e-05, "loss": 2.7838, "mean_token_accuracy": 0.4517241418361664, "step": 111255 }, { "epoch": 0.11206225367405254, "grad_norm": 10.082903539599533, "learning_rate": 4.95278035779595e-05, "loss": 2.2282, "mean_token_accuracy": 0.4344827592372894, "step": 111260 }, { "epoch": 0.11206728972715671, "grad_norm": 10.479371746569397, "learning_rate": 4.95277272099694e-05, "loss": 2.328, "mean_token_accuracy": 0.4506352126598358, "step": 111265 }, { "epoch": 0.11207232578026088, "grad_norm": 9.717997432650758, "learning_rate": 4.9527650835869826e-05, "loss": 2.6857, "mean_token_accuracy": 0.39655172228813174, "step": 111270 }, { "epoch": 0.11207736183336506, "grad_norm": 9.907502702399379, "learning_rate": 4.952757445566081e-05, "loss": 2.3944, "mean_token_accuracy": 0.42413793206214906, "step": 111275 }, { "epoch": 0.11208239788646923, "grad_norm": 9.84157711374118, "learning_rate": 4.9527498069342357e-05, "loss": 2.2078, "mean_token_accuracy": 0.458620685338974, "step": 111280 }, { "epoch": 0.1120874339395734, "grad_norm": 11.632289960422431, "learning_rate": 4.9527421676914505e-05, "loss": 2.5868, "mean_token_accuracy": 0.42758620977401735, "step": 111285 }, { "epoch": 0.11209246999267758, "grad_norm": 11.210209146752737, "learning_rate": 4.9527345278377255e-05, "loss": 2.4733, "mean_token_accuracy": 0.4068965494632721, "step": 111290 }, { "epoch": 0.11209750604578175, "grad_norm": 10.436801095705171, "learning_rate": 4.952726887373064e-05, "loss": 2.5441, "mean_token_accuracy": 0.36551724672317504, "step": 111295 }, { "epoch": 0.11210254209888593, "grad_norm": 11.892442080040759, "learning_rate": 4.9527192462974684e-05, "loss": 2.6294, "mean_token_accuracy": 0.3655172407627106, "step": 111300 }, { "epoch": 0.1121075781519901, "grad_norm": 11.308370950501159, "learning_rate": 4.9527116046109397e-05, "loss": 2.3855, "mean_token_accuracy": 0.4, "step": 111305 }, { "epoch": 0.11211261420509427, "grad_norm": 9.276320636204971, "learning_rate": 4.952703962313481e-05, "loss": 2.063, "mean_token_accuracy": 0.5086509466171265, "step": 111310 }, { "epoch": 0.11211765025819845, "grad_norm": 10.219062619352373, "learning_rate": 4.952696319405094e-05, "loss": 2.2618, "mean_token_accuracy": 0.4310344785451889, "step": 111315 }, { "epoch": 0.11212268631130262, "grad_norm": 12.749503576301349, "learning_rate": 4.952688675885782e-05, "loss": 2.3858, "mean_token_accuracy": 0.43297035694122316, "step": 111320 }, { "epoch": 0.11212772236440678, "grad_norm": 11.997728127878338, "learning_rate": 4.952681031755545e-05, "loss": 2.278, "mean_token_accuracy": 0.42068964838981626, "step": 111325 }, { "epoch": 0.11213275841751096, "grad_norm": 9.185447867605896, "learning_rate": 4.9526733870143874e-05, "loss": 2.4808, "mean_token_accuracy": 0.45517240166664125, "step": 111330 }, { "epoch": 0.11213779447061513, "grad_norm": 10.181894623507537, "learning_rate": 4.9526657416623086e-05, "loss": 2.3951, "mean_token_accuracy": 0.4, "step": 111335 }, { "epoch": 0.1121428305237193, "grad_norm": 12.153119787161627, "learning_rate": 4.952658095699313e-05, "loss": 2.7726, "mean_token_accuracy": 0.37586206793785093, "step": 111340 }, { "epoch": 0.11214786657682348, "grad_norm": 11.543220030583123, "learning_rate": 4.9526504491254015e-05, "loss": 2.2505, "mean_token_accuracy": 0.4620689630508423, "step": 111345 }, { "epoch": 0.11215290262992765, "grad_norm": 8.450532948394486, "learning_rate": 4.952642801940577e-05, "loss": 2.1657, "mean_token_accuracy": 0.4551724076271057, "step": 111350 }, { "epoch": 0.11215793868303182, "grad_norm": 10.854961635333785, "learning_rate": 4.952635154144842e-05, "loss": 2.1853, "mean_token_accuracy": 0.4448275864124298, "step": 111355 }, { "epoch": 0.112162974736136, "grad_norm": 12.176724733024207, "learning_rate": 4.952627505738197e-05, "loss": 2.0843, "mean_token_accuracy": 0.47586206793785096, "step": 111360 }, { "epoch": 0.11216801078924017, "grad_norm": 10.641635157863735, "learning_rate": 4.952619856720645e-05, "loss": 2.4683, "mean_token_accuracy": 0.42232305407524107, "step": 111365 }, { "epoch": 0.11217304684234435, "grad_norm": 9.728677723855975, "learning_rate": 4.9526122070921886e-05, "loss": 2.0068, "mean_token_accuracy": 0.48275861144065857, "step": 111370 }, { "epoch": 0.11217808289544852, "grad_norm": 11.115419596188751, "learning_rate": 4.9526045568528294e-05, "loss": 2.3474, "mean_token_accuracy": 0.4604355812072754, "step": 111375 }, { "epoch": 0.1121831189485527, "grad_norm": 12.548180784201522, "learning_rate": 4.952596906002569e-05, "loss": 2.2223, "mean_token_accuracy": 0.4551724135875702, "step": 111380 }, { "epoch": 0.11218815500165687, "grad_norm": 11.537445129690736, "learning_rate": 4.952589254541411e-05, "loss": 2.4014, "mean_token_accuracy": 0.49039408564567566, "step": 111385 }, { "epoch": 0.11219319105476104, "grad_norm": 9.990629746328988, "learning_rate": 4.952581602469357e-05, "loss": 2.3455, "mean_token_accuracy": 0.4463054180145264, "step": 111390 }, { "epoch": 0.1121982271078652, "grad_norm": 10.539457799127904, "learning_rate": 4.952573949786407e-05, "loss": 2.6802, "mean_token_accuracy": 0.42413793206214906, "step": 111395 }, { "epoch": 0.11220326316096937, "grad_norm": 12.640962539544208, "learning_rate": 4.9525662964925665e-05, "loss": 2.3339, "mean_token_accuracy": 0.44827587008476255, "step": 111400 }, { "epoch": 0.11220829921407355, "grad_norm": 13.685443646724211, "learning_rate": 4.9525586425878355e-05, "loss": 2.7182, "mean_token_accuracy": 0.4068965494632721, "step": 111405 }, { "epoch": 0.11221333526717772, "grad_norm": 10.523390347403765, "learning_rate": 4.952550988072217e-05, "loss": 2.2881, "mean_token_accuracy": 0.4275861978530884, "step": 111410 }, { "epoch": 0.1122183713202819, "grad_norm": 16.752881544991617, "learning_rate": 4.952543332945712e-05, "loss": 2.4511, "mean_token_accuracy": 0.39310344457626345, "step": 111415 }, { "epoch": 0.11222340737338607, "grad_norm": 17.69398408263333, "learning_rate": 4.9525356772083235e-05, "loss": 2.4886, "mean_token_accuracy": 0.4758620738983154, "step": 111420 }, { "epoch": 0.11222844342649024, "grad_norm": 10.968546883937739, "learning_rate": 4.9525280208600536e-05, "loss": 2.4074, "mean_token_accuracy": 0.4724137902259827, "step": 111425 }, { "epoch": 0.11223347947959442, "grad_norm": 15.899308460762535, "learning_rate": 4.9525203639009055e-05, "loss": 2.3983, "mean_token_accuracy": 0.4344827592372894, "step": 111430 }, { "epoch": 0.11223851553269859, "grad_norm": 11.667906683856852, "learning_rate": 4.952512706330879e-05, "loss": 2.4178, "mean_token_accuracy": 0.4275861978530884, "step": 111435 }, { "epoch": 0.11224355158580276, "grad_norm": 9.524507232600618, "learning_rate": 4.9525050481499776e-05, "loss": 2.3471, "mean_token_accuracy": 0.4482758641242981, "step": 111440 }, { "epoch": 0.11224858763890694, "grad_norm": 10.8551219848192, "learning_rate": 4.952497389358203e-05, "loss": 2.4519, "mean_token_accuracy": 0.3896551728248596, "step": 111445 }, { "epoch": 0.11225362369201111, "grad_norm": 9.501695722841893, "learning_rate": 4.9524897299555576e-05, "loss": 2.36, "mean_token_accuracy": 0.41034482717514037, "step": 111450 }, { "epoch": 0.11225865974511529, "grad_norm": 10.847067942215253, "learning_rate": 4.952482069942044e-05, "loss": 2.9669, "mean_token_accuracy": 0.35644282698631286, "step": 111455 }, { "epoch": 0.11226369579821946, "grad_norm": 11.573546131623743, "learning_rate": 4.952474409317664e-05, "loss": 2.2444, "mean_token_accuracy": 0.44482758045196535, "step": 111460 }, { "epoch": 0.11226873185132362, "grad_norm": 10.969126447925802, "learning_rate": 4.952466748082418e-05, "loss": 2.6441, "mean_token_accuracy": 0.38433151245117186, "step": 111465 }, { "epoch": 0.1122737679044278, "grad_norm": 10.991123847201068, "learning_rate": 4.952459086236311e-05, "loss": 2.0642, "mean_token_accuracy": 0.5226860284805298, "step": 111470 }, { "epoch": 0.11227880395753197, "grad_norm": 12.886517572434629, "learning_rate": 4.952451423779343e-05, "loss": 2.7396, "mean_token_accuracy": 0.41004234552383423, "step": 111475 }, { "epoch": 0.11228384001063614, "grad_norm": 10.090199717238045, "learning_rate": 4.952443760711518e-05, "loss": 2.2984, "mean_token_accuracy": 0.4206896543502808, "step": 111480 }, { "epoch": 0.11228887606374031, "grad_norm": 12.082189845388879, "learning_rate": 4.952436097032835e-05, "loss": 2.3999, "mean_token_accuracy": 0.36896551251411436, "step": 111485 }, { "epoch": 0.11229391211684449, "grad_norm": 9.400050088194, "learning_rate": 4.9524284327433e-05, "loss": 2.3374, "mean_token_accuracy": 0.4448275864124298, "step": 111490 }, { "epoch": 0.11229894816994866, "grad_norm": 11.23501525366294, "learning_rate": 4.952420767842912e-05, "loss": 2.5844, "mean_token_accuracy": 0.36551723480224607, "step": 111495 }, { "epoch": 0.11230398422305284, "grad_norm": 9.646472311888301, "learning_rate": 4.952413102331675e-05, "loss": 2.2016, "mean_token_accuracy": 0.44700543880462645, "step": 111500 }, { "epoch": 0.11230902027615701, "grad_norm": 10.563971354164007, "learning_rate": 4.952405436209591e-05, "loss": 2.3543, "mean_token_accuracy": 0.46896551847457885, "step": 111505 }, { "epoch": 0.11231405632926118, "grad_norm": 18.897577827448874, "learning_rate": 4.952397769476661e-05, "loss": 2.7902, "mean_token_accuracy": 0.38965516686439516, "step": 111510 }, { "epoch": 0.11231909238236536, "grad_norm": 10.7458985397591, "learning_rate": 4.952390102132888e-05, "loss": 2.3902, "mean_token_accuracy": 0.41379310488700866, "step": 111515 }, { "epoch": 0.11232412843546953, "grad_norm": 10.344108476463264, "learning_rate": 4.9523824341782735e-05, "loss": 2.3228, "mean_token_accuracy": 0.4551724135875702, "step": 111520 }, { "epoch": 0.1123291644885737, "grad_norm": 10.856748075871668, "learning_rate": 4.9523747656128204e-05, "loss": 2.2985, "mean_token_accuracy": 0.46896551847457885, "step": 111525 }, { "epoch": 0.11233420054167788, "grad_norm": 11.77775214849814, "learning_rate": 4.9523670964365306e-05, "loss": 2.4635, "mean_token_accuracy": 0.4344827592372894, "step": 111530 }, { "epoch": 0.11233923659478204, "grad_norm": 9.605010599188414, "learning_rate": 4.952359426649405e-05, "loss": 2.0447, "mean_token_accuracy": 0.4655172348022461, "step": 111535 }, { "epoch": 0.11234427264788621, "grad_norm": 25.123472258204288, "learning_rate": 4.952351756251448e-05, "loss": 3.3984, "mean_token_accuracy": 0.3448275923728943, "step": 111540 }, { "epoch": 0.11234930870099039, "grad_norm": 10.951970184018228, "learning_rate": 4.95234408524266e-05, "loss": 2.5563, "mean_token_accuracy": 0.3931034505367279, "step": 111545 }, { "epoch": 0.11235434475409456, "grad_norm": 12.167939679656278, "learning_rate": 4.952336413623044e-05, "loss": 2.4186, "mean_token_accuracy": 0.38965516686439516, "step": 111550 }, { "epoch": 0.11235938080719873, "grad_norm": 10.804955590322109, "learning_rate": 4.952328741392601e-05, "loss": 2.8534, "mean_token_accuracy": 0.4172413766384125, "step": 111555 }, { "epoch": 0.11236441686030291, "grad_norm": 9.437564171340664, "learning_rate": 4.952321068551335e-05, "loss": 2.1959, "mean_token_accuracy": 0.46551724672317507, "step": 111560 }, { "epoch": 0.11236945291340708, "grad_norm": 8.240952102744117, "learning_rate": 4.952313395099246e-05, "loss": 2.1984, "mean_token_accuracy": 0.4413793087005615, "step": 111565 }, { "epoch": 0.11237448896651125, "grad_norm": 14.237742846783584, "learning_rate": 4.952305721036338e-05, "loss": 2.3903, "mean_token_accuracy": 0.4053236573934555, "step": 111570 }, { "epoch": 0.11237952501961543, "grad_norm": 11.417725487580878, "learning_rate": 4.952298046362612e-05, "loss": 2.077, "mean_token_accuracy": 0.4965517222881317, "step": 111575 }, { "epoch": 0.1123845610727196, "grad_norm": 11.426746032620931, "learning_rate": 4.95229037107807e-05, "loss": 2.2463, "mean_token_accuracy": 0.441379314661026, "step": 111580 }, { "epoch": 0.11238959712582378, "grad_norm": 11.645258676917305, "learning_rate": 4.9522826951827156e-05, "loss": 2.51, "mean_token_accuracy": 0.4103448331356049, "step": 111585 }, { "epoch": 0.11239463317892795, "grad_norm": 9.81325088724214, "learning_rate": 4.952275018676549e-05, "loss": 2.7684, "mean_token_accuracy": 0.42068964838981626, "step": 111590 }, { "epoch": 0.11239966923203212, "grad_norm": 11.137167333862763, "learning_rate": 4.952267341559573e-05, "loss": 2.4601, "mean_token_accuracy": 0.4, "step": 111595 }, { "epoch": 0.1124047052851363, "grad_norm": 10.623380760707912, "learning_rate": 4.9522596638317905e-05, "loss": 2.5053, "mean_token_accuracy": 0.4, "step": 111600 }, { "epoch": 0.11240974133824046, "grad_norm": 9.607679715333383, "learning_rate": 4.952251985493203e-05, "loss": 2.6569, "mean_token_accuracy": 0.403448274731636, "step": 111605 }, { "epoch": 0.11241477739134463, "grad_norm": 11.798647927684359, "learning_rate": 4.952244306543813e-05, "loss": 2.4386, "mean_token_accuracy": 0.38620689511299133, "step": 111610 }, { "epoch": 0.1124198134444488, "grad_norm": 10.75262759235998, "learning_rate": 4.9522366269836215e-05, "loss": 2.7987, "mean_token_accuracy": 0.4068965494632721, "step": 111615 }, { "epoch": 0.11242484949755298, "grad_norm": 10.558491694008719, "learning_rate": 4.952228946812631e-05, "loss": 2.8253, "mean_token_accuracy": 0.36206896007061007, "step": 111620 }, { "epoch": 0.11242988555065715, "grad_norm": 8.685177177048237, "learning_rate": 4.952221266030846e-05, "loss": 2.2789, "mean_token_accuracy": 0.4103448212146759, "step": 111625 }, { "epoch": 0.11243492160376133, "grad_norm": 8.81007218401979, "learning_rate": 4.952213584638265e-05, "loss": 2.2682, "mean_token_accuracy": 0.43793103098869324, "step": 111630 }, { "epoch": 0.1124399576568655, "grad_norm": 8.547261276375915, "learning_rate": 4.9522059026348924e-05, "loss": 2.295, "mean_token_accuracy": 0.46551724076271056, "step": 111635 }, { "epoch": 0.11244499370996967, "grad_norm": 12.66313043718746, "learning_rate": 4.9521982200207305e-05, "loss": 2.4051, "mean_token_accuracy": 0.4724137902259827, "step": 111640 }, { "epoch": 0.11245002976307385, "grad_norm": 9.126060731176104, "learning_rate": 4.95219053679578e-05, "loss": 2.2014, "mean_token_accuracy": 0.45172412395477296, "step": 111645 }, { "epoch": 0.11245506581617802, "grad_norm": 10.78781175057242, "learning_rate": 4.952182852960044e-05, "loss": 2.5019, "mean_token_accuracy": 0.42758620977401735, "step": 111650 }, { "epoch": 0.1124601018692822, "grad_norm": 11.503963056929237, "learning_rate": 4.9521751685135234e-05, "loss": 2.6663, "mean_token_accuracy": 0.4068965494632721, "step": 111655 }, { "epoch": 0.11246513792238637, "grad_norm": 9.379835630985818, "learning_rate": 4.952167483456223e-05, "loss": 2.4182, "mean_token_accuracy": 0.4482758641242981, "step": 111660 }, { "epoch": 0.11247017397549054, "grad_norm": 9.436200834845094, "learning_rate": 4.952159797788142e-05, "loss": 2.4973, "mean_token_accuracy": 0.3999999940395355, "step": 111665 }, { "epoch": 0.11247521002859472, "grad_norm": 10.27729235818234, "learning_rate": 4.9521521115092846e-05, "loss": 2.6232, "mean_token_accuracy": 0.43103448748588563, "step": 111670 }, { "epoch": 0.11248024608169888, "grad_norm": 9.30369414209824, "learning_rate": 4.952144424619651e-05, "loss": 2.4661, "mean_token_accuracy": 0.4103448301553726, "step": 111675 }, { "epoch": 0.11248528213480305, "grad_norm": 10.263524921083793, "learning_rate": 4.952136737119245e-05, "loss": 2.2314, "mean_token_accuracy": 0.4379310369491577, "step": 111680 }, { "epoch": 0.11249031818790722, "grad_norm": 9.987483150347861, "learning_rate": 4.952129049008068e-05, "loss": 2.414, "mean_token_accuracy": 0.4, "step": 111685 }, { "epoch": 0.1124953542410114, "grad_norm": 12.314383439974607, "learning_rate": 4.952121360286123e-05, "loss": 2.284, "mean_token_accuracy": 0.43448275327682495, "step": 111690 }, { "epoch": 0.11250039029411557, "grad_norm": 14.527667997174206, "learning_rate": 4.952113670953411e-05, "loss": 2.5226, "mean_token_accuracy": 0.4137930989265442, "step": 111695 }, { "epoch": 0.11250542634721974, "grad_norm": 10.308531318547193, "learning_rate": 4.9521059810099347e-05, "loss": 2.275, "mean_token_accuracy": 0.4850574791431427, "step": 111700 }, { "epoch": 0.11251046240032392, "grad_norm": 12.085283609470673, "learning_rate": 4.952098290455696e-05, "loss": 2.3342, "mean_token_accuracy": 0.4344827473163605, "step": 111705 }, { "epoch": 0.11251549845342809, "grad_norm": 10.152063430710427, "learning_rate": 4.952090599290697e-05, "loss": 2.8539, "mean_token_accuracy": 0.37241379618644715, "step": 111710 }, { "epoch": 0.11252053450653227, "grad_norm": 10.46692517274618, "learning_rate": 4.95208290751494e-05, "loss": 2.407, "mean_token_accuracy": 0.4413793087005615, "step": 111715 }, { "epoch": 0.11252557055963644, "grad_norm": 11.351004995776478, "learning_rate": 4.952075215128427e-05, "loss": 2.2432, "mean_token_accuracy": 0.44482758045196535, "step": 111720 }, { "epoch": 0.11253060661274061, "grad_norm": 11.844090882535875, "learning_rate": 4.9520675221311596e-05, "loss": 2.7212, "mean_token_accuracy": 0.4310344934463501, "step": 111725 }, { "epoch": 0.11253564266584479, "grad_norm": 9.56817327362832, "learning_rate": 4.9520598285231416e-05, "loss": 2.283, "mean_token_accuracy": 0.4068965494632721, "step": 111730 }, { "epoch": 0.11254067871894896, "grad_norm": 10.643896297863472, "learning_rate": 4.952052134304374e-05, "loss": 2.3846, "mean_token_accuracy": 0.4103448331356049, "step": 111735 }, { "epoch": 0.11254571477205313, "grad_norm": 9.618497514387489, "learning_rate": 4.952044439474858e-05, "loss": 2.6347, "mean_token_accuracy": 0.39310344457626345, "step": 111740 }, { "epoch": 0.1125507508251573, "grad_norm": 21.129664013225085, "learning_rate": 4.952036744034598e-05, "loss": 2.2762, "mean_token_accuracy": 0.47241378426551817, "step": 111745 }, { "epoch": 0.11255578687826147, "grad_norm": 12.46011851105571, "learning_rate": 4.9520290479835945e-05, "loss": 2.6727, "mean_token_accuracy": 0.4172413766384125, "step": 111750 }, { "epoch": 0.11256082293136564, "grad_norm": 10.375489734873824, "learning_rate": 4.95202135132185e-05, "loss": 2.5563, "mean_token_accuracy": 0.4, "step": 111755 }, { "epoch": 0.11256585898446982, "grad_norm": 12.222679199375383, "learning_rate": 4.952013654049367e-05, "loss": 2.568, "mean_token_accuracy": 0.39310344457626345, "step": 111760 }, { "epoch": 0.11257089503757399, "grad_norm": 19.109422920181366, "learning_rate": 4.9520059561661464e-05, "loss": 2.6066, "mean_token_accuracy": 0.4172413766384125, "step": 111765 }, { "epoch": 0.11257593109067816, "grad_norm": 11.38413408592637, "learning_rate": 4.9519982576721916e-05, "loss": 2.3885, "mean_token_accuracy": 0.4586206912994385, "step": 111770 }, { "epoch": 0.11258096714378234, "grad_norm": 10.814878962772442, "learning_rate": 4.951990558567505e-05, "loss": 2.6322, "mean_token_accuracy": 0.37241379022598264, "step": 111775 }, { "epoch": 0.11258600319688651, "grad_norm": 10.028460252741345, "learning_rate": 4.951982858852087e-05, "loss": 2.4866, "mean_token_accuracy": 0.3965517282485962, "step": 111780 }, { "epoch": 0.11259103924999068, "grad_norm": 9.565090671659185, "learning_rate": 4.951975158525941e-05, "loss": 2.1258, "mean_token_accuracy": 0.4517241358757019, "step": 111785 }, { "epoch": 0.11259607530309486, "grad_norm": 10.568144969376164, "learning_rate": 4.95196745758907e-05, "loss": 2.6386, "mean_token_accuracy": 0.39310344457626345, "step": 111790 }, { "epoch": 0.11260111135619903, "grad_norm": 8.861342015210543, "learning_rate": 4.951959756041475e-05, "loss": 2.1312, "mean_token_accuracy": 0.512401682138443, "step": 111795 }, { "epoch": 0.1126061474093032, "grad_norm": 18.477909269401962, "learning_rate": 4.9519520538831573e-05, "loss": 2.6257, "mean_token_accuracy": 0.44137930274009707, "step": 111800 }, { "epoch": 0.11261118346240738, "grad_norm": 9.666251068916313, "learning_rate": 4.951944351114121e-05, "loss": 2.558, "mean_token_accuracy": 0.4344827592372894, "step": 111805 }, { "epoch": 0.11261621951551155, "grad_norm": 9.711545888476971, "learning_rate": 4.951936647734366e-05, "loss": 2.3615, "mean_token_accuracy": 0.42413793206214906, "step": 111810 }, { "epoch": 0.11262125556861571, "grad_norm": 9.849719905149824, "learning_rate": 4.9519289437438966e-05, "loss": 2.0468, "mean_token_accuracy": 0.458620685338974, "step": 111815 }, { "epoch": 0.11262629162171989, "grad_norm": 11.402520882318544, "learning_rate": 4.9519212391427137e-05, "loss": 2.4914, "mean_token_accuracy": 0.3896551728248596, "step": 111820 }, { "epoch": 0.11263132767482406, "grad_norm": 10.796561043821034, "learning_rate": 4.95191353393082e-05, "loss": 2.3916, "mean_token_accuracy": 0.491379314661026, "step": 111825 }, { "epoch": 0.11263636372792823, "grad_norm": 9.584189099628968, "learning_rate": 4.951905828108217e-05, "loss": 2.5697, "mean_token_accuracy": 0.3806412577629089, "step": 111830 }, { "epoch": 0.11264139978103241, "grad_norm": 10.340988010763109, "learning_rate": 4.951898121674907e-05, "loss": 2.7361, "mean_token_accuracy": 0.4034482777118683, "step": 111835 }, { "epoch": 0.11264643583413658, "grad_norm": 10.007496824739125, "learning_rate": 4.951890414630893e-05, "loss": 2.6144, "mean_token_accuracy": 0.4068965494632721, "step": 111840 }, { "epoch": 0.11265147188724076, "grad_norm": 10.643485027265461, "learning_rate": 4.951882706976176e-05, "loss": 2.2278, "mean_token_accuracy": 0.4482758641242981, "step": 111845 }, { "epoch": 0.11265650794034493, "grad_norm": 8.10109905816921, "learning_rate": 4.951874998710759e-05, "loss": 2.601, "mean_token_accuracy": 0.47586206793785096, "step": 111850 }, { "epoch": 0.1126615439934491, "grad_norm": 10.173892208865098, "learning_rate": 4.9518672898346434e-05, "loss": 2.8477, "mean_token_accuracy": 0.39655172228813174, "step": 111855 }, { "epoch": 0.11266658004655328, "grad_norm": 10.112922739474547, "learning_rate": 4.951859580347832e-05, "loss": 2.2828, "mean_token_accuracy": 0.46896551847457885, "step": 111860 }, { "epoch": 0.11267161609965745, "grad_norm": 11.99106759352344, "learning_rate": 4.951851870250326e-05, "loss": 2.5196, "mean_token_accuracy": 0.3827586203813553, "step": 111865 }, { "epoch": 0.11267665215276162, "grad_norm": 9.537426146310862, "learning_rate": 4.951844159542129e-05, "loss": 2.6249, "mean_token_accuracy": 0.3896551787853241, "step": 111870 }, { "epoch": 0.1126816882058658, "grad_norm": 8.538851622621495, "learning_rate": 4.9518364482232416e-05, "loss": 2.3502, "mean_token_accuracy": 0.42758620381355283, "step": 111875 }, { "epoch": 0.11268672425896997, "grad_norm": 10.163664557694991, "learning_rate": 4.9518287362936665e-05, "loss": 2.3424, "mean_token_accuracy": 0.4413793087005615, "step": 111880 }, { "epoch": 0.11269176031207413, "grad_norm": 11.36191656978102, "learning_rate": 4.9518210237534077e-05, "loss": 2.4425, "mean_token_accuracy": 0.4068965524435043, "step": 111885 }, { "epoch": 0.1126967963651783, "grad_norm": 10.886909854495062, "learning_rate": 4.951813310602463e-05, "loss": 2.2276, "mean_token_accuracy": 0.4275861978530884, "step": 111890 }, { "epoch": 0.11270183241828248, "grad_norm": 10.924110545016852, "learning_rate": 4.951805596840839e-05, "loss": 2.2889, "mean_token_accuracy": 0.41379311084747317, "step": 111895 }, { "epoch": 0.11270686847138665, "grad_norm": 33.09264553973042, "learning_rate": 4.951797882468536e-05, "loss": 2.9556, "mean_token_accuracy": 0.3827586233615875, "step": 111900 }, { "epoch": 0.11271190452449083, "grad_norm": 14.067818432535168, "learning_rate": 4.9517901674855556e-05, "loss": 2.5971, "mean_token_accuracy": 0.4379310369491577, "step": 111905 }, { "epoch": 0.112716940577595, "grad_norm": 9.457722548284556, "learning_rate": 4.9517824518919e-05, "loss": 1.9271, "mean_token_accuracy": 0.5125226736068725, "step": 111910 }, { "epoch": 0.11272197663069917, "grad_norm": 18.477518320215772, "learning_rate": 4.9517747356875733e-05, "loss": 2.7695, "mean_token_accuracy": 0.42758620977401735, "step": 111915 }, { "epoch": 0.11272701268380335, "grad_norm": 10.219748128361198, "learning_rate": 4.951767018872575e-05, "loss": 2.4082, "mean_token_accuracy": 0.4517241418361664, "step": 111920 }, { "epoch": 0.11273204873690752, "grad_norm": 11.068663896903583, "learning_rate": 4.951759301446909e-05, "loss": 2.732, "mean_token_accuracy": 0.38275861740112305, "step": 111925 }, { "epoch": 0.1127370847900117, "grad_norm": 10.43735099678545, "learning_rate": 4.9517515834105757e-05, "loss": 2.2349, "mean_token_accuracy": 0.4379310429096222, "step": 111930 }, { "epoch": 0.11274212084311587, "grad_norm": 9.635107309744868, "learning_rate": 4.951743864763579e-05, "loss": 2.0681, "mean_token_accuracy": 0.4746521532535553, "step": 111935 }, { "epoch": 0.11274715689622004, "grad_norm": 13.786354329463379, "learning_rate": 4.951736145505921e-05, "loss": 2.0381, "mean_token_accuracy": 0.4862069010734558, "step": 111940 }, { "epoch": 0.11275219294932422, "grad_norm": 9.843596638262246, "learning_rate": 4.951728425637603e-05, "loss": 2.3695, "mean_token_accuracy": 0.4517241418361664, "step": 111945 }, { "epoch": 0.11275722900242839, "grad_norm": 9.712538126988122, "learning_rate": 4.951720705158627e-05, "loss": 2.2845, "mean_token_accuracy": 0.4517241418361664, "step": 111950 }, { "epoch": 0.11276226505553255, "grad_norm": 14.041250514925462, "learning_rate": 4.9517129840689965e-05, "loss": 2.5721, "mean_token_accuracy": 0.37586206793785093, "step": 111955 }, { "epoch": 0.11276730110863672, "grad_norm": 12.515475796807888, "learning_rate": 4.951705262368711e-05, "loss": 2.044, "mean_token_accuracy": 0.46551724672317507, "step": 111960 }, { "epoch": 0.1127723371617409, "grad_norm": 11.426288377859997, "learning_rate": 4.9516975400577756e-05, "loss": 2.5846, "mean_token_accuracy": 0.35862069129943847, "step": 111965 }, { "epoch": 0.11277737321484507, "grad_norm": 9.625575267367854, "learning_rate": 4.9516898171361916e-05, "loss": 2.0719, "mean_token_accuracy": 0.4620689570903778, "step": 111970 }, { "epoch": 0.11278240926794925, "grad_norm": 9.796840021384469, "learning_rate": 4.95168209360396e-05, "loss": 2.302, "mean_token_accuracy": 0.47241379618644713, "step": 111975 }, { "epoch": 0.11278744532105342, "grad_norm": 10.247770792589984, "learning_rate": 4.9516743694610836e-05, "loss": 2.3907, "mean_token_accuracy": 0.4379310369491577, "step": 111980 }, { "epoch": 0.1127924813741576, "grad_norm": 9.626352838635478, "learning_rate": 4.9516666447075645e-05, "loss": 2.3265, "mean_token_accuracy": 0.4206896543502808, "step": 111985 }, { "epoch": 0.11279751742726177, "grad_norm": 11.510714745508748, "learning_rate": 4.951658919343406e-05, "loss": 2.5235, "mean_token_accuracy": 0.4, "step": 111990 }, { "epoch": 0.11280255348036594, "grad_norm": 10.708162001686013, "learning_rate": 4.951651193368608e-05, "loss": 2.6105, "mean_token_accuracy": 0.32758620381355286, "step": 111995 }, { "epoch": 0.11280758953347012, "grad_norm": 8.929597227548431, "learning_rate": 4.9516434667831735e-05, "loss": 2.5534, "mean_token_accuracy": 0.42196006774902345, "step": 112000 }, { "epoch": 0.11281262558657429, "grad_norm": 16.823140738009453, "learning_rate": 4.951635739587106e-05, "loss": 2.7911, "mean_token_accuracy": 0.3655172407627106, "step": 112005 }, { "epoch": 0.11281766163967846, "grad_norm": 11.409889263961729, "learning_rate": 4.951628011780406e-05, "loss": 2.9153, "mean_token_accuracy": 0.37586206793785093, "step": 112010 }, { "epoch": 0.11282269769278264, "grad_norm": 18.697553057474412, "learning_rate": 4.951620283363077e-05, "loss": 3.0374, "mean_token_accuracy": 0.38620689511299133, "step": 112015 }, { "epoch": 0.11282773374588681, "grad_norm": 10.593040641906047, "learning_rate": 4.95161255433512e-05, "loss": 2.4457, "mean_token_accuracy": 0.4000000089406967, "step": 112020 }, { "epoch": 0.11283276979899097, "grad_norm": 11.098326081548795, "learning_rate": 4.9516048246965374e-05, "loss": 2.1724, "mean_token_accuracy": 0.43103447556495667, "step": 112025 }, { "epoch": 0.11283780585209514, "grad_norm": 11.685250317188183, "learning_rate": 4.9515970944473316e-05, "loss": 2.3764, "mean_token_accuracy": 0.46896552443504336, "step": 112030 }, { "epoch": 0.11284284190519932, "grad_norm": 9.707788891878984, "learning_rate": 4.9515893635875046e-05, "loss": 2.5649, "mean_token_accuracy": 0.4103448301553726, "step": 112035 }, { "epoch": 0.11284787795830349, "grad_norm": 13.091141906105934, "learning_rate": 4.951581632117058e-05, "loss": 2.6138, "mean_token_accuracy": 0.37586206793785093, "step": 112040 }, { "epoch": 0.11285291401140767, "grad_norm": 10.37130532949413, "learning_rate": 4.951573900035995e-05, "loss": 2.3476, "mean_token_accuracy": 0.4811857283115387, "step": 112045 }, { "epoch": 0.11285795006451184, "grad_norm": 10.561388418740135, "learning_rate": 4.951566167344318e-05, "loss": 2.2696, "mean_token_accuracy": 0.4620689570903778, "step": 112050 }, { "epoch": 0.11286298611761601, "grad_norm": 8.702935097513361, "learning_rate": 4.951558434042027e-05, "loss": 2.0795, "mean_token_accuracy": 0.482758617401123, "step": 112055 }, { "epoch": 0.11286802217072019, "grad_norm": 10.060070189179497, "learning_rate": 4.9515507001291274e-05, "loss": 2.0212, "mean_token_accuracy": 0.4620689690113068, "step": 112060 }, { "epoch": 0.11287305822382436, "grad_norm": 10.007695369763864, "learning_rate": 4.951542965605618e-05, "loss": 2.9708, "mean_token_accuracy": 0.39485783576965333, "step": 112065 }, { "epoch": 0.11287809427692853, "grad_norm": 10.204914770179567, "learning_rate": 4.9515352304715026e-05, "loss": 2.2973, "mean_token_accuracy": 0.4379310250282288, "step": 112070 }, { "epoch": 0.11288313033003271, "grad_norm": 13.838557556687348, "learning_rate": 4.951527494726784e-05, "loss": 2.9558, "mean_token_accuracy": 0.3655172437429428, "step": 112075 }, { "epoch": 0.11288816638313688, "grad_norm": 10.54915127093984, "learning_rate": 4.951519758371463e-05, "loss": 2.4616, "mean_token_accuracy": 0.3965517282485962, "step": 112080 }, { "epoch": 0.11289320243624106, "grad_norm": 12.08559040022823, "learning_rate": 4.9515120214055415e-05, "loss": 2.7843, "mean_token_accuracy": 0.3620689660310745, "step": 112085 }, { "epoch": 0.11289823848934523, "grad_norm": 11.987381198976127, "learning_rate": 4.951504283829024e-05, "loss": 2.6091, "mean_token_accuracy": 0.39310344457626345, "step": 112090 }, { "epoch": 0.11290327454244939, "grad_norm": 11.083705451838046, "learning_rate": 4.9514965456419096e-05, "loss": 2.3758, "mean_token_accuracy": 0.42413792610168455, "step": 112095 }, { "epoch": 0.11290831059555356, "grad_norm": 10.123408378246427, "learning_rate": 4.951488806844203e-05, "loss": 2.371, "mean_token_accuracy": 0.4569268047809601, "step": 112100 }, { "epoch": 0.11291334664865774, "grad_norm": 11.114590718880807, "learning_rate": 4.951481067435905e-05, "loss": 2.7544, "mean_token_accuracy": 0.42952207922935487, "step": 112105 }, { "epoch": 0.11291838270176191, "grad_norm": 10.614712722050163, "learning_rate": 4.951473327417017e-05, "loss": 2.4094, "mean_token_accuracy": 0.41724138259887694, "step": 112110 }, { "epoch": 0.11292341875486608, "grad_norm": 9.853735122918833, "learning_rate": 4.951465586787544e-05, "loss": 2.4062, "mean_token_accuracy": 0.42413792610168455, "step": 112115 }, { "epoch": 0.11292845480797026, "grad_norm": 9.738702403117605, "learning_rate": 4.9514578455474846e-05, "loss": 2.5223, "mean_token_accuracy": 0.3793103337287903, "step": 112120 }, { "epoch": 0.11293349086107443, "grad_norm": 14.684717154250205, "learning_rate": 4.951450103696844e-05, "loss": 2.6053, "mean_token_accuracy": 0.4280788242816925, "step": 112125 }, { "epoch": 0.1129385269141786, "grad_norm": 8.460870341075733, "learning_rate": 4.951442361235622e-05, "loss": 2.3291, "mean_token_accuracy": 0.4620689690113068, "step": 112130 }, { "epoch": 0.11294356296728278, "grad_norm": 9.934954491826542, "learning_rate": 4.951434618163822e-05, "loss": 2.5354, "mean_token_accuracy": 0.42413793206214906, "step": 112135 }, { "epoch": 0.11294859902038695, "grad_norm": 9.598687975005795, "learning_rate": 4.951426874481446e-05, "loss": 2.1552, "mean_token_accuracy": 0.4448275864124298, "step": 112140 }, { "epoch": 0.11295363507349113, "grad_norm": 12.13814884438668, "learning_rate": 4.9514191301884965e-05, "loss": 2.8591, "mean_token_accuracy": 0.3689655244350433, "step": 112145 }, { "epoch": 0.1129586711265953, "grad_norm": 11.824384033807242, "learning_rate": 4.951411385284974e-05, "loss": 2.4284, "mean_token_accuracy": 0.42413792610168455, "step": 112150 }, { "epoch": 0.11296370717969947, "grad_norm": 11.985515489600779, "learning_rate": 4.9514036397708836e-05, "loss": 2.0137, "mean_token_accuracy": 0.48965516686439514, "step": 112155 }, { "epoch": 0.11296874323280365, "grad_norm": 10.026855947297554, "learning_rate": 4.9513958936462244e-05, "loss": 2.2569, "mean_token_accuracy": 0.4689655125141144, "step": 112160 }, { "epoch": 0.11297377928590781, "grad_norm": 9.869882921703228, "learning_rate": 4.951388146911e-05, "loss": 2.2764, "mean_token_accuracy": 0.44827585816383364, "step": 112165 }, { "epoch": 0.11297881533901198, "grad_norm": 10.335936383874522, "learning_rate": 4.9513803995652123e-05, "loss": 2.3098, "mean_token_accuracy": 0.42413792610168455, "step": 112170 }, { "epoch": 0.11298385139211616, "grad_norm": 8.558601014769206, "learning_rate": 4.9513726516088634e-05, "loss": 2.352, "mean_token_accuracy": 0.42068966031074523, "step": 112175 }, { "epoch": 0.11298888744522033, "grad_norm": 9.74611985971413, "learning_rate": 4.951364903041957e-05, "loss": 2.0931, "mean_token_accuracy": 0.43103448748588563, "step": 112180 }, { "epoch": 0.1129939234983245, "grad_norm": 11.781720208734127, "learning_rate": 4.951357153864493e-05, "loss": 2.7263, "mean_token_accuracy": 0.43103447556495667, "step": 112185 }, { "epoch": 0.11299895955142868, "grad_norm": 10.168224510456232, "learning_rate": 4.951349404076474e-05, "loss": 2.1716, "mean_token_accuracy": 0.4655172288417816, "step": 112190 }, { "epoch": 0.11300399560453285, "grad_norm": 9.383615654781536, "learning_rate": 4.9513416536779026e-05, "loss": 2.615, "mean_token_accuracy": 0.3827586233615875, "step": 112195 }, { "epoch": 0.11300903165763702, "grad_norm": 8.837247309828369, "learning_rate": 4.9513339026687805e-05, "loss": 2.1011, "mean_token_accuracy": 0.44137930274009707, "step": 112200 }, { "epoch": 0.1130140677107412, "grad_norm": 8.780647521165315, "learning_rate": 4.951326151049111e-05, "loss": 2.2176, "mean_token_accuracy": 0.47586206793785096, "step": 112205 }, { "epoch": 0.11301910376384537, "grad_norm": 9.210831455293675, "learning_rate": 4.9513183988188954e-05, "loss": 2.6068, "mean_token_accuracy": 0.3896551728248596, "step": 112210 }, { "epoch": 0.11302413981694955, "grad_norm": 11.415318432500177, "learning_rate": 4.951310645978135e-05, "loss": 2.23, "mean_token_accuracy": 0.4931034505367279, "step": 112215 }, { "epoch": 0.11302917587005372, "grad_norm": 13.522460028780852, "learning_rate": 4.951302892526834e-05, "loss": 2.1977, "mean_token_accuracy": 0.49848759174346924, "step": 112220 }, { "epoch": 0.11303421192315789, "grad_norm": 10.634091185403916, "learning_rate": 4.951295138464993e-05, "loss": 2.4248, "mean_token_accuracy": 0.3876588046550751, "step": 112225 }, { "epoch": 0.11303924797626207, "grad_norm": 16.452895581389484, "learning_rate": 4.951287383792614e-05, "loss": 2.9035, "mean_token_accuracy": 0.3482758581638336, "step": 112230 }, { "epoch": 0.11304428402936623, "grad_norm": 9.840390173606698, "learning_rate": 4.9512796285097006e-05, "loss": 2.3514, "mean_token_accuracy": 0.42413793206214906, "step": 112235 }, { "epoch": 0.1130493200824704, "grad_norm": 10.361210623654326, "learning_rate": 4.951271872616254e-05, "loss": 2.6183, "mean_token_accuracy": 0.4034482717514038, "step": 112240 }, { "epoch": 0.11305435613557457, "grad_norm": 10.172913874332453, "learning_rate": 4.9512641161122766e-05, "loss": 2.7148, "mean_token_accuracy": 0.39655172228813174, "step": 112245 }, { "epoch": 0.11305939218867875, "grad_norm": 11.726342984073534, "learning_rate": 4.95125635899777e-05, "loss": 2.1831, "mean_token_accuracy": 0.44482759237289426, "step": 112250 }, { "epoch": 0.11306442824178292, "grad_norm": 9.167725604210977, "learning_rate": 4.951248601272737e-05, "loss": 2.5631, "mean_token_accuracy": 0.4344827592372894, "step": 112255 }, { "epoch": 0.1130694642948871, "grad_norm": 10.136103673162948, "learning_rate": 4.951240842937179e-05, "loss": 2.3461, "mean_token_accuracy": 0.4482758641242981, "step": 112260 }, { "epoch": 0.11307450034799127, "grad_norm": 10.152051075292167, "learning_rate": 4.951233083991099e-05, "loss": 2.4725, "mean_token_accuracy": 0.4, "step": 112265 }, { "epoch": 0.11307953640109544, "grad_norm": 11.169857360158185, "learning_rate": 4.9512253244344985e-05, "loss": 2.2612, "mean_token_accuracy": 0.4344827592372894, "step": 112270 }, { "epoch": 0.11308457245419962, "grad_norm": 9.798676541615995, "learning_rate": 4.9512175642673796e-05, "loss": 2.3035, "mean_token_accuracy": 0.42413792610168455, "step": 112275 }, { "epoch": 0.11308960850730379, "grad_norm": 13.604119379206551, "learning_rate": 4.951209803489746e-05, "loss": 2.2973, "mean_token_accuracy": 0.47931033968925474, "step": 112280 }, { "epoch": 0.11309464456040796, "grad_norm": 11.814696111357394, "learning_rate": 4.9512020421015976e-05, "loss": 3.0832, "mean_token_accuracy": 0.37586206793785093, "step": 112285 }, { "epoch": 0.11309968061351214, "grad_norm": 18.551617540978377, "learning_rate": 4.951194280102938e-05, "loss": 2.3041, "mean_token_accuracy": 0.4068965494632721, "step": 112290 }, { "epoch": 0.11310471666661631, "grad_norm": 10.32830745736162, "learning_rate": 4.951186517493769e-05, "loss": 2.6332, "mean_token_accuracy": 0.4034482717514038, "step": 112295 }, { "epoch": 0.11310975271972049, "grad_norm": 10.25450270484326, "learning_rate": 4.951178754274093e-05, "loss": 2.3259, "mean_token_accuracy": 0.43448275327682495, "step": 112300 }, { "epoch": 0.11311478877282465, "grad_norm": 12.868099106526067, "learning_rate": 4.951170990443911e-05, "loss": 2.407, "mean_token_accuracy": 0.4068965494632721, "step": 112305 }, { "epoch": 0.11311982482592882, "grad_norm": 12.423213574924635, "learning_rate": 4.951163226003227e-05, "loss": 2.7118, "mean_token_accuracy": 0.3999999940395355, "step": 112310 }, { "epoch": 0.11312486087903299, "grad_norm": 12.528918792941527, "learning_rate": 4.951155460952042e-05, "loss": 2.9389, "mean_token_accuracy": 0.39310344457626345, "step": 112315 }, { "epoch": 0.11312989693213717, "grad_norm": 9.661359959574826, "learning_rate": 4.951147695290357e-05, "loss": 2.3118, "mean_token_accuracy": 0.4517241358757019, "step": 112320 }, { "epoch": 0.11313493298524134, "grad_norm": 11.26934291117187, "learning_rate": 4.951139929018177e-05, "loss": 2.162, "mean_token_accuracy": 0.4620689630508423, "step": 112325 }, { "epoch": 0.11313996903834551, "grad_norm": 14.58188098207035, "learning_rate": 4.951132162135502e-05, "loss": 1.9054, "mean_token_accuracy": 0.5108287930488586, "step": 112330 }, { "epoch": 0.11314500509144969, "grad_norm": 10.933531568971459, "learning_rate": 4.9511243946423345e-05, "loss": 2.2488, "mean_token_accuracy": 0.42758620381355283, "step": 112335 }, { "epoch": 0.11315004114455386, "grad_norm": 10.33322509381467, "learning_rate": 4.9511166265386774e-05, "loss": 2.7507, "mean_token_accuracy": 0.4221415638923645, "step": 112340 }, { "epoch": 0.11315507719765804, "grad_norm": 12.94608655515518, "learning_rate": 4.9511088578245325e-05, "loss": 2.5327, "mean_token_accuracy": 0.3999999940395355, "step": 112345 }, { "epoch": 0.11316011325076221, "grad_norm": 11.593199381456072, "learning_rate": 4.951101088499902e-05, "loss": 2.7042, "mean_token_accuracy": 0.37586206793785093, "step": 112350 }, { "epoch": 0.11316514930386638, "grad_norm": 14.356117380316872, "learning_rate": 4.9510933185647874e-05, "loss": 2.4616, "mean_token_accuracy": 0.4, "step": 112355 }, { "epoch": 0.11317018535697056, "grad_norm": 11.117393676043172, "learning_rate": 4.951085548019191e-05, "loss": 2.6159, "mean_token_accuracy": 0.4206896543502808, "step": 112360 }, { "epoch": 0.11317522141007473, "grad_norm": 10.166844273646618, "learning_rate": 4.951077776863116e-05, "loss": 2.2713, "mean_token_accuracy": 0.44827585816383364, "step": 112365 }, { "epoch": 0.1131802574631789, "grad_norm": 12.211920564853251, "learning_rate": 4.951070005096564e-05, "loss": 2.351, "mean_token_accuracy": 0.45517241954803467, "step": 112370 }, { "epoch": 0.11318529351628306, "grad_norm": 11.287092608852857, "learning_rate": 4.951062232719536e-05, "loss": 2.4213, "mean_token_accuracy": 0.4344827592372894, "step": 112375 }, { "epoch": 0.11319032956938724, "grad_norm": 10.886003405238254, "learning_rate": 4.951054459732037e-05, "loss": 2.4819, "mean_token_accuracy": 0.41724138259887694, "step": 112380 }, { "epoch": 0.11319536562249141, "grad_norm": 10.711937260092208, "learning_rate": 4.951046686134066e-05, "loss": 2.5082, "mean_token_accuracy": 0.4068965494632721, "step": 112385 }, { "epoch": 0.11320040167559559, "grad_norm": 14.781412246581398, "learning_rate": 4.9510389119256265e-05, "loss": 2.8194, "mean_token_accuracy": 0.3793103456497192, "step": 112390 }, { "epoch": 0.11320543772869976, "grad_norm": 11.100164450716207, "learning_rate": 4.951031137106721e-05, "loss": 2.4426, "mean_token_accuracy": 0.45862067937850953, "step": 112395 }, { "epoch": 0.11321047378180393, "grad_norm": 11.393728140951234, "learning_rate": 4.9510233616773515e-05, "loss": 2.2806, "mean_token_accuracy": 0.39655172228813174, "step": 112400 }, { "epoch": 0.1132155098349081, "grad_norm": 14.274189415511296, "learning_rate": 4.9510155856375195e-05, "loss": 2.6659, "mean_token_accuracy": 0.42413793206214906, "step": 112405 }, { "epoch": 0.11322054588801228, "grad_norm": 10.758011177961524, "learning_rate": 4.951007808987228e-05, "loss": 2.3328, "mean_token_accuracy": 0.4, "step": 112410 }, { "epoch": 0.11322558194111645, "grad_norm": 8.837145034889286, "learning_rate": 4.951000031726478e-05, "loss": 2.0194, "mean_token_accuracy": 0.4965517222881317, "step": 112415 }, { "epoch": 0.11323061799422063, "grad_norm": 18.80916709207696, "learning_rate": 4.9509922538552735e-05, "loss": 2.396, "mean_token_accuracy": 0.4034482777118683, "step": 112420 }, { "epoch": 0.1132356540473248, "grad_norm": 10.216712361886339, "learning_rate": 4.950984475373614e-05, "loss": 2.7103, "mean_token_accuracy": 0.36551724672317504, "step": 112425 }, { "epoch": 0.11324069010042898, "grad_norm": 12.161941207908773, "learning_rate": 4.950976696281505e-05, "loss": 2.6317, "mean_token_accuracy": 0.4, "step": 112430 }, { "epoch": 0.11324572615353315, "grad_norm": 10.63732096438779, "learning_rate": 4.950968916578946e-05, "loss": 2.5106, "mean_token_accuracy": 0.41379310488700866, "step": 112435 }, { "epoch": 0.11325076220663732, "grad_norm": 7.68988123791223, "learning_rate": 4.950961136265941e-05, "loss": 1.9188, "mean_token_accuracy": 0.4468844473361969, "step": 112440 }, { "epoch": 0.11325579825974148, "grad_norm": 10.239054108598628, "learning_rate": 4.95095335534249e-05, "loss": 2.2205, "mean_token_accuracy": 0.43986691236495973, "step": 112445 }, { "epoch": 0.11326083431284566, "grad_norm": 10.600008138134777, "learning_rate": 4.950945573808597e-05, "loss": 2.4271, "mean_token_accuracy": 0.46551724076271056, "step": 112450 }, { "epoch": 0.11326587036594983, "grad_norm": 13.560729837005418, "learning_rate": 4.950937791664264e-05, "loss": 2.6895, "mean_token_accuracy": 0.3793103456497192, "step": 112455 }, { "epoch": 0.113270906419054, "grad_norm": 10.76779913253055, "learning_rate": 4.950930008909492e-05, "loss": 2.3881, "mean_token_accuracy": 0.42413792610168455, "step": 112460 }, { "epoch": 0.11327594247215818, "grad_norm": 11.29523549352523, "learning_rate": 4.950922225544283e-05, "loss": 2.6198, "mean_token_accuracy": 0.46896551847457885, "step": 112465 }, { "epoch": 0.11328097852526235, "grad_norm": 10.27067347910389, "learning_rate": 4.950914441568642e-05, "loss": 2.5696, "mean_token_accuracy": 0.41379310488700866, "step": 112470 }, { "epoch": 0.11328601457836653, "grad_norm": 13.064380407176515, "learning_rate": 4.950906656982567e-05, "loss": 3.3672, "mean_token_accuracy": 0.31034482419490816, "step": 112475 }, { "epoch": 0.1132910506314707, "grad_norm": 10.430120548001469, "learning_rate": 4.9508988717860637e-05, "loss": 2.5065, "mean_token_accuracy": 0.4413793087005615, "step": 112480 }, { "epoch": 0.11329608668457487, "grad_norm": 8.271612612404052, "learning_rate": 4.950891085979133e-05, "loss": 2.2725, "mean_token_accuracy": 0.45051422119140627, "step": 112485 }, { "epoch": 0.11330112273767905, "grad_norm": 11.642432113317138, "learning_rate": 4.950883299561776e-05, "loss": 2.3184, "mean_token_accuracy": 0.4398669093847275, "step": 112490 }, { "epoch": 0.11330615879078322, "grad_norm": 10.632275282403485, "learning_rate": 4.9508755125339964e-05, "loss": 2.6096, "mean_token_accuracy": 0.3793103337287903, "step": 112495 }, { "epoch": 0.1133111948438874, "grad_norm": 10.387993284894442, "learning_rate": 4.9508677248957965e-05, "loss": 2.1457, "mean_token_accuracy": 0.4949788272380829, "step": 112500 }, { "epoch": 0.11331623089699157, "grad_norm": 12.583606904666755, "learning_rate": 4.950859936647177e-05, "loss": 2.4628, "mean_token_accuracy": 0.3793103456497192, "step": 112505 }, { "epoch": 0.11332126695009574, "grad_norm": 8.9784507628001, "learning_rate": 4.9508521477881406e-05, "loss": 2.3244, "mean_token_accuracy": 0.42413793206214906, "step": 112510 }, { "epoch": 0.1133263030031999, "grad_norm": 14.054294401755884, "learning_rate": 4.95084435831869e-05, "loss": 2.3109, "mean_token_accuracy": 0.4620689690113068, "step": 112515 }, { "epoch": 0.11333133905630408, "grad_norm": 10.86300268767509, "learning_rate": 4.950836568238827e-05, "loss": 2.4358, "mean_token_accuracy": 0.47241378426551817, "step": 112520 }, { "epoch": 0.11333637510940825, "grad_norm": 10.021149861681003, "learning_rate": 4.950828777548553e-05, "loss": 2.4272, "mean_token_accuracy": 0.46061705946922304, "step": 112525 }, { "epoch": 0.11334141116251242, "grad_norm": 10.860990583187526, "learning_rate": 4.9508209862478714e-05, "loss": 2.3804, "mean_token_accuracy": 0.44137930274009707, "step": 112530 }, { "epoch": 0.1133464472156166, "grad_norm": 27.751713056182442, "learning_rate": 4.950813194336784e-05, "loss": 2.8249, "mean_token_accuracy": 0.4257713258266449, "step": 112535 }, { "epoch": 0.11335148326872077, "grad_norm": 9.798477220317345, "learning_rate": 4.9508054018152935e-05, "loss": 2.371, "mean_token_accuracy": 0.4482758641242981, "step": 112540 }, { "epoch": 0.11335651932182494, "grad_norm": 12.974638441815156, "learning_rate": 4.9507976086834005e-05, "loss": 2.2347, "mean_token_accuracy": 0.46896551847457885, "step": 112545 }, { "epoch": 0.11336155537492912, "grad_norm": 8.890279946733163, "learning_rate": 4.9507898149411084e-05, "loss": 2.1002, "mean_token_accuracy": 0.4344827651977539, "step": 112550 }, { "epoch": 0.11336659142803329, "grad_norm": 12.400856437531234, "learning_rate": 4.9507820205884194e-05, "loss": 2.6807, "mean_token_accuracy": 0.4344827473163605, "step": 112555 }, { "epoch": 0.11337162748113747, "grad_norm": 10.51500033595627, "learning_rate": 4.950774225625334e-05, "loss": 2.2955, "mean_token_accuracy": 0.4430127084255219, "step": 112560 }, { "epoch": 0.11337666353424164, "grad_norm": 12.908551875679096, "learning_rate": 4.950766430051857e-05, "loss": 2.61, "mean_token_accuracy": 0.3758620649576187, "step": 112565 }, { "epoch": 0.11338169958734581, "grad_norm": 10.114325260277047, "learning_rate": 4.950758633867989e-05, "loss": 2.3728, "mean_token_accuracy": 0.41034482717514037, "step": 112570 }, { "epoch": 0.11338673564044999, "grad_norm": 12.65785854263634, "learning_rate": 4.950750837073732e-05, "loss": 2.5889, "mean_token_accuracy": 0.4052026689052582, "step": 112575 }, { "epoch": 0.11339177169355416, "grad_norm": 9.5696482370971, "learning_rate": 4.950743039669089e-05, "loss": 2.4666, "mean_token_accuracy": 0.37743496894836426, "step": 112580 }, { "epoch": 0.11339680774665832, "grad_norm": 12.110643775014921, "learning_rate": 4.950735241654061e-05, "loss": 2.592, "mean_token_accuracy": 0.41034482717514037, "step": 112585 }, { "epoch": 0.1134018437997625, "grad_norm": 13.612028782413036, "learning_rate": 4.950727443028651e-05, "loss": 2.5999, "mean_token_accuracy": 0.39655172228813174, "step": 112590 }, { "epoch": 0.11340687985286667, "grad_norm": 10.21409142588348, "learning_rate": 4.950719643792862e-05, "loss": 2.6029, "mean_token_accuracy": 0.4206896543502808, "step": 112595 }, { "epoch": 0.11341191590597084, "grad_norm": 10.318700033312874, "learning_rate": 4.950711843946694e-05, "loss": 2.432, "mean_token_accuracy": 0.4482758641242981, "step": 112600 }, { "epoch": 0.11341695195907502, "grad_norm": 9.78717945823098, "learning_rate": 4.9507040434901506e-05, "loss": 2.3255, "mean_token_accuracy": 0.41379311084747317, "step": 112605 }, { "epoch": 0.11342198801217919, "grad_norm": 14.162182263606587, "learning_rate": 4.950696242423235e-05, "loss": 2.319, "mean_token_accuracy": 0.41034482717514037, "step": 112610 }, { "epoch": 0.11342702406528336, "grad_norm": 13.33732163138306, "learning_rate": 4.950688440745946e-05, "loss": 2.7267, "mean_token_accuracy": 0.3827586233615875, "step": 112615 }, { "epoch": 0.11343206011838754, "grad_norm": 13.831559515206177, "learning_rate": 4.950680638458289e-05, "loss": 2.0875, "mean_token_accuracy": 0.458620685338974, "step": 112620 }, { "epoch": 0.11343709617149171, "grad_norm": 10.164092618482316, "learning_rate": 4.950672835560265e-05, "loss": 2.194, "mean_token_accuracy": 0.4620689630508423, "step": 112625 }, { "epoch": 0.11344213222459588, "grad_norm": 9.485509818800969, "learning_rate": 4.950665032051876e-05, "loss": 2.1772, "mean_token_accuracy": 0.4931034445762634, "step": 112630 }, { "epoch": 0.11344716827770006, "grad_norm": 9.462468287577686, "learning_rate": 4.950657227933125e-05, "loss": 2.2047, "mean_token_accuracy": 0.46394434571266174, "step": 112635 }, { "epoch": 0.11345220433080423, "grad_norm": 9.82952668679054, "learning_rate": 4.950649423204013e-05, "loss": 2.3954, "mean_token_accuracy": 0.41034482717514037, "step": 112640 }, { "epoch": 0.1134572403839084, "grad_norm": 15.486847673804846, "learning_rate": 4.950641617864542e-05, "loss": 2.6007, "mean_token_accuracy": 0.43103448748588563, "step": 112645 }, { "epoch": 0.11346227643701258, "grad_norm": 10.980521361238507, "learning_rate": 4.950633811914715e-05, "loss": 2.5132, "mean_token_accuracy": 0.4758620738983154, "step": 112650 }, { "epoch": 0.11346731249011674, "grad_norm": 13.808154705068338, "learning_rate": 4.950626005354535e-05, "loss": 2.5863, "mean_token_accuracy": 0.40508167147636415, "step": 112655 }, { "epoch": 0.11347234854322091, "grad_norm": 9.834409151075814, "learning_rate": 4.950618198184003e-05, "loss": 2.1222, "mean_token_accuracy": 0.4620689630508423, "step": 112660 }, { "epoch": 0.11347738459632509, "grad_norm": 9.129435969139413, "learning_rate": 4.950610390403121e-05, "loss": 2.3897, "mean_token_accuracy": 0.4275862157344818, "step": 112665 }, { "epoch": 0.11348242064942926, "grad_norm": 10.802864718112625, "learning_rate": 4.9506025820118914e-05, "loss": 2.2116, "mean_token_accuracy": 0.4482758641242981, "step": 112670 }, { "epoch": 0.11348745670253343, "grad_norm": 8.534373747527995, "learning_rate": 4.9505947730103164e-05, "loss": 2.543, "mean_token_accuracy": 0.4034482717514038, "step": 112675 }, { "epoch": 0.11349249275563761, "grad_norm": 12.626695417522242, "learning_rate": 4.950586963398398e-05, "loss": 2.419, "mean_token_accuracy": 0.42068966031074523, "step": 112680 }, { "epoch": 0.11349752880874178, "grad_norm": 8.772732762734016, "learning_rate": 4.9505791531761394e-05, "loss": 2.1669, "mean_token_accuracy": 0.4137930989265442, "step": 112685 }, { "epoch": 0.11350256486184596, "grad_norm": 10.631695648456004, "learning_rate": 4.9505713423435414e-05, "loss": 2.5723, "mean_token_accuracy": 0.4, "step": 112690 }, { "epoch": 0.11350760091495013, "grad_norm": 13.291338143216759, "learning_rate": 4.9505635309006074e-05, "loss": 2.2889, "mean_token_accuracy": 0.45722927451133727, "step": 112695 }, { "epoch": 0.1135126369680543, "grad_norm": 12.065328072716197, "learning_rate": 4.950555718847338e-05, "loss": 2.5799, "mean_token_accuracy": 0.38275861740112305, "step": 112700 }, { "epoch": 0.11351767302115848, "grad_norm": 11.007597806646965, "learning_rate": 4.950547906183737e-05, "loss": 2.453, "mean_token_accuracy": 0.39655172228813174, "step": 112705 }, { "epoch": 0.11352270907426265, "grad_norm": 9.330024105780648, "learning_rate": 4.950540092909805e-05, "loss": 2.4688, "mean_token_accuracy": 0.4517241418361664, "step": 112710 }, { "epoch": 0.11352774512736682, "grad_norm": 13.155215818721846, "learning_rate": 4.950532279025546e-05, "loss": 3.4271, "mean_token_accuracy": 0.3862069010734558, "step": 112715 }, { "epoch": 0.113532781180471, "grad_norm": 9.951792348916053, "learning_rate": 4.950524464530959e-05, "loss": 2.3433, "mean_token_accuracy": 0.43968542814254763, "step": 112720 }, { "epoch": 0.11353781723357516, "grad_norm": 11.179116986339334, "learning_rate": 4.9505166494260505e-05, "loss": 2.5908, "mean_token_accuracy": 0.3896551728248596, "step": 112725 }, { "epoch": 0.11354285328667933, "grad_norm": 9.822797362745447, "learning_rate": 4.9505088337108205e-05, "loss": 2.4251, "mean_token_accuracy": 0.4482758641242981, "step": 112730 }, { "epoch": 0.1135478893397835, "grad_norm": 11.091482619629184, "learning_rate": 4.95050101738527e-05, "loss": 2.5196, "mean_token_accuracy": 0.4310344934463501, "step": 112735 }, { "epoch": 0.11355292539288768, "grad_norm": 9.990303092562069, "learning_rate": 4.9504932004494033e-05, "loss": 2.3834, "mean_token_accuracy": 0.4137930989265442, "step": 112740 }, { "epoch": 0.11355796144599185, "grad_norm": 9.791429906243796, "learning_rate": 4.950485382903221e-05, "loss": 2.4259, "mean_token_accuracy": 0.4296430766582489, "step": 112745 }, { "epoch": 0.11356299749909603, "grad_norm": 10.26657727767445, "learning_rate": 4.950477564746727e-05, "loss": 2.4806, "mean_token_accuracy": 0.4517241358757019, "step": 112750 }, { "epoch": 0.1135680335522002, "grad_norm": 9.503826251956095, "learning_rate": 4.9504697459799203e-05, "loss": 2.3452, "mean_token_accuracy": 0.4206896543502808, "step": 112755 }, { "epoch": 0.11357306960530437, "grad_norm": 8.27102891811764, "learning_rate": 4.950461926602807e-05, "loss": 2.1934, "mean_token_accuracy": 0.44827585816383364, "step": 112760 }, { "epoch": 0.11357810565840855, "grad_norm": 12.78409129981293, "learning_rate": 4.950454106615386e-05, "loss": 2.6662, "mean_token_accuracy": 0.35172413289546967, "step": 112765 }, { "epoch": 0.11358314171151272, "grad_norm": 10.086959067495929, "learning_rate": 4.950446286017662e-05, "loss": 2.6014, "mean_token_accuracy": 0.4, "step": 112770 }, { "epoch": 0.1135881777646169, "grad_norm": 13.320189148045081, "learning_rate": 4.950438464809635e-05, "loss": 2.4212, "mean_token_accuracy": 0.44482758045196535, "step": 112775 }, { "epoch": 0.11359321381772107, "grad_norm": 12.34315559533147, "learning_rate": 4.9504306429913086e-05, "loss": 2.36, "mean_token_accuracy": 0.45862067937850953, "step": 112780 }, { "epoch": 0.11359824987082524, "grad_norm": 15.218955362267408, "learning_rate": 4.9504228205626846e-05, "loss": 2.846, "mean_token_accuracy": 0.34482758641242983, "step": 112785 }, { "epoch": 0.11360328592392942, "grad_norm": 11.538281053886553, "learning_rate": 4.950414997523765e-05, "loss": 2.4983, "mean_token_accuracy": 0.3896551728248596, "step": 112790 }, { "epoch": 0.11360832197703358, "grad_norm": 34.391844066076786, "learning_rate": 4.9504071738745524e-05, "loss": 2.8062, "mean_token_accuracy": 0.37586206793785093, "step": 112795 }, { "epoch": 0.11361335803013775, "grad_norm": 9.412878639740066, "learning_rate": 4.950399349615049e-05, "loss": 2.3222, "mean_token_accuracy": 0.4068965494632721, "step": 112800 }, { "epoch": 0.11361839408324192, "grad_norm": 11.358788539751473, "learning_rate": 4.950391524745256e-05, "loss": 2.349, "mean_token_accuracy": 0.4152450144290924, "step": 112805 }, { "epoch": 0.1136234301363461, "grad_norm": 10.196096069373818, "learning_rate": 4.950383699265176e-05, "loss": 2.1797, "mean_token_accuracy": 0.4, "step": 112810 }, { "epoch": 0.11362846618945027, "grad_norm": 11.84958197422626, "learning_rate": 4.9503758731748114e-05, "loss": 2.5803, "mean_token_accuracy": 0.4068965494632721, "step": 112815 }, { "epoch": 0.11363350224255445, "grad_norm": 9.918889629749781, "learning_rate": 4.950368046474165e-05, "loss": 2.5376, "mean_token_accuracy": 0.38965516686439516, "step": 112820 }, { "epoch": 0.11363853829565862, "grad_norm": 10.59079363378361, "learning_rate": 4.950360219163238e-05, "loss": 2.3285, "mean_token_accuracy": 0.47931034564971925, "step": 112825 }, { "epoch": 0.1136435743487628, "grad_norm": 11.738348529134498, "learning_rate": 4.950352391242033e-05, "loss": 2.2935, "mean_token_accuracy": 0.44137931168079375, "step": 112830 }, { "epoch": 0.11364861040186697, "grad_norm": 11.721461380802934, "learning_rate": 4.950344562710552e-05, "loss": 2.4273, "mean_token_accuracy": 0.4344827592372894, "step": 112835 }, { "epoch": 0.11365364645497114, "grad_norm": 9.079775859432178, "learning_rate": 4.9503367335687974e-05, "loss": 2.3564, "mean_token_accuracy": 0.44827585816383364, "step": 112840 }, { "epoch": 0.11365868250807531, "grad_norm": 10.516943776892882, "learning_rate": 4.9503289038167705e-05, "loss": 2.6198, "mean_token_accuracy": 0.3999999940395355, "step": 112845 }, { "epoch": 0.11366371856117949, "grad_norm": 10.669957973110739, "learning_rate": 4.950321073454475e-05, "loss": 2.2477, "mean_token_accuracy": 0.4655172288417816, "step": 112850 }, { "epoch": 0.11366875461428366, "grad_norm": 11.41794169458884, "learning_rate": 4.950313242481911e-05, "loss": 2.6348, "mean_token_accuracy": 0.358620685338974, "step": 112855 }, { "epoch": 0.11367379066738784, "grad_norm": 10.12704674619292, "learning_rate": 4.950305410899083e-05, "loss": 2.6235, "mean_token_accuracy": 0.4, "step": 112860 }, { "epoch": 0.113678826720492, "grad_norm": 10.572714846039661, "learning_rate": 4.950297578705992e-05, "loss": 2.6728, "mean_token_accuracy": 0.37241379022598264, "step": 112865 }, { "epoch": 0.11368386277359617, "grad_norm": 12.658584169158898, "learning_rate": 4.95028974590264e-05, "loss": 2.6985, "mean_token_accuracy": 0.3827586233615875, "step": 112870 }, { "epoch": 0.11368889882670034, "grad_norm": 15.314121025482617, "learning_rate": 4.9502819124890294e-05, "loss": 2.5923, "mean_token_accuracy": 0.4206896543502808, "step": 112875 }, { "epoch": 0.11369393487980452, "grad_norm": 10.867150235515789, "learning_rate": 4.950274078465163e-05, "loss": 2.4636, "mean_token_accuracy": 0.4206896543502808, "step": 112880 }, { "epoch": 0.11369897093290869, "grad_norm": 9.092189141555469, "learning_rate": 4.950266243831042e-05, "loss": 2.3646, "mean_token_accuracy": 0.4206896543502808, "step": 112885 }, { "epoch": 0.11370400698601286, "grad_norm": 9.462584613121015, "learning_rate": 4.950258408586668e-05, "loss": 2.3151, "mean_token_accuracy": 0.42068964838981626, "step": 112890 }, { "epoch": 0.11370904303911704, "grad_norm": 11.490035366236722, "learning_rate": 4.9502505727320456e-05, "loss": 2.2452, "mean_token_accuracy": 0.44482758045196535, "step": 112895 }, { "epoch": 0.11371407909222121, "grad_norm": 12.448376649907267, "learning_rate": 4.950242736267175e-05, "loss": 2.4434, "mean_token_accuracy": 0.3965517282485962, "step": 112900 }, { "epoch": 0.11371911514532539, "grad_norm": 11.593818648544124, "learning_rate": 4.950234899192058e-05, "loss": 2.1065, "mean_token_accuracy": 0.46896551847457885, "step": 112905 }, { "epoch": 0.11372415119842956, "grad_norm": 15.143760686034854, "learning_rate": 4.950227061506699e-05, "loss": 2.3473, "mean_token_accuracy": 0.4620689630508423, "step": 112910 }, { "epoch": 0.11372918725153373, "grad_norm": 9.673757838364011, "learning_rate": 4.950219223211098e-05, "loss": 2.1377, "mean_token_accuracy": 0.5028044164180756, "step": 112915 }, { "epoch": 0.11373422330463791, "grad_norm": 11.27581860778706, "learning_rate": 4.950211384305258e-05, "loss": 2.2512, "mean_token_accuracy": 0.46551724672317507, "step": 112920 }, { "epoch": 0.11373925935774208, "grad_norm": 10.413519342874757, "learning_rate": 4.950203544789182e-05, "loss": 2.2228, "mean_token_accuracy": 0.4206896543502808, "step": 112925 }, { "epoch": 0.11374429541084626, "grad_norm": 11.104010138660158, "learning_rate": 4.95019570466287e-05, "loss": 2.3453, "mean_token_accuracy": 0.43793103098869324, "step": 112930 }, { "epoch": 0.11374933146395041, "grad_norm": 9.292504800398415, "learning_rate": 4.950187863926327e-05, "loss": 2.1692, "mean_token_accuracy": 0.45517241954803467, "step": 112935 }, { "epoch": 0.11375436751705459, "grad_norm": 9.35564041040178, "learning_rate": 4.950180022579553e-05, "loss": 2.3855, "mean_token_accuracy": 0.4465819776058197, "step": 112940 }, { "epoch": 0.11375940357015876, "grad_norm": 11.528256574261754, "learning_rate": 4.95017218062255e-05, "loss": 2.5506, "mean_token_accuracy": 0.41034482717514037, "step": 112945 }, { "epoch": 0.11376443962326294, "grad_norm": 11.464581648542369, "learning_rate": 4.950164338055322e-05, "loss": 2.5332, "mean_token_accuracy": 0.39310344457626345, "step": 112950 }, { "epoch": 0.11376947567636711, "grad_norm": 10.898727030019352, "learning_rate": 4.95015649487787e-05, "loss": 2.3679, "mean_token_accuracy": 0.45329703092575074, "step": 112955 }, { "epoch": 0.11377451172947128, "grad_norm": 10.450689479248306, "learning_rate": 4.9501486510901965e-05, "loss": 2.7554, "mean_token_accuracy": 0.35172412991523744, "step": 112960 }, { "epoch": 0.11377954778257546, "grad_norm": 10.208772550248536, "learning_rate": 4.950140806692303e-05, "loss": 2.3792, "mean_token_accuracy": 0.42758620381355283, "step": 112965 }, { "epoch": 0.11378458383567963, "grad_norm": 10.977935427287786, "learning_rate": 4.950132961684193e-05, "loss": 2.3115, "mean_token_accuracy": 0.4655172348022461, "step": 112970 }, { "epoch": 0.1137896198887838, "grad_norm": 11.462108564871988, "learning_rate": 4.9501251160658676e-05, "loss": 2.2867, "mean_token_accuracy": 0.44482759237289426, "step": 112975 }, { "epoch": 0.11379465594188798, "grad_norm": 8.66631317198689, "learning_rate": 4.9501172698373295e-05, "loss": 2.5056, "mean_token_accuracy": 0.4172413766384125, "step": 112980 }, { "epoch": 0.11379969199499215, "grad_norm": 10.63965215620542, "learning_rate": 4.9501094229985806e-05, "loss": 2.2019, "mean_token_accuracy": 0.4620689690113068, "step": 112985 }, { "epoch": 0.11380472804809633, "grad_norm": 10.667001787606706, "learning_rate": 4.9501015755496225e-05, "loss": 2.3473, "mean_token_accuracy": 0.4206896543502808, "step": 112990 }, { "epoch": 0.1138097641012005, "grad_norm": 10.985345749358471, "learning_rate": 4.9500937274904584e-05, "loss": 2.2199, "mean_token_accuracy": 0.441379314661026, "step": 112995 }, { "epoch": 0.11381480015430467, "grad_norm": 12.966664106527714, "learning_rate": 4.9500858788210904e-05, "loss": 2.4685, "mean_token_accuracy": 0.4310344815254211, "step": 113000 }, { "epoch": 0.11381983620740883, "grad_norm": 11.872130864182086, "learning_rate": 4.9500780295415206e-05, "loss": 2.3333, "mean_token_accuracy": 0.4517241358757019, "step": 113005 }, { "epoch": 0.11382487226051301, "grad_norm": 14.256529578119999, "learning_rate": 4.95007017965175e-05, "loss": 2.6981, "mean_token_accuracy": 0.43103447556495667, "step": 113010 }, { "epoch": 0.11382990831361718, "grad_norm": 12.122172025402355, "learning_rate": 4.950062329151783e-05, "loss": 2.189, "mean_token_accuracy": 0.4847549915313721, "step": 113015 }, { "epoch": 0.11383494436672136, "grad_norm": 11.811552295943956, "learning_rate": 4.950054478041619e-05, "loss": 2.3214, "mean_token_accuracy": 0.43103447556495667, "step": 113020 }, { "epoch": 0.11383998041982553, "grad_norm": 14.985012060033208, "learning_rate": 4.9500466263212634e-05, "loss": 2.6502, "mean_token_accuracy": 0.41034482717514037, "step": 113025 }, { "epoch": 0.1138450164729297, "grad_norm": 8.848924416683507, "learning_rate": 4.9500387739907155e-05, "loss": 2.1618, "mean_token_accuracy": 0.46079854369163514, "step": 113030 }, { "epoch": 0.11385005252603388, "grad_norm": 13.43749839772969, "learning_rate": 4.950030921049979e-05, "loss": 2.63, "mean_token_accuracy": 0.43103448748588563, "step": 113035 }, { "epoch": 0.11385508857913805, "grad_norm": 10.644979632040803, "learning_rate": 4.9500230674990553e-05, "loss": 2.6792, "mean_token_accuracy": 0.3965517163276672, "step": 113040 }, { "epoch": 0.11386012463224222, "grad_norm": 8.77887395422998, "learning_rate": 4.950015213337947e-05, "loss": 2.4625, "mean_token_accuracy": 0.3896551728248596, "step": 113045 }, { "epoch": 0.1138651606853464, "grad_norm": 9.594743290726822, "learning_rate": 4.950007358566657e-05, "loss": 2.3294, "mean_token_accuracy": 0.4724137902259827, "step": 113050 }, { "epoch": 0.11387019673845057, "grad_norm": 9.311390483127402, "learning_rate": 4.949999503185186e-05, "loss": 2.4759, "mean_token_accuracy": 0.4172413766384125, "step": 113055 }, { "epoch": 0.11387523279155475, "grad_norm": 12.503288392959382, "learning_rate": 4.949991647193538e-05, "loss": 2.8729, "mean_token_accuracy": 0.41724138259887694, "step": 113060 }, { "epoch": 0.11388026884465892, "grad_norm": 9.931197784672545, "learning_rate": 4.949983790591713e-05, "loss": 2.398, "mean_token_accuracy": 0.4517241358757019, "step": 113065 }, { "epoch": 0.11388530489776309, "grad_norm": 10.788719159143596, "learning_rate": 4.949975933379715e-05, "loss": 2.5342, "mean_token_accuracy": 0.3896551787853241, "step": 113070 }, { "epoch": 0.11389034095086725, "grad_norm": 11.045821825500827, "learning_rate": 4.949968075557545e-05, "loss": 2.5056, "mean_token_accuracy": 0.3896551787853241, "step": 113075 }, { "epoch": 0.11389537700397143, "grad_norm": 10.842479058349177, "learning_rate": 4.949960217125205e-05, "loss": 2.3472, "mean_token_accuracy": 0.42413793206214906, "step": 113080 }, { "epoch": 0.1139004130570756, "grad_norm": 11.785158897818905, "learning_rate": 4.949952358082699e-05, "loss": 2.7876, "mean_token_accuracy": 0.42068964838981626, "step": 113085 }, { "epoch": 0.11390544911017977, "grad_norm": 8.418180242209662, "learning_rate": 4.949944498430028e-05, "loss": 2.041, "mean_token_accuracy": 0.5021173477172851, "step": 113090 }, { "epoch": 0.11391048516328395, "grad_norm": 7.59588563388341, "learning_rate": 4.949936638167194e-05, "loss": 2.0517, "mean_token_accuracy": 0.46551724672317507, "step": 113095 }, { "epoch": 0.11391552121638812, "grad_norm": 10.082629439074063, "learning_rate": 4.949928777294198e-05, "loss": 2.4002, "mean_token_accuracy": 0.44482759237289426, "step": 113100 }, { "epoch": 0.1139205572694923, "grad_norm": 9.56419024566006, "learning_rate": 4.949920915811045e-05, "loss": 2.4172, "mean_token_accuracy": 0.4172413766384125, "step": 113105 }, { "epoch": 0.11392559332259647, "grad_norm": 11.583427611281936, "learning_rate": 4.9499130537177355e-05, "loss": 2.7297, "mean_token_accuracy": 0.3918935328722, "step": 113110 }, { "epoch": 0.11393062937570064, "grad_norm": 10.667556518165586, "learning_rate": 4.9499051910142724e-05, "loss": 2.5091, "mean_token_accuracy": 0.42068966031074523, "step": 113115 }, { "epoch": 0.11393566542880482, "grad_norm": 10.481446910962806, "learning_rate": 4.949897327700657e-05, "loss": 2.8785, "mean_token_accuracy": 0.36896551847457887, "step": 113120 }, { "epoch": 0.11394070148190899, "grad_norm": 9.550723010260254, "learning_rate": 4.94988946377689e-05, "loss": 2.1406, "mean_token_accuracy": 0.44827585816383364, "step": 113125 }, { "epoch": 0.11394573753501316, "grad_norm": 34.810218784031846, "learning_rate": 4.949881599242978e-05, "loss": 2.8493, "mean_token_accuracy": 0.3931034505367279, "step": 113130 }, { "epoch": 0.11395077358811734, "grad_norm": 12.314575408284757, "learning_rate": 4.94987373409892e-05, "loss": 2.7444, "mean_token_accuracy": 0.4137930989265442, "step": 113135 }, { "epoch": 0.11395580964122151, "grad_norm": 9.397865391359787, "learning_rate": 4.949865868344719e-05, "loss": 2.4864, "mean_token_accuracy": 0.40689654350280763, "step": 113140 }, { "epoch": 0.11396084569432567, "grad_norm": 10.221560714454467, "learning_rate": 4.949858001980376e-05, "loss": 3.0097, "mean_token_accuracy": 0.29715668559074404, "step": 113145 }, { "epoch": 0.11396588174742985, "grad_norm": 11.451542010614418, "learning_rate": 4.9498501350058954e-05, "loss": 2.4813, "mean_token_accuracy": 0.4172413766384125, "step": 113150 }, { "epoch": 0.11397091780053402, "grad_norm": 9.172662428553256, "learning_rate": 4.949842267421277e-05, "loss": 2.4894, "mean_token_accuracy": 0.43793103396892546, "step": 113155 }, { "epoch": 0.11397595385363819, "grad_norm": 9.969517984225336, "learning_rate": 4.949834399226525e-05, "loss": 2.2152, "mean_token_accuracy": 0.458620685338974, "step": 113160 }, { "epoch": 0.11398098990674237, "grad_norm": 10.647808643521731, "learning_rate": 4.9498265304216394e-05, "loss": 2.1946, "mean_token_accuracy": 0.4137930989265442, "step": 113165 }, { "epoch": 0.11398602595984654, "grad_norm": 10.64814818584678, "learning_rate": 4.949818661006625e-05, "loss": 2.43, "mean_token_accuracy": 0.42758620381355283, "step": 113170 }, { "epoch": 0.11399106201295071, "grad_norm": 11.474775204938512, "learning_rate": 4.9498107909814825e-05, "loss": 2.4596, "mean_token_accuracy": 0.4172413766384125, "step": 113175 }, { "epoch": 0.11399609806605489, "grad_norm": 10.125856101884013, "learning_rate": 4.949802920346214e-05, "loss": 2.3479, "mean_token_accuracy": 0.42068966031074523, "step": 113180 }, { "epoch": 0.11400113411915906, "grad_norm": 11.191507978650462, "learning_rate": 4.9497950491008224e-05, "loss": 2.5476, "mean_token_accuracy": 0.42413792610168455, "step": 113185 }, { "epoch": 0.11400617017226324, "grad_norm": 11.33454535843002, "learning_rate": 4.949787177245309e-05, "loss": 2.3468, "mean_token_accuracy": 0.49458128213882446, "step": 113190 }, { "epoch": 0.11401120622536741, "grad_norm": 12.087548133594487, "learning_rate": 4.949779304779676e-05, "loss": 2.5392, "mean_token_accuracy": 0.3965517282485962, "step": 113195 }, { "epoch": 0.11401624227847158, "grad_norm": 9.960898151943242, "learning_rate": 4.949771431703927e-05, "loss": 2.278, "mean_token_accuracy": 0.41034482717514037, "step": 113200 }, { "epoch": 0.11402127833157576, "grad_norm": 12.06523950578498, "learning_rate": 4.949763558018063e-05, "loss": 2.1652, "mean_token_accuracy": 0.44827585220336913, "step": 113205 }, { "epoch": 0.11402631438467993, "grad_norm": 9.831053572903501, "learning_rate": 4.949755683722086e-05, "loss": 2.5092, "mean_token_accuracy": 0.4241379380226135, "step": 113210 }, { "epoch": 0.11403135043778409, "grad_norm": 10.775848668041025, "learning_rate": 4.949747808815999e-05, "loss": 2.5701, "mean_token_accuracy": 0.4344827592372894, "step": 113215 }, { "epoch": 0.11403638649088826, "grad_norm": 14.695883011614601, "learning_rate": 4.9497399332998036e-05, "loss": 2.307, "mean_token_accuracy": 0.42758620977401735, "step": 113220 }, { "epoch": 0.11404142254399244, "grad_norm": 8.710291487652412, "learning_rate": 4.9497320571735014e-05, "loss": 2.1022, "mean_token_accuracy": 0.4655172348022461, "step": 113225 }, { "epoch": 0.11404645859709661, "grad_norm": 11.419332765046402, "learning_rate": 4.949724180437096e-05, "loss": 2.3351, "mean_token_accuracy": 0.4344827592372894, "step": 113230 }, { "epoch": 0.11405149465020079, "grad_norm": 10.20398804765159, "learning_rate": 4.949716303090589e-05, "loss": 2.4702, "mean_token_accuracy": 0.42413792610168455, "step": 113235 }, { "epoch": 0.11405653070330496, "grad_norm": 10.845057161348851, "learning_rate": 4.949708425133982e-05, "loss": 2.1939, "mean_token_accuracy": 0.44482758045196535, "step": 113240 }, { "epoch": 0.11406156675640913, "grad_norm": 10.049798920749064, "learning_rate": 4.949700546567278e-05, "loss": 2.5361, "mean_token_accuracy": 0.42068966031074523, "step": 113245 }, { "epoch": 0.1140666028095133, "grad_norm": 11.378965183847846, "learning_rate": 4.949692667390478e-05, "loss": 2.5501, "mean_token_accuracy": 0.4000000059604645, "step": 113250 }, { "epoch": 0.11407163886261748, "grad_norm": 9.294428055582308, "learning_rate": 4.9496847876035856e-05, "loss": 2.973, "mean_token_accuracy": 0.3620689660310745, "step": 113255 }, { "epoch": 0.11407667491572165, "grad_norm": 13.975864531693889, "learning_rate": 4.949676907206603e-05, "loss": 2.7928, "mean_token_accuracy": 0.37586207389831544, "step": 113260 }, { "epoch": 0.11408171096882583, "grad_norm": 13.660362790115851, "learning_rate": 4.9496690261995316e-05, "loss": 2.6571, "mean_token_accuracy": 0.41379310488700866, "step": 113265 }, { "epoch": 0.11408674702193, "grad_norm": 11.148633100257763, "learning_rate": 4.949661144582374e-05, "loss": 2.3314, "mean_token_accuracy": 0.458620685338974, "step": 113270 }, { "epoch": 0.11409178307503418, "grad_norm": 12.064414064142063, "learning_rate": 4.949653262355131e-05, "loss": 2.2437, "mean_token_accuracy": 0.432667875289917, "step": 113275 }, { "epoch": 0.11409681912813835, "grad_norm": 10.688852440708585, "learning_rate": 4.949645379517807e-05, "loss": 1.8782, "mean_token_accuracy": 0.541379302740097, "step": 113280 }, { "epoch": 0.11410185518124251, "grad_norm": 11.218316108767265, "learning_rate": 4.949637496070403e-05, "loss": 2.2959, "mean_token_accuracy": 0.4379310369491577, "step": 113285 }, { "epoch": 0.11410689123434668, "grad_norm": 14.816725548606522, "learning_rate": 4.949629612012921e-05, "loss": 2.4734, "mean_token_accuracy": 0.44827587008476255, "step": 113290 }, { "epoch": 0.11411192728745086, "grad_norm": 11.061974015416226, "learning_rate": 4.949621727345364e-05, "loss": 2.7431, "mean_token_accuracy": 0.42413792610168455, "step": 113295 }, { "epoch": 0.11411696334055503, "grad_norm": 9.964756555006183, "learning_rate": 4.949613842067733e-05, "loss": 2.1772, "mean_token_accuracy": 0.4896551609039307, "step": 113300 }, { "epoch": 0.1141219993936592, "grad_norm": 12.390570868900841, "learning_rate": 4.949605956180031e-05, "loss": 2.3921, "mean_token_accuracy": 0.41724138259887694, "step": 113305 }, { "epoch": 0.11412703544676338, "grad_norm": 14.845901343009714, "learning_rate": 4.949598069682261e-05, "loss": 2.803, "mean_token_accuracy": 0.3793103516101837, "step": 113310 }, { "epoch": 0.11413207149986755, "grad_norm": 10.67618062191245, "learning_rate": 4.9495901825744235e-05, "loss": 2.1307, "mean_token_accuracy": 0.43793103098869324, "step": 113315 }, { "epoch": 0.11413710755297173, "grad_norm": 10.031834764163156, "learning_rate": 4.949582294856521e-05, "loss": 2.2445, "mean_token_accuracy": 0.48275861144065857, "step": 113320 }, { "epoch": 0.1141421436060759, "grad_norm": 15.736364692493241, "learning_rate": 4.9495744065285574e-05, "loss": 2.5228, "mean_token_accuracy": 0.38275861740112305, "step": 113325 }, { "epoch": 0.11414717965918007, "grad_norm": 9.364293880169328, "learning_rate": 4.949566517590532e-05, "loss": 2.0923, "mean_token_accuracy": 0.44482758045196535, "step": 113330 }, { "epoch": 0.11415221571228425, "grad_norm": 8.819982790443811, "learning_rate": 4.9495586280424504e-05, "loss": 2.4813, "mean_token_accuracy": 0.4068965494632721, "step": 113335 }, { "epoch": 0.11415725176538842, "grad_norm": 11.601371405048335, "learning_rate": 4.949550737884312e-05, "loss": 2.4397, "mean_token_accuracy": 0.40544463992118834, "step": 113340 }, { "epoch": 0.1141622878184926, "grad_norm": 13.0197702469481, "learning_rate": 4.9495428471161207e-05, "loss": 2.3181, "mean_token_accuracy": 0.4620689630508423, "step": 113345 }, { "epoch": 0.11416732387159677, "grad_norm": 12.50222061393604, "learning_rate": 4.949534955737877e-05, "loss": 2.6322, "mean_token_accuracy": 0.417241370677948, "step": 113350 }, { "epoch": 0.11417235992470093, "grad_norm": 10.474822116968944, "learning_rate": 4.949527063749585e-05, "loss": 2.3376, "mean_token_accuracy": 0.4620689630508423, "step": 113355 }, { "epoch": 0.1141773959778051, "grad_norm": 14.458068592006503, "learning_rate": 4.9495191711512455e-05, "loss": 2.3975, "mean_token_accuracy": 0.4551724135875702, "step": 113360 }, { "epoch": 0.11418243203090928, "grad_norm": 11.361647917559504, "learning_rate": 4.949511277942861e-05, "loss": 2.572, "mean_token_accuracy": 0.4034482777118683, "step": 113365 }, { "epoch": 0.11418746808401345, "grad_norm": 12.932486338295037, "learning_rate": 4.949503384124434e-05, "loss": 2.6922, "mean_token_accuracy": 0.3862069010734558, "step": 113370 }, { "epoch": 0.11419250413711762, "grad_norm": 9.67422420348851, "learning_rate": 4.9494954896959665e-05, "loss": 2.7554, "mean_token_accuracy": 0.4206896543502808, "step": 113375 }, { "epoch": 0.1141975401902218, "grad_norm": 7.887708908293583, "learning_rate": 4.949487594657461e-05, "loss": 2.4777, "mean_token_accuracy": 0.41034482717514037, "step": 113380 }, { "epoch": 0.11420257624332597, "grad_norm": 9.680144030373048, "learning_rate": 4.949479699008919e-05, "loss": 2.0863, "mean_token_accuracy": 0.47773745059967043, "step": 113385 }, { "epoch": 0.11420761229643014, "grad_norm": 8.883907198980403, "learning_rate": 4.949471802750344e-05, "loss": 2.3385, "mean_token_accuracy": 0.4310344815254211, "step": 113390 }, { "epoch": 0.11421264834953432, "grad_norm": 11.62154901442218, "learning_rate": 4.9494639058817364e-05, "loss": 2.2665, "mean_token_accuracy": 0.47586206793785096, "step": 113395 }, { "epoch": 0.11421768440263849, "grad_norm": 10.840802169478733, "learning_rate": 4.9494560084031e-05, "loss": 2.1543, "mean_token_accuracy": 0.4419950723648071, "step": 113400 }, { "epoch": 0.11422272045574267, "grad_norm": 10.73399636419925, "learning_rate": 4.949448110314435e-05, "loss": 2.7492, "mean_token_accuracy": 0.39310344457626345, "step": 113405 }, { "epoch": 0.11422775650884684, "grad_norm": 10.139801059311369, "learning_rate": 4.9494402116157454e-05, "loss": 2.2088, "mean_token_accuracy": 0.4689655125141144, "step": 113410 }, { "epoch": 0.11423279256195101, "grad_norm": 11.13104416938885, "learning_rate": 4.949432312307034e-05, "loss": 2.3649, "mean_token_accuracy": 0.4359951615333557, "step": 113415 }, { "epoch": 0.11423782861505519, "grad_norm": 14.536714460285188, "learning_rate": 4.9494244123883005e-05, "loss": 2.5675, "mean_token_accuracy": 0.41034482717514037, "step": 113420 }, { "epoch": 0.11424286466815935, "grad_norm": 13.429721080359569, "learning_rate": 4.9494165118595484e-05, "loss": 2.6397, "mean_token_accuracy": 0.3758620619773865, "step": 113425 }, { "epoch": 0.11424790072126352, "grad_norm": 10.29286802052455, "learning_rate": 4.949408610720781e-05, "loss": 2.4443, "mean_token_accuracy": 0.4137930989265442, "step": 113430 }, { "epoch": 0.1142529367743677, "grad_norm": 12.176194479251862, "learning_rate": 4.949400708971998e-05, "loss": 2.4235, "mean_token_accuracy": 0.47241380214691164, "step": 113435 }, { "epoch": 0.11425797282747187, "grad_norm": 11.04352862540111, "learning_rate": 4.9493928066132044e-05, "loss": 2.131, "mean_token_accuracy": 0.4448275864124298, "step": 113440 }, { "epoch": 0.11426300888057604, "grad_norm": 8.749721719723174, "learning_rate": 4.9493849036444004e-05, "loss": 2.3874, "mean_token_accuracy": 0.44482759237289426, "step": 113445 }, { "epoch": 0.11426804493368022, "grad_norm": 11.753717979389771, "learning_rate": 4.949377000065589e-05, "loss": 2.3063, "mean_token_accuracy": 0.4517241418361664, "step": 113450 }, { "epoch": 0.11427308098678439, "grad_norm": 11.952751825182776, "learning_rate": 4.949369095876771e-05, "loss": 2.4727, "mean_token_accuracy": 0.35862068831920624, "step": 113455 }, { "epoch": 0.11427811703988856, "grad_norm": 12.615017839166764, "learning_rate": 4.949361191077951e-05, "loss": 2.517, "mean_token_accuracy": 0.39655172228813174, "step": 113460 }, { "epoch": 0.11428315309299274, "grad_norm": 8.643812854011056, "learning_rate": 4.94935328566913e-05, "loss": 2.1651, "mean_token_accuracy": 0.4623109459877014, "step": 113465 }, { "epoch": 0.11428818914609691, "grad_norm": 14.701537562191152, "learning_rate": 4.9493453796503096e-05, "loss": 2.4089, "mean_token_accuracy": 0.45069570541381837, "step": 113470 }, { "epoch": 0.11429322519920108, "grad_norm": 14.544497270939637, "learning_rate": 4.9493374730214925e-05, "loss": 2.4453, "mean_token_accuracy": 0.4379310369491577, "step": 113475 }, { "epoch": 0.11429826125230526, "grad_norm": 10.722636481132666, "learning_rate": 4.949329565782682e-05, "loss": 2.3277, "mean_token_accuracy": 0.4517241358757019, "step": 113480 }, { "epoch": 0.11430329730540943, "grad_norm": 10.245423331537594, "learning_rate": 4.949321657933878e-05, "loss": 2.7559, "mean_token_accuracy": 0.3793103456497192, "step": 113485 }, { "epoch": 0.1143083333585136, "grad_norm": 9.015153978871965, "learning_rate": 4.9493137494750843e-05, "loss": 2.2583, "mean_token_accuracy": 0.46551724076271056, "step": 113490 }, { "epoch": 0.11431336941161777, "grad_norm": 13.955637579080173, "learning_rate": 4.9493058404063034e-05, "loss": 2.3894, "mean_token_accuracy": 0.4676346004009247, "step": 113495 }, { "epoch": 0.11431840546472194, "grad_norm": 10.164242267721278, "learning_rate": 4.949297930727537e-05, "loss": 2.4805, "mean_token_accuracy": 0.4396249264478683, "step": 113500 }, { "epoch": 0.11432344151782611, "grad_norm": 10.701808374759569, "learning_rate": 4.949290020438785e-05, "loss": 2.274, "mean_token_accuracy": 0.3999999940395355, "step": 113505 }, { "epoch": 0.11432847757093029, "grad_norm": 10.056245059931017, "learning_rate": 4.9492821095400534e-05, "loss": 2.7455, "mean_token_accuracy": 0.41379311084747317, "step": 113510 }, { "epoch": 0.11433351362403446, "grad_norm": 10.379616270184927, "learning_rate": 4.9492741980313426e-05, "loss": 2.75, "mean_token_accuracy": 0.43103447556495667, "step": 113515 }, { "epoch": 0.11433854967713863, "grad_norm": 10.128488584249718, "learning_rate": 4.949266285912655e-05, "loss": 2.1725, "mean_token_accuracy": 0.44827585816383364, "step": 113520 }, { "epoch": 0.11434358573024281, "grad_norm": 10.028530631862171, "learning_rate": 4.949258373183992e-05, "loss": 2.7574, "mean_token_accuracy": 0.37931033968925476, "step": 113525 }, { "epoch": 0.11434862178334698, "grad_norm": 11.029166188529539, "learning_rate": 4.949250459845357e-05, "loss": 2.2862, "mean_token_accuracy": 0.39655171930789945, "step": 113530 }, { "epoch": 0.11435365783645116, "grad_norm": 8.639796434258983, "learning_rate": 4.9492425458967515e-05, "loss": 2.232, "mean_token_accuracy": 0.4620689630508423, "step": 113535 }, { "epoch": 0.11435869388955533, "grad_norm": 10.308297789321932, "learning_rate": 4.949234631338178e-05, "loss": 2.4251, "mean_token_accuracy": 0.4482758641242981, "step": 113540 }, { "epoch": 0.1143637299426595, "grad_norm": 15.08414463091528, "learning_rate": 4.9492267161696384e-05, "loss": 2.846, "mean_token_accuracy": 0.34482758641242983, "step": 113545 }, { "epoch": 0.11436876599576368, "grad_norm": 11.177042411524832, "learning_rate": 4.949218800391135e-05, "loss": 2.1939, "mean_token_accuracy": 0.4965517222881317, "step": 113550 }, { "epoch": 0.11437380204886785, "grad_norm": 10.12394115917201, "learning_rate": 4.94921088400267e-05, "loss": 2.4855, "mean_token_accuracy": 0.441379314661026, "step": 113555 }, { "epoch": 0.11437883810197202, "grad_norm": 11.461125092714033, "learning_rate": 4.9492029670042464e-05, "loss": 2.5777, "mean_token_accuracy": 0.39655172228813174, "step": 113560 }, { "epoch": 0.11438387415507618, "grad_norm": 12.145113342961128, "learning_rate": 4.949195049395866e-05, "loss": 2.6422, "mean_token_accuracy": 0.3931034475564957, "step": 113565 }, { "epoch": 0.11438891020818036, "grad_norm": 11.039025934611505, "learning_rate": 4.949187131177529e-05, "loss": 2.4975, "mean_token_accuracy": 0.4068965494632721, "step": 113570 }, { "epoch": 0.11439394626128453, "grad_norm": 11.917544606537819, "learning_rate": 4.94917921234924e-05, "loss": 2.6254, "mean_token_accuracy": 0.37241379618644715, "step": 113575 }, { "epoch": 0.1143989823143887, "grad_norm": 10.91747988373674, "learning_rate": 4.9491712929110005e-05, "loss": 2.1965, "mean_token_accuracy": 0.482758617401123, "step": 113580 }, { "epoch": 0.11440401836749288, "grad_norm": 9.607019364874029, "learning_rate": 4.9491633728628125e-05, "loss": 2.2214, "mean_token_accuracy": 0.4620689630508423, "step": 113585 }, { "epoch": 0.11440905442059705, "grad_norm": 11.946990671274653, "learning_rate": 4.949155452204678e-05, "loss": 2.1696, "mean_token_accuracy": 0.4344827592372894, "step": 113590 }, { "epoch": 0.11441409047370123, "grad_norm": 10.656620084598478, "learning_rate": 4.949147530936601e-05, "loss": 2.6961, "mean_token_accuracy": 0.4068965375423431, "step": 113595 }, { "epoch": 0.1144191265268054, "grad_norm": 10.741289191887997, "learning_rate": 4.949139609058581e-05, "loss": 2.6364, "mean_token_accuracy": 0.4228675127029419, "step": 113600 }, { "epoch": 0.11442416257990957, "grad_norm": 17.391155622455074, "learning_rate": 4.949131686570622e-05, "loss": 2.4523, "mean_token_accuracy": 0.41379310488700866, "step": 113605 }, { "epoch": 0.11442919863301375, "grad_norm": 15.482805296608134, "learning_rate": 4.949123763472725e-05, "loss": 2.3816, "mean_token_accuracy": 0.4620689690113068, "step": 113610 }, { "epoch": 0.11443423468611792, "grad_norm": 8.62625679001273, "learning_rate": 4.949115839764894e-05, "loss": 2.4148, "mean_token_accuracy": 0.4862069010734558, "step": 113615 }, { "epoch": 0.1144392707392221, "grad_norm": 11.702000404910105, "learning_rate": 4.9491079154471285e-05, "loss": 2.5361, "mean_token_accuracy": 0.43793103098869324, "step": 113620 }, { "epoch": 0.11444430679232627, "grad_norm": 10.346494056259406, "learning_rate": 4.9490999905194324e-05, "loss": 2.3197, "mean_token_accuracy": 0.4344827592372894, "step": 113625 }, { "epoch": 0.11444934284543043, "grad_norm": 12.908847049773321, "learning_rate": 4.949092064981808e-05, "loss": 2.4551, "mean_token_accuracy": 0.42413793206214906, "step": 113630 }, { "epoch": 0.1144543788985346, "grad_norm": 11.779814119931922, "learning_rate": 4.949084138834258e-05, "loss": 2.3351, "mean_token_accuracy": 0.4137930989265442, "step": 113635 }, { "epoch": 0.11445941495163878, "grad_norm": 11.125115793043847, "learning_rate": 4.949076212076782e-05, "loss": 2.3582, "mean_token_accuracy": 0.4310344815254211, "step": 113640 }, { "epoch": 0.11446445100474295, "grad_norm": 10.960252828186526, "learning_rate": 4.9490682847093853e-05, "loss": 2.1299, "mean_token_accuracy": 0.482758629322052, "step": 113645 }, { "epoch": 0.11446948705784712, "grad_norm": 12.008041702282949, "learning_rate": 4.949060356732069e-05, "loss": 2.1779, "mean_token_accuracy": 0.4482758641242981, "step": 113650 }, { "epoch": 0.1144745231109513, "grad_norm": 10.583236393470349, "learning_rate": 4.949052428144835e-05, "loss": 2.4378, "mean_token_accuracy": 0.4154869973659515, "step": 113655 }, { "epoch": 0.11447955916405547, "grad_norm": 9.57775670723433, "learning_rate": 4.9490444989476855e-05, "loss": 1.8755, "mean_token_accuracy": 0.536721122264862, "step": 113660 }, { "epoch": 0.11448459521715965, "grad_norm": 9.15819841022125, "learning_rate": 4.949036569140623e-05, "loss": 2.5678, "mean_token_accuracy": 0.39310343861579894, "step": 113665 }, { "epoch": 0.11448963127026382, "grad_norm": 12.42287228126611, "learning_rate": 4.949028638723649e-05, "loss": 2.3551, "mean_token_accuracy": 0.4517241358757019, "step": 113670 }, { "epoch": 0.114494667323368, "grad_norm": 14.653872043611278, "learning_rate": 4.9490207076967656e-05, "loss": 2.6872, "mean_token_accuracy": 0.4310344815254211, "step": 113675 }, { "epoch": 0.11449970337647217, "grad_norm": 11.294341596083138, "learning_rate": 4.949012776059977e-05, "loss": 2.3009, "mean_token_accuracy": 0.4068965494632721, "step": 113680 }, { "epoch": 0.11450473942957634, "grad_norm": 11.918013344220562, "learning_rate": 4.9490048438132834e-05, "loss": 2.1516, "mean_token_accuracy": 0.458620685338974, "step": 113685 }, { "epoch": 0.11450977548268051, "grad_norm": 9.69246667312578, "learning_rate": 4.9489969109566874e-05, "loss": 2.2894, "mean_token_accuracy": 0.4776164650917053, "step": 113690 }, { "epoch": 0.11451481153578469, "grad_norm": 11.139233038301702, "learning_rate": 4.9489889774901924e-05, "loss": 2.4423, "mean_token_accuracy": 0.43103447556495667, "step": 113695 }, { "epoch": 0.11451984758888885, "grad_norm": 9.183912868197861, "learning_rate": 4.948981043413798e-05, "loss": 2.5323, "mean_token_accuracy": 0.39655172228813174, "step": 113700 }, { "epoch": 0.11452488364199302, "grad_norm": 12.614316695091405, "learning_rate": 4.948973108727509e-05, "loss": 2.6711, "mean_token_accuracy": 0.4034482777118683, "step": 113705 }, { "epoch": 0.1145299196950972, "grad_norm": 9.632480915156544, "learning_rate": 4.9489651734313266e-05, "loss": 2.2178, "mean_token_accuracy": 0.4172413766384125, "step": 113710 }, { "epoch": 0.11453495574820137, "grad_norm": 8.296316241171496, "learning_rate": 4.948957237525253e-05, "loss": 2.4047, "mean_token_accuracy": 0.4551724076271057, "step": 113715 }, { "epoch": 0.11453999180130554, "grad_norm": 12.759721194862824, "learning_rate": 4.94894930100929e-05, "loss": 2.7462, "mean_token_accuracy": 0.38620689511299133, "step": 113720 }, { "epoch": 0.11454502785440972, "grad_norm": 10.036537827581931, "learning_rate": 4.948941363883441e-05, "loss": 2.8933, "mean_token_accuracy": 0.4034482777118683, "step": 113725 }, { "epoch": 0.11455006390751389, "grad_norm": 12.447410641465243, "learning_rate": 4.9489334261477065e-05, "loss": 2.8365, "mean_token_accuracy": 0.3655172407627106, "step": 113730 }, { "epoch": 0.11455509996061806, "grad_norm": 11.953243997680993, "learning_rate": 4.9489254878020904e-05, "loss": 2.0314, "mean_token_accuracy": 0.49655171632766726, "step": 113735 }, { "epoch": 0.11456013601372224, "grad_norm": 11.78610799620108, "learning_rate": 4.9489175488465936e-05, "loss": 2.7733, "mean_token_accuracy": 0.35862069129943847, "step": 113740 }, { "epoch": 0.11456517206682641, "grad_norm": 9.406421298271498, "learning_rate": 4.948909609281218e-05, "loss": 1.997, "mean_token_accuracy": 0.5103448331356049, "step": 113745 }, { "epoch": 0.11457020811993059, "grad_norm": 10.095220806974822, "learning_rate": 4.9489016691059674e-05, "loss": 2.5905, "mean_token_accuracy": 0.42758620381355283, "step": 113750 }, { "epoch": 0.11457524417303476, "grad_norm": 13.388223965868127, "learning_rate": 4.948893728320844e-05, "loss": 2.6073, "mean_token_accuracy": 0.4275862157344818, "step": 113755 }, { "epoch": 0.11458028022613893, "grad_norm": 9.471261397869029, "learning_rate": 4.9488857869258484e-05, "loss": 2.3186, "mean_token_accuracy": 0.4310344815254211, "step": 113760 }, { "epoch": 0.11458531627924311, "grad_norm": 12.643587075811629, "learning_rate": 4.9488778449209835e-05, "loss": 2.1844, "mean_token_accuracy": 0.4811857283115387, "step": 113765 }, { "epoch": 0.11459035233234727, "grad_norm": 11.020482074224862, "learning_rate": 4.9488699023062515e-05, "loss": 2.1993, "mean_token_accuracy": 0.45759225487709043, "step": 113770 }, { "epoch": 0.11459538838545144, "grad_norm": 9.02576152182453, "learning_rate": 4.948861959081656e-05, "loss": 2.4543, "mean_token_accuracy": 0.40689654350280763, "step": 113775 }, { "epoch": 0.11460042443855561, "grad_norm": 11.790217320470852, "learning_rate": 4.948854015247197e-05, "loss": 2.5383, "mean_token_accuracy": 0.42068964838981626, "step": 113780 }, { "epoch": 0.11460546049165979, "grad_norm": 10.833820992894111, "learning_rate": 4.948846070802878e-05, "loss": 2.4236, "mean_token_accuracy": 0.3931034475564957, "step": 113785 }, { "epoch": 0.11461049654476396, "grad_norm": 9.15191295838091, "learning_rate": 4.9488381257487e-05, "loss": 2.168, "mean_token_accuracy": 0.4551724076271057, "step": 113790 }, { "epoch": 0.11461553259786814, "grad_norm": 12.64919693295713, "learning_rate": 4.948830180084667e-05, "loss": 2.4248, "mean_token_accuracy": 0.42413793206214906, "step": 113795 }, { "epoch": 0.11462056865097231, "grad_norm": 12.098311546097689, "learning_rate": 4.9488222338107795e-05, "loss": 2.2487, "mean_token_accuracy": 0.4517241299152374, "step": 113800 }, { "epoch": 0.11462560470407648, "grad_norm": 11.590061303108335, "learning_rate": 4.948814286927041e-05, "loss": 2.3527, "mean_token_accuracy": 0.3862069010734558, "step": 113805 }, { "epoch": 0.11463064075718066, "grad_norm": 11.089201212737738, "learning_rate": 4.9488063394334525e-05, "loss": 1.9798, "mean_token_accuracy": 0.4931034505367279, "step": 113810 }, { "epoch": 0.11463567681028483, "grad_norm": 9.799307407646957, "learning_rate": 4.9487983913300175e-05, "loss": 2.0101, "mean_token_accuracy": 0.5344827592372894, "step": 113815 }, { "epoch": 0.114640712863389, "grad_norm": 11.72433125646831, "learning_rate": 4.948790442616738e-05, "loss": 2.6755, "mean_token_accuracy": 0.37931033968925476, "step": 113820 }, { "epoch": 0.11464574891649318, "grad_norm": 11.495957315083603, "learning_rate": 4.9487824932936145e-05, "loss": 2.5161, "mean_token_accuracy": 0.4448275864124298, "step": 113825 }, { "epoch": 0.11465078496959735, "grad_norm": 9.615827817054202, "learning_rate": 4.9487745433606507e-05, "loss": 2.5414, "mean_token_accuracy": 0.441379314661026, "step": 113830 }, { "epoch": 0.11465582102270153, "grad_norm": 11.397016871850619, "learning_rate": 4.9487665928178495e-05, "loss": 2.5587, "mean_token_accuracy": 0.4137930989265442, "step": 113835 }, { "epoch": 0.11466085707580569, "grad_norm": 10.278638141718346, "learning_rate": 4.948758641665212e-05, "loss": 2.2742, "mean_token_accuracy": 0.42413793206214906, "step": 113840 }, { "epoch": 0.11466589312890986, "grad_norm": 11.024709624385842, "learning_rate": 4.94875068990274e-05, "loss": 2.4534, "mean_token_accuracy": 0.4068965494632721, "step": 113845 }, { "epoch": 0.11467092918201403, "grad_norm": 10.649946148412724, "learning_rate": 4.9487427375304365e-05, "loss": 2.5059, "mean_token_accuracy": 0.4137930989265442, "step": 113850 }, { "epoch": 0.11467596523511821, "grad_norm": 9.277995238077608, "learning_rate": 4.948734784548303e-05, "loss": 2.5528, "mean_token_accuracy": 0.4517241358757019, "step": 113855 }, { "epoch": 0.11468100128822238, "grad_norm": 10.86562718460318, "learning_rate": 4.9487268309563426e-05, "loss": 2.3333, "mean_token_accuracy": 0.46551724076271056, "step": 113860 }, { "epoch": 0.11468603734132655, "grad_norm": 11.57718861134086, "learning_rate": 4.9487188767545576e-05, "loss": 2.4861, "mean_token_accuracy": 0.3862068891525269, "step": 113865 }, { "epoch": 0.11469107339443073, "grad_norm": 10.33927655828899, "learning_rate": 4.94871092194295e-05, "loss": 2.542, "mean_token_accuracy": 0.38965516686439516, "step": 113870 }, { "epoch": 0.1146961094475349, "grad_norm": 12.595536968842474, "learning_rate": 4.9487029665215206e-05, "loss": 2.646, "mean_token_accuracy": 0.3827586233615875, "step": 113875 }, { "epoch": 0.11470114550063908, "grad_norm": 10.606889630928459, "learning_rate": 4.948695010490273e-05, "loss": 2.5376, "mean_token_accuracy": 0.4068965554237366, "step": 113880 }, { "epoch": 0.11470618155374325, "grad_norm": 11.796607424015194, "learning_rate": 4.9486870538492096e-05, "loss": 2.4371, "mean_token_accuracy": 0.43448275327682495, "step": 113885 }, { "epoch": 0.11471121760684742, "grad_norm": 14.262373274703593, "learning_rate": 4.9486790965983324e-05, "loss": 2.4766, "mean_token_accuracy": 0.4931034505367279, "step": 113890 }, { "epoch": 0.1147162536599516, "grad_norm": 10.473058102726982, "learning_rate": 4.948671138737643e-05, "loss": 2.4986, "mean_token_accuracy": 0.4310344815254211, "step": 113895 }, { "epoch": 0.11472128971305577, "grad_norm": 14.285388653870314, "learning_rate": 4.948663180267143e-05, "loss": 2.3645, "mean_token_accuracy": 0.4620689690113068, "step": 113900 }, { "epoch": 0.11472632576615995, "grad_norm": 11.24305873388817, "learning_rate": 4.948655221186836e-05, "loss": 2.7835, "mean_token_accuracy": 0.3517241358757019, "step": 113905 }, { "epoch": 0.1147313618192641, "grad_norm": 11.77452984959016, "learning_rate": 4.948647261496725e-05, "loss": 2.4833, "mean_token_accuracy": 0.426013308763504, "step": 113910 }, { "epoch": 0.11473639787236828, "grad_norm": 9.78259016733333, "learning_rate": 4.94863930119681e-05, "loss": 2.3666, "mean_token_accuracy": 0.42413793206214906, "step": 113915 }, { "epoch": 0.11474143392547245, "grad_norm": 11.90363737609743, "learning_rate": 4.948631340287095e-05, "loss": 2.5529, "mean_token_accuracy": 0.41724138259887694, "step": 113920 }, { "epoch": 0.11474646997857663, "grad_norm": 10.411634503372966, "learning_rate": 4.94862337876758e-05, "loss": 2.7654, "mean_token_accuracy": 0.40907440185546873, "step": 113925 }, { "epoch": 0.1147515060316808, "grad_norm": 12.423891223336954, "learning_rate": 4.948615416638269e-05, "loss": 2.2315, "mean_token_accuracy": 0.4620689690113068, "step": 113930 }, { "epoch": 0.11475654208478497, "grad_norm": 12.018425686425598, "learning_rate": 4.9486074538991645e-05, "loss": 2.0233, "mean_token_accuracy": 0.4482758641242981, "step": 113935 }, { "epoch": 0.11476157813788915, "grad_norm": 10.912654916593338, "learning_rate": 4.948599490550268e-05, "loss": 2.4244, "mean_token_accuracy": 0.4172413766384125, "step": 113940 }, { "epoch": 0.11476661419099332, "grad_norm": 12.06295380848458, "learning_rate": 4.948591526591581e-05, "loss": 2.647, "mean_token_accuracy": 0.39655172228813174, "step": 113945 }, { "epoch": 0.1147716502440975, "grad_norm": 9.18634312006316, "learning_rate": 4.948583562023106e-05, "loss": 2.3188, "mean_token_accuracy": 0.4551724076271057, "step": 113950 }, { "epoch": 0.11477668629720167, "grad_norm": 13.814035827151605, "learning_rate": 4.9485755968448476e-05, "loss": 2.5358, "mean_token_accuracy": 0.4310344815254211, "step": 113955 }, { "epoch": 0.11478172235030584, "grad_norm": 10.84685704303101, "learning_rate": 4.9485676310568046e-05, "loss": 2.4725, "mean_token_accuracy": 0.4241379380226135, "step": 113960 }, { "epoch": 0.11478675840341002, "grad_norm": 10.54386286725506, "learning_rate": 4.948559664658981e-05, "loss": 2.0323, "mean_token_accuracy": 0.4448275864124298, "step": 113965 }, { "epoch": 0.11479179445651419, "grad_norm": 11.51650669111011, "learning_rate": 4.948551697651379e-05, "loss": 2.398, "mean_token_accuracy": 0.4137930989265442, "step": 113970 }, { "epoch": 0.11479683050961836, "grad_norm": 11.172986342949743, "learning_rate": 4.948543730034e-05, "loss": 2.3643, "mean_token_accuracy": 0.44137930274009707, "step": 113975 }, { "epoch": 0.11480186656272252, "grad_norm": 10.282606205618496, "learning_rate": 4.948535761806847e-05, "loss": 2.5042, "mean_token_accuracy": 0.39655172228813174, "step": 113980 }, { "epoch": 0.1148069026158267, "grad_norm": 8.888747477147502, "learning_rate": 4.948527792969921e-05, "loss": 2.6922, "mean_token_accuracy": 0.4206896543502808, "step": 113985 }, { "epoch": 0.11481193866893087, "grad_norm": 8.525829562270047, "learning_rate": 4.9485198235232264e-05, "loss": 2.3016, "mean_token_accuracy": 0.3827586233615875, "step": 113990 }, { "epoch": 0.11481697472203505, "grad_norm": 13.810846568031577, "learning_rate": 4.9485118534667634e-05, "loss": 2.6228, "mean_token_accuracy": 0.33103448450565337, "step": 113995 }, { "epoch": 0.11482201077513922, "grad_norm": 10.375553307280166, "learning_rate": 4.948503882800536e-05, "loss": 2.3311, "mean_token_accuracy": 0.4137930989265442, "step": 114000 }, { "epoch": 0.11482704682824339, "grad_norm": 9.001013142581225, "learning_rate": 4.948495911524544e-05, "loss": 2.1663, "mean_token_accuracy": 0.4551724135875702, "step": 114005 }, { "epoch": 0.11483208288134757, "grad_norm": 14.235429279709575, "learning_rate": 4.948487939638791e-05, "loss": 3.1092, "mean_token_accuracy": 0.34137930274009703, "step": 114010 }, { "epoch": 0.11483711893445174, "grad_norm": 11.371773245033062, "learning_rate": 4.94847996714328e-05, "loss": 2.1699, "mean_token_accuracy": 0.47586206793785096, "step": 114015 }, { "epoch": 0.11484215498755591, "grad_norm": 9.155351957488874, "learning_rate": 4.948471994038012e-05, "loss": 2.2043, "mean_token_accuracy": 0.4448275864124298, "step": 114020 }, { "epoch": 0.11484719104066009, "grad_norm": 10.16519489865712, "learning_rate": 4.9484640203229896e-05, "loss": 2.236, "mean_token_accuracy": 0.43103448748588563, "step": 114025 }, { "epoch": 0.11485222709376426, "grad_norm": 10.50597979725388, "learning_rate": 4.948456045998215e-05, "loss": 2.3956, "mean_token_accuracy": 0.4034482717514038, "step": 114030 }, { "epoch": 0.11485726314686844, "grad_norm": 12.404880570502268, "learning_rate": 4.948448071063691e-05, "loss": 2.4476, "mean_token_accuracy": 0.3931034505367279, "step": 114035 }, { "epoch": 0.11486229919997261, "grad_norm": 13.37793461164307, "learning_rate": 4.948440095519418e-05, "loss": 2.5674, "mean_token_accuracy": 0.39104658365249634, "step": 114040 }, { "epoch": 0.11486733525307678, "grad_norm": 11.847185817857563, "learning_rate": 4.9484321193654e-05, "loss": 2.363, "mean_token_accuracy": 0.441379314661026, "step": 114045 }, { "epoch": 0.11487237130618094, "grad_norm": 12.545202002162336, "learning_rate": 4.948424142601639e-05, "loss": 2.6762, "mean_token_accuracy": 0.39310344457626345, "step": 114050 }, { "epoch": 0.11487740735928512, "grad_norm": 10.991761816530065, "learning_rate": 4.948416165228136e-05, "loss": 2.5085, "mean_token_accuracy": 0.38275861740112305, "step": 114055 }, { "epoch": 0.11488244341238929, "grad_norm": 10.769079708819692, "learning_rate": 4.948408187244895e-05, "loss": 2.7193, "mean_token_accuracy": 0.41379311084747317, "step": 114060 }, { "epoch": 0.11488747946549346, "grad_norm": 9.905533680041689, "learning_rate": 4.9484002086519165e-05, "loss": 2.5577, "mean_token_accuracy": 0.42413793206214906, "step": 114065 }, { "epoch": 0.11489251551859764, "grad_norm": 9.861688952243156, "learning_rate": 4.948392229449204e-05, "loss": 2.045, "mean_token_accuracy": 0.46896551847457885, "step": 114070 }, { "epoch": 0.11489755157170181, "grad_norm": 15.9746567750674, "learning_rate": 4.948384249636759e-05, "loss": 2.6828, "mean_token_accuracy": 0.4000000059604645, "step": 114075 }, { "epoch": 0.11490258762480599, "grad_norm": 12.009074934802989, "learning_rate": 4.948376269214584e-05, "loss": 2.6382, "mean_token_accuracy": 0.42068964838981626, "step": 114080 }, { "epoch": 0.11490762367791016, "grad_norm": 13.64660522941905, "learning_rate": 4.9483682881826824e-05, "loss": 2.0331, "mean_token_accuracy": 0.4896551728248596, "step": 114085 }, { "epoch": 0.11491265973101433, "grad_norm": 9.280593708131645, "learning_rate": 4.948360306541053e-05, "loss": 2.3967, "mean_token_accuracy": 0.42413792610168455, "step": 114090 }, { "epoch": 0.1149176957841185, "grad_norm": 8.396777275562203, "learning_rate": 4.948352324289701e-05, "loss": 2.6186, "mean_token_accuracy": 0.417241370677948, "step": 114095 }, { "epoch": 0.11492273183722268, "grad_norm": 12.026471867673358, "learning_rate": 4.948344341428628e-05, "loss": 2.5018, "mean_token_accuracy": 0.4068965554237366, "step": 114100 }, { "epoch": 0.11492776789032685, "grad_norm": 10.564590504863084, "learning_rate": 4.9483363579578364e-05, "loss": 2.3974, "mean_token_accuracy": 0.4738656938076019, "step": 114105 }, { "epoch": 0.11493280394343103, "grad_norm": 12.253368627038808, "learning_rate": 4.948328373877327e-05, "loss": 2.4085, "mean_token_accuracy": 0.42546883821487425, "step": 114110 }, { "epoch": 0.1149378399965352, "grad_norm": 10.777913057139038, "learning_rate": 4.948320389187104e-05, "loss": 2.5384, "mean_token_accuracy": 0.41034482717514037, "step": 114115 }, { "epoch": 0.11494287604963936, "grad_norm": 11.006712039320778, "learning_rate": 4.948312403887168e-05, "loss": 2.3672, "mean_token_accuracy": 0.44827585816383364, "step": 114120 }, { "epoch": 0.11494791210274354, "grad_norm": 10.826713771697541, "learning_rate": 4.948304417977522e-05, "loss": 2.4728, "mean_token_accuracy": 0.42758620977401735, "step": 114125 }, { "epoch": 0.11495294815584771, "grad_norm": 11.741902115103413, "learning_rate": 4.948296431458168e-05, "loss": 2.2491, "mean_token_accuracy": 0.4758620738983154, "step": 114130 }, { "epoch": 0.11495798420895188, "grad_norm": 9.423203753511673, "learning_rate": 4.9482884443291084e-05, "loss": 2.4385, "mean_token_accuracy": 0.4499697506427765, "step": 114135 }, { "epoch": 0.11496302026205606, "grad_norm": 10.127323478544906, "learning_rate": 4.9482804565903455e-05, "loss": 2.6661, "mean_token_accuracy": 0.4068965554237366, "step": 114140 }, { "epoch": 0.11496805631516023, "grad_norm": 10.717278265783976, "learning_rate": 4.948272468241881e-05, "loss": 2.5826, "mean_token_accuracy": 0.4, "step": 114145 }, { "epoch": 0.1149730923682644, "grad_norm": 11.829556268726973, "learning_rate": 4.948264479283718e-05, "loss": 2.4729, "mean_token_accuracy": 0.4413793087005615, "step": 114150 }, { "epoch": 0.11497812842136858, "grad_norm": 10.015714812563699, "learning_rate": 4.9482564897158576e-05, "loss": 2.2554, "mean_token_accuracy": 0.4482758641242981, "step": 114155 }, { "epoch": 0.11498316447447275, "grad_norm": 11.024550062713494, "learning_rate": 4.9482484995383025e-05, "loss": 2.4351, "mean_token_accuracy": 0.39655172228813174, "step": 114160 }, { "epoch": 0.11498820052757693, "grad_norm": 12.413669346970101, "learning_rate": 4.948240508751055e-05, "loss": 2.6537, "mean_token_accuracy": 0.45862069725990295, "step": 114165 }, { "epoch": 0.1149932365806811, "grad_norm": 11.993218426553433, "learning_rate": 4.9482325173541175e-05, "loss": 2.5589, "mean_token_accuracy": 0.4310344815254211, "step": 114170 }, { "epoch": 0.11499827263378527, "grad_norm": 11.715414005312931, "learning_rate": 4.9482245253474916e-05, "loss": 2.7837, "mean_token_accuracy": 0.36896551847457887, "step": 114175 }, { "epoch": 0.11500330868688945, "grad_norm": 10.1591967873332, "learning_rate": 4.948216532731181e-05, "loss": 2.4536, "mean_token_accuracy": 0.4310344815254211, "step": 114180 }, { "epoch": 0.11500834473999362, "grad_norm": 9.471177610560758, "learning_rate": 4.948208539505186e-05, "loss": 2.2309, "mean_token_accuracy": 0.4540834903717041, "step": 114185 }, { "epoch": 0.11501338079309778, "grad_norm": 9.983318819801264, "learning_rate": 4.948200545669509e-05, "loss": 2.1922, "mean_token_accuracy": 0.42758620977401735, "step": 114190 }, { "epoch": 0.11501841684620195, "grad_norm": 17.922162131705186, "learning_rate": 4.948192551224154e-05, "loss": 2.33, "mean_token_accuracy": 0.5127041757106781, "step": 114195 }, { "epoch": 0.11502345289930613, "grad_norm": 11.167165349571508, "learning_rate": 4.948184556169122e-05, "loss": 2.4666, "mean_token_accuracy": 0.4241379380226135, "step": 114200 }, { "epoch": 0.1150284889524103, "grad_norm": 9.807124593451773, "learning_rate": 4.948176560504415e-05, "loss": 2.3143, "mean_token_accuracy": 0.4103448212146759, "step": 114205 }, { "epoch": 0.11503352500551448, "grad_norm": 9.639815815686905, "learning_rate": 4.948168564230036e-05, "loss": 2.6257, "mean_token_accuracy": 0.36896551251411436, "step": 114210 }, { "epoch": 0.11503856105861865, "grad_norm": 12.720133604106408, "learning_rate": 4.9481605673459855e-05, "loss": 2.1358, "mean_token_accuracy": 0.4344827592372894, "step": 114215 }, { "epoch": 0.11504359711172282, "grad_norm": 10.955302307421924, "learning_rate": 4.9481525698522687e-05, "loss": 2.4368, "mean_token_accuracy": 0.41724138259887694, "step": 114220 }, { "epoch": 0.115048633164827, "grad_norm": 13.360167559560479, "learning_rate": 4.9481445717488846e-05, "loss": 2.6689, "mean_token_accuracy": 0.37586206793785093, "step": 114225 }, { "epoch": 0.11505366921793117, "grad_norm": 12.308827627719717, "learning_rate": 4.9481365730358375e-05, "loss": 2.5291, "mean_token_accuracy": 0.41379310488700866, "step": 114230 }, { "epoch": 0.11505870527103534, "grad_norm": 10.971639254032365, "learning_rate": 4.9481285737131285e-05, "loss": 2.2121, "mean_token_accuracy": 0.4620689690113068, "step": 114235 }, { "epoch": 0.11506374132413952, "grad_norm": 12.218376112623474, "learning_rate": 4.948120573780761e-05, "loss": 2.3167, "mean_token_accuracy": 0.453901994228363, "step": 114240 }, { "epoch": 0.11506877737724369, "grad_norm": 9.461375627072245, "learning_rate": 4.948112573238737e-05, "loss": 2.7059, "mean_token_accuracy": 0.3793103516101837, "step": 114245 }, { "epoch": 0.11507381343034787, "grad_norm": 10.086943376776528, "learning_rate": 4.9481045720870574e-05, "loss": 2.4821, "mean_token_accuracy": 0.41724138557910917, "step": 114250 }, { "epoch": 0.11507884948345204, "grad_norm": 11.246074287412615, "learning_rate": 4.948096570325726e-05, "loss": 2.4058, "mean_token_accuracy": 0.42068964838981626, "step": 114255 }, { "epoch": 0.1150838855365562, "grad_norm": 14.01658130832733, "learning_rate": 4.948088567954744e-05, "loss": 2.5915, "mean_token_accuracy": 0.3999999940395355, "step": 114260 }, { "epoch": 0.11508892158966037, "grad_norm": 10.665053961858103, "learning_rate": 4.9480805649741135e-05, "loss": 2.3465, "mean_token_accuracy": 0.4655172348022461, "step": 114265 }, { "epoch": 0.11509395764276455, "grad_norm": 11.915422266993902, "learning_rate": 4.948072561383838e-05, "loss": 2.1998, "mean_token_accuracy": 0.4620689690113068, "step": 114270 }, { "epoch": 0.11509899369586872, "grad_norm": 11.951763763137516, "learning_rate": 4.948064557183918e-05, "loss": 2.592, "mean_token_accuracy": 0.36551723480224607, "step": 114275 }, { "epoch": 0.1151040297489729, "grad_norm": 11.06375173601289, "learning_rate": 4.9480565523743564e-05, "loss": 2.2002, "mean_token_accuracy": 0.458620685338974, "step": 114280 }, { "epoch": 0.11510906580207707, "grad_norm": 10.857945632590365, "learning_rate": 4.948048546955156e-05, "loss": 2.5112, "mean_token_accuracy": 0.4482758641242981, "step": 114285 }, { "epoch": 0.11511410185518124, "grad_norm": 8.79550657736542, "learning_rate": 4.9480405409263195e-05, "loss": 2.5516, "mean_token_accuracy": 0.42758620381355283, "step": 114290 }, { "epoch": 0.11511913790828542, "grad_norm": 7.3495390667440414, "learning_rate": 4.948032534287847e-05, "loss": 2.3559, "mean_token_accuracy": 0.41584996581077577, "step": 114295 }, { "epoch": 0.11512417396138959, "grad_norm": 11.67739099793903, "learning_rate": 4.9480245270397424e-05, "loss": 2.4519, "mean_token_accuracy": 0.39310344457626345, "step": 114300 }, { "epoch": 0.11512921001449376, "grad_norm": 9.329102045504136, "learning_rate": 4.948016519182008e-05, "loss": 2.3697, "mean_token_accuracy": 0.4517241418361664, "step": 114305 }, { "epoch": 0.11513424606759794, "grad_norm": 10.41458916255205, "learning_rate": 4.948008510714645e-05, "loss": 2.4249, "mean_token_accuracy": 0.41034482717514037, "step": 114310 }, { "epoch": 0.11513928212070211, "grad_norm": 8.793076330776032, "learning_rate": 4.948000501637657e-05, "loss": 1.9952, "mean_token_accuracy": 0.4862068951129913, "step": 114315 }, { "epoch": 0.11514431817380628, "grad_norm": 15.360007649938716, "learning_rate": 4.947992491951044e-05, "loss": 2.6239, "mean_token_accuracy": 0.41034482717514037, "step": 114320 }, { "epoch": 0.11514935422691046, "grad_norm": 11.677199411865576, "learning_rate": 4.94798448165481e-05, "loss": 2.0911, "mean_token_accuracy": 0.4903940916061401, "step": 114325 }, { "epoch": 0.11515439028001462, "grad_norm": 12.196811723782464, "learning_rate": 4.947976470748958e-05, "loss": 2.2896, "mean_token_accuracy": 0.3896551728248596, "step": 114330 }, { "epoch": 0.11515942633311879, "grad_norm": 11.117467543743958, "learning_rate": 4.947968459233488e-05, "loss": 2.0601, "mean_token_accuracy": 0.4551724135875702, "step": 114335 }, { "epoch": 0.11516446238622297, "grad_norm": 10.42918928673411, "learning_rate": 4.947960447108403e-05, "loss": 2.3085, "mean_token_accuracy": 0.4034482717514038, "step": 114340 }, { "epoch": 0.11516949843932714, "grad_norm": 11.630209305619932, "learning_rate": 4.9479524343737054e-05, "loss": 3.0507, "mean_token_accuracy": 0.34482758343219755, "step": 114345 }, { "epoch": 0.11517453449243131, "grad_norm": 15.21471754844737, "learning_rate": 4.947944421029398e-05, "loss": 2.7611, "mean_token_accuracy": 0.43103447556495667, "step": 114350 }, { "epoch": 0.11517957054553549, "grad_norm": 10.59730367485689, "learning_rate": 4.9479364070754825e-05, "loss": 2.1756, "mean_token_accuracy": 0.43448275327682495, "step": 114355 }, { "epoch": 0.11518460659863966, "grad_norm": 10.275831841831076, "learning_rate": 4.947928392511961e-05, "loss": 2.4117, "mean_token_accuracy": 0.41379310488700866, "step": 114360 }, { "epoch": 0.11518964265174383, "grad_norm": 11.78705069720452, "learning_rate": 4.947920377338836e-05, "loss": 2.0692, "mean_token_accuracy": 0.5, "step": 114365 }, { "epoch": 0.11519467870484801, "grad_norm": 11.146134650809783, "learning_rate": 4.94791236155611e-05, "loss": 2.4884, "mean_token_accuracy": 0.4206896543502808, "step": 114370 }, { "epoch": 0.11519971475795218, "grad_norm": 12.941719514604799, "learning_rate": 4.9479043451637835e-05, "loss": 2.4966, "mean_token_accuracy": 0.4379310369491577, "step": 114375 }, { "epoch": 0.11520475081105636, "grad_norm": 13.595352655746929, "learning_rate": 4.947896328161861e-05, "loss": 2.0218, "mean_token_accuracy": 0.5024198353290558, "step": 114380 }, { "epoch": 0.11520978686416053, "grad_norm": 11.1919901749803, "learning_rate": 4.9478883105503434e-05, "loss": 2.5195, "mean_token_accuracy": 0.4172413766384125, "step": 114385 }, { "epoch": 0.1152148229172647, "grad_norm": 11.49490483273503, "learning_rate": 4.947880292329234e-05, "loss": 2.4846, "mean_token_accuracy": 0.4172413766384125, "step": 114390 }, { "epoch": 0.11521985897036888, "grad_norm": 11.641513033964513, "learning_rate": 4.947872273498533e-05, "loss": 2.1974, "mean_token_accuracy": 0.4517241358757019, "step": 114395 }, { "epoch": 0.11522489502347304, "grad_norm": 15.442837953448356, "learning_rate": 4.947864254058245e-05, "loss": 2.5486, "mean_token_accuracy": 0.4482758641242981, "step": 114400 }, { "epoch": 0.11522993107657721, "grad_norm": 11.156768581566247, "learning_rate": 4.947856234008371e-05, "loss": 2.2582, "mean_token_accuracy": 0.460496062040329, "step": 114405 }, { "epoch": 0.11523496712968138, "grad_norm": 8.461948292711309, "learning_rate": 4.9478482133489135e-05, "loss": 2.1503, "mean_token_accuracy": 0.4448275864124298, "step": 114410 }, { "epoch": 0.11524000318278556, "grad_norm": 11.672629027264625, "learning_rate": 4.947840192079874e-05, "loss": 2.1338, "mean_token_accuracy": 0.5019358813762664, "step": 114415 }, { "epoch": 0.11524503923588973, "grad_norm": 8.908164182296904, "learning_rate": 4.947832170201255e-05, "loss": 2.2706, "mean_token_accuracy": 0.47586206197738645, "step": 114420 }, { "epoch": 0.1152500752889939, "grad_norm": 10.042469449380288, "learning_rate": 4.94782414771306e-05, "loss": 2.0327, "mean_token_accuracy": 0.4896551609039307, "step": 114425 }, { "epoch": 0.11525511134209808, "grad_norm": 11.020448341863673, "learning_rate": 4.94781612461529e-05, "loss": 2.8366, "mean_token_accuracy": 0.32413792312145234, "step": 114430 }, { "epoch": 0.11526014739520225, "grad_norm": 10.912518933696612, "learning_rate": 4.947808100907947e-05, "loss": 2.5729, "mean_token_accuracy": 0.3965517282485962, "step": 114435 }, { "epoch": 0.11526518344830643, "grad_norm": 10.143547787815832, "learning_rate": 4.947800076591034e-05, "loss": 2.2699, "mean_token_accuracy": 0.4379310369491577, "step": 114440 }, { "epoch": 0.1152702195014106, "grad_norm": 9.002677320809923, "learning_rate": 4.947792051664554e-05, "loss": 1.9524, "mean_token_accuracy": 0.4862068951129913, "step": 114445 }, { "epoch": 0.11527525555451477, "grad_norm": 11.54999764020228, "learning_rate": 4.947784026128507e-05, "loss": 2.4735, "mean_token_accuracy": 0.46896551847457885, "step": 114450 }, { "epoch": 0.11528029160761895, "grad_norm": 12.515416508491509, "learning_rate": 4.947775999982897e-05, "loss": 2.4628, "mean_token_accuracy": 0.39655172526836396, "step": 114455 }, { "epoch": 0.11528532766072312, "grad_norm": 10.968169503793305, "learning_rate": 4.9477679732277254e-05, "loss": 3.013, "mean_token_accuracy": 0.37241379022598264, "step": 114460 }, { "epoch": 0.1152903637138273, "grad_norm": 10.349380549278491, "learning_rate": 4.947759945862994e-05, "loss": 2.1009, "mean_token_accuracy": 0.49534180760383606, "step": 114465 }, { "epoch": 0.11529539976693146, "grad_norm": 10.95108320875787, "learning_rate": 4.9477519178887064e-05, "loss": 2.398, "mean_token_accuracy": 0.4517241418361664, "step": 114470 }, { "epoch": 0.11530043582003563, "grad_norm": 9.61673932105015, "learning_rate": 4.947743889304864e-05, "loss": 2.3768, "mean_token_accuracy": 0.3965517282485962, "step": 114475 }, { "epoch": 0.1153054718731398, "grad_norm": 14.447229623405963, "learning_rate": 4.94773586011147e-05, "loss": 2.6607, "mean_token_accuracy": 0.4310344815254211, "step": 114480 }, { "epoch": 0.11531050792624398, "grad_norm": 10.146372300552864, "learning_rate": 4.947727830308524e-05, "loss": 2.4017, "mean_token_accuracy": 0.4413793087005615, "step": 114485 }, { "epoch": 0.11531554397934815, "grad_norm": 8.930959553256985, "learning_rate": 4.9477197998960314e-05, "loss": 2.3884, "mean_token_accuracy": 0.3950998157262802, "step": 114490 }, { "epoch": 0.11532058003245232, "grad_norm": 16.218202745912052, "learning_rate": 4.947711768873992e-05, "loss": 2.6574, "mean_token_accuracy": 0.4517241299152374, "step": 114495 }, { "epoch": 0.1153256160855565, "grad_norm": 9.89118439570471, "learning_rate": 4.94770373724241e-05, "loss": 2.4402, "mean_token_accuracy": 0.36206896901130675, "step": 114500 }, { "epoch": 0.11533065213866067, "grad_norm": 9.691230429043946, "learning_rate": 4.9476957050012854e-05, "loss": 2.4374, "mean_token_accuracy": 0.44343618154525755, "step": 114505 }, { "epoch": 0.11533568819176485, "grad_norm": 11.510495907441681, "learning_rate": 4.947687672150623e-05, "loss": 2.1139, "mean_token_accuracy": 0.4379310250282288, "step": 114510 }, { "epoch": 0.11534072424486902, "grad_norm": 8.964848206860545, "learning_rate": 4.947679638690423e-05, "loss": 2.053, "mean_token_accuracy": 0.4601935803890228, "step": 114515 }, { "epoch": 0.1153457602979732, "grad_norm": 10.682343061870995, "learning_rate": 4.9476716046206885e-05, "loss": 2.4019, "mean_token_accuracy": 0.4379310369491577, "step": 114520 }, { "epoch": 0.11535079635107737, "grad_norm": 9.618198043858225, "learning_rate": 4.9476635699414216e-05, "loss": 2.3808, "mean_token_accuracy": 0.44137930274009707, "step": 114525 }, { "epoch": 0.11535583240418154, "grad_norm": 8.806456485991411, "learning_rate": 4.947655534652625e-05, "loss": 1.991, "mean_token_accuracy": 0.5156684815883636, "step": 114530 }, { "epoch": 0.11536086845728571, "grad_norm": 9.785695649715745, "learning_rate": 4.947647498754299e-05, "loss": 2.2655, "mean_token_accuracy": 0.4517241358757019, "step": 114535 }, { "epoch": 0.11536590451038987, "grad_norm": 10.563434986961429, "learning_rate": 4.947639462246449e-05, "loss": 2.7464, "mean_token_accuracy": 0.38620689511299133, "step": 114540 }, { "epoch": 0.11537094056349405, "grad_norm": 10.008861512931677, "learning_rate": 4.9476314251290746e-05, "loss": 2.6529, "mean_token_accuracy": 0.38753780722618103, "step": 114545 }, { "epoch": 0.11537597661659822, "grad_norm": 11.674184383647587, "learning_rate": 4.9476233874021785e-05, "loss": 2.7439, "mean_token_accuracy": 0.3620689630508423, "step": 114550 }, { "epoch": 0.1153810126697024, "grad_norm": 11.682666554977915, "learning_rate": 4.947615349065764e-05, "loss": 2.4392, "mean_token_accuracy": 0.3862068891525269, "step": 114555 }, { "epoch": 0.11538604872280657, "grad_norm": 14.130804620692176, "learning_rate": 4.947607310119833e-05, "loss": 2.4434, "mean_token_accuracy": 0.40344826579093934, "step": 114560 }, { "epoch": 0.11539108477591074, "grad_norm": 11.545471822807887, "learning_rate": 4.9475992705643867e-05, "loss": 2.308, "mean_token_accuracy": 0.493103438615799, "step": 114565 }, { "epoch": 0.11539612082901492, "grad_norm": 9.137790354232546, "learning_rate": 4.947591230399429e-05, "loss": 2.154, "mean_token_accuracy": 0.4517241418361664, "step": 114570 }, { "epoch": 0.11540115688211909, "grad_norm": 10.369850857638635, "learning_rate": 4.947583189624961e-05, "loss": 2.3209, "mean_token_accuracy": 0.4344827592372894, "step": 114575 }, { "epoch": 0.11540619293522326, "grad_norm": 11.400631360061109, "learning_rate": 4.947575148240984e-05, "loss": 2.4818, "mean_token_accuracy": 0.4206896543502808, "step": 114580 }, { "epoch": 0.11541122898832744, "grad_norm": 10.777133922965536, "learning_rate": 4.947567106247502e-05, "loss": 2.2518, "mean_token_accuracy": 0.4379310429096222, "step": 114585 }, { "epoch": 0.11541626504143161, "grad_norm": 9.033371463432728, "learning_rate": 4.9475590636445165e-05, "loss": 2.5923, "mean_token_accuracy": 0.43793103098869324, "step": 114590 }, { "epoch": 0.11542130109453579, "grad_norm": 10.609364242310876, "learning_rate": 4.94755102043203e-05, "loss": 2.4796, "mean_token_accuracy": 0.3931034505367279, "step": 114595 }, { "epoch": 0.11542633714763996, "grad_norm": 10.80417511154219, "learning_rate": 4.9475429766100437e-05, "loss": 2.8225, "mean_token_accuracy": 0.3758620619773865, "step": 114600 }, { "epoch": 0.11543137320074413, "grad_norm": 10.556849137695398, "learning_rate": 4.947534932178562e-05, "loss": 2.3881, "mean_token_accuracy": 0.4517241418361664, "step": 114605 }, { "epoch": 0.1154364092538483, "grad_norm": 8.778695121711372, "learning_rate": 4.947526887137585e-05, "loss": 2.3347, "mean_token_accuracy": 0.4517241299152374, "step": 114610 }, { "epoch": 0.11544144530695247, "grad_norm": 14.78203657973234, "learning_rate": 4.947518841487116e-05, "loss": 2.2739, "mean_token_accuracy": 0.47931034564971925, "step": 114615 }, { "epoch": 0.11544648136005664, "grad_norm": 11.10012417945893, "learning_rate": 4.947510795227156e-05, "loss": 2.1409, "mean_token_accuracy": 0.47241379618644713, "step": 114620 }, { "epoch": 0.11545151741316081, "grad_norm": 8.722344282943816, "learning_rate": 4.94750274835771e-05, "loss": 2.0505, "mean_token_accuracy": 0.4732758581638336, "step": 114625 }, { "epoch": 0.11545655346626499, "grad_norm": 8.160706046546673, "learning_rate": 4.947494700878777e-05, "loss": 2.1153, "mean_token_accuracy": 0.482758617401123, "step": 114630 }, { "epoch": 0.11546158951936916, "grad_norm": 11.082409385852314, "learning_rate": 4.9474866527903604e-05, "loss": 2.5476, "mean_token_accuracy": 0.4482758641242981, "step": 114635 }, { "epoch": 0.11546662557247334, "grad_norm": 9.689819310465234, "learning_rate": 4.9474786040924645e-05, "loss": 2.1592, "mean_token_accuracy": 0.42758620977401735, "step": 114640 }, { "epoch": 0.11547166162557751, "grad_norm": 10.02147786714863, "learning_rate": 4.947470554785088e-05, "loss": 2.2084, "mean_token_accuracy": 0.4413793087005615, "step": 114645 }, { "epoch": 0.11547669767868168, "grad_norm": 10.179254784402412, "learning_rate": 4.9474625048682355e-05, "loss": 2.367, "mean_token_accuracy": 0.441379314661026, "step": 114650 }, { "epoch": 0.11548173373178586, "grad_norm": 10.90891773673151, "learning_rate": 4.9474544543419086e-05, "loss": 2.9072, "mean_token_accuracy": 0.3837870478630066, "step": 114655 }, { "epoch": 0.11548676978489003, "grad_norm": 12.081832697686217, "learning_rate": 4.947446403206109e-05, "loss": 2.9445, "mean_token_accuracy": 0.36896551847457887, "step": 114660 }, { "epoch": 0.1154918058379942, "grad_norm": 9.856248224185135, "learning_rate": 4.94743835146084e-05, "loss": 2.1448, "mean_token_accuracy": 0.4862068831920624, "step": 114665 }, { "epoch": 0.11549684189109838, "grad_norm": 10.526721312771814, "learning_rate": 4.947430299106103e-05, "loss": 2.8332, "mean_token_accuracy": 0.3793103456497192, "step": 114670 }, { "epoch": 0.11550187794420255, "grad_norm": 10.40351915739031, "learning_rate": 4.947422246141901e-05, "loss": 2.6716, "mean_token_accuracy": 0.4034482717514038, "step": 114675 }, { "epoch": 0.11550691399730671, "grad_norm": 9.859906992833814, "learning_rate": 4.947414192568235e-05, "loss": 1.9472, "mean_token_accuracy": 0.482758617401123, "step": 114680 }, { "epoch": 0.11551195005041089, "grad_norm": 10.731568345069613, "learning_rate": 4.947406138385108e-05, "loss": 2.2379, "mean_token_accuracy": 0.42256503105163573, "step": 114685 }, { "epoch": 0.11551698610351506, "grad_norm": 12.068249906238519, "learning_rate": 4.947398083592523e-05, "loss": 2.3637, "mean_token_accuracy": 0.4448275864124298, "step": 114690 }, { "epoch": 0.11552202215661923, "grad_norm": 10.369265759046108, "learning_rate": 4.9473900281904804e-05, "loss": 2.26, "mean_token_accuracy": 0.4724137902259827, "step": 114695 }, { "epoch": 0.11552705820972341, "grad_norm": 12.092107678789406, "learning_rate": 4.947381972178984e-05, "loss": 2.3578, "mean_token_accuracy": 0.4137930989265442, "step": 114700 }, { "epoch": 0.11553209426282758, "grad_norm": 9.981810315288632, "learning_rate": 4.947373915558036e-05, "loss": 2.9392, "mean_token_accuracy": 0.4034482717514038, "step": 114705 }, { "epoch": 0.11553713031593175, "grad_norm": 8.897771367701681, "learning_rate": 4.947365858327638e-05, "loss": 2.156, "mean_token_accuracy": 0.4758620738983154, "step": 114710 }, { "epoch": 0.11554216636903593, "grad_norm": 14.925776088147948, "learning_rate": 4.947357800487792e-05, "loss": 2.5908, "mean_token_accuracy": 0.4379310250282288, "step": 114715 }, { "epoch": 0.1155472024221401, "grad_norm": 10.433349035700555, "learning_rate": 4.9473497420385e-05, "loss": 2.3255, "mean_token_accuracy": 0.420689657330513, "step": 114720 }, { "epoch": 0.11555223847524428, "grad_norm": 12.290843267835486, "learning_rate": 4.947341682979765e-05, "loss": 2.4987, "mean_token_accuracy": 0.41379310488700866, "step": 114725 }, { "epoch": 0.11555727452834845, "grad_norm": 9.367805269568239, "learning_rate": 4.9473336233115906e-05, "loss": 2.675, "mean_token_accuracy": 0.417241370677948, "step": 114730 }, { "epoch": 0.11556231058145262, "grad_norm": 9.307414056999395, "learning_rate": 4.9473255630339764e-05, "loss": 2.8124, "mean_token_accuracy": 0.4103448331356049, "step": 114735 }, { "epoch": 0.1155673466345568, "grad_norm": 9.033892750017497, "learning_rate": 4.947317502146926e-05, "loss": 2.0454, "mean_token_accuracy": 0.48965516686439514, "step": 114740 }, { "epoch": 0.11557238268766097, "grad_norm": 11.683163522492425, "learning_rate": 4.947309440650441e-05, "loss": 2.4118, "mean_token_accuracy": 0.41724138259887694, "step": 114745 }, { "epoch": 0.11557741874076513, "grad_norm": 8.234765699547703, "learning_rate": 4.947301378544525e-05, "loss": 2.007, "mean_token_accuracy": 0.5436781585216522, "step": 114750 }, { "epoch": 0.1155824547938693, "grad_norm": 11.987082736154791, "learning_rate": 4.947293315829178e-05, "loss": 2.2801, "mean_token_accuracy": 0.4598911166191101, "step": 114755 }, { "epoch": 0.11558749084697348, "grad_norm": 9.134477323977766, "learning_rate": 4.947285252504404e-05, "loss": 2.5214, "mean_token_accuracy": 0.3931034505367279, "step": 114760 }, { "epoch": 0.11559252690007765, "grad_norm": 10.96523923994237, "learning_rate": 4.9472771885702054e-05, "loss": 2.2035, "mean_token_accuracy": 0.38275861740112305, "step": 114765 }, { "epoch": 0.11559756295318183, "grad_norm": 10.794217381560557, "learning_rate": 4.947269124026583e-05, "loss": 2.427, "mean_token_accuracy": 0.43448275327682495, "step": 114770 }, { "epoch": 0.115602599006286, "grad_norm": 11.50113186908765, "learning_rate": 4.94726105887354e-05, "loss": 2.6883, "mean_token_accuracy": 0.4083484590053558, "step": 114775 }, { "epoch": 0.11560763505939017, "grad_norm": 10.102937652965693, "learning_rate": 4.947252993111078e-05, "loss": 2.6221, "mean_token_accuracy": 0.39655172228813174, "step": 114780 }, { "epoch": 0.11561267111249435, "grad_norm": 13.240633773539358, "learning_rate": 4.947244926739201e-05, "loss": 2.3388, "mean_token_accuracy": 0.4620689690113068, "step": 114785 }, { "epoch": 0.11561770716559852, "grad_norm": 12.086622987868527, "learning_rate": 4.947236859757909e-05, "loss": 2.3593, "mean_token_accuracy": 0.42068966031074523, "step": 114790 }, { "epoch": 0.1156227432187027, "grad_norm": 34.74912174700879, "learning_rate": 4.947228792167205e-05, "loss": 2.958, "mean_token_accuracy": 0.42413792908191683, "step": 114795 }, { "epoch": 0.11562777927180687, "grad_norm": 8.655295521346384, "learning_rate": 4.9472207239670916e-05, "loss": 2.3412, "mean_token_accuracy": 0.4361161470413208, "step": 114800 }, { "epoch": 0.11563281532491104, "grad_norm": 10.327715624395896, "learning_rate": 4.947212655157571e-05, "loss": 2.3336, "mean_token_accuracy": 0.42758620977401735, "step": 114805 }, { "epoch": 0.11563785137801522, "grad_norm": 10.001276555685791, "learning_rate": 4.947204585738645e-05, "loss": 1.8739, "mean_token_accuracy": 0.5159709513187408, "step": 114810 }, { "epoch": 0.11564288743111939, "grad_norm": 10.655800717031573, "learning_rate": 4.947196515710316e-05, "loss": 2.656, "mean_token_accuracy": 0.3931034505367279, "step": 114815 }, { "epoch": 0.11564792348422355, "grad_norm": 9.787162743551132, "learning_rate": 4.947188445072587e-05, "loss": 2.7723, "mean_token_accuracy": 0.4206896424293518, "step": 114820 }, { "epoch": 0.11565295953732772, "grad_norm": 15.857705156140165, "learning_rate": 4.9471803738254595e-05, "loss": 2.3892, "mean_token_accuracy": 0.42413793206214906, "step": 114825 }, { "epoch": 0.1156579955904319, "grad_norm": 9.104147080791181, "learning_rate": 4.9471723019689354e-05, "loss": 2.5101, "mean_token_accuracy": 0.3793103456497192, "step": 114830 }, { "epoch": 0.11566303164353607, "grad_norm": 10.824231104091838, "learning_rate": 4.947164229503017e-05, "loss": 2.2565, "mean_token_accuracy": 0.4413793087005615, "step": 114835 }, { "epoch": 0.11566806769664024, "grad_norm": 10.108348420436315, "learning_rate": 4.9471561564277075e-05, "loss": 2.4726, "mean_token_accuracy": 0.4068965494632721, "step": 114840 }, { "epoch": 0.11567310374974442, "grad_norm": 10.209690995148106, "learning_rate": 4.9471480827430085e-05, "loss": 2.5131, "mean_token_accuracy": 0.42413792610168455, "step": 114845 }, { "epoch": 0.11567813980284859, "grad_norm": 12.259702109827607, "learning_rate": 4.947140008448922e-05, "loss": 2.4142, "mean_token_accuracy": 0.3965517282485962, "step": 114850 }, { "epoch": 0.11568317585595277, "grad_norm": 9.608136061964927, "learning_rate": 4.947131933545451e-05, "loss": 2.5046, "mean_token_accuracy": 0.4172413766384125, "step": 114855 }, { "epoch": 0.11568821190905694, "grad_norm": 10.668309501766641, "learning_rate": 4.947123858032597e-05, "loss": 2.2662, "mean_token_accuracy": 0.41724137365818026, "step": 114860 }, { "epoch": 0.11569324796216111, "grad_norm": 10.617106029159755, "learning_rate": 4.947115781910363e-05, "loss": 2.2898, "mean_token_accuracy": 0.4586206912994385, "step": 114865 }, { "epoch": 0.11569828401526529, "grad_norm": 9.878663582166649, "learning_rate": 4.94710770517875e-05, "loss": 2.6638, "mean_token_accuracy": 0.4344827592372894, "step": 114870 }, { "epoch": 0.11570332006836946, "grad_norm": 10.096062908600633, "learning_rate": 4.947099627837761e-05, "loss": 2.4537, "mean_token_accuracy": 0.4, "step": 114875 }, { "epoch": 0.11570835612147364, "grad_norm": 10.109360426562704, "learning_rate": 4.947091549887399e-05, "loss": 2.4242, "mean_token_accuracy": 0.41034482717514037, "step": 114880 }, { "epoch": 0.11571339217457781, "grad_norm": 11.870642270027597, "learning_rate": 4.947083471327665e-05, "loss": 3.1018, "mean_token_accuracy": 0.3517241358757019, "step": 114885 }, { "epoch": 0.11571842822768197, "grad_norm": 13.896298910578459, "learning_rate": 4.947075392158561e-05, "loss": 2.8808, "mean_token_accuracy": 0.39655172526836396, "step": 114890 }, { "epoch": 0.11572346428078614, "grad_norm": 15.534732701661298, "learning_rate": 4.9470673123800906e-05, "loss": 3.1769, "mean_token_accuracy": 0.3517241358757019, "step": 114895 }, { "epoch": 0.11572850033389032, "grad_norm": 12.422139315432718, "learning_rate": 4.947059231992256e-05, "loss": 2.4334, "mean_token_accuracy": 0.4275861978530884, "step": 114900 }, { "epoch": 0.11573353638699449, "grad_norm": 11.289629133522578, "learning_rate": 4.947051150995057e-05, "loss": 2.3586, "mean_token_accuracy": 0.458620685338974, "step": 114905 }, { "epoch": 0.11573857244009866, "grad_norm": 12.187650052563535, "learning_rate": 4.9470430693884995e-05, "loss": 2.3475, "mean_token_accuracy": 0.42413793206214906, "step": 114910 }, { "epoch": 0.11574360849320284, "grad_norm": 12.530995904598273, "learning_rate": 4.947034987172583e-05, "loss": 2.3189, "mean_token_accuracy": 0.39310343861579894, "step": 114915 }, { "epoch": 0.11574864454630701, "grad_norm": 8.385574642579826, "learning_rate": 4.947026904347311e-05, "loss": 2.4921, "mean_token_accuracy": 0.4344827622175217, "step": 114920 }, { "epoch": 0.11575368059941119, "grad_norm": 10.98471409710384, "learning_rate": 4.947018820912685e-05, "loss": 2.5151, "mean_token_accuracy": 0.37586206793785093, "step": 114925 }, { "epoch": 0.11575871665251536, "grad_norm": 12.494020152037292, "learning_rate": 4.947010736868709e-05, "loss": 2.6162, "mean_token_accuracy": 0.37241379022598264, "step": 114930 }, { "epoch": 0.11576375270561953, "grad_norm": 9.994918790582947, "learning_rate": 4.947002652215382e-05, "loss": 2.1625, "mean_token_accuracy": 0.44137930274009707, "step": 114935 }, { "epoch": 0.1157687887587237, "grad_norm": 9.349052996898859, "learning_rate": 4.9469945669527085e-05, "loss": 2.1796, "mean_token_accuracy": 0.4689655125141144, "step": 114940 }, { "epoch": 0.11577382481182788, "grad_norm": 11.128489157331199, "learning_rate": 4.946986481080691e-05, "loss": 2.501, "mean_token_accuracy": 0.42921960949897764, "step": 114945 }, { "epoch": 0.11577886086493205, "grad_norm": 11.22896045822213, "learning_rate": 4.9469783945993316e-05, "loss": 2.6193, "mean_token_accuracy": 0.43103447556495667, "step": 114950 }, { "epoch": 0.11578389691803623, "grad_norm": 34.76976321431227, "learning_rate": 4.946970307508631e-05, "loss": 3.1037, "mean_token_accuracy": 0.37241379022598264, "step": 114955 }, { "epoch": 0.11578893297114039, "grad_norm": 9.948230086104374, "learning_rate": 4.9469622198085924e-05, "loss": 2.5616, "mean_token_accuracy": 0.3999999940395355, "step": 114960 }, { "epoch": 0.11579396902424456, "grad_norm": 10.366545295723576, "learning_rate": 4.946954131499219e-05, "loss": 2.0939, "mean_token_accuracy": 0.45246305465698244, "step": 114965 }, { "epoch": 0.11579900507734874, "grad_norm": 9.506954420751157, "learning_rate": 4.946946042580511e-05, "loss": 2.3933, "mean_token_accuracy": 0.4620689630508423, "step": 114970 }, { "epoch": 0.11580404113045291, "grad_norm": 10.374595171185781, "learning_rate": 4.946937953052473e-05, "loss": 2.1646, "mean_token_accuracy": 0.4655172348022461, "step": 114975 }, { "epoch": 0.11580907718355708, "grad_norm": 11.622030695526126, "learning_rate": 4.9469298629151054e-05, "loss": 2.1895, "mean_token_accuracy": 0.4016333997249603, "step": 114980 }, { "epoch": 0.11581411323666126, "grad_norm": 9.645352718793392, "learning_rate": 4.9469217721684116e-05, "loss": 2.1136, "mean_token_accuracy": 0.47931033968925474, "step": 114985 }, { "epoch": 0.11581914928976543, "grad_norm": 13.168528483976742, "learning_rate": 4.946913680812393e-05, "loss": 2.5168, "mean_token_accuracy": 0.4068965494632721, "step": 114990 }, { "epoch": 0.1158241853428696, "grad_norm": 11.922130695032525, "learning_rate": 4.9469055888470525e-05, "loss": 2.644, "mean_token_accuracy": 0.3620689630508423, "step": 114995 }, { "epoch": 0.11582922139597378, "grad_norm": 12.839271749319044, "learning_rate": 4.946897496272392e-05, "loss": 2.5436, "mean_token_accuracy": 0.39655172228813174, "step": 115000 }, { "epoch": 0.11583425744907795, "grad_norm": 10.1785852050693, "learning_rate": 4.9468894030884136e-05, "loss": 2.6784, "mean_token_accuracy": 0.38275861740112305, "step": 115005 }, { "epoch": 0.11583929350218213, "grad_norm": 10.980216295560634, "learning_rate": 4.946881309295119e-05, "loss": 2.757, "mean_token_accuracy": 0.3896551728248596, "step": 115010 }, { "epoch": 0.1158443295552863, "grad_norm": 10.226675146828429, "learning_rate": 4.9468732148925127e-05, "loss": 2.1552, "mean_token_accuracy": 0.4620689630508423, "step": 115015 }, { "epoch": 0.11584936560839047, "grad_norm": 14.026351250308208, "learning_rate": 4.946865119880594e-05, "loss": 2.5151, "mean_token_accuracy": 0.38620689511299133, "step": 115020 }, { "epoch": 0.11585440166149465, "grad_norm": 12.45389663486251, "learning_rate": 4.946857024259368e-05, "loss": 2.733, "mean_token_accuracy": 0.4206896543502808, "step": 115025 }, { "epoch": 0.1158594377145988, "grad_norm": 9.442534667269339, "learning_rate": 4.946848928028834e-05, "loss": 2.5857, "mean_token_accuracy": 0.3827586233615875, "step": 115030 }, { "epoch": 0.11586447376770298, "grad_norm": 9.083497313303834, "learning_rate": 4.946840831188996e-05, "loss": 2.5459, "mean_token_accuracy": 0.4034482777118683, "step": 115035 }, { "epoch": 0.11586950982080715, "grad_norm": 9.124452232725604, "learning_rate": 4.9468327337398575e-05, "loss": 2.3336, "mean_token_accuracy": 0.4620689630508423, "step": 115040 }, { "epoch": 0.11587454587391133, "grad_norm": 12.295081768215281, "learning_rate": 4.9468246356814175e-05, "loss": 2.2935, "mean_token_accuracy": 0.4379310369491577, "step": 115045 }, { "epoch": 0.1158795819270155, "grad_norm": 8.93200602151638, "learning_rate": 4.946816537013681e-05, "loss": 2.1948, "mean_token_accuracy": 0.41379310488700866, "step": 115050 }, { "epoch": 0.11588461798011968, "grad_norm": 10.026883581061169, "learning_rate": 4.9468084377366484e-05, "loss": 2.2993, "mean_token_accuracy": 0.4551724135875702, "step": 115055 }, { "epoch": 0.11588965403322385, "grad_norm": 12.689090336341405, "learning_rate": 4.9468003378503235e-05, "loss": 2.9292, "mean_token_accuracy": 0.36896551251411436, "step": 115060 }, { "epoch": 0.11589469008632802, "grad_norm": 9.541070340947421, "learning_rate": 4.9467922373547074e-05, "loss": 2.0635, "mean_token_accuracy": 0.44482759237289426, "step": 115065 }, { "epoch": 0.1158997261394322, "grad_norm": 9.35622604691093, "learning_rate": 4.9467841362498036e-05, "loss": 2.4309, "mean_token_accuracy": 0.4379310369491577, "step": 115070 }, { "epoch": 0.11590476219253637, "grad_norm": 10.691137842513843, "learning_rate": 4.946776034535612e-05, "loss": 2.4491, "mean_token_accuracy": 0.42413793206214906, "step": 115075 }, { "epoch": 0.11590979824564054, "grad_norm": 9.829862440545714, "learning_rate": 4.946767932212139e-05, "loss": 2.5544, "mean_token_accuracy": 0.4, "step": 115080 }, { "epoch": 0.11591483429874472, "grad_norm": 9.145867329156193, "learning_rate": 4.946759829279381e-05, "loss": 2.4485, "mean_token_accuracy": 0.4034482777118683, "step": 115085 }, { "epoch": 0.11591987035184889, "grad_norm": 7.6897273959218975, "learning_rate": 4.9467517257373455e-05, "loss": 2.3298, "mean_token_accuracy": 0.4551724076271057, "step": 115090 }, { "epoch": 0.11592490640495307, "grad_norm": 10.741907085339285, "learning_rate": 4.946743621586032e-05, "loss": 2.6791, "mean_token_accuracy": 0.39183302521705626, "step": 115095 }, { "epoch": 0.11592994245805723, "grad_norm": 9.84487328448593, "learning_rate": 4.946735516825444e-05, "loss": 2.4177, "mean_token_accuracy": 0.4379310369491577, "step": 115100 }, { "epoch": 0.1159349785111614, "grad_norm": 10.878252234993955, "learning_rate": 4.946727411455583e-05, "loss": 2.2905, "mean_token_accuracy": 0.4379310250282288, "step": 115105 }, { "epoch": 0.11594001456426557, "grad_norm": 11.409014987691457, "learning_rate": 4.946719305476451e-05, "loss": 2.3298, "mean_token_accuracy": 0.4793103337287903, "step": 115110 }, { "epoch": 0.11594505061736975, "grad_norm": 11.28540886150633, "learning_rate": 4.946711198888051e-05, "loss": 2.4118, "mean_token_accuracy": 0.3862068891525269, "step": 115115 }, { "epoch": 0.11595008667047392, "grad_norm": 11.082297894509734, "learning_rate": 4.946703091690385e-05, "loss": 2.9201, "mean_token_accuracy": 0.42583181858062746, "step": 115120 }, { "epoch": 0.1159551227235781, "grad_norm": 9.514869502614246, "learning_rate": 4.9466949838834556e-05, "loss": 3.0083, "mean_token_accuracy": 0.4, "step": 115125 }, { "epoch": 0.11596015877668227, "grad_norm": 10.541619586285215, "learning_rate": 4.946686875467264e-05, "loss": 2.2433, "mean_token_accuracy": 0.4655172348022461, "step": 115130 }, { "epoch": 0.11596519482978644, "grad_norm": 10.34254713848287, "learning_rate": 4.946678766441813e-05, "loss": 2.4655, "mean_token_accuracy": 0.38275861740112305, "step": 115135 }, { "epoch": 0.11597023088289062, "grad_norm": 12.565393072279965, "learning_rate": 4.946670656807105e-05, "loss": 2.5526, "mean_token_accuracy": 0.43103447556495667, "step": 115140 }, { "epoch": 0.11597526693599479, "grad_norm": 9.367707549708788, "learning_rate": 4.9466625465631426e-05, "loss": 2.6294, "mean_token_accuracy": 0.35862069129943847, "step": 115145 }, { "epoch": 0.11598030298909896, "grad_norm": 9.053484530661889, "learning_rate": 4.946654435709928e-05, "loss": 2.0614, "mean_token_accuracy": 0.4724137902259827, "step": 115150 }, { "epoch": 0.11598533904220314, "grad_norm": 10.290453044422614, "learning_rate": 4.946646324247462e-05, "loss": 2.4449, "mean_token_accuracy": 0.4379310369491577, "step": 115155 }, { "epoch": 0.11599037509530731, "grad_norm": 10.530823031091657, "learning_rate": 4.946638212175749e-05, "loss": 2.2961, "mean_token_accuracy": 0.4482758641242981, "step": 115160 }, { "epoch": 0.11599541114841148, "grad_norm": 13.568082956372091, "learning_rate": 4.9466300994947896e-05, "loss": 2.9254, "mean_token_accuracy": 0.3551724135875702, "step": 115165 }, { "epoch": 0.11600044720151564, "grad_norm": 12.82807777098334, "learning_rate": 4.946621986204587e-05, "loss": 2.3822, "mean_token_accuracy": 0.441379314661026, "step": 115170 }, { "epoch": 0.11600548325461982, "grad_norm": 14.738920314987807, "learning_rate": 4.946613872305143e-05, "loss": 2.6168, "mean_token_accuracy": 0.38965516686439516, "step": 115175 }, { "epoch": 0.11601051930772399, "grad_norm": 8.541220965909346, "learning_rate": 4.9466057577964596e-05, "loss": 2.1948, "mean_token_accuracy": 0.4931034505367279, "step": 115180 }, { "epoch": 0.11601555536082817, "grad_norm": 9.648440456121552, "learning_rate": 4.9465976426785395e-05, "loss": 2.5241, "mean_token_accuracy": 0.4034482717514038, "step": 115185 }, { "epoch": 0.11602059141393234, "grad_norm": 10.92054789620148, "learning_rate": 4.946589526951386e-05, "loss": 2.0495, "mean_token_accuracy": 0.4965517342090607, "step": 115190 }, { "epoch": 0.11602562746703651, "grad_norm": 11.155178262817554, "learning_rate": 4.946581410614999e-05, "loss": 2.609, "mean_token_accuracy": 0.38275861740112305, "step": 115195 }, { "epoch": 0.11603066352014069, "grad_norm": 23.46945464633818, "learning_rate": 4.946573293669382e-05, "loss": 2.6794, "mean_token_accuracy": 0.39310343861579894, "step": 115200 }, { "epoch": 0.11603569957324486, "grad_norm": 9.472567503498016, "learning_rate": 4.946565176114537e-05, "loss": 2.4833, "mean_token_accuracy": 0.4344827592372894, "step": 115205 }, { "epoch": 0.11604073562634903, "grad_norm": 9.744768122377197, "learning_rate": 4.9465570579504666e-05, "loss": 2.4249, "mean_token_accuracy": 0.413793095946312, "step": 115210 }, { "epoch": 0.11604577167945321, "grad_norm": 10.369140622678696, "learning_rate": 4.946548939177174e-05, "loss": 2.7524, "mean_token_accuracy": 0.3862069010734558, "step": 115215 }, { "epoch": 0.11605080773255738, "grad_norm": 11.05828993811328, "learning_rate": 4.9465408197946596e-05, "loss": 2.447, "mean_token_accuracy": 0.43272837400436404, "step": 115220 }, { "epoch": 0.11605584378566156, "grad_norm": 19.4444165493529, "learning_rate": 4.946532699802926e-05, "loss": 2.5705, "mean_token_accuracy": 0.4221415638923645, "step": 115225 }, { "epoch": 0.11606087983876573, "grad_norm": 10.784234333579297, "learning_rate": 4.946524579201976e-05, "loss": 2.2905, "mean_token_accuracy": 0.4517241299152374, "step": 115230 }, { "epoch": 0.1160659158918699, "grad_norm": 12.178963322160445, "learning_rate": 4.9465164579918116e-05, "loss": 2.7118, "mean_token_accuracy": 0.4172413766384125, "step": 115235 }, { "epoch": 0.11607095194497406, "grad_norm": 10.549543156394241, "learning_rate": 4.946508336172436e-05, "loss": 1.9286, "mean_token_accuracy": 0.4915305495262146, "step": 115240 }, { "epoch": 0.11607598799807824, "grad_norm": 10.319450282173989, "learning_rate": 4.9465002137438496e-05, "loss": 2.1127, "mean_token_accuracy": 0.4172413766384125, "step": 115245 }, { "epoch": 0.11608102405118241, "grad_norm": 10.528320450768115, "learning_rate": 4.946492090706057e-05, "loss": 2.3161, "mean_token_accuracy": 0.48275862336158754, "step": 115250 }, { "epoch": 0.11608606010428658, "grad_norm": 14.61231232918794, "learning_rate": 4.946483967059057e-05, "loss": 2.5036, "mean_token_accuracy": 0.45172413289546964, "step": 115255 }, { "epoch": 0.11609109615739076, "grad_norm": 9.441541989977045, "learning_rate": 4.946475842802856e-05, "loss": 2.1945, "mean_token_accuracy": 0.4689655125141144, "step": 115260 }, { "epoch": 0.11609613221049493, "grad_norm": 10.398852625841755, "learning_rate": 4.946467717937454e-05, "loss": 2.2905, "mean_token_accuracy": 0.4793103516101837, "step": 115265 }, { "epoch": 0.1161011682635991, "grad_norm": 11.306175116284312, "learning_rate": 4.946459592462852e-05, "loss": 2.4169, "mean_token_accuracy": 0.37931033968925476, "step": 115270 }, { "epoch": 0.11610620431670328, "grad_norm": 12.044385339473058, "learning_rate": 4.946451466379055e-05, "loss": 2.4445, "mean_token_accuracy": 0.4517241358757019, "step": 115275 }, { "epoch": 0.11611124036980745, "grad_norm": 9.809964191586662, "learning_rate": 4.9464433396860635e-05, "loss": 2.5434, "mean_token_accuracy": 0.40689656138420105, "step": 115280 }, { "epoch": 0.11611627642291163, "grad_norm": 11.045390611267464, "learning_rate": 4.94643521238388e-05, "loss": 2.2195, "mean_token_accuracy": 0.44827587008476255, "step": 115285 }, { "epoch": 0.1161213124760158, "grad_norm": 11.545014940079167, "learning_rate": 4.946427084472508e-05, "loss": 2.7855, "mean_token_accuracy": 0.38620689511299133, "step": 115290 }, { "epoch": 0.11612634852911997, "grad_norm": 10.423737383038201, "learning_rate": 4.946418955951949e-05, "loss": 2.3702, "mean_token_accuracy": 0.441379314661026, "step": 115295 }, { "epoch": 0.11613138458222415, "grad_norm": 11.287713678219394, "learning_rate": 4.946410826822204e-05, "loss": 2.6133, "mean_token_accuracy": 0.4344827651977539, "step": 115300 }, { "epoch": 0.11613642063532832, "grad_norm": 11.538449547221573, "learning_rate": 4.946402697083276e-05, "loss": 2.3911, "mean_token_accuracy": 0.4241379380226135, "step": 115305 }, { "epoch": 0.11614145668843248, "grad_norm": 11.15145853769805, "learning_rate": 4.946394566735168e-05, "loss": 2.5075, "mean_token_accuracy": 0.4206896543502808, "step": 115310 }, { "epoch": 0.11614649274153666, "grad_norm": 12.004308088461892, "learning_rate": 4.9463864357778817e-05, "loss": 2.1845, "mean_token_accuracy": 0.47931033968925474, "step": 115315 }, { "epoch": 0.11615152879464083, "grad_norm": 10.836842826315328, "learning_rate": 4.946378304211419e-05, "loss": 2.3953, "mean_token_accuracy": 0.44827585816383364, "step": 115320 }, { "epoch": 0.116156564847745, "grad_norm": 12.454193753984839, "learning_rate": 4.946370172035783e-05, "loss": 2.0635, "mean_token_accuracy": 0.47931033968925474, "step": 115325 }, { "epoch": 0.11616160090084918, "grad_norm": 9.981426029421588, "learning_rate": 4.946362039250976e-05, "loss": 2.1319, "mean_token_accuracy": 0.48275862336158754, "step": 115330 }, { "epoch": 0.11616663695395335, "grad_norm": 9.968584080082733, "learning_rate": 4.946353905856999e-05, "loss": 2.3514, "mean_token_accuracy": 0.38965516686439516, "step": 115335 }, { "epoch": 0.11617167300705752, "grad_norm": 12.891631154553258, "learning_rate": 4.9463457718538565e-05, "loss": 2.3522, "mean_token_accuracy": 0.4241379380226135, "step": 115340 }, { "epoch": 0.1161767090601617, "grad_norm": 12.53300376786482, "learning_rate": 4.9463376372415476e-05, "loss": 2.9471, "mean_token_accuracy": 0.33103448152542114, "step": 115345 }, { "epoch": 0.11618174511326587, "grad_norm": 7.86987531860855, "learning_rate": 4.946329502020077e-05, "loss": 2.647, "mean_token_accuracy": 0.4379310369491577, "step": 115350 }, { "epoch": 0.11618678116637005, "grad_norm": 10.889868247753515, "learning_rate": 4.9463213661894455e-05, "loss": 2.3054, "mean_token_accuracy": 0.4551724135875702, "step": 115355 }, { "epoch": 0.11619181721947422, "grad_norm": 9.311672817113127, "learning_rate": 4.946313229749657e-05, "loss": 2.4792, "mean_token_accuracy": 0.4241379380226135, "step": 115360 }, { "epoch": 0.1161968532725784, "grad_norm": 8.51992237007899, "learning_rate": 4.946305092700713e-05, "loss": 2.2338, "mean_token_accuracy": 0.44670296311378477, "step": 115365 }, { "epoch": 0.11620188932568257, "grad_norm": 11.49364806349511, "learning_rate": 4.9462969550426145e-05, "loss": 2.681, "mean_token_accuracy": 0.42758620977401735, "step": 115370 }, { "epoch": 0.11620692537878674, "grad_norm": 9.994100106065499, "learning_rate": 4.946288816775366e-05, "loss": 2.2989, "mean_token_accuracy": 0.42413792610168455, "step": 115375 }, { "epoch": 0.1162119614318909, "grad_norm": 10.836512539711663, "learning_rate": 4.946280677898968e-05, "loss": 2.7114, "mean_token_accuracy": 0.4000000059604645, "step": 115380 }, { "epoch": 0.11621699748499507, "grad_norm": 9.85786582514832, "learning_rate": 4.946272538413424e-05, "loss": 2.5756, "mean_token_accuracy": 0.4413793087005615, "step": 115385 }, { "epoch": 0.11622203353809925, "grad_norm": 9.700186948577992, "learning_rate": 4.946264398318735e-05, "loss": 2.5969, "mean_token_accuracy": 0.4, "step": 115390 }, { "epoch": 0.11622706959120342, "grad_norm": 12.330438731043174, "learning_rate": 4.946256257614904e-05, "loss": 2.2164, "mean_token_accuracy": 0.4000000059604645, "step": 115395 }, { "epoch": 0.1162321056443076, "grad_norm": 9.762070619396953, "learning_rate": 4.946248116301933e-05, "loss": 2.0498, "mean_token_accuracy": 0.5172413766384125, "step": 115400 }, { "epoch": 0.11623714169741177, "grad_norm": 11.941247389040686, "learning_rate": 4.946239974379824e-05, "loss": 2.4527, "mean_token_accuracy": 0.40471869707107544, "step": 115405 }, { "epoch": 0.11624217775051594, "grad_norm": 13.135468872935595, "learning_rate": 4.946231831848581e-05, "loss": 2.6166, "mean_token_accuracy": 0.3896551728248596, "step": 115410 }, { "epoch": 0.11624721380362012, "grad_norm": 10.318823286145287, "learning_rate": 4.9462236887082034e-05, "loss": 2.2466, "mean_token_accuracy": 0.4569872975349426, "step": 115415 }, { "epoch": 0.11625224985672429, "grad_norm": 11.915735426719996, "learning_rate": 4.9462155449586954e-05, "loss": 2.4182, "mean_token_accuracy": 0.3724137872457504, "step": 115420 }, { "epoch": 0.11625728590982846, "grad_norm": 14.011478759940081, "learning_rate": 4.946207400600059e-05, "loss": 2.5114, "mean_token_accuracy": 0.4103448331356049, "step": 115425 }, { "epoch": 0.11626232196293264, "grad_norm": 10.498706248395369, "learning_rate": 4.946199255632296e-05, "loss": 2.4155, "mean_token_accuracy": 0.4068965554237366, "step": 115430 }, { "epoch": 0.11626735801603681, "grad_norm": 9.147964912638376, "learning_rate": 4.946191110055409e-05, "loss": 3.1358, "mean_token_accuracy": 0.35692680180072783, "step": 115435 }, { "epoch": 0.11627239406914099, "grad_norm": 11.739749886125201, "learning_rate": 4.9461829638694014e-05, "loss": 2.5006, "mean_token_accuracy": 0.42413793206214906, "step": 115440 }, { "epoch": 0.11627743012224516, "grad_norm": 10.159315269181814, "learning_rate": 4.9461748170742725e-05, "loss": 2.4216, "mean_token_accuracy": 0.4620689690113068, "step": 115445 }, { "epoch": 0.11628246617534932, "grad_norm": 9.585271059352436, "learning_rate": 4.9461666696700274e-05, "loss": 2.193, "mean_token_accuracy": 0.48275862336158754, "step": 115450 }, { "epoch": 0.1162875022284535, "grad_norm": 11.422969018296149, "learning_rate": 4.946158521656667e-05, "loss": 2.581, "mean_token_accuracy": 0.37586206793785093, "step": 115455 }, { "epoch": 0.11629253828155767, "grad_norm": 12.990905244431511, "learning_rate": 4.946150373034194e-05, "loss": 2.3948, "mean_token_accuracy": 0.43103448748588563, "step": 115460 }, { "epoch": 0.11629757433466184, "grad_norm": 9.992660761729882, "learning_rate": 4.94614222380261e-05, "loss": 1.8641, "mean_token_accuracy": 0.5261343002319336, "step": 115465 }, { "epoch": 0.11630261038776601, "grad_norm": 9.603350942751467, "learning_rate": 4.9461340739619184e-05, "loss": 2.9338, "mean_token_accuracy": 0.4206896543502808, "step": 115470 }, { "epoch": 0.11630764644087019, "grad_norm": 11.694132810592606, "learning_rate": 4.946125923512121e-05, "loss": 2.4882, "mean_token_accuracy": 0.3896551728248596, "step": 115475 }, { "epoch": 0.11631268249397436, "grad_norm": 9.080885273015994, "learning_rate": 4.946117772453219e-05, "loss": 2.4411, "mean_token_accuracy": 0.42758620381355283, "step": 115480 }, { "epoch": 0.11631771854707854, "grad_norm": 10.452452824649617, "learning_rate": 4.946109620785216e-05, "loss": 2.8492, "mean_token_accuracy": 0.39999998807907106, "step": 115485 }, { "epoch": 0.11632275460018271, "grad_norm": 11.34839882166613, "learning_rate": 4.946101468508114e-05, "loss": 2.2454, "mean_token_accuracy": 0.4586206912994385, "step": 115490 }, { "epoch": 0.11632779065328688, "grad_norm": 8.657593349849884, "learning_rate": 4.946093315621915e-05, "loss": 2.2743, "mean_token_accuracy": 0.43448275327682495, "step": 115495 }, { "epoch": 0.11633282670639106, "grad_norm": 9.247033890336532, "learning_rate": 4.946085162126621e-05, "loss": 2.258, "mean_token_accuracy": 0.42413792610168455, "step": 115500 }, { "epoch": 0.11633786275949523, "grad_norm": 12.083708733895131, "learning_rate": 4.946077008022234e-05, "loss": 2.1065, "mean_token_accuracy": 0.5052631616592407, "step": 115505 }, { "epoch": 0.1163428988125994, "grad_norm": 11.385202146190863, "learning_rate": 4.946068853308758e-05, "loss": 2.3868, "mean_token_accuracy": 0.4275861978530884, "step": 115510 }, { "epoch": 0.11634793486570358, "grad_norm": 11.251132100476541, "learning_rate": 4.946060697986193e-05, "loss": 2.5211, "mean_token_accuracy": 0.38620689511299133, "step": 115515 }, { "epoch": 0.11635297091880774, "grad_norm": 9.929897116976612, "learning_rate": 4.946052542054544e-05, "loss": 2.4848, "mean_token_accuracy": 0.43793103098869324, "step": 115520 }, { "epoch": 0.11635800697191191, "grad_norm": 12.82217655212544, "learning_rate": 4.9460443855138107e-05, "loss": 2.8325, "mean_token_accuracy": 0.3862068891525269, "step": 115525 }, { "epoch": 0.11636304302501609, "grad_norm": 11.12107264207888, "learning_rate": 4.946036228363996e-05, "loss": 2.4833, "mean_token_accuracy": 0.3965517163276672, "step": 115530 }, { "epoch": 0.11636807907812026, "grad_norm": 9.190720710973812, "learning_rate": 4.9460280706051025e-05, "loss": 2.4454, "mean_token_accuracy": 0.4304900109767914, "step": 115535 }, { "epoch": 0.11637311513122443, "grad_norm": 11.576107026425028, "learning_rate": 4.946019912237133e-05, "loss": 2.5712, "mean_token_accuracy": 0.4275862067937851, "step": 115540 }, { "epoch": 0.11637815118432861, "grad_norm": 12.33257781253891, "learning_rate": 4.946011753260089e-05, "loss": 2.5197, "mean_token_accuracy": 0.403448274731636, "step": 115545 }, { "epoch": 0.11638318723743278, "grad_norm": 10.178551352018037, "learning_rate": 4.9460035936739726e-05, "loss": 2.5318, "mean_token_accuracy": 0.42758620381355283, "step": 115550 }, { "epoch": 0.11638822329053695, "grad_norm": 8.431372736083754, "learning_rate": 4.945995433478787e-05, "loss": 2.4723, "mean_token_accuracy": 0.43103448748588563, "step": 115555 }, { "epoch": 0.11639325934364113, "grad_norm": 10.924898393733491, "learning_rate": 4.945987272674534e-05, "loss": 2.144, "mean_token_accuracy": 0.4379310369491577, "step": 115560 }, { "epoch": 0.1163982953967453, "grad_norm": 12.096936335973517, "learning_rate": 4.945979111261215e-05, "loss": 2.3004, "mean_token_accuracy": 0.4482758641242981, "step": 115565 }, { "epoch": 0.11640333144984948, "grad_norm": 10.105374753845368, "learning_rate": 4.9459709492388324e-05, "loss": 2.4953, "mean_token_accuracy": 0.41379310488700866, "step": 115570 }, { "epoch": 0.11640836750295365, "grad_norm": 11.453520102704662, "learning_rate": 4.9459627866073906e-05, "loss": 2.6879, "mean_token_accuracy": 0.36709014177322385, "step": 115575 }, { "epoch": 0.11641340355605782, "grad_norm": 10.354748191361663, "learning_rate": 4.945954623366889e-05, "loss": 2.3209, "mean_token_accuracy": 0.441379314661026, "step": 115580 }, { "epoch": 0.116418439609162, "grad_norm": 6.707900197516155, "learning_rate": 4.945946459517332e-05, "loss": 2.2203, "mean_token_accuracy": 0.47241380214691164, "step": 115585 }, { "epoch": 0.11642347566226616, "grad_norm": 11.404918589719339, "learning_rate": 4.945938295058721e-05, "loss": 1.9056, "mean_token_accuracy": 0.49655171632766726, "step": 115590 }, { "epoch": 0.11642851171537033, "grad_norm": 9.888018090877482, "learning_rate": 4.945930129991059e-05, "loss": 2.2519, "mean_token_accuracy": 0.4586206912994385, "step": 115595 }, { "epoch": 0.1164335477684745, "grad_norm": 9.483823781135168, "learning_rate": 4.945921964314346e-05, "loss": 2.2268, "mean_token_accuracy": 0.42758620381355283, "step": 115600 }, { "epoch": 0.11643858382157868, "grad_norm": 9.930794360241201, "learning_rate": 4.945913798028587e-05, "loss": 2.2112, "mean_token_accuracy": 0.4813067078590393, "step": 115605 }, { "epoch": 0.11644361987468285, "grad_norm": 11.083984495867913, "learning_rate": 4.9459056311337824e-05, "loss": 2.4094, "mean_token_accuracy": 0.42413793206214906, "step": 115610 }, { "epoch": 0.11644865592778703, "grad_norm": 9.934613486360456, "learning_rate": 4.9458974636299353e-05, "loss": 2.4664, "mean_token_accuracy": 0.42413793206214906, "step": 115615 }, { "epoch": 0.1164536919808912, "grad_norm": 10.699417389567897, "learning_rate": 4.9458892955170486e-05, "loss": 2.2118, "mean_token_accuracy": 0.4813672125339508, "step": 115620 }, { "epoch": 0.11645872803399537, "grad_norm": 10.542226740099181, "learning_rate": 4.945881126795124e-05, "loss": 2.766, "mean_token_accuracy": 0.4222625494003296, "step": 115625 }, { "epoch": 0.11646376408709955, "grad_norm": 15.216529993859794, "learning_rate": 4.945872957464162e-05, "loss": 2.768, "mean_token_accuracy": 0.37241379022598264, "step": 115630 }, { "epoch": 0.11646880014020372, "grad_norm": 9.670434014276903, "learning_rate": 4.9458647875241675e-05, "loss": 2.3946, "mean_token_accuracy": 0.441379314661026, "step": 115635 }, { "epoch": 0.1164738361933079, "grad_norm": 12.776683054562769, "learning_rate": 4.9458566169751415e-05, "loss": 2.6635, "mean_token_accuracy": 0.39655172228813174, "step": 115640 }, { "epoch": 0.11647887224641207, "grad_norm": 13.176645976549592, "learning_rate": 4.9458484458170865e-05, "loss": 2.399, "mean_token_accuracy": 0.42413793206214906, "step": 115645 }, { "epoch": 0.11648390829951624, "grad_norm": 8.316682104635019, "learning_rate": 4.945840274050005e-05, "loss": 2.2724, "mean_token_accuracy": 0.48620688915252686, "step": 115650 }, { "epoch": 0.11648894435262042, "grad_norm": 10.106033476581214, "learning_rate": 4.945832101673898e-05, "loss": 2.2859, "mean_token_accuracy": 0.46896551847457885, "step": 115655 }, { "epoch": 0.11649398040572458, "grad_norm": 8.671452747323778, "learning_rate": 4.94582392868877e-05, "loss": 2.1955, "mean_token_accuracy": 0.475862056016922, "step": 115660 }, { "epoch": 0.11649901645882875, "grad_norm": 12.048126220549499, "learning_rate": 4.9458157550946213e-05, "loss": 2.6396, "mean_token_accuracy": 0.42413793206214906, "step": 115665 }, { "epoch": 0.11650405251193292, "grad_norm": 10.216266725546491, "learning_rate": 4.945807580891456e-05, "loss": 2.5016, "mean_token_accuracy": 0.42758620977401735, "step": 115670 }, { "epoch": 0.1165090885650371, "grad_norm": 9.176206072741564, "learning_rate": 4.9457994060792744e-05, "loss": 2.2731, "mean_token_accuracy": 0.42413792610168455, "step": 115675 }, { "epoch": 0.11651412461814127, "grad_norm": 8.96944403741141, "learning_rate": 4.9457912306580796e-05, "loss": 2.1557, "mean_token_accuracy": 0.4172413766384125, "step": 115680 }, { "epoch": 0.11651916067124544, "grad_norm": 18.97427530744383, "learning_rate": 4.9457830546278736e-05, "loss": 2.7075, "mean_token_accuracy": 0.37241379022598264, "step": 115685 }, { "epoch": 0.11652419672434962, "grad_norm": 10.410101995569148, "learning_rate": 4.9457748779886596e-05, "loss": 2.5704, "mean_token_accuracy": 0.39310344457626345, "step": 115690 }, { "epoch": 0.11652923277745379, "grad_norm": 10.027134632980392, "learning_rate": 4.945766700740439e-05, "loss": 2.1855, "mean_token_accuracy": 0.4310344815254211, "step": 115695 }, { "epoch": 0.11653426883055797, "grad_norm": 8.591661744075036, "learning_rate": 4.945758522883215e-05, "loss": 2.1383, "mean_token_accuracy": 0.46551724076271056, "step": 115700 }, { "epoch": 0.11653930488366214, "grad_norm": 9.923454356588389, "learning_rate": 4.9457503444169875e-05, "loss": 2.4699, "mean_token_accuracy": 0.4344827651977539, "step": 115705 }, { "epoch": 0.11654434093676631, "grad_norm": 9.345009795333038, "learning_rate": 4.945742165341762e-05, "loss": 1.7995, "mean_token_accuracy": 0.5241379380226135, "step": 115710 }, { "epoch": 0.11654937698987049, "grad_norm": 10.789109957377846, "learning_rate": 4.945733985657539e-05, "loss": 2.3099, "mean_token_accuracy": 0.4517241418361664, "step": 115715 }, { "epoch": 0.11655441304297466, "grad_norm": 9.785067243177775, "learning_rate": 4.945725805364321e-05, "loss": 2.4444, "mean_token_accuracy": 0.42758620977401735, "step": 115720 }, { "epoch": 0.11655944909607883, "grad_norm": 8.163219949610674, "learning_rate": 4.945717624462109e-05, "loss": 2.0798, "mean_token_accuracy": 0.4747731387615204, "step": 115725 }, { "epoch": 0.116564485149183, "grad_norm": 13.167250983399748, "learning_rate": 4.945709442950908e-05, "loss": 2.4144, "mean_token_accuracy": 0.4241379380226135, "step": 115730 }, { "epoch": 0.11656952120228717, "grad_norm": 11.350491317908327, "learning_rate": 4.945701260830718e-05, "loss": 2.1833, "mean_token_accuracy": 0.4310344815254211, "step": 115735 }, { "epoch": 0.11657455725539134, "grad_norm": 10.45625902204729, "learning_rate": 4.945693078101542e-05, "loss": 2.4926, "mean_token_accuracy": 0.4117362439632416, "step": 115740 }, { "epoch": 0.11657959330849552, "grad_norm": 10.596470722984007, "learning_rate": 4.9456848947633826e-05, "loss": 2.5133, "mean_token_accuracy": 0.37241379022598264, "step": 115745 }, { "epoch": 0.11658462936159969, "grad_norm": 12.026924708159525, "learning_rate": 4.945676710816242e-05, "loss": 2.6786, "mean_token_accuracy": 0.42758620381355283, "step": 115750 }, { "epoch": 0.11658966541470386, "grad_norm": 10.180229389458578, "learning_rate": 4.945668526260122e-05, "loss": 2.4549, "mean_token_accuracy": 0.4517241299152374, "step": 115755 }, { "epoch": 0.11659470146780804, "grad_norm": 10.909569803329962, "learning_rate": 4.945660341095026e-05, "loss": 2.3922, "mean_token_accuracy": 0.4655172348022461, "step": 115760 }, { "epoch": 0.11659973752091221, "grad_norm": 9.356114418358679, "learning_rate": 4.9456521553209545e-05, "loss": 2.2703, "mean_token_accuracy": 0.47241379618644713, "step": 115765 }, { "epoch": 0.11660477357401638, "grad_norm": 18.25112101732361, "learning_rate": 4.945643968937911e-05, "loss": 2.7905, "mean_token_accuracy": 0.4068965524435043, "step": 115770 }, { "epoch": 0.11660980962712056, "grad_norm": 11.722689611003101, "learning_rate": 4.945635781945897e-05, "loss": 2.6798, "mean_token_accuracy": 0.4034482777118683, "step": 115775 }, { "epoch": 0.11661484568022473, "grad_norm": 9.051407925823062, "learning_rate": 4.945627594344916e-05, "loss": 2.4164, "mean_token_accuracy": 0.42068966031074523, "step": 115780 }, { "epoch": 0.1166198817333289, "grad_norm": 14.681051897934866, "learning_rate": 4.945619406134969e-05, "loss": 2.4927, "mean_token_accuracy": 0.42758620381355283, "step": 115785 }, { "epoch": 0.11662491778643308, "grad_norm": 11.922758460906103, "learning_rate": 4.945611217316059e-05, "loss": 2.14, "mean_token_accuracy": 0.4517241418361664, "step": 115790 }, { "epoch": 0.11662995383953725, "grad_norm": 11.468697062484884, "learning_rate": 4.9456030278881875e-05, "loss": 2.3373, "mean_token_accuracy": 0.4517241358757019, "step": 115795 }, { "epoch": 0.11663498989264141, "grad_norm": 10.450657000528347, "learning_rate": 4.945594837851358e-05, "loss": 2.2488, "mean_token_accuracy": 0.441379314661026, "step": 115800 }, { "epoch": 0.11664002594574559, "grad_norm": 11.42373818839804, "learning_rate": 4.945586647205571e-05, "loss": 2.6074, "mean_token_accuracy": 0.39655172228813174, "step": 115805 }, { "epoch": 0.11664506199884976, "grad_norm": 10.016274459032315, "learning_rate": 4.945578455950831e-05, "loss": 2.0037, "mean_token_accuracy": 0.4758620738983154, "step": 115810 }, { "epoch": 0.11665009805195393, "grad_norm": 11.579618902275154, "learning_rate": 4.945570264087139e-05, "loss": 2.2197, "mean_token_accuracy": 0.43103447556495667, "step": 115815 }, { "epoch": 0.11665513410505811, "grad_norm": 12.47962708618119, "learning_rate": 4.9455620716144974e-05, "loss": 2.4978, "mean_token_accuracy": 0.3620689630508423, "step": 115820 }, { "epoch": 0.11666017015816228, "grad_norm": 11.375758142208007, "learning_rate": 4.945553878532908e-05, "loss": 2.6813, "mean_token_accuracy": 0.39836660623550413, "step": 115825 }, { "epoch": 0.11666520621126646, "grad_norm": 11.643283017714696, "learning_rate": 4.9455456848423735e-05, "loss": 2.4689, "mean_token_accuracy": 0.35862069129943847, "step": 115830 }, { "epoch": 0.11667024226437063, "grad_norm": 13.874526331426166, "learning_rate": 4.945537490542897e-05, "loss": 2.2901, "mean_token_accuracy": 0.4413793087005615, "step": 115835 }, { "epoch": 0.1166752783174748, "grad_norm": 10.737342865805761, "learning_rate": 4.945529295634479e-05, "loss": 2.5921, "mean_token_accuracy": 0.37241379022598264, "step": 115840 }, { "epoch": 0.11668031437057898, "grad_norm": 11.20551990751828, "learning_rate": 4.9455211001171234e-05, "loss": 2.6037, "mean_token_accuracy": 0.38275861740112305, "step": 115845 }, { "epoch": 0.11668535042368315, "grad_norm": 11.744746618391442, "learning_rate": 4.9455129039908316e-05, "loss": 2.3853, "mean_token_accuracy": 0.38620689511299133, "step": 115850 }, { "epoch": 0.11669038647678733, "grad_norm": 28.02986587397832, "learning_rate": 4.945504707255606e-05, "loss": 2.743, "mean_token_accuracy": 0.4034482777118683, "step": 115855 }, { "epoch": 0.1166954225298915, "grad_norm": 10.984878470524434, "learning_rate": 4.9454965099114494e-05, "loss": 2.2535, "mean_token_accuracy": 0.4068965494632721, "step": 115860 }, { "epoch": 0.11670045858299567, "grad_norm": 9.60325715319281, "learning_rate": 4.945488311958364e-05, "loss": 2.2033, "mean_token_accuracy": 0.4862068831920624, "step": 115865 }, { "epoch": 0.11670549463609983, "grad_norm": 10.512226715408875, "learning_rate": 4.94548011339635e-05, "loss": 2.3801, "mean_token_accuracy": 0.4206896543502808, "step": 115870 }, { "epoch": 0.116710530689204, "grad_norm": 12.691413563055008, "learning_rate": 4.945471914225413e-05, "loss": 2.4049, "mean_token_accuracy": 0.4000000059604645, "step": 115875 }, { "epoch": 0.11671556674230818, "grad_norm": 9.86549905467702, "learning_rate": 4.945463714445553e-05, "loss": 2.3183, "mean_token_accuracy": 0.44482759237289426, "step": 115880 }, { "epoch": 0.11672060279541235, "grad_norm": 11.167283535735207, "learning_rate": 4.945455514056773e-05, "loss": 2.5128, "mean_token_accuracy": 0.4117362380027771, "step": 115885 }, { "epoch": 0.11672563884851653, "grad_norm": 13.239678191295784, "learning_rate": 4.945447313059075e-05, "loss": 2.4661, "mean_token_accuracy": 0.39310345649719236, "step": 115890 }, { "epoch": 0.1167306749016207, "grad_norm": 13.349667705713188, "learning_rate": 4.9454391114524615e-05, "loss": 2.4421, "mean_token_accuracy": 0.4206896543502808, "step": 115895 }, { "epoch": 0.11673571095472488, "grad_norm": 7.313308269894281, "learning_rate": 4.945430909236936e-05, "loss": 1.9481, "mean_token_accuracy": 0.49999999403953554, "step": 115900 }, { "epoch": 0.11674074700782905, "grad_norm": 13.024430158237502, "learning_rate": 4.945422706412499e-05, "loss": 3.0536, "mean_token_accuracy": 0.3849969744682312, "step": 115905 }, { "epoch": 0.11674578306093322, "grad_norm": 13.575749639406023, "learning_rate": 4.9454145029791526e-05, "loss": 2.5119, "mean_token_accuracy": 0.4379310369491577, "step": 115910 }, { "epoch": 0.1167508191140374, "grad_norm": 13.48809560332994, "learning_rate": 4.9454062989369e-05, "loss": 2.824, "mean_token_accuracy": 0.353780996799469, "step": 115915 }, { "epoch": 0.11675585516714157, "grad_norm": 10.725665005337913, "learning_rate": 4.9453980942857435e-05, "loss": 2.4539, "mean_token_accuracy": 0.44137930274009707, "step": 115920 }, { "epoch": 0.11676089122024574, "grad_norm": 13.271258282021748, "learning_rate": 4.9453898890256854e-05, "loss": 2.9137, "mean_token_accuracy": 0.4344827592372894, "step": 115925 }, { "epoch": 0.11676592727334992, "grad_norm": 11.570208199295593, "learning_rate": 4.945381683156727e-05, "loss": 2.4912, "mean_token_accuracy": 0.34482758641242983, "step": 115930 }, { "epoch": 0.11677096332645409, "grad_norm": 11.486074291200067, "learning_rate": 4.9453734766788715e-05, "loss": 2.2736, "mean_token_accuracy": 0.417241370677948, "step": 115935 }, { "epoch": 0.11677599937955825, "grad_norm": 12.672968983714828, "learning_rate": 4.945365269592122e-05, "loss": 2.2165, "mean_token_accuracy": 0.42068964838981626, "step": 115940 }, { "epoch": 0.11678103543266243, "grad_norm": 10.465695061299856, "learning_rate": 4.945357061896478e-05, "loss": 2.6173, "mean_token_accuracy": 0.43103448748588563, "step": 115945 }, { "epoch": 0.1167860714857666, "grad_norm": 10.520903876469106, "learning_rate": 4.945348853591946e-05, "loss": 2.1098, "mean_token_accuracy": 0.5000000059604645, "step": 115950 }, { "epoch": 0.11679110753887077, "grad_norm": 11.154555243478825, "learning_rate": 4.9453406446785234e-05, "loss": 2.7848, "mean_token_accuracy": 0.38620689511299133, "step": 115955 }, { "epoch": 0.11679614359197495, "grad_norm": 9.618912115411533, "learning_rate": 4.945332435156215e-05, "loss": 2.2893, "mean_token_accuracy": 0.43297035694122316, "step": 115960 }, { "epoch": 0.11680117964507912, "grad_norm": 9.948535689632957, "learning_rate": 4.945324225025025e-05, "loss": 2.1097, "mean_token_accuracy": 0.46896552443504336, "step": 115965 }, { "epoch": 0.1168062156981833, "grad_norm": 8.899589222745341, "learning_rate": 4.945316014284952e-05, "loss": 1.9782, "mean_token_accuracy": 0.4896551728248596, "step": 115970 }, { "epoch": 0.11681125175128747, "grad_norm": 12.360207828206821, "learning_rate": 4.945307802936e-05, "loss": 2.4752, "mean_token_accuracy": 0.39310344457626345, "step": 115975 }, { "epoch": 0.11681628780439164, "grad_norm": 10.672202597674643, "learning_rate": 4.945299590978172e-05, "loss": 2.3938, "mean_token_accuracy": 0.4620689690113068, "step": 115980 }, { "epoch": 0.11682132385749582, "grad_norm": 10.185671768148804, "learning_rate": 4.945291378411469e-05, "loss": 2.2032, "mean_token_accuracy": 0.49165154099464414, "step": 115985 }, { "epoch": 0.11682635991059999, "grad_norm": 10.105402541411513, "learning_rate": 4.945283165235893e-05, "loss": 2.2887, "mean_token_accuracy": 0.4206896543502808, "step": 115990 }, { "epoch": 0.11683139596370416, "grad_norm": 10.593841295959065, "learning_rate": 4.9452749514514486e-05, "loss": 2.3434, "mean_token_accuracy": 0.4724137902259827, "step": 115995 }, { "epoch": 0.11683643201680834, "grad_norm": 11.053590106659037, "learning_rate": 4.945266737058135e-05, "loss": 2.6298, "mean_token_accuracy": 0.4103448212146759, "step": 116000 }, { "epoch": 0.11684146806991251, "grad_norm": 11.681979189068993, "learning_rate": 4.945258522055957e-05, "loss": 2.4986, "mean_token_accuracy": 0.4448275864124298, "step": 116005 }, { "epoch": 0.11684650412301667, "grad_norm": 9.559378969716334, "learning_rate": 4.9452503064449154e-05, "loss": 2.2271, "mean_token_accuracy": 0.44827587008476255, "step": 116010 }, { "epoch": 0.11685154017612084, "grad_norm": 10.755371691121825, "learning_rate": 4.9452420902250126e-05, "loss": 2.4322, "mean_token_accuracy": 0.42758620977401735, "step": 116015 }, { "epoch": 0.11685657622922502, "grad_norm": 10.7181291881355, "learning_rate": 4.945233873396252e-05, "loss": 2.076, "mean_token_accuracy": 0.510344821214676, "step": 116020 }, { "epoch": 0.11686161228232919, "grad_norm": 9.338306548513417, "learning_rate": 4.945225655958635e-05, "loss": 2.5562, "mean_token_accuracy": 0.4310344696044922, "step": 116025 }, { "epoch": 0.11686664833543337, "grad_norm": 10.759471829250641, "learning_rate": 4.945217437912163e-05, "loss": 2.4652, "mean_token_accuracy": 0.38965516686439516, "step": 116030 }, { "epoch": 0.11687168438853754, "grad_norm": 10.963118704539715, "learning_rate": 4.94520921925684e-05, "loss": 2.3406, "mean_token_accuracy": 0.4137930989265442, "step": 116035 }, { "epoch": 0.11687672044164171, "grad_norm": 12.000032890893863, "learning_rate": 4.9452009999926675e-05, "loss": 2.644, "mean_token_accuracy": 0.41724138855934145, "step": 116040 }, { "epoch": 0.11688175649474589, "grad_norm": 9.52798022326986, "learning_rate": 4.945192780119648e-05, "loss": 2.0728, "mean_token_accuracy": 0.44730792939662933, "step": 116045 }, { "epoch": 0.11688679254785006, "grad_norm": 11.530358754824915, "learning_rate": 4.9451845596377836e-05, "loss": 2.546, "mean_token_accuracy": 0.41554749608039854, "step": 116050 }, { "epoch": 0.11689182860095423, "grad_norm": 9.943993307005176, "learning_rate": 4.945176338547076e-05, "loss": 2.5577, "mean_token_accuracy": 0.4379310369491577, "step": 116055 }, { "epoch": 0.11689686465405841, "grad_norm": 12.576407512955814, "learning_rate": 4.945168116847529e-05, "loss": 2.5042, "mean_token_accuracy": 0.42068964838981626, "step": 116060 }, { "epoch": 0.11690190070716258, "grad_norm": 10.263905879947119, "learning_rate": 4.945159894539142e-05, "loss": 2.3021, "mean_token_accuracy": 0.42758620977401735, "step": 116065 }, { "epoch": 0.11690693676026676, "grad_norm": 12.035905441718675, "learning_rate": 4.9451516716219215e-05, "loss": 2.7625, "mean_token_accuracy": 0.4344827592372894, "step": 116070 }, { "epoch": 0.11691197281337093, "grad_norm": 11.323575787580229, "learning_rate": 4.945143448095866e-05, "loss": 2.4595, "mean_token_accuracy": 0.44137930274009707, "step": 116075 }, { "epoch": 0.11691700886647509, "grad_norm": 10.026153568842135, "learning_rate": 4.9451352239609806e-05, "loss": 2.3371, "mean_token_accuracy": 0.4206896543502808, "step": 116080 }, { "epoch": 0.11692204491957926, "grad_norm": 10.440528336379558, "learning_rate": 4.9451269992172646e-05, "loss": 2.6709, "mean_token_accuracy": 0.42758620381355283, "step": 116085 }, { "epoch": 0.11692708097268344, "grad_norm": 10.734080592238982, "learning_rate": 4.945118773864723e-05, "loss": 2.4214, "mean_token_accuracy": 0.42758620381355283, "step": 116090 }, { "epoch": 0.11693211702578761, "grad_norm": 10.27241678816356, "learning_rate": 4.945110547903357e-05, "loss": 2.7337, "mean_token_accuracy": 0.35862069129943847, "step": 116095 }, { "epoch": 0.11693715307889178, "grad_norm": 9.808677096690635, "learning_rate": 4.9451023213331686e-05, "loss": 2.268, "mean_token_accuracy": 0.44482759237289426, "step": 116100 }, { "epoch": 0.11694218913199596, "grad_norm": 14.870292691563819, "learning_rate": 4.9450940941541604e-05, "loss": 2.4544, "mean_token_accuracy": 0.4482758641242981, "step": 116105 }, { "epoch": 0.11694722518510013, "grad_norm": 8.867048741685338, "learning_rate": 4.9450858663663355e-05, "loss": 2.1487, "mean_token_accuracy": 0.4862069010734558, "step": 116110 }, { "epoch": 0.1169522612382043, "grad_norm": 9.519703216701137, "learning_rate": 4.9450776379696944e-05, "loss": 2.2684, "mean_token_accuracy": 0.44827585816383364, "step": 116115 }, { "epoch": 0.11695729729130848, "grad_norm": 10.869691303841805, "learning_rate": 4.9450694089642406e-05, "loss": 2.2337, "mean_token_accuracy": 0.458620685338974, "step": 116120 }, { "epoch": 0.11696233334441265, "grad_norm": 11.765809329502606, "learning_rate": 4.9450611793499754e-05, "loss": 2.208, "mean_token_accuracy": 0.40810647010803225, "step": 116125 }, { "epoch": 0.11696736939751683, "grad_norm": 15.498511979054731, "learning_rate": 4.945052949126903e-05, "loss": 2.6611, "mean_token_accuracy": 0.4517241299152374, "step": 116130 }, { "epoch": 0.116972405450621, "grad_norm": 10.976151391005148, "learning_rate": 4.945044718295024e-05, "loss": 2.1054, "mean_token_accuracy": 0.4896551787853241, "step": 116135 }, { "epoch": 0.11697744150372517, "grad_norm": 12.076626805056296, "learning_rate": 4.9450364868543415e-05, "loss": 2.2408, "mean_token_accuracy": 0.46551724672317507, "step": 116140 }, { "epoch": 0.11698247755682935, "grad_norm": 15.360905749949245, "learning_rate": 4.9450282548048566e-05, "loss": 2.3533, "mean_token_accuracy": 0.46551724672317507, "step": 116145 }, { "epoch": 0.11698751360993351, "grad_norm": 13.149931175719512, "learning_rate": 4.945020022146573e-05, "loss": 2.6351, "mean_token_accuracy": 0.403448274731636, "step": 116150 }, { "epoch": 0.11699254966303768, "grad_norm": 9.83808949999587, "learning_rate": 4.945011788879492e-05, "loss": 2.4157, "mean_token_accuracy": 0.41724138259887694, "step": 116155 }, { "epoch": 0.11699758571614186, "grad_norm": 10.78087795454106, "learning_rate": 4.945003555003617e-05, "loss": 2.2923, "mean_token_accuracy": 0.4379310369491577, "step": 116160 }, { "epoch": 0.11700262176924603, "grad_norm": 12.384688224747618, "learning_rate": 4.944995320518949e-05, "loss": 2.3945, "mean_token_accuracy": 0.42758620381355283, "step": 116165 }, { "epoch": 0.1170076578223502, "grad_norm": 11.947188904828211, "learning_rate": 4.9449870854254906e-05, "loss": 2.0403, "mean_token_accuracy": 0.48275862336158754, "step": 116170 }, { "epoch": 0.11701269387545438, "grad_norm": 11.194985142313664, "learning_rate": 4.944978849723245e-05, "loss": 2.488, "mean_token_accuracy": 0.4641863167285919, "step": 116175 }, { "epoch": 0.11701772992855855, "grad_norm": 9.875553562382589, "learning_rate": 4.9449706134122136e-05, "loss": 2.3149, "mean_token_accuracy": 0.4862069010734558, "step": 116180 }, { "epoch": 0.11702276598166272, "grad_norm": 11.698830232013327, "learning_rate": 4.944962376492399e-05, "loss": 2.3978, "mean_token_accuracy": 0.4206896543502808, "step": 116185 }, { "epoch": 0.1170278020347669, "grad_norm": 9.022639123999628, "learning_rate": 4.944954138963803e-05, "loss": 2.3598, "mean_token_accuracy": 0.4068965494632721, "step": 116190 }, { "epoch": 0.11703283808787107, "grad_norm": 10.105603856025704, "learning_rate": 4.9449459008264286e-05, "loss": 2.3072, "mean_token_accuracy": 0.44827587008476255, "step": 116195 }, { "epoch": 0.11703787414097525, "grad_norm": 13.839522445052427, "learning_rate": 4.9449376620802775e-05, "loss": 2.8676, "mean_token_accuracy": 0.36896551847457887, "step": 116200 }, { "epoch": 0.11704291019407942, "grad_norm": 12.252244272753927, "learning_rate": 4.944929422725353e-05, "loss": 2.4315, "mean_token_accuracy": 0.4000000059604645, "step": 116205 }, { "epoch": 0.11704794624718359, "grad_norm": 13.399723841664212, "learning_rate": 4.944921182761656e-05, "loss": 2.6766, "mean_token_accuracy": 0.4068965554237366, "step": 116210 }, { "epoch": 0.11705298230028777, "grad_norm": 10.358785604996916, "learning_rate": 4.944912942189189e-05, "loss": 2.1308, "mean_token_accuracy": 0.5103448271751404, "step": 116215 }, { "epoch": 0.11705801835339193, "grad_norm": 10.699659395828462, "learning_rate": 4.944904701007956e-05, "loss": 2.5629, "mean_token_accuracy": 0.3827586114406586, "step": 116220 }, { "epoch": 0.1170630544064961, "grad_norm": 8.28110972307634, "learning_rate": 4.944896459217957e-05, "loss": 2.3129, "mean_token_accuracy": 0.43448275327682495, "step": 116225 }, { "epoch": 0.11706809045960027, "grad_norm": 11.220316342260155, "learning_rate": 4.9448882168191955e-05, "loss": 2.6096, "mean_token_accuracy": 0.42068966627120974, "step": 116230 }, { "epoch": 0.11707312651270445, "grad_norm": 11.924521016067692, "learning_rate": 4.944879973811674e-05, "loss": 2.3065, "mean_token_accuracy": 0.4, "step": 116235 }, { "epoch": 0.11707816256580862, "grad_norm": 11.030082688305972, "learning_rate": 4.944871730195394e-05, "loss": 2.4011, "mean_token_accuracy": 0.41379310488700866, "step": 116240 }, { "epoch": 0.1170831986189128, "grad_norm": 9.670925299909234, "learning_rate": 4.944863485970358e-05, "loss": 2.0029, "mean_token_accuracy": 0.5103448331356049, "step": 116245 }, { "epoch": 0.11708823467201697, "grad_norm": 8.693764624460925, "learning_rate": 4.9448552411365686e-05, "loss": 2.13, "mean_token_accuracy": 0.42758620977401735, "step": 116250 }, { "epoch": 0.11709327072512114, "grad_norm": 10.138147437805983, "learning_rate": 4.944846995694027e-05, "loss": 2.5662, "mean_token_accuracy": 0.38965516686439516, "step": 116255 }, { "epoch": 0.11709830677822532, "grad_norm": 12.428578656213613, "learning_rate": 4.944838749642738e-05, "loss": 2.1952, "mean_token_accuracy": 0.44827585816383364, "step": 116260 }, { "epoch": 0.11710334283132949, "grad_norm": 12.939888660894074, "learning_rate": 4.944830502982701e-05, "loss": 2.4612, "mean_token_accuracy": 0.42413792610168455, "step": 116265 }, { "epoch": 0.11710837888443366, "grad_norm": 10.034471661828219, "learning_rate": 4.94482225571392e-05, "loss": 2.7153, "mean_token_accuracy": 0.37931033968925476, "step": 116270 }, { "epoch": 0.11711341493753784, "grad_norm": 11.213729992659832, "learning_rate": 4.944814007836397e-05, "loss": 2.057, "mean_token_accuracy": 0.44482758045196535, "step": 116275 }, { "epoch": 0.11711845099064201, "grad_norm": 7.821901811106521, "learning_rate": 4.9448057593501343e-05, "loss": 2.4318, "mean_token_accuracy": 0.39509981870651245, "step": 116280 }, { "epoch": 0.11712348704374619, "grad_norm": 13.529311609703427, "learning_rate": 4.944797510255134e-05, "loss": 2.7912, "mean_token_accuracy": 0.3482758641242981, "step": 116285 }, { "epoch": 0.11712852309685035, "grad_norm": 13.182477130985573, "learning_rate": 4.944789260551398e-05, "loss": 2.6032, "mean_token_accuracy": 0.3655172437429428, "step": 116290 }, { "epoch": 0.11713355914995452, "grad_norm": 9.372159615134265, "learning_rate": 4.9447810102389294e-05, "loss": 2.1468, "mean_token_accuracy": 0.5090744018554687, "step": 116295 }, { "epoch": 0.11713859520305869, "grad_norm": 9.787342582369819, "learning_rate": 4.94477275931773e-05, "loss": 2.1813, "mean_token_accuracy": 0.4431941986083984, "step": 116300 }, { "epoch": 0.11714363125616287, "grad_norm": 9.58095525109613, "learning_rate": 4.944764507787802e-05, "loss": 1.9698, "mean_token_accuracy": 0.46551724076271056, "step": 116305 }, { "epoch": 0.11714866730926704, "grad_norm": 12.768645829501542, "learning_rate": 4.944756255649148e-05, "loss": 2.9265, "mean_token_accuracy": 0.38275861740112305, "step": 116310 }, { "epoch": 0.11715370336237121, "grad_norm": 9.881912939677031, "learning_rate": 4.94474800290177e-05, "loss": 2.3946, "mean_token_accuracy": 0.4000000059604645, "step": 116315 }, { "epoch": 0.11715873941547539, "grad_norm": 13.848996644139467, "learning_rate": 4.9447397495456705e-05, "loss": 2.3912, "mean_token_accuracy": 0.3999999940395355, "step": 116320 }, { "epoch": 0.11716377546857956, "grad_norm": 10.713600679017972, "learning_rate": 4.944731495580852e-05, "loss": 2.3341, "mean_token_accuracy": 0.4034482717514038, "step": 116325 }, { "epoch": 0.11716881152168374, "grad_norm": 10.595057932568483, "learning_rate": 4.9447232410073164e-05, "loss": 2.4671, "mean_token_accuracy": 0.46963096857070924, "step": 116330 }, { "epoch": 0.11717384757478791, "grad_norm": 11.199938322306465, "learning_rate": 4.944714985825066e-05, "loss": 2.3577, "mean_token_accuracy": 0.42413792610168455, "step": 116335 }, { "epoch": 0.11717888362789208, "grad_norm": 12.241815179422435, "learning_rate": 4.944706730034104e-05, "loss": 2.2017, "mean_token_accuracy": 0.4396249234676361, "step": 116340 }, { "epoch": 0.11718391968099626, "grad_norm": 10.908901025234922, "learning_rate": 4.944698473634431e-05, "loss": 2.3606, "mean_token_accuracy": 0.41034482717514037, "step": 116345 }, { "epoch": 0.11718895573410043, "grad_norm": 11.218768136939213, "learning_rate": 4.94469021662605e-05, "loss": 2.6828, "mean_token_accuracy": 0.38620689511299133, "step": 116350 }, { "epoch": 0.1171939917872046, "grad_norm": 11.210801240564177, "learning_rate": 4.944681959008964e-05, "loss": 2.2302, "mean_token_accuracy": 0.48820326924324037, "step": 116355 }, { "epoch": 0.11719902784030876, "grad_norm": 8.064764567798221, "learning_rate": 4.944673700783175e-05, "loss": 2.25, "mean_token_accuracy": 0.47586207985877993, "step": 116360 }, { "epoch": 0.11720406389341294, "grad_norm": 11.448064656910768, "learning_rate": 4.944665441948684e-05, "loss": 2.6454, "mean_token_accuracy": 0.36896551251411436, "step": 116365 }, { "epoch": 0.11720909994651711, "grad_norm": 10.418409702827855, "learning_rate": 4.944657182505495e-05, "loss": 2.5253, "mean_token_accuracy": 0.44827585220336913, "step": 116370 }, { "epoch": 0.11721413599962129, "grad_norm": 11.40192024260826, "learning_rate": 4.94464892245361e-05, "loss": 2.2117, "mean_token_accuracy": 0.4517241299152374, "step": 116375 }, { "epoch": 0.11721917205272546, "grad_norm": 12.788808913068152, "learning_rate": 4.9446406617930306e-05, "loss": 2.7074, "mean_token_accuracy": 0.3517241358757019, "step": 116380 }, { "epoch": 0.11722420810582963, "grad_norm": 8.897991442329726, "learning_rate": 4.94463240052376e-05, "loss": 2.0492, "mean_token_accuracy": 0.4517241358757019, "step": 116385 }, { "epoch": 0.1172292441589338, "grad_norm": 11.43191693620906, "learning_rate": 4.944624138645799e-05, "loss": 2.2601, "mean_token_accuracy": 0.46908867359161377, "step": 116390 }, { "epoch": 0.11723428021203798, "grad_norm": 11.563658486798383, "learning_rate": 4.9446158761591514e-05, "loss": 2.2644, "mean_token_accuracy": 0.41584996581077577, "step": 116395 }, { "epoch": 0.11723931626514215, "grad_norm": 21.739944846020986, "learning_rate": 4.9446076130638185e-05, "loss": 3.0435, "mean_token_accuracy": 0.3896551728248596, "step": 116400 }, { "epoch": 0.11724435231824633, "grad_norm": 10.567984691030203, "learning_rate": 4.944599349359803e-05, "loss": 2.7275, "mean_token_accuracy": 0.43103447556495667, "step": 116405 }, { "epoch": 0.1172493883713505, "grad_norm": 12.491885799705903, "learning_rate": 4.944591085047108e-05, "loss": 2.4462, "mean_token_accuracy": 0.443315190076828, "step": 116410 }, { "epoch": 0.11725442442445468, "grad_norm": 11.112751849615425, "learning_rate": 4.944582820125734e-05, "loss": 2.6486, "mean_token_accuracy": 0.3793103456497192, "step": 116415 }, { "epoch": 0.11725946047755885, "grad_norm": 13.281479164905049, "learning_rate": 4.944574554595685e-05, "loss": 2.3952, "mean_token_accuracy": 0.44482758045196535, "step": 116420 }, { "epoch": 0.11726449653066302, "grad_norm": 16.47375911351208, "learning_rate": 4.944566288456962e-05, "loss": 2.6882, "mean_token_accuracy": 0.38275861740112305, "step": 116425 }, { "epoch": 0.11726953258376718, "grad_norm": 10.59301812283459, "learning_rate": 4.944558021709568e-05, "loss": 2.6771, "mean_token_accuracy": 0.36896551251411436, "step": 116430 }, { "epoch": 0.11727456863687136, "grad_norm": 9.041556915038434, "learning_rate": 4.944549754353506e-05, "loss": 2.7566, "mean_token_accuracy": 0.41034482717514037, "step": 116435 }, { "epoch": 0.11727960468997553, "grad_norm": 15.713302346805873, "learning_rate": 4.9445414863887755e-05, "loss": 2.4818, "mean_token_accuracy": 0.42758620381355283, "step": 116440 }, { "epoch": 0.1172846407430797, "grad_norm": 8.74296217736429, "learning_rate": 4.9445332178153824e-05, "loss": 2.0144, "mean_token_accuracy": 0.44482758045196535, "step": 116445 }, { "epoch": 0.11728967679618388, "grad_norm": 9.159036261888977, "learning_rate": 4.944524948633327e-05, "loss": 2.0715, "mean_token_accuracy": 0.47931034564971925, "step": 116450 }, { "epoch": 0.11729471284928805, "grad_norm": 9.915648138392548, "learning_rate": 4.944516678842611e-05, "loss": 2.2438, "mean_token_accuracy": 0.4219600737094879, "step": 116455 }, { "epoch": 0.11729974890239223, "grad_norm": 9.037395980035706, "learning_rate": 4.9445084084432395e-05, "loss": 2.6635, "mean_token_accuracy": 0.3896551728248596, "step": 116460 }, { "epoch": 0.1173047849554964, "grad_norm": 9.669724716283866, "learning_rate": 4.9445001374352114e-05, "loss": 2.0881, "mean_token_accuracy": 0.4586206912994385, "step": 116465 }, { "epoch": 0.11730982100860057, "grad_norm": 14.121449160891514, "learning_rate": 4.94449186581853e-05, "loss": 3.1865, "mean_token_accuracy": 0.35172412991523744, "step": 116470 }, { "epoch": 0.11731485706170475, "grad_norm": 10.16106422488777, "learning_rate": 4.9444835935931996e-05, "loss": 2.8576, "mean_token_accuracy": 0.37586206793785093, "step": 116475 }, { "epoch": 0.11731989311480892, "grad_norm": 14.003053310333689, "learning_rate": 4.9444753207592206e-05, "loss": 2.6765, "mean_token_accuracy": 0.4000000059604645, "step": 116480 }, { "epoch": 0.1173249291679131, "grad_norm": 9.659221885652371, "learning_rate": 4.944467047316595e-05, "loss": 2.0901, "mean_token_accuracy": 0.4551724135875702, "step": 116485 }, { "epoch": 0.11732996522101727, "grad_norm": 8.91509779253529, "learning_rate": 4.9444587732653266e-05, "loss": 2.4517, "mean_token_accuracy": 0.4517241418361664, "step": 116490 }, { "epoch": 0.11733500127412144, "grad_norm": 20.58438867076416, "learning_rate": 4.9444504986054156e-05, "loss": 2.551, "mean_token_accuracy": 0.39655172228813174, "step": 116495 }, { "epoch": 0.1173400373272256, "grad_norm": 13.882471124924116, "learning_rate": 4.944442223336867e-05, "loss": 2.2383, "mean_token_accuracy": 0.4310344815254211, "step": 116500 }, { "epoch": 0.11734507338032978, "grad_norm": 10.622615345071779, "learning_rate": 4.944433947459681e-05, "loss": 2.5263, "mean_token_accuracy": 0.36551723480224607, "step": 116505 }, { "epoch": 0.11735010943343395, "grad_norm": 13.143523544054066, "learning_rate": 4.94442567097386e-05, "loss": 2.6672, "mean_token_accuracy": 0.41724138259887694, "step": 116510 }, { "epoch": 0.11735514548653812, "grad_norm": 9.70575261608229, "learning_rate": 4.944417393879408e-05, "loss": 2.3677, "mean_token_accuracy": 0.4379310250282288, "step": 116515 }, { "epoch": 0.1173601815396423, "grad_norm": 10.95023956229262, "learning_rate": 4.9444091161763254e-05, "loss": 2.2053, "mean_token_accuracy": 0.48275862336158754, "step": 116520 }, { "epoch": 0.11736521759274647, "grad_norm": 9.224374933290502, "learning_rate": 4.9444008378646155e-05, "loss": 2.4481, "mean_token_accuracy": 0.4241379380226135, "step": 116525 }, { "epoch": 0.11737025364585064, "grad_norm": 12.865728933357365, "learning_rate": 4.9443925589442806e-05, "loss": 2.4315, "mean_token_accuracy": 0.4689655125141144, "step": 116530 }, { "epoch": 0.11737528969895482, "grad_norm": 11.155486633358318, "learning_rate": 4.9443842794153227e-05, "loss": 1.961, "mean_token_accuracy": 0.5052026689052582, "step": 116535 }, { "epoch": 0.11738032575205899, "grad_norm": 11.38635000140854, "learning_rate": 4.944375999277743e-05, "loss": 2.3602, "mean_token_accuracy": 0.45972906351089476, "step": 116540 }, { "epoch": 0.11738536180516317, "grad_norm": 11.078220699198326, "learning_rate": 4.944367718531546e-05, "loss": 3.0634, "mean_token_accuracy": 0.4206896543502808, "step": 116545 }, { "epoch": 0.11739039785826734, "grad_norm": 10.766764198575192, "learning_rate": 4.9443594371767333e-05, "loss": 2.0095, "mean_token_accuracy": 0.5103448331356049, "step": 116550 }, { "epoch": 0.11739543391137151, "grad_norm": 11.078317938177184, "learning_rate": 4.9443511552133065e-05, "loss": 2.5383, "mean_token_accuracy": 0.38620689511299133, "step": 116555 }, { "epoch": 0.11740046996447569, "grad_norm": 13.535648154996771, "learning_rate": 4.9443428726412675e-05, "loss": 2.1587, "mean_token_accuracy": 0.4344827592372894, "step": 116560 }, { "epoch": 0.11740550601757986, "grad_norm": 9.799503329525264, "learning_rate": 4.94433458946062e-05, "loss": 3.0303, "mean_token_accuracy": 0.36206896901130675, "step": 116565 }, { "epoch": 0.11741054207068402, "grad_norm": 10.916761000971757, "learning_rate": 4.9443263056713654e-05, "loss": 2.2982, "mean_token_accuracy": 0.4310344815254211, "step": 116570 }, { "epoch": 0.1174155781237882, "grad_norm": 12.05348692217127, "learning_rate": 4.944318021273506e-05, "loss": 2.2204, "mean_token_accuracy": 0.45015124678611756, "step": 116575 }, { "epoch": 0.11742061417689237, "grad_norm": 13.874551247473775, "learning_rate": 4.944309736267045e-05, "loss": 2.6612, "mean_token_accuracy": 0.43793103098869324, "step": 116580 }, { "epoch": 0.11742565022999654, "grad_norm": 15.31809047807715, "learning_rate": 4.944301450651983e-05, "loss": 2.7279, "mean_token_accuracy": 0.38620689511299133, "step": 116585 }, { "epoch": 0.11743068628310072, "grad_norm": 8.023310194004289, "learning_rate": 4.944293164428324e-05, "loss": 2.6278, "mean_token_accuracy": 0.3793103456497192, "step": 116590 }, { "epoch": 0.11743572233620489, "grad_norm": 10.891732738447331, "learning_rate": 4.94428487759607e-05, "loss": 2.206, "mean_token_accuracy": 0.48620688915252686, "step": 116595 }, { "epoch": 0.11744075838930906, "grad_norm": 10.954299021734673, "learning_rate": 4.944276590155222e-05, "loss": 2.1167, "mean_token_accuracy": 0.46551724672317507, "step": 116600 }, { "epoch": 0.11744579444241324, "grad_norm": 11.97448311872732, "learning_rate": 4.9442683021057836e-05, "loss": 2.4581, "mean_token_accuracy": 0.42758620977401735, "step": 116605 }, { "epoch": 0.11745083049551741, "grad_norm": 12.149972506785112, "learning_rate": 4.9442600134477566e-05, "loss": 2.7285, "mean_token_accuracy": 0.36896551847457887, "step": 116610 }, { "epoch": 0.11745586654862158, "grad_norm": 8.21473441148617, "learning_rate": 4.944251724181144e-05, "loss": 2.389, "mean_token_accuracy": 0.4206896543502808, "step": 116615 }, { "epoch": 0.11746090260172576, "grad_norm": 10.914441827126826, "learning_rate": 4.944243434305947e-05, "loss": 2.5348, "mean_token_accuracy": 0.4068965494632721, "step": 116620 }, { "epoch": 0.11746593865482993, "grad_norm": 10.157873643056975, "learning_rate": 4.9442351438221676e-05, "loss": 2.2388, "mean_token_accuracy": 0.4517241358757019, "step": 116625 }, { "epoch": 0.1174709747079341, "grad_norm": 9.81894238236103, "learning_rate": 4.94422685272981e-05, "loss": 2.1986, "mean_token_accuracy": 0.4344827592372894, "step": 116630 }, { "epoch": 0.11747601076103828, "grad_norm": 11.247209605334845, "learning_rate": 4.9442185610288754e-05, "loss": 2.5353, "mean_token_accuracy": 0.4034482717514038, "step": 116635 }, { "epoch": 0.11748104681414244, "grad_norm": 8.861290669821091, "learning_rate": 4.9442102687193654e-05, "loss": 2.5188, "mean_token_accuracy": 0.42413792610168455, "step": 116640 }, { "epoch": 0.11748608286724661, "grad_norm": 9.831821329559252, "learning_rate": 4.944201975801284e-05, "loss": 2.3204, "mean_token_accuracy": 0.4, "step": 116645 }, { "epoch": 0.11749111892035079, "grad_norm": 10.197027567787785, "learning_rate": 4.9441936822746314e-05, "loss": 2.2504, "mean_token_accuracy": 0.4744101524353027, "step": 116650 }, { "epoch": 0.11749615497345496, "grad_norm": 10.09618015134596, "learning_rate": 4.944185388139411e-05, "loss": 2.316, "mean_token_accuracy": 0.47007388472557066, "step": 116655 }, { "epoch": 0.11750119102655913, "grad_norm": 12.26876481277808, "learning_rate": 4.9441770933956264e-05, "loss": 2.4975, "mean_token_accuracy": 0.4103448331356049, "step": 116660 }, { "epoch": 0.11750622707966331, "grad_norm": 8.819438306540784, "learning_rate": 4.944168798043277e-05, "loss": 2.1074, "mean_token_accuracy": 0.4279556632041931, "step": 116665 }, { "epoch": 0.11751126313276748, "grad_norm": 11.377646003569401, "learning_rate": 4.944160502082368e-05, "loss": 2.5739, "mean_token_accuracy": 0.38620689511299133, "step": 116670 }, { "epoch": 0.11751629918587166, "grad_norm": 9.965306354696677, "learning_rate": 4.9441522055128995e-05, "loss": 2.2015, "mean_token_accuracy": 0.46896552443504336, "step": 116675 }, { "epoch": 0.11752133523897583, "grad_norm": 8.746778140833316, "learning_rate": 4.944143908334875e-05, "loss": 2.2589, "mean_token_accuracy": 0.4068965554237366, "step": 116680 }, { "epoch": 0.11752637129208, "grad_norm": 10.380934340358964, "learning_rate": 4.944135610548297e-05, "loss": 2.122, "mean_token_accuracy": 0.4517241358757019, "step": 116685 }, { "epoch": 0.11753140734518418, "grad_norm": 11.553585604385464, "learning_rate": 4.944127312153167e-05, "loss": 2.0439, "mean_token_accuracy": 0.4758620738983154, "step": 116690 }, { "epoch": 0.11753644339828835, "grad_norm": 8.502544671889812, "learning_rate": 4.944119013149487e-05, "loss": 2.0998, "mean_token_accuracy": 0.4931034505367279, "step": 116695 }, { "epoch": 0.11754147945139252, "grad_norm": 12.520537765873877, "learning_rate": 4.9441107135372605e-05, "loss": 2.604, "mean_token_accuracy": 0.42413793206214906, "step": 116700 }, { "epoch": 0.1175465155044967, "grad_norm": 9.74614436118029, "learning_rate": 4.944102413316489e-05, "loss": 2.3471, "mean_token_accuracy": 0.4448275834321976, "step": 116705 }, { "epoch": 0.11755155155760086, "grad_norm": 10.250991341635984, "learning_rate": 4.9440941124871754e-05, "loss": 2.3137, "mean_token_accuracy": 0.4586206912994385, "step": 116710 }, { "epoch": 0.11755658761070503, "grad_norm": 11.448313482341357, "learning_rate": 4.944085811049321e-05, "loss": 2.5401, "mean_token_accuracy": 0.4137930989265442, "step": 116715 }, { "epoch": 0.1175616236638092, "grad_norm": 11.042596152989422, "learning_rate": 4.944077509002929e-05, "loss": 2.3346, "mean_token_accuracy": 0.4551724135875702, "step": 116720 }, { "epoch": 0.11756665971691338, "grad_norm": 12.343340410772536, "learning_rate": 4.944069206348002e-05, "loss": 2.6679, "mean_token_accuracy": 0.4068965554237366, "step": 116725 }, { "epoch": 0.11757169577001755, "grad_norm": 10.807518279498833, "learning_rate": 4.944060903084541e-05, "loss": 2.3591, "mean_token_accuracy": 0.40689654350280763, "step": 116730 }, { "epoch": 0.11757673182312173, "grad_norm": 10.609633143030225, "learning_rate": 4.944052599212549e-05, "loss": 2.313, "mean_token_accuracy": 0.4379310369491577, "step": 116735 }, { "epoch": 0.1175817678762259, "grad_norm": 10.04981050905842, "learning_rate": 4.944044294732028e-05, "loss": 2.2392, "mean_token_accuracy": 0.44482758045196535, "step": 116740 }, { "epoch": 0.11758680392933007, "grad_norm": 13.130474002526809, "learning_rate": 4.944035989642981e-05, "loss": 2.5623, "mean_token_accuracy": 0.41034482717514037, "step": 116745 }, { "epoch": 0.11759183998243425, "grad_norm": 9.840445667958704, "learning_rate": 4.9440276839454106e-05, "loss": 2.2261, "mean_token_accuracy": 0.4931034445762634, "step": 116750 }, { "epoch": 0.11759687603553842, "grad_norm": 10.62719303654517, "learning_rate": 4.944019377639319e-05, "loss": 2.399, "mean_token_accuracy": 0.45517241954803467, "step": 116755 }, { "epoch": 0.1176019120886426, "grad_norm": 11.567061175900406, "learning_rate": 4.944011070724706e-05, "loss": 3.0578, "mean_token_accuracy": 0.37586206793785093, "step": 116760 }, { "epoch": 0.11760694814174677, "grad_norm": 12.958909484683227, "learning_rate": 4.944002763201577e-05, "loss": 2.5405, "mean_token_accuracy": 0.40689654648303986, "step": 116765 }, { "epoch": 0.11761198419485094, "grad_norm": 9.29276585877999, "learning_rate": 4.943994455069932e-05, "loss": 2.1623, "mean_token_accuracy": 0.482758617401123, "step": 116770 }, { "epoch": 0.11761702024795512, "grad_norm": 10.16086692536918, "learning_rate": 4.9439861463297756e-05, "loss": 2.3733, "mean_token_accuracy": 0.4241379380226135, "step": 116775 }, { "epoch": 0.11762205630105928, "grad_norm": 11.966958024976213, "learning_rate": 4.943977836981109e-05, "loss": 3.0058, "mean_token_accuracy": 0.36551723480224607, "step": 116780 }, { "epoch": 0.11762709235416345, "grad_norm": 10.14445576897235, "learning_rate": 4.9439695270239336e-05, "loss": 2.0251, "mean_token_accuracy": 0.47931034564971925, "step": 116785 }, { "epoch": 0.11763212840726762, "grad_norm": 10.99127178405551, "learning_rate": 4.9439612164582526e-05, "loss": 2.4016, "mean_token_accuracy": 0.4206896543502808, "step": 116790 }, { "epoch": 0.1176371644603718, "grad_norm": 13.946121313919505, "learning_rate": 4.9439529052840686e-05, "loss": 2.5516, "mean_token_accuracy": 0.45862067937850953, "step": 116795 }, { "epoch": 0.11764220051347597, "grad_norm": 10.770480312769632, "learning_rate": 4.943944593501384e-05, "loss": 2.4444, "mean_token_accuracy": 0.42413792610168455, "step": 116800 }, { "epoch": 0.11764723656658015, "grad_norm": 9.747945798982782, "learning_rate": 4.943936281110201e-05, "loss": 2.5046, "mean_token_accuracy": 0.40443349480628965, "step": 116805 }, { "epoch": 0.11765227261968432, "grad_norm": 8.63774571230082, "learning_rate": 4.943927968110521e-05, "loss": 2.2843, "mean_token_accuracy": 0.42413793206214906, "step": 116810 }, { "epoch": 0.1176573086727885, "grad_norm": 14.275075114798376, "learning_rate": 4.943919654502347e-05, "loss": 2.7727, "mean_token_accuracy": 0.4103448152542114, "step": 116815 }, { "epoch": 0.11766234472589267, "grad_norm": 10.896224554118199, "learning_rate": 4.9439113402856806e-05, "loss": 2.2958, "mean_token_accuracy": 0.4448275864124298, "step": 116820 }, { "epoch": 0.11766738077899684, "grad_norm": 10.215495976486038, "learning_rate": 4.943903025460526e-05, "loss": 2.4377, "mean_token_accuracy": 0.4482758641242981, "step": 116825 }, { "epoch": 0.11767241683210101, "grad_norm": 11.09979579138643, "learning_rate": 4.943894710026883e-05, "loss": 2.638, "mean_token_accuracy": 0.38620689511299133, "step": 116830 }, { "epoch": 0.11767745288520519, "grad_norm": 6.843723155900329, "learning_rate": 4.943886393984755e-05, "loss": 2.2339, "mean_token_accuracy": 0.4985221564769745, "step": 116835 }, { "epoch": 0.11768248893830936, "grad_norm": 12.085533955454624, "learning_rate": 4.943878077334146e-05, "loss": 2.5308, "mean_token_accuracy": 0.3965517282485962, "step": 116840 }, { "epoch": 0.11768752499141354, "grad_norm": 10.032777712656817, "learning_rate": 4.943869760075055e-05, "loss": 2.0214, "mean_token_accuracy": 0.4965517222881317, "step": 116845 }, { "epoch": 0.1176925610445177, "grad_norm": 10.75534712392264, "learning_rate": 4.943861442207486e-05, "loss": 2.7358, "mean_token_accuracy": 0.3999999940395355, "step": 116850 }, { "epoch": 0.11769759709762187, "grad_norm": 11.894288871575446, "learning_rate": 4.943853123731443e-05, "loss": 2.298, "mean_token_accuracy": 0.4206896543502808, "step": 116855 }, { "epoch": 0.11770263315072604, "grad_norm": 10.914205775975462, "learning_rate": 4.943844804646925e-05, "loss": 2.2869, "mean_token_accuracy": 0.44827585816383364, "step": 116860 }, { "epoch": 0.11770766920383022, "grad_norm": 12.210442613010017, "learning_rate": 4.943836484953937e-05, "loss": 1.9752, "mean_token_accuracy": 0.5295825660228729, "step": 116865 }, { "epoch": 0.11771270525693439, "grad_norm": 9.530681257972288, "learning_rate": 4.94382816465248e-05, "loss": 2.1066, "mean_token_accuracy": 0.4620689630508423, "step": 116870 }, { "epoch": 0.11771774131003857, "grad_norm": 11.82857876694248, "learning_rate": 4.943819843742557e-05, "loss": 2.2201, "mean_token_accuracy": 0.4517241358757019, "step": 116875 }, { "epoch": 0.11772277736314274, "grad_norm": 11.550901837025753, "learning_rate": 4.943811522224169e-05, "loss": 2.4304, "mean_token_accuracy": 0.39655172228813174, "step": 116880 }, { "epoch": 0.11772781341624691, "grad_norm": 11.051989524457058, "learning_rate": 4.94380320009732e-05, "loss": 2.3391, "mean_token_accuracy": 0.4255293428897858, "step": 116885 }, { "epoch": 0.11773284946935109, "grad_norm": 8.099628719990488, "learning_rate": 4.943794877362011e-05, "loss": 2.1933, "mean_token_accuracy": 0.46896551847457885, "step": 116890 }, { "epoch": 0.11773788552245526, "grad_norm": 11.378584371753458, "learning_rate": 4.943786554018245e-05, "loss": 2.2902, "mean_token_accuracy": 0.45862067937850953, "step": 116895 }, { "epoch": 0.11774292157555943, "grad_norm": 10.332208158184434, "learning_rate": 4.9437782300660234e-05, "loss": 2.254, "mean_token_accuracy": 0.47931033968925474, "step": 116900 }, { "epoch": 0.11774795762866361, "grad_norm": 13.736697877725698, "learning_rate": 4.94376990550535e-05, "loss": 2.9497, "mean_token_accuracy": 0.3586206942796707, "step": 116905 }, { "epoch": 0.11775299368176778, "grad_norm": 10.67860134953222, "learning_rate": 4.943761580336226e-05, "loss": 2.746, "mean_token_accuracy": 0.3517241418361664, "step": 116910 }, { "epoch": 0.11775802973487196, "grad_norm": 12.495174976682623, "learning_rate": 4.943753254558655e-05, "loss": 2.5648, "mean_token_accuracy": 0.37931033968925476, "step": 116915 }, { "epoch": 0.11776306578797612, "grad_norm": 7.76158216386975, "learning_rate": 4.943744928172637e-05, "loss": 2.1637, "mean_token_accuracy": 0.4884452521800995, "step": 116920 }, { "epoch": 0.11776810184108029, "grad_norm": 9.517951481718757, "learning_rate": 4.943736601178176e-05, "loss": 2.2113, "mean_token_accuracy": 0.4434361755847931, "step": 116925 }, { "epoch": 0.11777313789418446, "grad_norm": 13.937153001581242, "learning_rate": 4.9437282735752746e-05, "loss": 2.9494, "mean_token_accuracy": 0.3310344785451889, "step": 116930 }, { "epoch": 0.11777817394728864, "grad_norm": 12.714448270026521, "learning_rate": 4.9437199453639336e-05, "loss": 2.1383, "mean_token_accuracy": 0.4620689630508423, "step": 116935 }, { "epoch": 0.11778321000039281, "grad_norm": 10.614352109131126, "learning_rate": 4.9437116165441566e-05, "loss": 2.4785, "mean_token_accuracy": 0.4000000059604645, "step": 116940 }, { "epoch": 0.11778824605349698, "grad_norm": 11.56157409273577, "learning_rate": 4.943703287115946e-05, "loss": 2.6419, "mean_token_accuracy": 0.4068965554237366, "step": 116945 }, { "epoch": 0.11779328210660116, "grad_norm": 10.751747902612573, "learning_rate": 4.943694957079303e-05, "loss": 2.1691, "mean_token_accuracy": 0.47586206793785096, "step": 116950 }, { "epoch": 0.11779831815970533, "grad_norm": 9.273145195229677, "learning_rate": 4.943686626434231e-05, "loss": 2.2331, "mean_token_accuracy": 0.43611615896224976, "step": 116955 }, { "epoch": 0.1178033542128095, "grad_norm": 11.099075430529211, "learning_rate": 4.9436782951807316e-05, "loss": 2.2157, "mean_token_accuracy": 0.43103447556495667, "step": 116960 }, { "epoch": 0.11780839026591368, "grad_norm": 11.020795811539765, "learning_rate": 4.9436699633188066e-05, "loss": 2.3982, "mean_token_accuracy": 0.4206896543502808, "step": 116965 }, { "epoch": 0.11781342631901785, "grad_norm": 11.039962977281919, "learning_rate": 4.94366163084846e-05, "loss": 2.6009, "mean_token_accuracy": 0.37586206793785093, "step": 116970 }, { "epoch": 0.11781846237212203, "grad_norm": 9.696212176677967, "learning_rate": 4.943653297769693e-05, "loss": 2.0978, "mean_token_accuracy": 0.4689655125141144, "step": 116975 }, { "epoch": 0.1178234984252262, "grad_norm": 11.797569378567681, "learning_rate": 4.943644964082508e-05, "loss": 2.5671, "mean_token_accuracy": 0.41379310488700866, "step": 116980 }, { "epoch": 0.11782853447833037, "grad_norm": 10.673206046986072, "learning_rate": 4.943636629786907e-05, "loss": 2.7213, "mean_token_accuracy": 0.3655172407627106, "step": 116985 }, { "epoch": 0.11783357053143453, "grad_norm": 10.94125678942213, "learning_rate": 4.943628294882893e-05, "loss": 3.3002, "mean_token_accuracy": 0.3931034475564957, "step": 116990 }, { "epoch": 0.11783860658453871, "grad_norm": 11.685919606684527, "learning_rate": 4.943619959370469e-05, "loss": 2.4185, "mean_token_accuracy": 0.44827585816383364, "step": 116995 }, { "epoch": 0.11784364263764288, "grad_norm": 9.678789269292205, "learning_rate": 4.943611623249635e-05, "loss": 2.4696, "mean_token_accuracy": 0.42758620381355283, "step": 117000 }, { "epoch": 0.11784867869074706, "grad_norm": 8.748330517344156, "learning_rate": 4.9436032865203945e-05, "loss": 2.655, "mean_token_accuracy": 0.42758620381355283, "step": 117005 }, { "epoch": 0.11785371474385123, "grad_norm": 17.70168653896884, "learning_rate": 4.943594949182751e-05, "loss": 2.7592, "mean_token_accuracy": 0.4310344785451889, "step": 117010 }, { "epoch": 0.1178587507969554, "grad_norm": 10.709325792840621, "learning_rate": 4.9435866112367044e-05, "loss": 2.2808, "mean_token_accuracy": 0.45172412395477296, "step": 117015 }, { "epoch": 0.11786378685005958, "grad_norm": 10.410880206000954, "learning_rate": 4.9435782726822597e-05, "loss": 2.1926, "mean_token_accuracy": 0.4467029690742493, "step": 117020 }, { "epoch": 0.11786882290316375, "grad_norm": 10.399901059749748, "learning_rate": 4.9435699335194175e-05, "loss": 2.0198, "mean_token_accuracy": 0.4724137902259827, "step": 117025 }, { "epoch": 0.11787385895626792, "grad_norm": 9.679408712776091, "learning_rate": 4.94356159374818e-05, "loss": 2.2997, "mean_token_accuracy": 0.4034482717514038, "step": 117030 }, { "epoch": 0.1178788950093721, "grad_norm": 11.190594597219697, "learning_rate": 4.943553253368551e-05, "loss": 2.6156, "mean_token_accuracy": 0.39310344457626345, "step": 117035 }, { "epoch": 0.11788393106247627, "grad_norm": 12.247828846290636, "learning_rate": 4.94354491238053e-05, "loss": 2.4953, "mean_token_accuracy": 0.4034482777118683, "step": 117040 }, { "epoch": 0.11788896711558045, "grad_norm": 11.19305764561484, "learning_rate": 4.943536570784123e-05, "loss": 2.4112, "mean_token_accuracy": 0.4551724076271057, "step": 117045 }, { "epoch": 0.11789400316868462, "grad_norm": 5.6205515226935185, "learning_rate": 4.94352822857933e-05, "loss": 2.1552, "mean_token_accuracy": 0.47426108121871946, "step": 117050 }, { "epoch": 0.11789903922178879, "grad_norm": 10.693146134476525, "learning_rate": 4.9435198857661535e-05, "loss": 2.5307, "mean_token_accuracy": 0.3931034505367279, "step": 117055 }, { "epoch": 0.11790407527489295, "grad_norm": 11.972952121323944, "learning_rate": 4.943511542344596e-05, "loss": 2.1609, "mean_token_accuracy": 0.46551724076271056, "step": 117060 }, { "epoch": 0.11790911132799713, "grad_norm": 10.978410036891553, "learning_rate": 4.9435031983146603e-05, "loss": 2.4684, "mean_token_accuracy": 0.4259528160095215, "step": 117065 }, { "epoch": 0.1179141473811013, "grad_norm": 17.227658551889444, "learning_rate": 4.943494853676348e-05, "loss": 2.5962, "mean_token_accuracy": 0.37241379022598264, "step": 117070 }, { "epoch": 0.11791918343420547, "grad_norm": 8.61719860457293, "learning_rate": 4.943486508429661e-05, "loss": 2.2179, "mean_token_accuracy": 0.44984875321388246, "step": 117075 }, { "epoch": 0.11792421948730965, "grad_norm": 13.023991930899264, "learning_rate": 4.943478162574604e-05, "loss": 3.0853, "mean_token_accuracy": 0.34827586114406583, "step": 117080 }, { "epoch": 0.11792925554041382, "grad_norm": 10.756462401450111, "learning_rate": 4.943469816111176e-05, "loss": 2.1238, "mean_token_accuracy": 0.4517241418361664, "step": 117085 }, { "epoch": 0.117934291593518, "grad_norm": 10.580216514491761, "learning_rate": 4.9434614690393815e-05, "loss": 2.592, "mean_token_accuracy": 0.4634236454963684, "step": 117090 }, { "epoch": 0.11793932764662217, "grad_norm": 9.200668956704202, "learning_rate": 4.9434531213592226e-05, "loss": 2.357, "mean_token_accuracy": 0.4517241299152374, "step": 117095 }, { "epoch": 0.11794436369972634, "grad_norm": 11.751319282570229, "learning_rate": 4.943444773070701e-05, "loss": 2.3512, "mean_token_accuracy": 0.47586206793785096, "step": 117100 }, { "epoch": 0.11794939975283052, "grad_norm": 9.95946497456793, "learning_rate": 4.943436424173819e-05, "loss": 2.2289, "mean_token_accuracy": 0.4448275864124298, "step": 117105 }, { "epoch": 0.11795443580593469, "grad_norm": 14.116453248101966, "learning_rate": 4.943428074668581e-05, "loss": 2.6454, "mean_token_accuracy": 0.39310343861579894, "step": 117110 }, { "epoch": 0.11795947185903886, "grad_norm": 11.529620933914135, "learning_rate": 4.943419724554986e-05, "loss": 2.4068, "mean_token_accuracy": 0.46206897497177124, "step": 117115 }, { "epoch": 0.11796450791214304, "grad_norm": 11.08098099742401, "learning_rate": 4.9434113738330374e-05, "loss": 3.1405, "mean_token_accuracy": 0.4, "step": 117120 }, { "epoch": 0.11796954396524721, "grad_norm": 10.394389551154124, "learning_rate": 4.94340302250274e-05, "loss": 2.3689, "mean_token_accuracy": 0.4172413766384125, "step": 117125 }, { "epoch": 0.11797458001835137, "grad_norm": 10.161269927461255, "learning_rate": 4.9433946705640916e-05, "loss": 2.6113, "mean_token_accuracy": 0.4068965494632721, "step": 117130 }, { "epoch": 0.11797961607145555, "grad_norm": 10.075927651983134, "learning_rate": 4.943386318017099e-05, "loss": 2.1861, "mean_token_accuracy": 0.47931033968925474, "step": 117135 }, { "epoch": 0.11798465212455972, "grad_norm": 10.388338409910235, "learning_rate": 4.943377964861761e-05, "loss": 2.2907, "mean_token_accuracy": 0.4413793087005615, "step": 117140 }, { "epoch": 0.11798968817766389, "grad_norm": 10.3047240369664, "learning_rate": 4.943369611098082e-05, "loss": 2.6871, "mean_token_accuracy": 0.41034482717514037, "step": 117145 }, { "epoch": 0.11799472423076807, "grad_norm": 9.922034938102792, "learning_rate": 4.943361256726064e-05, "loss": 2.5867, "mean_token_accuracy": 0.4034482777118683, "step": 117150 }, { "epoch": 0.11799976028387224, "grad_norm": 9.57198006769421, "learning_rate": 4.9433529017457084e-05, "loss": 2.6722, "mean_token_accuracy": 0.3655172407627106, "step": 117155 }, { "epoch": 0.11800479633697641, "grad_norm": 12.418429027404768, "learning_rate": 4.943344546157019e-05, "loss": 2.5296, "mean_token_accuracy": 0.45172414779663084, "step": 117160 }, { "epoch": 0.11800983239008059, "grad_norm": 11.285319776402508, "learning_rate": 4.9433361899599975e-05, "loss": 2.5, "mean_token_accuracy": 0.41724138259887694, "step": 117165 }, { "epoch": 0.11801486844318476, "grad_norm": 10.226897098447497, "learning_rate": 4.9433278331546456e-05, "loss": 2.1124, "mean_token_accuracy": 0.4931034445762634, "step": 117170 }, { "epoch": 0.11801990449628894, "grad_norm": 12.192806240284487, "learning_rate": 4.9433194757409655e-05, "loss": 2.1581, "mean_token_accuracy": 0.5241379320621491, "step": 117175 }, { "epoch": 0.11802494054939311, "grad_norm": 8.217158210169593, "learning_rate": 4.943311117718961e-05, "loss": 2.4065, "mean_token_accuracy": 0.4103448331356049, "step": 117180 }, { "epoch": 0.11802997660249728, "grad_norm": 10.021513594986747, "learning_rate": 4.9433027590886335e-05, "loss": 2.4971, "mean_token_accuracy": 0.44827585816383364, "step": 117185 }, { "epoch": 0.11803501265560146, "grad_norm": 10.032821865296821, "learning_rate": 4.9432943998499836e-05, "loss": 2.7352, "mean_token_accuracy": 0.4364791214466095, "step": 117190 }, { "epoch": 0.11804004870870563, "grad_norm": 10.687139729635792, "learning_rate": 4.943286040003017e-05, "loss": 2.3678, "mean_token_accuracy": 0.43793103098869324, "step": 117195 }, { "epoch": 0.11804508476180979, "grad_norm": 10.806512773438895, "learning_rate": 4.9432776795477345e-05, "loss": 2.2012, "mean_token_accuracy": 0.45862069725990295, "step": 117200 }, { "epoch": 0.11805012081491396, "grad_norm": 10.987334124490186, "learning_rate": 4.943269318484137e-05, "loss": 2.7004, "mean_token_accuracy": 0.4034482777118683, "step": 117205 }, { "epoch": 0.11805515686801814, "grad_norm": 10.861353474319626, "learning_rate": 4.943260956812229e-05, "loss": 2.7197, "mean_token_accuracy": 0.35172412991523744, "step": 117210 }, { "epoch": 0.11806019292112231, "grad_norm": 9.601023839327263, "learning_rate": 4.9432525945320116e-05, "loss": 2.7831, "mean_token_accuracy": 0.3758620619773865, "step": 117215 }, { "epoch": 0.11806522897422649, "grad_norm": 10.853734434122517, "learning_rate": 4.943244231643487e-05, "loss": 2.5222, "mean_token_accuracy": 0.40514216423034666, "step": 117220 }, { "epoch": 0.11807026502733066, "grad_norm": 10.527451331740146, "learning_rate": 4.9432358681466585e-05, "loss": 2.5907, "mean_token_accuracy": 0.3482758581638336, "step": 117225 }, { "epoch": 0.11807530108043483, "grad_norm": 10.195285015841613, "learning_rate": 4.943227504041528e-05, "loss": 2.1806, "mean_token_accuracy": 0.4882637679576874, "step": 117230 }, { "epoch": 0.118080337133539, "grad_norm": 9.334626967617538, "learning_rate": 4.9432191393280975e-05, "loss": 2.0619, "mean_token_accuracy": 0.5022988557815552, "step": 117235 }, { "epoch": 0.11808537318664318, "grad_norm": 12.024272279862098, "learning_rate": 4.943210774006369e-05, "loss": 1.8735, "mean_token_accuracy": 0.5379310369491577, "step": 117240 }, { "epoch": 0.11809040923974735, "grad_norm": 10.52909907742804, "learning_rate": 4.9432024080763456e-05, "loss": 2.376, "mean_token_accuracy": 0.42068964838981626, "step": 117245 }, { "epoch": 0.11809544529285153, "grad_norm": 12.035014513484528, "learning_rate": 4.9431940415380295e-05, "loss": 2.1957, "mean_token_accuracy": 0.4413793087005615, "step": 117250 }, { "epoch": 0.1181004813459557, "grad_norm": 9.578910746855403, "learning_rate": 4.9431856743914224e-05, "loss": 2.2358, "mean_token_accuracy": 0.45517241954803467, "step": 117255 }, { "epoch": 0.11810551739905988, "grad_norm": 10.68387916337257, "learning_rate": 4.943177306636527e-05, "loss": 2.233, "mean_token_accuracy": 0.4366001129150391, "step": 117260 }, { "epoch": 0.11811055345216404, "grad_norm": 10.3284038067129, "learning_rate": 4.943168938273347e-05, "loss": 2.0098, "mean_token_accuracy": 0.47931034564971925, "step": 117265 }, { "epoch": 0.11811558950526821, "grad_norm": 10.892552848245662, "learning_rate": 4.943160569301882e-05, "loss": 2.449, "mean_token_accuracy": 0.42068966031074523, "step": 117270 }, { "epoch": 0.11812062555837238, "grad_norm": 10.6957681776383, "learning_rate": 4.943152199722137e-05, "loss": 2.693, "mean_token_accuracy": 0.40689654350280763, "step": 117275 }, { "epoch": 0.11812566161147656, "grad_norm": 11.610918461643395, "learning_rate": 4.9431438295341123e-05, "loss": 2.7006, "mean_token_accuracy": 0.3793103456497192, "step": 117280 }, { "epoch": 0.11813069766458073, "grad_norm": 13.23784821915778, "learning_rate": 4.9431354587378104e-05, "loss": 2.9336, "mean_token_accuracy": 0.37241379618644715, "step": 117285 }, { "epoch": 0.1181357337176849, "grad_norm": 11.482730018297387, "learning_rate": 4.943127087333235e-05, "loss": 3.1177, "mean_token_accuracy": 0.4085299432277679, "step": 117290 }, { "epoch": 0.11814076977078908, "grad_norm": 13.683177575608765, "learning_rate": 4.943118715320388e-05, "loss": 2.5691, "mean_token_accuracy": 0.42758620381355283, "step": 117295 }, { "epoch": 0.11814580582389325, "grad_norm": 10.762445831827403, "learning_rate": 4.94311034269927e-05, "loss": 2.3604, "mean_token_accuracy": 0.44827585816383364, "step": 117300 }, { "epoch": 0.11815084187699743, "grad_norm": 9.783928643717415, "learning_rate": 4.9431019694698856e-05, "loss": 2.4328, "mean_token_accuracy": 0.43448275327682495, "step": 117305 }, { "epoch": 0.1181558779301016, "grad_norm": 9.329378896819275, "learning_rate": 4.943093595632236e-05, "loss": 2.2028, "mean_token_accuracy": 0.458620685338974, "step": 117310 }, { "epoch": 0.11816091398320577, "grad_norm": 11.55787108820168, "learning_rate": 4.943085221186323e-05, "loss": 2.5845, "mean_token_accuracy": 0.4103448212146759, "step": 117315 }, { "epoch": 0.11816595003630995, "grad_norm": 11.236494829853308, "learning_rate": 4.943076846132151e-05, "loss": 2.642, "mean_token_accuracy": 0.4116757333278656, "step": 117320 }, { "epoch": 0.11817098608941412, "grad_norm": 10.767562652970813, "learning_rate": 4.94306847046972e-05, "loss": 2.4351, "mean_token_accuracy": 0.4068965554237366, "step": 117325 }, { "epoch": 0.1181760221425183, "grad_norm": 14.274857540631332, "learning_rate": 4.943060094199034e-05, "loss": 2.4518, "mean_token_accuracy": 0.4671506345272064, "step": 117330 }, { "epoch": 0.11818105819562245, "grad_norm": 10.392036178706148, "learning_rate": 4.943051717320094e-05, "loss": 2.2696, "mean_token_accuracy": 0.4413793087005615, "step": 117335 }, { "epoch": 0.11818609424872663, "grad_norm": 11.054529643493035, "learning_rate": 4.943043339832903e-05, "loss": 2.4521, "mean_token_accuracy": 0.35862069129943847, "step": 117340 }, { "epoch": 0.1181911303018308, "grad_norm": 10.887870657814348, "learning_rate": 4.9430349617374635e-05, "loss": 2.4558, "mean_token_accuracy": 0.41034482717514037, "step": 117345 }, { "epoch": 0.11819616635493498, "grad_norm": 11.916027275345636, "learning_rate": 4.9430265830337776e-05, "loss": 2.6069, "mean_token_accuracy": 0.417241370677948, "step": 117350 }, { "epoch": 0.11820120240803915, "grad_norm": 12.681660600658997, "learning_rate": 4.9430182037218476e-05, "loss": 2.1111, "mean_token_accuracy": 0.5194797456264496, "step": 117355 }, { "epoch": 0.11820623846114332, "grad_norm": 11.660412767454138, "learning_rate": 4.943009823801676e-05, "loss": 2.2791, "mean_token_accuracy": 0.4366606116294861, "step": 117360 }, { "epoch": 0.1182112745142475, "grad_norm": 12.597235078045887, "learning_rate": 4.9430014432732644e-05, "loss": 2.5437, "mean_token_accuracy": 0.4413793087005615, "step": 117365 }, { "epoch": 0.11821631056735167, "grad_norm": 10.116875910105714, "learning_rate": 4.942993062136615e-05, "loss": 2.8002, "mean_token_accuracy": 0.44137930274009707, "step": 117370 }, { "epoch": 0.11822134662045584, "grad_norm": 9.784215524689278, "learning_rate": 4.9429846803917315e-05, "loss": 2.3437, "mean_token_accuracy": 0.458620685338974, "step": 117375 }, { "epoch": 0.11822638267356002, "grad_norm": 11.574574742106151, "learning_rate": 4.942976298038616e-05, "loss": 2.4179, "mean_token_accuracy": 0.46551724076271056, "step": 117380 }, { "epoch": 0.11823141872666419, "grad_norm": 10.16596423409803, "learning_rate": 4.94296791507727e-05, "loss": 2.009, "mean_token_accuracy": 0.482758617401123, "step": 117385 }, { "epoch": 0.11823645477976837, "grad_norm": 11.13949715530086, "learning_rate": 4.942959531507696e-05, "loss": 2.5864, "mean_token_accuracy": 0.41724138259887694, "step": 117390 }, { "epoch": 0.11824149083287254, "grad_norm": 11.017517344902009, "learning_rate": 4.9429511473298973e-05, "loss": 3.1637, "mean_token_accuracy": 0.3758620709180832, "step": 117395 }, { "epoch": 0.11824652688597671, "grad_norm": 11.264072718340433, "learning_rate": 4.9429427625438746e-05, "loss": 2.3694, "mean_token_accuracy": 0.3827586233615875, "step": 117400 }, { "epoch": 0.11825156293908087, "grad_norm": 10.684505736246438, "learning_rate": 4.942934377149631e-05, "loss": 2.3108, "mean_token_accuracy": 0.44827585816383364, "step": 117405 }, { "epoch": 0.11825659899218505, "grad_norm": 8.699446510608043, "learning_rate": 4.942925991147169e-05, "loss": 2.4175, "mean_token_accuracy": 0.4413793087005615, "step": 117410 }, { "epoch": 0.11826163504528922, "grad_norm": 9.259399237925509, "learning_rate": 4.9429176045364915e-05, "loss": 2.0575, "mean_token_accuracy": 0.4620689570903778, "step": 117415 }, { "epoch": 0.1182666710983934, "grad_norm": 11.25169555740929, "learning_rate": 4.942909217317599e-05, "loss": 2.6031, "mean_token_accuracy": 0.3862069010734558, "step": 117420 }, { "epoch": 0.11827170715149757, "grad_norm": 8.369906799755208, "learning_rate": 4.942900829490496e-05, "loss": 2.3816, "mean_token_accuracy": 0.4620689690113068, "step": 117425 }, { "epoch": 0.11827674320460174, "grad_norm": 12.00864604398708, "learning_rate": 4.942892441055183e-05, "loss": 2.5364, "mean_token_accuracy": 0.3862068891525269, "step": 117430 }, { "epoch": 0.11828177925770592, "grad_norm": 13.424480548541231, "learning_rate": 4.942884052011663e-05, "loss": 2.7027, "mean_token_accuracy": 0.39310343861579894, "step": 117435 }, { "epoch": 0.11828681531081009, "grad_norm": 11.532548787149036, "learning_rate": 4.9428756623599394e-05, "loss": 2.7056, "mean_token_accuracy": 0.33448276221752166, "step": 117440 }, { "epoch": 0.11829185136391426, "grad_norm": 15.206613147743617, "learning_rate": 4.942867272100013e-05, "loss": 2.7865, "mean_token_accuracy": 0.40344826579093934, "step": 117445 }, { "epoch": 0.11829688741701844, "grad_norm": 9.009886833345593, "learning_rate": 4.9428588812318864e-05, "loss": 2.6155, "mean_token_accuracy": 0.4137930989265442, "step": 117450 }, { "epoch": 0.11830192347012261, "grad_norm": 9.085238104629758, "learning_rate": 4.942850489755563e-05, "loss": 2.1024, "mean_token_accuracy": 0.5275862038135528, "step": 117455 }, { "epoch": 0.11830695952322678, "grad_norm": 10.336552955561045, "learning_rate": 4.942842097671043e-05, "loss": 2.7042, "mean_token_accuracy": 0.3931034505367279, "step": 117460 }, { "epoch": 0.11831199557633096, "grad_norm": 10.179359474179078, "learning_rate": 4.942833704978331e-05, "loss": 2.4306, "mean_token_accuracy": 0.43793103098869324, "step": 117465 }, { "epoch": 0.11831703162943513, "grad_norm": 13.22413514023442, "learning_rate": 4.942825311677428e-05, "loss": 2.9233, "mean_token_accuracy": 0.3827586233615875, "step": 117470 }, { "epoch": 0.11832206768253929, "grad_norm": 9.05398544786203, "learning_rate": 4.942816917768337e-05, "loss": 2.3322, "mean_token_accuracy": 0.4344827592372894, "step": 117475 }, { "epoch": 0.11832710373564347, "grad_norm": 13.285777591973783, "learning_rate": 4.94280852325106e-05, "loss": 2.1749, "mean_token_accuracy": 0.43793103098869324, "step": 117480 }, { "epoch": 0.11833213978874764, "grad_norm": 11.856534996611979, "learning_rate": 4.9428001281255994e-05, "loss": 2.1179, "mean_token_accuracy": 0.4671182245016098, "step": 117485 }, { "epoch": 0.11833717584185181, "grad_norm": 7.868162655609711, "learning_rate": 4.9427917323919576e-05, "loss": 2.3885, "mean_token_accuracy": 0.4589231610298157, "step": 117490 }, { "epoch": 0.11834221189495599, "grad_norm": 9.978699211146864, "learning_rate": 4.942783336050136e-05, "loss": 2.2539, "mean_token_accuracy": 0.4206896424293518, "step": 117495 }, { "epoch": 0.11834724794806016, "grad_norm": 9.328487121688646, "learning_rate": 4.942774939100139e-05, "loss": 2.6718, "mean_token_accuracy": 0.36896551251411436, "step": 117500 }, { "epoch": 0.11835228400116433, "grad_norm": 10.846124902863966, "learning_rate": 4.942766541541967e-05, "loss": 2.1751, "mean_token_accuracy": 0.4241379380226135, "step": 117505 }, { "epoch": 0.11835732005426851, "grad_norm": 10.879567136373531, "learning_rate": 4.942758143375623e-05, "loss": 2.3471, "mean_token_accuracy": 0.4137930989265442, "step": 117510 }, { "epoch": 0.11836235610737268, "grad_norm": 9.427494804127166, "learning_rate": 4.94274974460111e-05, "loss": 2.2597, "mean_token_accuracy": 0.3999999940395355, "step": 117515 }, { "epoch": 0.11836739216047686, "grad_norm": 9.78666787103167, "learning_rate": 4.942741345218429e-05, "loss": 2.3376, "mean_token_accuracy": 0.42758620977401735, "step": 117520 }, { "epoch": 0.11837242821358103, "grad_norm": 10.318343540428916, "learning_rate": 4.942732945227583e-05, "loss": 2.4714, "mean_token_accuracy": 0.4034482777118683, "step": 117525 }, { "epoch": 0.1183774642666852, "grad_norm": 9.002988706579933, "learning_rate": 4.9427245446285746e-05, "loss": 2.3321, "mean_token_accuracy": 0.42068964838981626, "step": 117530 }, { "epoch": 0.11838250031978938, "grad_norm": 9.600980834146766, "learning_rate": 4.9427161434214066e-05, "loss": 2.2218, "mean_token_accuracy": 0.4551724076271057, "step": 117535 }, { "epoch": 0.11838753637289355, "grad_norm": 13.096164722424598, "learning_rate": 4.942707741606079e-05, "loss": 2.4372, "mean_token_accuracy": 0.4034482717514038, "step": 117540 }, { "epoch": 0.11839257242599771, "grad_norm": 10.407059675104497, "learning_rate": 4.942699339182596e-05, "loss": 2.534, "mean_token_accuracy": 0.4, "step": 117545 }, { "epoch": 0.11839760847910188, "grad_norm": 11.990428529160273, "learning_rate": 4.942690936150961e-05, "loss": 2.8298, "mean_token_accuracy": 0.37241379022598264, "step": 117550 }, { "epoch": 0.11840264453220606, "grad_norm": 10.56402549219765, "learning_rate": 4.9426825325111744e-05, "loss": 2.8074, "mean_token_accuracy": 0.3482758581638336, "step": 117555 }, { "epoch": 0.11840768058531023, "grad_norm": 10.793341755636009, "learning_rate": 4.942674128263239e-05, "loss": 2.5085, "mean_token_accuracy": 0.4448275864124298, "step": 117560 }, { "epoch": 0.1184127166384144, "grad_norm": 8.77722281746387, "learning_rate": 4.9426657234071566e-05, "loss": 2.4511, "mean_token_accuracy": 0.3896551787853241, "step": 117565 }, { "epoch": 0.11841775269151858, "grad_norm": 10.761729214711456, "learning_rate": 4.942657317942931e-05, "loss": 2.1193, "mean_token_accuracy": 0.43793103098869324, "step": 117570 }, { "epoch": 0.11842278874462275, "grad_norm": 20.087335454805636, "learning_rate": 4.942648911870563e-05, "loss": 2.4982, "mean_token_accuracy": 0.43266788125038147, "step": 117575 }, { "epoch": 0.11842782479772693, "grad_norm": 9.513108457658975, "learning_rate": 4.942640505190057e-05, "loss": 2.1642, "mean_token_accuracy": 0.42758620381355283, "step": 117580 }, { "epoch": 0.1184328608508311, "grad_norm": 13.829662132590203, "learning_rate": 4.942632097901413e-05, "loss": 2.4177, "mean_token_accuracy": 0.4068965524435043, "step": 117585 }, { "epoch": 0.11843789690393527, "grad_norm": 11.000197874428952, "learning_rate": 4.9426236900046344e-05, "loss": 2.404, "mean_token_accuracy": 0.4103448331356049, "step": 117590 }, { "epoch": 0.11844293295703945, "grad_norm": 12.16239773865491, "learning_rate": 4.9426152814997226e-05, "loss": 2.2412, "mean_token_accuracy": 0.4448275864124298, "step": 117595 }, { "epoch": 0.11844796901014362, "grad_norm": 12.171884762290903, "learning_rate": 4.942606872386681e-05, "loss": 2.8885, "mean_token_accuracy": 0.41034482717514037, "step": 117600 }, { "epoch": 0.1184530050632478, "grad_norm": 10.067866480557523, "learning_rate": 4.9425984626655134e-05, "loss": 2.5273, "mean_token_accuracy": 0.3827586233615875, "step": 117605 }, { "epoch": 0.11845804111635197, "grad_norm": 11.085957033618337, "learning_rate": 4.942590052336219e-05, "loss": 2.2385, "mean_token_accuracy": 0.4448275864124298, "step": 117610 }, { "epoch": 0.11846307716945613, "grad_norm": 12.088177247722781, "learning_rate": 4.942581641398802e-05, "loss": 2.6982, "mean_token_accuracy": 0.4137930989265442, "step": 117615 }, { "epoch": 0.1184681132225603, "grad_norm": 9.04376959477072, "learning_rate": 4.942573229853264e-05, "loss": 2.3519, "mean_token_accuracy": 0.441379314661026, "step": 117620 }, { "epoch": 0.11847314927566448, "grad_norm": 8.741414380852834, "learning_rate": 4.942564817699608e-05, "loss": 2.42, "mean_token_accuracy": 0.3827586203813553, "step": 117625 }, { "epoch": 0.11847818532876865, "grad_norm": 11.81567649432294, "learning_rate": 4.942556404937836e-05, "loss": 2.0234, "mean_token_accuracy": 0.4862068951129913, "step": 117630 }, { "epoch": 0.11848322138187282, "grad_norm": 12.698237896206757, "learning_rate": 4.94254799156795e-05, "loss": 2.447, "mean_token_accuracy": 0.4137930929660797, "step": 117635 }, { "epoch": 0.118488257434977, "grad_norm": 7.251000999991585, "learning_rate": 4.942539577589952e-05, "loss": 2.1174, "mean_token_accuracy": 0.47586206793785096, "step": 117640 }, { "epoch": 0.11849329348808117, "grad_norm": 13.115054723603714, "learning_rate": 4.942531163003846e-05, "loss": 2.4662, "mean_token_accuracy": 0.42758620977401735, "step": 117645 }, { "epoch": 0.11849832954118535, "grad_norm": 11.066373482963813, "learning_rate": 4.942522747809633e-05, "loss": 2.4682, "mean_token_accuracy": 0.4344827622175217, "step": 117650 }, { "epoch": 0.11850336559428952, "grad_norm": 11.485186188581066, "learning_rate": 4.9425143320073164e-05, "loss": 2.371, "mean_token_accuracy": 0.4344827592372894, "step": 117655 }, { "epoch": 0.1185084016473937, "grad_norm": 10.44216665598516, "learning_rate": 4.9425059155968964e-05, "loss": 2.3079, "mean_token_accuracy": 0.4640653312206268, "step": 117660 }, { "epoch": 0.11851343770049787, "grad_norm": 11.114178055563702, "learning_rate": 4.942497498578377e-05, "loss": 2.1642, "mean_token_accuracy": 0.4275861978530884, "step": 117665 }, { "epoch": 0.11851847375360204, "grad_norm": 10.757586622165972, "learning_rate": 4.94248908095176e-05, "loss": 2.4873, "mean_token_accuracy": 0.4172413766384125, "step": 117670 }, { "epoch": 0.11852350980670621, "grad_norm": 10.001279349511824, "learning_rate": 4.9424806627170494e-05, "loss": 2.103, "mean_token_accuracy": 0.5206896424293518, "step": 117675 }, { "epoch": 0.11852854585981039, "grad_norm": 11.860519748174326, "learning_rate": 4.9424722438742455e-05, "loss": 2.4628, "mean_token_accuracy": 0.3808832406997681, "step": 117680 }, { "epoch": 0.11853358191291455, "grad_norm": 13.110566053895788, "learning_rate": 4.942463824423351e-05, "loss": 2.5673, "mean_token_accuracy": 0.4344827592372894, "step": 117685 }, { "epoch": 0.11853861796601872, "grad_norm": 13.694202556480725, "learning_rate": 4.9424554043643686e-05, "loss": 2.8296, "mean_token_accuracy": 0.3655172407627106, "step": 117690 }, { "epoch": 0.1185436540191229, "grad_norm": 11.035358788701384, "learning_rate": 4.9424469836973004e-05, "loss": 2.5899, "mean_token_accuracy": 0.38275861740112305, "step": 117695 }, { "epoch": 0.11854869007222707, "grad_norm": 10.123509006224824, "learning_rate": 4.9424385624221486e-05, "loss": 2.0671, "mean_token_accuracy": 0.4896551787853241, "step": 117700 }, { "epoch": 0.11855372612533124, "grad_norm": 9.839207093390044, "learning_rate": 4.942430140538916e-05, "loss": 2.5789, "mean_token_accuracy": 0.4068965494632721, "step": 117705 }, { "epoch": 0.11855876217843542, "grad_norm": 11.423751810183264, "learning_rate": 4.942421718047605e-05, "loss": 2.2464, "mean_token_accuracy": 0.4310344815254211, "step": 117710 }, { "epoch": 0.11856379823153959, "grad_norm": 10.284298461641127, "learning_rate": 4.942413294948217e-05, "loss": 2.4385, "mean_token_accuracy": 0.3862069010734558, "step": 117715 }, { "epoch": 0.11856883428464376, "grad_norm": 12.764844554029859, "learning_rate": 4.9424048712407554e-05, "loss": 2.5734, "mean_token_accuracy": 0.3965517282485962, "step": 117720 }, { "epoch": 0.11857387033774794, "grad_norm": 12.938438140374915, "learning_rate": 4.942396446925222e-05, "loss": 2.4212, "mean_token_accuracy": 0.44137929677963256, "step": 117725 }, { "epoch": 0.11857890639085211, "grad_norm": 11.029249170841775, "learning_rate": 4.9423880220016196e-05, "loss": 2.6026, "mean_token_accuracy": 0.4, "step": 117730 }, { "epoch": 0.11858394244395629, "grad_norm": 9.138235091635813, "learning_rate": 4.94237959646995e-05, "loss": 2.5885, "mean_token_accuracy": 0.3310344785451889, "step": 117735 }, { "epoch": 0.11858897849706046, "grad_norm": 9.311674742558143, "learning_rate": 4.942371170330216e-05, "loss": 2.2073, "mean_token_accuracy": 0.4689655125141144, "step": 117740 }, { "epoch": 0.11859401455016463, "grad_norm": 10.121723998478167, "learning_rate": 4.942362743582419e-05, "loss": 2.5153, "mean_token_accuracy": 0.43635813891887665, "step": 117745 }, { "epoch": 0.11859905060326881, "grad_norm": 11.286087147679487, "learning_rate": 4.9423543162265624e-05, "loss": 2.3436, "mean_token_accuracy": 0.4620689630508423, "step": 117750 }, { "epoch": 0.11860408665637297, "grad_norm": 9.099147944692174, "learning_rate": 4.942345888262649e-05, "loss": 2.5682, "mean_token_accuracy": 0.3931034505367279, "step": 117755 }, { "epoch": 0.11860912270947714, "grad_norm": 10.335274921440492, "learning_rate": 4.942337459690679e-05, "loss": 1.9992, "mean_token_accuracy": 0.4862069010734558, "step": 117760 }, { "epoch": 0.11861415876258131, "grad_norm": 9.88447416569921, "learning_rate": 4.9423290305106564e-05, "loss": 1.995, "mean_token_accuracy": 0.5310344815254211, "step": 117765 }, { "epoch": 0.11861919481568549, "grad_norm": 10.34007925251923, "learning_rate": 4.942320600722583e-05, "loss": 2.1413, "mean_token_accuracy": 0.452873569726944, "step": 117770 }, { "epoch": 0.11862423086878966, "grad_norm": 10.102537843597744, "learning_rate": 4.942312170326461e-05, "loss": 2.4418, "mean_token_accuracy": 0.41034482419490814, "step": 117775 }, { "epoch": 0.11862926692189384, "grad_norm": 8.945151886829414, "learning_rate": 4.942303739322294e-05, "loss": 2.1897, "mean_token_accuracy": 0.4517241358757019, "step": 117780 }, { "epoch": 0.11863430297499801, "grad_norm": 8.971994918877126, "learning_rate": 4.942295307710083e-05, "loss": 2.2438, "mean_token_accuracy": 0.4931034505367279, "step": 117785 }, { "epoch": 0.11863933902810218, "grad_norm": 11.420219573034966, "learning_rate": 4.942286875489831e-05, "loss": 2.2393, "mean_token_accuracy": 0.49999999403953554, "step": 117790 }, { "epoch": 0.11864437508120636, "grad_norm": 11.171050385377344, "learning_rate": 4.94227844266154e-05, "loss": 2.7947, "mean_token_accuracy": 0.3738656997680664, "step": 117795 }, { "epoch": 0.11864941113431053, "grad_norm": 13.69795820936663, "learning_rate": 4.942270009225212e-05, "loss": 2.4582, "mean_token_accuracy": 0.3965517163276672, "step": 117800 }, { "epoch": 0.1186544471874147, "grad_norm": 11.448656436044338, "learning_rate": 4.94226157518085e-05, "loss": 2.1673, "mean_token_accuracy": 0.4482758641242981, "step": 117805 }, { "epoch": 0.11865948324051888, "grad_norm": 8.190796980509887, "learning_rate": 4.942253140528456e-05, "loss": 2.1994, "mean_token_accuracy": 0.4896551787853241, "step": 117810 }, { "epoch": 0.11866451929362305, "grad_norm": 11.821608159128315, "learning_rate": 4.942244705268032e-05, "loss": 2.7316, "mean_token_accuracy": 0.38082274198532107, "step": 117815 }, { "epoch": 0.11866955534672723, "grad_norm": 10.125852585041727, "learning_rate": 4.942236269399581e-05, "loss": 2.4308, "mean_token_accuracy": 0.39655172228813174, "step": 117820 }, { "epoch": 0.11867459139983139, "grad_norm": 10.849466719655965, "learning_rate": 4.942227832923105e-05, "loss": 2.6407, "mean_token_accuracy": 0.37241379618644715, "step": 117825 }, { "epoch": 0.11867962745293556, "grad_norm": 12.085639964009205, "learning_rate": 4.942219395838608e-05, "loss": 2.5136, "mean_token_accuracy": 0.4181034445762634, "step": 117830 }, { "epoch": 0.11868466350603973, "grad_norm": 11.125941227921238, "learning_rate": 4.942210958146089e-05, "loss": 2.4519, "mean_token_accuracy": 0.38965516686439516, "step": 117835 }, { "epoch": 0.11868969955914391, "grad_norm": 10.139745065097202, "learning_rate": 4.942202519845552e-05, "loss": 2.4342, "mean_token_accuracy": 0.42068966031074523, "step": 117840 }, { "epoch": 0.11869473561224808, "grad_norm": 11.923264426840769, "learning_rate": 4.942194080937e-05, "loss": 2.4822, "mean_token_accuracy": 0.39655172228813174, "step": 117845 }, { "epoch": 0.11869977166535225, "grad_norm": 11.146289536686972, "learning_rate": 4.942185641420435e-05, "loss": 2.1482, "mean_token_accuracy": 0.4655172526836395, "step": 117850 }, { "epoch": 0.11870480771845643, "grad_norm": 9.368933974324925, "learning_rate": 4.9421772012958586e-05, "loss": 2.7444, "mean_token_accuracy": 0.4034482777118683, "step": 117855 }, { "epoch": 0.1187098437715606, "grad_norm": 10.941368930278218, "learning_rate": 4.942168760563274e-05, "loss": 2.4808, "mean_token_accuracy": 0.42068966031074523, "step": 117860 }, { "epoch": 0.11871487982466478, "grad_norm": 12.32080165588529, "learning_rate": 4.942160319222683e-05, "loss": 2.5743, "mean_token_accuracy": 0.39310344457626345, "step": 117865 }, { "epoch": 0.11871991587776895, "grad_norm": 10.220284658427996, "learning_rate": 4.942151877274089e-05, "loss": 2.5447, "mean_token_accuracy": 0.3862068891525269, "step": 117870 }, { "epoch": 0.11872495193087312, "grad_norm": 9.61090673585351, "learning_rate": 4.942143434717492e-05, "loss": 1.889, "mean_token_accuracy": 0.5179802894592285, "step": 117875 }, { "epoch": 0.1187299879839773, "grad_norm": 10.631401272861194, "learning_rate": 4.942134991552897e-05, "loss": 2.0979, "mean_token_accuracy": 0.47586206197738645, "step": 117880 }, { "epoch": 0.11873502403708147, "grad_norm": 10.53666213770821, "learning_rate": 4.942126547780305e-05, "loss": 2.4646, "mean_token_accuracy": 0.39310345649719236, "step": 117885 }, { "epoch": 0.11874006009018565, "grad_norm": 16.50154761240891, "learning_rate": 4.942118103399719e-05, "loss": 2.3266, "mean_token_accuracy": 0.4620689630508423, "step": 117890 }, { "epoch": 0.1187450961432898, "grad_norm": 10.434859789789655, "learning_rate": 4.9421096584111396e-05, "loss": 2.5767, "mean_token_accuracy": 0.4068965554237366, "step": 117895 }, { "epoch": 0.11875013219639398, "grad_norm": 9.674231512248724, "learning_rate": 4.942101212814572e-05, "loss": 2.326, "mean_token_accuracy": 0.4413793087005615, "step": 117900 }, { "epoch": 0.11875516824949815, "grad_norm": 9.049887517878883, "learning_rate": 4.942092766610016e-05, "loss": 3.1305, "mean_token_accuracy": 0.4019963666796684, "step": 117905 }, { "epoch": 0.11876020430260233, "grad_norm": 9.117668473827552, "learning_rate": 4.9420843197974745e-05, "loss": 2.3043, "mean_token_accuracy": 0.42413792610168455, "step": 117910 }, { "epoch": 0.1187652403557065, "grad_norm": 17.52025506560329, "learning_rate": 4.9420758723769506e-05, "loss": 2.7826, "mean_token_accuracy": 0.42758620381355283, "step": 117915 }, { "epoch": 0.11877027640881067, "grad_norm": 9.853902032540526, "learning_rate": 4.942067424348447e-05, "loss": 2.7749, "mean_token_accuracy": 0.43103447556495667, "step": 117920 }, { "epoch": 0.11877531246191485, "grad_norm": 8.682800140539221, "learning_rate": 4.9420589757119637e-05, "loss": 3.1315, "mean_token_accuracy": 0.3551724135875702, "step": 117925 }, { "epoch": 0.11878034851501902, "grad_norm": 9.935166064346944, "learning_rate": 4.942050526467506e-05, "loss": 2.3773, "mean_token_accuracy": 0.42413792610168455, "step": 117930 }, { "epoch": 0.1187853845681232, "grad_norm": 9.531610144893307, "learning_rate": 4.942042076615074e-05, "loss": 2.2526, "mean_token_accuracy": 0.46551724076271056, "step": 117935 }, { "epoch": 0.11879042062122737, "grad_norm": 9.064302013788339, "learning_rate": 4.9420336261546715e-05, "loss": 2.2141, "mean_token_accuracy": 0.4586207032203674, "step": 117940 }, { "epoch": 0.11879545667433154, "grad_norm": 10.62548195728122, "learning_rate": 4.9420251750863006e-05, "loss": 2.4602, "mean_token_accuracy": 0.4620689690113068, "step": 117945 }, { "epoch": 0.11880049272743572, "grad_norm": 11.34577112182543, "learning_rate": 4.9420167234099627e-05, "loss": 2.6009, "mean_token_accuracy": 0.38275861740112305, "step": 117950 }, { "epoch": 0.11880552878053989, "grad_norm": 10.812351520362958, "learning_rate": 4.942008271125662e-05, "loss": 2.5879, "mean_token_accuracy": 0.4122806966304779, "step": 117955 }, { "epoch": 0.11881056483364406, "grad_norm": 10.109293415462771, "learning_rate": 4.9419998182333985e-05, "loss": 2.4105, "mean_token_accuracy": 0.4068965554237366, "step": 117960 }, { "epoch": 0.11881560088674822, "grad_norm": 10.850329048416622, "learning_rate": 4.941991364733176e-05, "loss": 2.538, "mean_token_accuracy": 0.3655172407627106, "step": 117965 }, { "epoch": 0.1188206369398524, "grad_norm": 10.164670770448964, "learning_rate": 4.941982910624996e-05, "loss": 2.1548, "mean_token_accuracy": 0.5379310369491577, "step": 117970 }, { "epoch": 0.11882567299295657, "grad_norm": 10.978882418264059, "learning_rate": 4.9419744559088614e-05, "loss": 2.5366, "mean_token_accuracy": 0.43448275327682495, "step": 117975 }, { "epoch": 0.11883070904606075, "grad_norm": 7.394035147359939, "learning_rate": 4.9419660005847754e-05, "loss": 1.9128, "mean_token_accuracy": 0.5013546824455262, "step": 117980 }, { "epoch": 0.11883574509916492, "grad_norm": 10.371970959692444, "learning_rate": 4.941957544652739e-05, "loss": 2.9192, "mean_token_accuracy": 0.29655171632766725, "step": 117985 }, { "epoch": 0.11884078115226909, "grad_norm": 10.143870509519564, "learning_rate": 4.941949088112755e-05, "loss": 2.4329, "mean_token_accuracy": 0.41034482717514037, "step": 117990 }, { "epoch": 0.11884581720537327, "grad_norm": 10.791012924625877, "learning_rate": 4.9419406309648255e-05, "loss": 2.5678, "mean_token_accuracy": 0.40344826579093934, "step": 117995 }, { "epoch": 0.11885085325847744, "grad_norm": 11.779664692385717, "learning_rate": 4.941932173208953e-05, "loss": 2.6919, "mean_token_accuracy": 0.3655172407627106, "step": 118000 }, { "epoch": 0.11885588931158161, "grad_norm": 10.210680346041746, "learning_rate": 4.9419237148451405e-05, "loss": 2.4755, "mean_token_accuracy": 0.38275861740112305, "step": 118005 }, { "epoch": 0.11886092536468579, "grad_norm": 11.861469167534205, "learning_rate": 4.94191525587339e-05, "loss": 2.5803, "mean_token_accuracy": 0.3827586233615875, "step": 118010 }, { "epoch": 0.11886596141778996, "grad_norm": 7.677105491402499, "learning_rate": 4.941906796293703e-05, "loss": 1.9268, "mean_token_accuracy": 0.49534180760383606, "step": 118015 }, { "epoch": 0.11887099747089414, "grad_norm": 10.084756005372782, "learning_rate": 4.941898336106083e-05, "loss": 2.146, "mean_token_accuracy": 0.4724137902259827, "step": 118020 }, { "epoch": 0.11887603352399831, "grad_norm": 8.677621099266126, "learning_rate": 4.941889875310531e-05, "loss": 2.0427, "mean_token_accuracy": 0.47586206793785096, "step": 118025 }, { "epoch": 0.11888106957710248, "grad_norm": 9.983090439914552, "learning_rate": 4.9418814139070504e-05, "loss": 2.3807, "mean_token_accuracy": 0.42758620381355283, "step": 118030 }, { "epoch": 0.11888610563020664, "grad_norm": 9.523396300499368, "learning_rate": 4.9418729518956434e-05, "loss": 2.5352, "mean_token_accuracy": 0.37586207389831544, "step": 118035 }, { "epoch": 0.11889114168331082, "grad_norm": 9.439288814639417, "learning_rate": 4.9418644892763126e-05, "loss": 2.3616, "mean_token_accuracy": 0.43254687786102297, "step": 118040 }, { "epoch": 0.11889617773641499, "grad_norm": 9.97860023703339, "learning_rate": 4.9418560260490596e-05, "loss": 2.5781, "mean_token_accuracy": 0.41379310488700866, "step": 118045 }, { "epoch": 0.11890121378951916, "grad_norm": 9.273839606992649, "learning_rate": 4.9418475622138876e-05, "loss": 2.0821, "mean_token_accuracy": 0.3793103456497192, "step": 118050 }, { "epoch": 0.11890624984262334, "grad_norm": 9.248461754615594, "learning_rate": 4.941839097770798e-05, "loss": 2.4006, "mean_token_accuracy": 0.3827586233615875, "step": 118055 }, { "epoch": 0.11891128589572751, "grad_norm": 10.561315303283921, "learning_rate": 4.9418306327197935e-05, "loss": 2.4693, "mean_token_accuracy": 0.41379310488700866, "step": 118060 }, { "epoch": 0.11891632194883169, "grad_norm": 10.537358804239489, "learning_rate": 4.9418221670608776e-05, "loss": 2.6695, "mean_token_accuracy": 0.37931033968925476, "step": 118065 }, { "epoch": 0.11892135800193586, "grad_norm": 11.036825697979381, "learning_rate": 4.941813700794051e-05, "loss": 2.2234, "mean_token_accuracy": 0.4482758641242981, "step": 118070 }, { "epoch": 0.11892639405504003, "grad_norm": 8.878387545852805, "learning_rate": 4.941805233919317e-05, "loss": 2.4538, "mean_token_accuracy": 0.4172413766384125, "step": 118075 }, { "epoch": 0.1189314301081442, "grad_norm": 9.625659053447226, "learning_rate": 4.941796766436677e-05, "loss": 2.6818, "mean_token_accuracy": 0.4103448212146759, "step": 118080 }, { "epoch": 0.11893646616124838, "grad_norm": 12.540810744088365, "learning_rate": 4.941788298346134e-05, "loss": 2.5207, "mean_token_accuracy": 0.41379311084747317, "step": 118085 }, { "epoch": 0.11894150221435255, "grad_norm": 11.855896783512279, "learning_rate": 4.9417798296476915e-05, "loss": 2.5088, "mean_token_accuracy": 0.41724137365818026, "step": 118090 }, { "epoch": 0.11894653826745673, "grad_norm": 12.714311260357253, "learning_rate": 4.941771360341349e-05, "loss": 2.9778, "mean_token_accuracy": 0.34137930870056155, "step": 118095 }, { "epoch": 0.1189515743205609, "grad_norm": 11.921585351966051, "learning_rate": 4.941762890427112e-05, "loss": 2.5337, "mean_token_accuracy": 0.358620685338974, "step": 118100 }, { "epoch": 0.11895661037366506, "grad_norm": 10.638178479169765, "learning_rate": 4.94175441990498e-05, "loss": 2.8544, "mean_token_accuracy": 0.4068965584039688, "step": 118105 }, { "epoch": 0.11896164642676924, "grad_norm": 9.211244094789771, "learning_rate": 4.941745948774958e-05, "loss": 2.2945, "mean_token_accuracy": 0.4379310369491577, "step": 118110 }, { "epoch": 0.11896668247987341, "grad_norm": 9.227315085861271, "learning_rate": 4.941737477037047e-05, "loss": 2.492, "mean_token_accuracy": 0.42758620381355283, "step": 118115 }, { "epoch": 0.11897171853297758, "grad_norm": 9.721460869628173, "learning_rate": 4.941729004691249e-05, "loss": 2.3486, "mean_token_accuracy": 0.4034482777118683, "step": 118120 }, { "epoch": 0.11897675458608176, "grad_norm": 11.817939790871243, "learning_rate": 4.941720531737566e-05, "loss": 2.2008, "mean_token_accuracy": 0.45722927451133727, "step": 118125 }, { "epoch": 0.11898179063918593, "grad_norm": 9.594371120965361, "learning_rate": 4.941712058176003e-05, "loss": 2.2015, "mean_token_accuracy": 0.4517241299152374, "step": 118130 }, { "epoch": 0.1189868266922901, "grad_norm": 10.57940980313601, "learning_rate": 4.941703584006559e-05, "loss": 2.4606, "mean_token_accuracy": 0.42915910482406616, "step": 118135 }, { "epoch": 0.11899186274539428, "grad_norm": 10.19369821741318, "learning_rate": 4.941695109229238e-05, "loss": 2.8426, "mean_token_accuracy": 0.358620685338974, "step": 118140 }, { "epoch": 0.11899689879849845, "grad_norm": 12.708986970676188, "learning_rate": 4.9416866338440435e-05, "loss": 2.3852, "mean_token_accuracy": 0.4435571670532227, "step": 118145 }, { "epoch": 0.11900193485160263, "grad_norm": 15.577878330836903, "learning_rate": 4.9416781578509746e-05, "loss": 2.5312, "mean_token_accuracy": 0.42068966031074523, "step": 118150 }, { "epoch": 0.1190069709047068, "grad_norm": 11.072975299928592, "learning_rate": 4.941669681250036e-05, "loss": 2.673, "mean_token_accuracy": 0.3517241418361664, "step": 118155 }, { "epoch": 0.11901200695781097, "grad_norm": 12.045619258627642, "learning_rate": 4.9416612040412304e-05, "loss": 2.6572, "mean_token_accuracy": 0.3793103456497192, "step": 118160 }, { "epoch": 0.11901704301091515, "grad_norm": 12.090685709215084, "learning_rate": 4.94165272622456e-05, "loss": 2.3707, "mean_token_accuracy": 0.4551724135875702, "step": 118165 }, { "epoch": 0.11902207906401932, "grad_norm": 10.277842844948426, "learning_rate": 4.9416442478000253e-05, "loss": 1.8944, "mean_token_accuracy": 0.5283251285552979, "step": 118170 }, { "epoch": 0.11902711511712348, "grad_norm": 9.381176932969831, "learning_rate": 4.94163576876763e-05, "loss": 2.2185, "mean_token_accuracy": 0.42413793206214906, "step": 118175 }, { "epoch": 0.11903215117022765, "grad_norm": 11.36013866152452, "learning_rate": 4.941627289127377e-05, "loss": 2.6036, "mean_token_accuracy": 0.4034482717514038, "step": 118180 }, { "epoch": 0.11903718722333183, "grad_norm": 11.077589817940073, "learning_rate": 4.941618808879268e-05, "loss": 2.4475, "mean_token_accuracy": 0.441379314661026, "step": 118185 }, { "epoch": 0.119042223276436, "grad_norm": 10.959590634225602, "learning_rate": 4.941610328023304e-05, "loss": 2.2719, "mean_token_accuracy": 0.42758620977401735, "step": 118190 }, { "epoch": 0.11904725932954018, "grad_norm": 9.667822891809388, "learning_rate": 4.94160184655949e-05, "loss": 2.3571, "mean_token_accuracy": 0.4482758641242981, "step": 118195 }, { "epoch": 0.11905229538264435, "grad_norm": 13.392846605407586, "learning_rate": 4.941593364487827e-05, "loss": 2.2951, "mean_token_accuracy": 0.4551724076271057, "step": 118200 }, { "epoch": 0.11905733143574852, "grad_norm": 10.827498649861308, "learning_rate": 4.9415848818083164e-05, "loss": 2.6113, "mean_token_accuracy": 0.3896551728248596, "step": 118205 }, { "epoch": 0.1190623674888527, "grad_norm": 11.830284374903114, "learning_rate": 4.941576398520962e-05, "loss": 2.3752, "mean_token_accuracy": 0.4448275864124298, "step": 118210 }, { "epoch": 0.11906740354195687, "grad_norm": 10.33120865007257, "learning_rate": 4.941567914625767e-05, "loss": 2.2492, "mean_token_accuracy": 0.44137930274009707, "step": 118215 }, { "epoch": 0.11907243959506104, "grad_norm": 14.411637735979626, "learning_rate": 4.941559430122731e-05, "loss": 2.5762, "mean_token_accuracy": 0.41724138259887694, "step": 118220 }, { "epoch": 0.11907747564816522, "grad_norm": 15.249387732309263, "learning_rate": 4.941550945011858e-05, "loss": 2.3438, "mean_token_accuracy": 0.4551724076271057, "step": 118225 }, { "epoch": 0.11908251170126939, "grad_norm": 10.272365709143001, "learning_rate": 4.94154245929315e-05, "loss": 2.1131, "mean_token_accuracy": 0.48457351326942444, "step": 118230 }, { "epoch": 0.11908754775437357, "grad_norm": 9.873290752968993, "learning_rate": 4.9415339729666105e-05, "loss": 2.5571, "mean_token_accuracy": 0.4310344815254211, "step": 118235 }, { "epoch": 0.11909258380747774, "grad_norm": 9.918590101541952, "learning_rate": 4.94152548603224e-05, "loss": 2.6277, "mean_token_accuracy": 0.4034482777118683, "step": 118240 }, { "epoch": 0.1190976198605819, "grad_norm": 9.343003725989234, "learning_rate": 4.9415169984900423e-05, "loss": 2.3822, "mean_token_accuracy": 0.4272837281227112, "step": 118245 }, { "epoch": 0.11910265591368607, "grad_norm": 9.381508685954094, "learning_rate": 4.941508510340019e-05, "loss": 2.6887, "mean_token_accuracy": 0.3999999940395355, "step": 118250 }, { "epoch": 0.11910769196679025, "grad_norm": 11.231169027324897, "learning_rate": 4.9415000215821725e-05, "loss": 2.4338, "mean_token_accuracy": 0.3896551728248596, "step": 118255 }, { "epoch": 0.11911272801989442, "grad_norm": 9.980491772279954, "learning_rate": 4.941491532216505e-05, "loss": 2.3145, "mean_token_accuracy": 0.4344827592372894, "step": 118260 }, { "epoch": 0.1191177640729986, "grad_norm": 9.940889878598394, "learning_rate": 4.94148304224302e-05, "loss": 2.0441, "mean_token_accuracy": 0.47586206793785096, "step": 118265 }, { "epoch": 0.11912280012610277, "grad_norm": 10.171271102125502, "learning_rate": 4.941474551661718e-05, "loss": 2.2372, "mean_token_accuracy": 0.458620685338974, "step": 118270 }, { "epoch": 0.11912783617920694, "grad_norm": 9.684442713136876, "learning_rate": 4.941466060472603e-05, "loss": 2.4832, "mean_token_accuracy": 0.4399273991584778, "step": 118275 }, { "epoch": 0.11913287223231112, "grad_norm": 11.457882577223433, "learning_rate": 4.941457568675677e-05, "loss": 2.479, "mean_token_accuracy": 0.3931034505367279, "step": 118280 }, { "epoch": 0.11913790828541529, "grad_norm": 9.625692035660764, "learning_rate": 4.941449076270941e-05, "loss": 2.3688, "mean_token_accuracy": 0.41034482717514037, "step": 118285 }, { "epoch": 0.11914294433851946, "grad_norm": 15.243137908281236, "learning_rate": 4.9414405832584e-05, "loss": 2.7049, "mean_token_accuracy": 0.4196007251739502, "step": 118290 }, { "epoch": 0.11914798039162364, "grad_norm": 11.12678717048002, "learning_rate": 4.941432089638054e-05, "loss": 2.7158, "mean_token_accuracy": 0.37241379618644715, "step": 118295 }, { "epoch": 0.11915301644472781, "grad_norm": 13.054541526482632, "learning_rate": 4.941423595409905e-05, "loss": 2.528, "mean_token_accuracy": 0.4, "step": 118300 }, { "epoch": 0.11915805249783198, "grad_norm": 10.794845815368852, "learning_rate": 4.941415100573958e-05, "loss": 1.9352, "mean_token_accuracy": 0.47434966564178466, "step": 118305 }, { "epoch": 0.11916308855093616, "grad_norm": 9.005065862407985, "learning_rate": 4.9414066051302135e-05, "loss": 2.8789, "mean_token_accuracy": 0.3724137872457504, "step": 118310 }, { "epoch": 0.11916812460404032, "grad_norm": 10.549657818791939, "learning_rate": 4.941398109078674e-05, "loss": 2.8535, "mean_token_accuracy": 0.36896551251411436, "step": 118315 }, { "epoch": 0.11917316065714449, "grad_norm": 13.709032557563544, "learning_rate": 4.941389612419342e-05, "loss": 2.8147, "mean_token_accuracy": 0.3758620619773865, "step": 118320 }, { "epoch": 0.11917819671024867, "grad_norm": 10.870110386109967, "learning_rate": 4.94138111515222e-05, "loss": 2.8652, "mean_token_accuracy": 0.39655172228813174, "step": 118325 }, { "epoch": 0.11918323276335284, "grad_norm": 11.334745620308894, "learning_rate": 4.94137261727731e-05, "loss": 2.669, "mean_token_accuracy": 0.33448274731636046, "step": 118330 }, { "epoch": 0.11918826881645701, "grad_norm": 13.40262837600971, "learning_rate": 4.941364118794615e-05, "loss": 2.2534, "mean_token_accuracy": 0.4275861978530884, "step": 118335 }, { "epoch": 0.11919330486956119, "grad_norm": 11.110444206783198, "learning_rate": 4.941355619704137e-05, "loss": 2.4351, "mean_token_accuracy": 0.42068964838981626, "step": 118340 }, { "epoch": 0.11919834092266536, "grad_norm": 12.597115681514273, "learning_rate": 4.9413471200058785e-05, "loss": 2.3296, "mean_token_accuracy": 0.4257713258266449, "step": 118345 }, { "epoch": 0.11920337697576953, "grad_norm": 9.62017124710925, "learning_rate": 4.9413386196998415e-05, "loss": 2.492, "mean_token_accuracy": 0.4, "step": 118350 }, { "epoch": 0.11920841302887371, "grad_norm": 11.268950610158683, "learning_rate": 4.9413301187860286e-05, "loss": 2.4912, "mean_token_accuracy": 0.40689654648303986, "step": 118355 }, { "epoch": 0.11921344908197788, "grad_norm": 10.656016022940861, "learning_rate": 4.941321617264442e-05, "loss": 2.538, "mean_token_accuracy": 0.3793103456497192, "step": 118360 }, { "epoch": 0.11921848513508206, "grad_norm": 12.183203114827036, "learning_rate": 4.9413131151350836e-05, "loss": 2.7715, "mean_token_accuracy": 0.4260738015174866, "step": 118365 }, { "epoch": 0.11922352118818623, "grad_norm": 9.321545495686983, "learning_rate": 4.9413046123979564e-05, "loss": 2.346, "mean_token_accuracy": 0.4448275864124298, "step": 118370 }, { "epoch": 0.1192285572412904, "grad_norm": 29.997487817204984, "learning_rate": 4.9412961090530633e-05, "loss": 3.2, "mean_token_accuracy": 0.4310344815254211, "step": 118375 }, { "epoch": 0.11923359329439458, "grad_norm": 8.918160088236391, "learning_rate": 4.9412876051004065e-05, "loss": 2.5311, "mean_token_accuracy": 0.4172413766384125, "step": 118380 }, { "epoch": 0.11923862934749874, "grad_norm": 11.781419307822668, "learning_rate": 4.941279100539988e-05, "loss": 2.481, "mean_token_accuracy": 0.41917725205421447, "step": 118385 }, { "epoch": 0.11924366540060291, "grad_norm": 10.94782684171135, "learning_rate": 4.941270595371809e-05, "loss": 2.6994, "mean_token_accuracy": 0.44482759237289426, "step": 118390 }, { "epoch": 0.11924870145370708, "grad_norm": 11.8883192673968, "learning_rate": 4.941262089595873e-05, "loss": 2.47, "mean_token_accuracy": 0.4448275864124298, "step": 118395 }, { "epoch": 0.11925373750681126, "grad_norm": 9.984444725726966, "learning_rate": 4.941253583212183e-05, "loss": 2.7977, "mean_token_accuracy": 0.4, "step": 118400 }, { "epoch": 0.11925877355991543, "grad_norm": 11.55064352432013, "learning_rate": 4.94124507622074e-05, "loss": 2.2126, "mean_token_accuracy": 0.4482758641242981, "step": 118405 }, { "epoch": 0.1192638096130196, "grad_norm": 10.443933013103306, "learning_rate": 4.941236568621548e-05, "loss": 2.1891, "mean_token_accuracy": 0.47931033968925474, "step": 118410 }, { "epoch": 0.11926884566612378, "grad_norm": 10.56321911840976, "learning_rate": 4.941228060414608e-05, "loss": 2.5508, "mean_token_accuracy": 0.44482758045196535, "step": 118415 }, { "epoch": 0.11927388171922795, "grad_norm": 12.413423540846772, "learning_rate": 4.941219551599922e-05, "loss": 2.4043, "mean_token_accuracy": 0.43103447556495667, "step": 118420 }, { "epoch": 0.11927891777233213, "grad_norm": 9.739692962291747, "learning_rate": 4.9412110421774935e-05, "loss": 2.2191, "mean_token_accuracy": 0.47931033968925474, "step": 118425 }, { "epoch": 0.1192839538254363, "grad_norm": 10.278708269236052, "learning_rate": 4.941202532147324e-05, "loss": 2.6529, "mean_token_accuracy": 0.42413792610168455, "step": 118430 }, { "epoch": 0.11928898987854047, "grad_norm": 12.280283579430765, "learning_rate": 4.9411940215094174e-05, "loss": 2.31, "mean_token_accuracy": 0.42758620381355283, "step": 118435 }, { "epoch": 0.11929402593164465, "grad_norm": 9.466449749542441, "learning_rate": 4.941185510263774e-05, "loss": 2.391, "mean_token_accuracy": 0.4310344815254211, "step": 118440 }, { "epoch": 0.11929906198474882, "grad_norm": 11.407865215387272, "learning_rate": 4.941176998410397e-05, "loss": 2.1453, "mean_token_accuracy": 0.4655172348022461, "step": 118445 }, { "epoch": 0.119304098037853, "grad_norm": 12.10325482713896, "learning_rate": 4.941168485949289e-05, "loss": 2.9337, "mean_token_accuracy": 0.37241379022598264, "step": 118450 }, { "epoch": 0.11930913409095716, "grad_norm": 10.93154155786285, "learning_rate": 4.941159972880453e-05, "loss": 2.2945, "mean_token_accuracy": 0.43103447556495667, "step": 118455 }, { "epoch": 0.11931417014406133, "grad_norm": 11.786331351668094, "learning_rate": 4.94115145920389e-05, "loss": 2.6024, "mean_token_accuracy": 0.4344827592372894, "step": 118460 }, { "epoch": 0.1193192061971655, "grad_norm": 12.373428329367554, "learning_rate": 4.941142944919603e-05, "loss": 2.383, "mean_token_accuracy": 0.4344827592372894, "step": 118465 }, { "epoch": 0.11932424225026968, "grad_norm": 8.579469214844448, "learning_rate": 4.9411344300275944e-05, "loss": 3.0894, "mean_token_accuracy": 0.4103448331356049, "step": 118470 }, { "epoch": 0.11932927830337385, "grad_norm": 12.034593023354848, "learning_rate": 4.9411259145278666e-05, "loss": 2.2998, "mean_token_accuracy": 0.46896552443504336, "step": 118475 }, { "epoch": 0.11933431435647802, "grad_norm": 10.751329233718838, "learning_rate": 4.941117398420422e-05, "loss": 2.2987, "mean_token_accuracy": 0.4137930989265442, "step": 118480 }, { "epoch": 0.1193393504095822, "grad_norm": 12.21522408430805, "learning_rate": 4.941108881705263e-05, "loss": 2.4447, "mean_token_accuracy": 0.4275861978530884, "step": 118485 }, { "epoch": 0.11934438646268637, "grad_norm": 12.249848250650917, "learning_rate": 4.941100364382391e-05, "loss": 2.862, "mean_token_accuracy": 0.38620689511299133, "step": 118490 }, { "epoch": 0.11934942251579055, "grad_norm": 13.804779691914534, "learning_rate": 4.9410918464518094e-05, "loss": 2.4697, "mean_token_accuracy": 0.4448275864124298, "step": 118495 }, { "epoch": 0.11935445856889472, "grad_norm": 10.03047140658655, "learning_rate": 4.94108332791352e-05, "loss": 2.7607, "mean_token_accuracy": 0.4310344934463501, "step": 118500 }, { "epoch": 0.1193594946219989, "grad_norm": 9.0664881963372, "learning_rate": 4.9410748087675265e-05, "loss": 2.1456, "mean_token_accuracy": 0.4413793087005615, "step": 118505 }, { "epoch": 0.11936453067510307, "grad_norm": 10.215796004246767, "learning_rate": 4.941066289013829e-05, "loss": 2.8482, "mean_token_accuracy": 0.41379311084747317, "step": 118510 }, { "epoch": 0.11936956672820724, "grad_norm": 10.41967782928123, "learning_rate": 4.9410577686524325e-05, "loss": 2.5428, "mean_token_accuracy": 0.4034482777118683, "step": 118515 }, { "epoch": 0.11937460278131141, "grad_norm": 11.804039453028354, "learning_rate": 4.941049247683337e-05, "loss": 2.6252, "mean_token_accuracy": 0.41566848158836367, "step": 118520 }, { "epoch": 0.11937963883441557, "grad_norm": 10.249753588947353, "learning_rate": 4.941040726106546e-05, "loss": 2.3151, "mean_token_accuracy": 0.41034482717514037, "step": 118525 }, { "epoch": 0.11938467488751975, "grad_norm": 11.667652439456687, "learning_rate": 4.941032203922061e-05, "loss": 2.4205, "mean_token_accuracy": 0.4068965494632721, "step": 118530 }, { "epoch": 0.11938971094062392, "grad_norm": 9.204732011029918, "learning_rate": 4.9410236811298865e-05, "loss": 2.1406, "mean_token_accuracy": 0.4517241418361664, "step": 118535 }, { "epoch": 0.1193947469937281, "grad_norm": 10.816973514215896, "learning_rate": 4.941015157730022e-05, "loss": 2.1132, "mean_token_accuracy": 0.4931034505367279, "step": 118540 }, { "epoch": 0.11939978304683227, "grad_norm": 10.342434586976344, "learning_rate": 4.941006633722472e-05, "loss": 2.422, "mean_token_accuracy": 0.42413793206214906, "step": 118545 }, { "epoch": 0.11940481909993644, "grad_norm": 9.624978956037522, "learning_rate": 4.9409981091072384e-05, "loss": 2.3141, "mean_token_accuracy": 0.46551724672317507, "step": 118550 }, { "epoch": 0.11940985515304062, "grad_norm": 9.520798323294272, "learning_rate": 4.9409895838843225e-05, "loss": 2.215, "mean_token_accuracy": 0.47241379618644713, "step": 118555 }, { "epoch": 0.11941489120614479, "grad_norm": 9.925576465575528, "learning_rate": 4.940981058053728e-05, "loss": 2.3774, "mean_token_accuracy": 0.4310344815254211, "step": 118560 }, { "epoch": 0.11941992725924896, "grad_norm": 11.098376916598838, "learning_rate": 4.9409725316154564e-05, "loss": 2.5175, "mean_token_accuracy": 0.3999999940395355, "step": 118565 }, { "epoch": 0.11942496331235314, "grad_norm": 10.26660720285367, "learning_rate": 4.940964004569511e-05, "loss": 2.3985, "mean_token_accuracy": 0.4034482777118683, "step": 118570 }, { "epoch": 0.11942999936545731, "grad_norm": 10.316315067683968, "learning_rate": 4.940955476915893e-05, "loss": 2.3255, "mean_token_accuracy": 0.4344827651977539, "step": 118575 }, { "epoch": 0.11943503541856149, "grad_norm": 11.201046746577662, "learning_rate": 4.9409469486546056e-05, "loss": 2.8227, "mean_token_accuracy": 0.4034482717514038, "step": 118580 }, { "epoch": 0.11944007147166566, "grad_norm": 9.27490743014243, "learning_rate": 4.9409384197856506e-05, "loss": 2.5329, "mean_token_accuracy": 0.4090139091014862, "step": 118585 }, { "epoch": 0.11944510752476983, "grad_norm": 9.621219968579341, "learning_rate": 4.940929890309031e-05, "loss": 2.0157, "mean_token_accuracy": 0.482758617401123, "step": 118590 }, { "epoch": 0.119450143577874, "grad_norm": 8.885591944982341, "learning_rate": 4.940921360224749e-05, "loss": 2.6776, "mean_token_accuracy": 0.37241379618644715, "step": 118595 }, { "epoch": 0.11945517963097817, "grad_norm": 11.219647294579152, "learning_rate": 4.9409128295328063e-05, "loss": 2.554, "mean_token_accuracy": 0.4, "step": 118600 }, { "epoch": 0.11946021568408234, "grad_norm": 10.447968297480196, "learning_rate": 4.9409042982332057e-05, "loss": 2.4538, "mean_token_accuracy": 0.41724138259887694, "step": 118605 }, { "epoch": 0.11946525173718651, "grad_norm": 10.576691868508107, "learning_rate": 4.9408957663259496e-05, "loss": 2.3196, "mean_token_accuracy": 0.46551724076271056, "step": 118610 }, { "epoch": 0.11947028779029069, "grad_norm": 11.610870927902063, "learning_rate": 4.940887233811041e-05, "loss": 2.6224, "mean_token_accuracy": 0.4310344934463501, "step": 118615 }, { "epoch": 0.11947532384339486, "grad_norm": 10.370765177794075, "learning_rate": 4.940878700688481e-05, "loss": 2.4932, "mean_token_accuracy": 0.4296430706977844, "step": 118620 }, { "epoch": 0.11948035989649904, "grad_norm": 10.941744439570284, "learning_rate": 4.940870166958273e-05, "loss": 2.1503, "mean_token_accuracy": 0.46896551847457885, "step": 118625 }, { "epoch": 0.11948539594960321, "grad_norm": 9.865619061932657, "learning_rate": 4.940861632620419e-05, "loss": 2.1389, "mean_token_accuracy": 0.47065940499305725, "step": 118630 }, { "epoch": 0.11949043200270738, "grad_norm": 10.012562084091808, "learning_rate": 4.9408530976749214e-05, "loss": 2.3268, "mean_token_accuracy": 0.4137930989265442, "step": 118635 }, { "epoch": 0.11949546805581156, "grad_norm": 12.204932619089906, "learning_rate": 4.940844562121782e-05, "loss": 2.3256, "mean_token_accuracy": 0.4493842303752899, "step": 118640 }, { "epoch": 0.11950050410891573, "grad_norm": 10.940336844614546, "learning_rate": 4.940836025961004e-05, "loss": 2.4426, "mean_token_accuracy": 0.39310344457626345, "step": 118645 }, { "epoch": 0.1195055401620199, "grad_norm": 9.467157973653636, "learning_rate": 4.940827489192589e-05, "loss": 2.5546, "mean_token_accuracy": 0.42413793206214906, "step": 118650 }, { "epoch": 0.11951057621512408, "grad_norm": 13.484174544539952, "learning_rate": 4.940818951816541e-05, "loss": 2.9886, "mean_token_accuracy": 0.38275861740112305, "step": 118655 }, { "epoch": 0.11951561226822825, "grad_norm": 9.145809618443721, "learning_rate": 4.94081041383286e-05, "loss": 2.1042, "mean_token_accuracy": 0.4655172348022461, "step": 118660 }, { "epoch": 0.11952064832133241, "grad_norm": 9.716263629843706, "learning_rate": 4.9408018752415506e-05, "loss": 2.0296, "mean_token_accuracy": 0.5543859660625458, "step": 118665 }, { "epoch": 0.11952568437443659, "grad_norm": 8.783392975159177, "learning_rate": 4.9407933360426135e-05, "loss": 2.4244, "mean_token_accuracy": 0.4586206912994385, "step": 118670 }, { "epoch": 0.11953072042754076, "grad_norm": 9.89286492434891, "learning_rate": 4.9407847962360515e-05, "loss": 2.0519, "mean_token_accuracy": 0.4965517222881317, "step": 118675 }, { "epoch": 0.11953575648064493, "grad_norm": 10.68375550297062, "learning_rate": 4.9407762558218674e-05, "loss": 2.3839, "mean_token_accuracy": 0.4637023627758026, "step": 118680 }, { "epoch": 0.11954079253374911, "grad_norm": 10.995806811562966, "learning_rate": 4.940767714800064e-05, "loss": 2.6065, "mean_token_accuracy": 0.3965517163276672, "step": 118685 }, { "epoch": 0.11954582858685328, "grad_norm": 12.235988455435903, "learning_rate": 4.940759173170642e-05, "loss": 2.0908, "mean_token_accuracy": 0.5, "step": 118690 }, { "epoch": 0.11955086463995745, "grad_norm": 10.52725905394331, "learning_rate": 4.940750630933605e-05, "loss": 2.4397, "mean_token_accuracy": 0.3793103516101837, "step": 118695 }, { "epoch": 0.11955590069306163, "grad_norm": 10.095836068033115, "learning_rate": 4.940742088088956e-05, "loss": 2.3236, "mean_token_accuracy": 0.42226254343986513, "step": 118700 }, { "epoch": 0.1195609367461658, "grad_norm": 10.743249271216033, "learning_rate": 4.9407335446366955e-05, "loss": 2.1919, "mean_token_accuracy": 0.4517241358757019, "step": 118705 }, { "epoch": 0.11956597279926998, "grad_norm": 9.192319439405217, "learning_rate": 4.9407250005768274e-05, "loss": 2.2704, "mean_token_accuracy": 0.46745312213897705, "step": 118710 }, { "epoch": 0.11957100885237415, "grad_norm": 12.105264217980421, "learning_rate": 4.940716455909353e-05, "loss": 2.4528, "mean_token_accuracy": 0.4620689690113068, "step": 118715 }, { "epoch": 0.11957604490547832, "grad_norm": 12.509984843605038, "learning_rate": 4.940707910634275e-05, "loss": 2.8933, "mean_token_accuracy": 0.3620689630508423, "step": 118720 }, { "epoch": 0.1195810809585825, "grad_norm": 10.673559763475271, "learning_rate": 4.940699364751597e-05, "loss": 2.257, "mean_token_accuracy": 0.4620689630508423, "step": 118725 }, { "epoch": 0.11958611701168667, "grad_norm": 11.563129849200525, "learning_rate": 4.94069081826132e-05, "loss": 2.6905, "mean_token_accuracy": 0.41034482717514037, "step": 118730 }, { "epoch": 0.11959115306479083, "grad_norm": 13.04224262956906, "learning_rate": 4.940682271163446e-05, "loss": 2.1283, "mean_token_accuracy": 0.4551724135875702, "step": 118735 }, { "epoch": 0.119596189117895, "grad_norm": 10.060225573720642, "learning_rate": 4.9406737234579795e-05, "loss": 2.5173, "mean_token_accuracy": 0.3931034505367279, "step": 118740 }, { "epoch": 0.11960122517099918, "grad_norm": 11.091293881696107, "learning_rate": 4.9406651751449204e-05, "loss": 2.1876, "mean_token_accuracy": 0.4689655125141144, "step": 118745 }, { "epoch": 0.11960626122410335, "grad_norm": 9.678496445382857, "learning_rate": 4.9406566262242724e-05, "loss": 2.619, "mean_token_accuracy": 0.3655172407627106, "step": 118750 }, { "epoch": 0.11961129727720753, "grad_norm": 10.843439663023569, "learning_rate": 4.9406480766960376e-05, "loss": 2.3407, "mean_token_accuracy": 0.4517241299152374, "step": 118755 }, { "epoch": 0.1196163333303117, "grad_norm": 10.073501246121298, "learning_rate": 4.9406395265602185e-05, "loss": 2.3933, "mean_token_accuracy": 0.4103448212146759, "step": 118760 }, { "epoch": 0.11962136938341587, "grad_norm": 9.748001097700424, "learning_rate": 4.940630975816817e-05, "loss": 2.3229, "mean_token_accuracy": 0.42413793206214906, "step": 118765 }, { "epoch": 0.11962640543652005, "grad_norm": 7.7739619930482515, "learning_rate": 4.9406224244658366e-05, "loss": 2.2761, "mean_token_accuracy": 0.5113300383090973, "step": 118770 }, { "epoch": 0.11963144148962422, "grad_norm": 19.792673048276622, "learning_rate": 4.940613872507278e-05, "loss": 2.449, "mean_token_accuracy": 0.4344827592372894, "step": 118775 }, { "epoch": 0.1196364775427284, "grad_norm": 14.19393826632409, "learning_rate": 4.940605319941146e-05, "loss": 2.6144, "mean_token_accuracy": 0.4034482717514038, "step": 118780 }, { "epoch": 0.11964151359583257, "grad_norm": 10.715374569230379, "learning_rate": 4.940596766767439e-05, "loss": 2.6072, "mean_token_accuracy": 0.41034482717514037, "step": 118785 }, { "epoch": 0.11964654964893674, "grad_norm": 12.56496972200216, "learning_rate": 4.9405882129861635e-05, "loss": 2.7369, "mean_token_accuracy": 0.41034482717514037, "step": 118790 }, { "epoch": 0.11965158570204092, "grad_norm": 10.184012183619025, "learning_rate": 4.9405796585973195e-05, "loss": 2.4989, "mean_token_accuracy": 0.42758620381355283, "step": 118795 }, { "epoch": 0.11965662175514509, "grad_norm": 10.170626396109657, "learning_rate": 4.940571103600911e-05, "loss": 2.4599, "mean_token_accuracy": 0.4620689630508423, "step": 118800 }, { "epoch": 0.11966165780824925, "grad_norm": 9.405842261791056, "learning_rate": 4.940562547996939e-05, "loss": 2.4974, "mean_token_accuracy": 0.4068965494632721, "step": 118805 }, { "epoch": 0.11966669386135342, "grad_norm": 10.300422134341396, "learning_rate": 4.9405539917854056e-05, "loss": 2.7065, "mean_token_accuracy": 0.4464004814624786, "step": 118810 }, { "epoch": 0.1196717299144576, "grad_norm": 10.21148583836709, "learning_rate": 4.940545434966314e-05, "loss": 2.6322, "mean_token_accuracy": 0.41851180195808413, "step": 118815 }, { "epoch": 0.11967676596756177, "grad_norm": 11.207322937327852, "learning_rate": 4.940536877539667e-05, "loss": 2.5619, "mean_token_accuracy": 0.42068966031074523, "step": 118820 }, { "epoch": 0.11968180202066594, "grad_norm": 11.636949975235481, "learning_rate": 4.940528319505467e-05, "loss": 2.4313, "mean_token_accuracy": 0.42068964838981626, "step": 118825 }, { "epoch": 0.11968683807377012, "grad_norm": 11.32954740889244, "learning_rate": 4.940519760863714e-05, "loss": 2.358, "mean_token_accuracy": 0.39310343861579894, "step": 118830 }, { "epoch": 0.11969187412687429, "grad_norm": 10.159525081342787, "learning_rate": 4.940511201614414e-05, "loss": 2.2329, "mean_token_accuracy": 0.42413793206214906, "step": 118835 }, { "epoch": 0.11969691017997847, "grad_norm": 10.902030583230793, "learning_rate": 4.9405026417575666e-05, "loss": 2.379, "mean_token_accuracy": 0.4482758641242981, "step": 118840 }, { "epoch": 0.11970194623308264, "grad_norm": 10.27155430843561, "learning_rate": 4.9404940812931755e-05, "loss": 2.4629, "mean_token_accuracy": 0.42413793206214906, "step": 118845 }, { "epoch": 0.11970698228618681, "grad_norm": 9.977628237820763, "learning_rate": 4.940485520221242e-05, "loss": 2.6085, "mean_token_accuracy": 0.36896551549434664, "step": 118850 }, { "epoch": 0.11971201833929099, "grad_norm": 15.665022171625505, "learning_rate": 4.94047695854177e-05, "loss": 3.0928, "mean_token_accuracy": 0.37241379022598264, "step": 118855 }, { "epoch": 0.11971705439239516, "grad_norm": 11.904276053349044, "learning_rate": 4.940468396254761e-05, "loss": 2.3811, "mean_token_accuracy": 0.4482758641242981, "step": 118860 }, { "epoch": 0.11972209044549934, "grad_norm": 10.897861802254642, "learning_rate": 4.940459833360217e-05, "loss": 2.2718, "mean_token_accuracy": 0.4965517222881317, "step": 118865 }, { "epoch": 0.11972712649860351, "grad_norm": 8.331129149671176, "learning_rate": 4.940451269858141e-05, "loss": 2.0369, "mean_token_accuracy": 0.5241379320621491, "step": 118870 }, { "epoch": 0.11973216255170767, "grad_norm": 8.700785293168776, "learning_rate": 4.9404427057485346e-05, "loss": 2.4496, "mean_token_accuracy": 0.3620689630508423, "step": 118875 }, { "epoch": 0.11973719860481184, "grad_norm": 10.286123215787304, "learning_rate": 4.940434141031402e-05, "loss": 2.7102, "mean_token_accuracy": 0.42413793206214906, "step": 118880 }, { "epoch": 0.11974223465791602, "grad_norm": 9.920525488813636, "learning_rate": 4.940425575706744e-05, "loss": 2.2155, "mean_token_accuracy": 0.47931033968925474, "step": 118885 }, { "epoch": 0.11974727071102019, "grad_norm": 11.999261667656052, "learning_rate": 4.9404170097745625e-05, "loss": 2.3321, "mean_token_accuracy": 0.47822141647338867, "step": 118890 }, { "epoch": 0.11975230676412436, "grad_norm": 12.354776235962683, "learning_rate": 4.940408443234861e-05, "loss": 2.493, "mean_token_accuracy": 0.43448275327682495, "step": 118895 }, { "epoch": 0.11975734281722854, "grad_norm": 9.104268609076932, "learning_rate": 4.940399876087641e-05, "loss": 2.4663, "mean_token_accuracy": 0.4517241299152374, "step": 118900 }, { "epoch": 0.11976237887033271, "grad_norm": 16.04227384387449, "learning_rate": 4.940391308332906e-05, "loss": 2.74, "mean_token_accuracy": 0.41034482717514037, "step": 118905 }, { "epoch": 0.11976741492343689, "grad_norm": 15.218781341166528, "learning_rate": 4.9403827399706584e-05, "loss": 2.2539, "mean_token_accuracy": 0.4586206912994385, "step": 118910 }, { "epoch": 0.11977245097654106, "grad_norm": 12.214424445694076, "learning_rate": 4.940374171000899e-05, "loss": 2.6897, "mean_token_accuracy": 0.38620689511299133, "step": 118915 }, { "epoch": 0.11977748702964523, "grad_norm": 12.256669135965447, "learning_rate": 4.9403656014236315e-05, "loss": 2.4118, "mean_token_accuracy": 0.4103448212146759, "step": 118920 }, { "epoch": 0.1197825230827494, "grad_norm": 10.130158485571638, "learning_rate": 4.9403570312388584e-05, "loss": 2.0252, "mean_token_accuracy": 0.4551724135875702, "step": 118925 }, { "epoch": 0.11978755913585358, "grad_norm": 12.852546942699673, "learning_rate": 4.940348460446581e-05, "loss": 2.5184, "mean_token_accuracy": 0.3965517282485962, "step": 118930 }, { "epoch": 0.11979259518895775, "grad_norm": 10.17935657611278, "learning_rate": 4.940339889046802e-05, "loss": 2.2801, "mean_token_accuracy": 0.4517241358757019, "step": 118935 }, { "epoch": 0.11979763124206193, "grad_norm": 10.456700518548788, "learning_rate": 4.9403313170395244e-05, "loss": 2.3518, "mean_token_accuracy": 0.4034482717514038, "step": 118940 }, { "epoch": 0.11980266729516609, "grad_norm": 10.000661376662674, "learning_rate": 4.94032274442475e-05, "loss": 2.3661, "mean_token_accuracy": 0.4344827473163605, "step": 118945 }, { "epoch": 0.11980770334827026, "grad_norm": 11.3251079797568, "learning_rate": 4.940314171202483e-05, "loss": 2.0831, "mean_token_accuracy": 0.47931033968925474, "step": 118950 }, { "epoch": 0.11981273940137444, "grad_norm": 9.703495039658216, "learning_rate": 4.940305597372723e-05, "loss": 2.5393, "mean_token_accuracy": 0.4034482777118683, "step": 118955 }, { "epoch": 0.11981777545447861, "grad_norm": 9.292965117055608, "learning_rate": 4.940297022935473e-05, "loss": 1.9019, "mean_token_accuracy": 0.5253694593906403, "step": 118960 }, { "epoch": 0.11982281150758278, "grad_norm": 11.01403846788282, "learning_rate": 4.940288447890737e-05, "loss": 2.2578, "mean_token_accuracy": 0.49999998807907103, "step": 118965 }, { "epoch": 0.11982784756068696, "grad_norm": 9.243486076187825, "learning_rate": 4.940279872238515e-05, "loss": 1.9937, "mean_token_accuracy": 0.4896551728248596, "step": 118970 }, { "epoch": 0.11983288361379113, "grad_norm": 12.92874074726818, "learning_rate": 4.9402712959788116e-05, "loss": 2.547, "mean_token_accuracy": 0.4052026689052582, "step": 118975 }, { "epoch": 0.1198379196668953, "grad_norm": 10.171625556908468, "learning_rate": 4.940262719111628e-05, "loss": 2.1989, "mean_token_accuracy": 0.47931033968925474, "step": 118980 }, { "epoch": 0.11984295571999948, "grad_norm": 9.600413939387508, "learning_rate": 4.940254141636968e-05, "loss": 2.3366, "mean_token_accuracy": 0.4172413766384125, "step": 118985 }, { "epoch": 0.11984799177310365, "grad_norm": 9.832874507392205, "learning_rate": 4.940245563554831e-05, "loss": 2.3572, "mean_token_accuracy": 0.4103448331356049, "step": 118990 }, { "epoch": 0.11985302782620783, "grad_norm": 10.908752166369093, "learning_rate": 4.9402369848652225e-05, "loss": 2.3137, "mean_token_accuracy": 0.4527093589305878, "step": 118995 }, { "epoch": 0.119858063879312, "grad_norm": 11.510245501421815, "learning_rate": 4.940228405568143e-05, "loss": 2.9673, "mean_token_accuracy": 0.3620689630508423, "step": 119000 }, { "epoch": 0.11986309993241617, "grad_norm": 10.230521188245369, "learning_rate": 4.940219825663596e-05, "loss": 2.3662, "mean_token_accuracy": 0.4379310369491577, "step": 119005 }, { "epoch": 0.11986813598552035, "grad_norm": 12.91677881265146, "learning_rate": 4.940211245151583e-05, "loss": 2.3097, "mean_token_accuracy": 0.44812030494213106, "step": 119010 }, { "epoch": 0.1198731720386245, "grad_norm": 12.252074748502762, "learning_rate": 4.940202664032107e-05, "loss": 2.3363, "mean_token_accuracy": 0.46896552443504336, "step": 119015 }, { "epoch": 0.11987820809172868, "grad_norm": 13.843651949500092, "learning_rate": 4.94019408230517e-05, "loss": 2.3691, "mean_token_accuracy": 0.44482758045196535, "step": 119020 }, { "epoch": 0.11988324414483285, "grad_norm": 10.150970044974168, "learning_rate": 4.940185499970774e-05, "loss": 2.2829, "mean_token_accuracy": 0.4482758641242981, "step": 119025 }, { "epoch": 0.11988828019793703, "grad_norm": 9.82948620935528, "learning_rate": 4.9401769170289225e-05, "loss": 2.0981, "mean_token_accuracy": 0.46551724076271056, "step": 119030 }, { "epoch": 0.1198933162510412, "grad_norm": 11.337032245245409, "learning_rate": 4.9401683334796174e-05, "loss": 2.3292, "mean_token_accuracy": 0.3896551728248596, "step": 119035 }, { "epoch": 0.11989835230414538, "grad_norm": 13.153124209937824, "learning_rate": 4.94015974932286e-05, "loss": 2.2478, "mean_token_accuracy": 0.413793095946312, "step": 119040 }, { "epoch": 0.11990338835724955, "grad_norm": 14.768692772045624, "learning_rate": 4.9401511645586545e-05, "loss": 2.3267, "mean_token_accuracy": 0.3827586233615875, "step": 119045 }, { "epoch": 0.11990842441035372, "grad_norm": 10.708134388991471, "learning_rate": 4.940142579187002e-05, "loss": 2.6283, "mean_token_accuracy": 0.37931033968925476, "step": 119050 }, { "epoch": 0.1199134604634579, "grad_norm": 10.565276953590889, "learning_rate": 4.940133993207906e-05, "loss": 2.021, "mean_token_accuracy": 0.4586206912994385, "step": 119055 }, { "epoch": 0.11991849651656207, "grad_norm": 9.436655558648924, "learning_rate": 4.940125406621367e-05, "loss": 2.4049, "mean_token_accuracy": 0.4448275864124298, "step": 119060 }, { "epoch": 0.11992353256966624, "grad_norm": 9.163636747168164, "learning_rate": 4.9401168194273894e-05, "loss": 2.12, "mean_token_accuracy": 0.4916515350341797, "step": 119065 }, { "epoch": 0.11992856862277042, "grad_norm": 9.974203461871152, "learning_rate": 4.940108231625974e-05, "loss": 2.6935, "mean_token_accuracy": 0.37586206793785093, "step": 119070 }, { "epoch": 0.11993360467587459, "grad_norm": 11.032710511938467, "learning_rate": 4.940099643217125e-05, "loss": 2.4679, "mean_token_accuracy": 0.4551724135875702, "step": 119075 }, { "epoch": 0.11993864072897877, "grad_norm": 11.328955800219799, "learning_rate": 4.940091054200843e-05, "loss": 2.2894, "mean_token_accuracy": 0.46551724076271056, "step": 119080 }, { "epoch": 0.11994367678208293, "grad_norm": 9.538276974271273, "learning_rate": 4.940082464577131e-05, "loss": 1.9611, "mean_token_accuracy": 0.4724137902259827, "step": 119085 }, { "epoch": 0.1199487128351871, "grad_norm": 9.495828542190486, "learning_rate": 4.940073874345992e-05, "loss": 2.188, "mean_token_accuracy": 0.4517241418361664, "step": 119090 }, { "epoch": 0.11995374888829127, "grad_norm": 12.40922566963609, "learning_rate": 4.940065283507428e-05, "loss": 2.5948, "mean_token_accuracy": 0.34137930870056155, "step": 119095 }, { "epoch": 0.11995878494139545, "grad_norm": 8.266268253995737, "learning_rate": 4.94005669206144e-05, "loss": 2.0969, "mean_token_accuracy": 0.4517241358757019, "step": 119100 }, { "epoch": 0.11996382099449962, "grad_norm": 10.477201096847546, "learning_rate": 4.940048100008032e-05, "loss": 1.9724, "mean_token_accuracy": 0.4655172348022461, "step": 119105 }, { "epoch": 0.1199688570476038, "grad_norm": 10.795744390895273, "learning_rate": 4.940039507347207e-05, "loss": 2.1984, "mean_token_accuracy": 0.4620689570903778, "step": 119110 }, { "epoch": 0.11997389310070797, "grad_norm": 12.252935225346146, "learning_rate": 4.940030914078965e-05, "loss": 2.7286, "mean_token_accuracy": 0.35862069129943847, "step": 119115 }, { "epoch": 0.11997892915381214, "grad_norm": 10.224078113513775, "learning_rate": 4.9400223202033104e-05, "loss": 2.0383, "mean_token_accuracy": 0.4034482777118683, "step": 119120 }, { "epoch": 0.11998396520691632, "grad_norm": 11.5327247403299, "learning_rate": 4.940013725720245e-05, "loss": 2.3646, "mean_token_accuracy": 0.4344827651977539, "step": 119125 }, { "epoch": 0.11998900126002049, "grad_norm": 8.676333968411784, "learning_rate": 4.9400051306297706e-05, "loss": 2.3536, "mean_token_accuracy": 0.42068966031074523, "step": 119130 }, { "epoch": 0.11999403731312466, "grad_norm": 10.610228992034166, "learning_rate": 4.9399965349318914e-05, "loss": 2.1975, "mean_token_accuracy": 0.46551724076271056, "step": 119135 }, { "epoch": 0.11999907336622884, "grad_norm": 10.964166810150786, "learning_rate": 4.9399879386266076e-05, "loss": 1.8283, "mean_token_accuracy": 0.5090744078159333, "step": 119140 }, { "epoch": 0.12000410941933301, "grad_norm": 11.34789436059137, "learning_rate": 4.9399793417139225e-05, "loss": 2.4062, "mean_token_accuracy": 0.4358741700649261, "step": 119145 }, { "epoch": 0.12000914547243718, "grad_norm": 13.675601557621595, "learning_rate": 4.939970744193838e-05, "loss": 2.1142, "mean_token_accuracy": 0.42068964540958403, "step": 119150 }, { "epoch": 0.12001418152554134, "grad_norm": 16.563869672257503, "learning_rate": 4.939962146066358e-05, "loss": 2.7509, "mean_token_accuracy": 0.4034482717514038, "step": 119155 }, { "epoch": 0.12001921757864552, "grad_norm": 9.714801161384313, "learning_rate": 4.939953547331483e-05, "loss": 2.1133, "mean_token_accuracy": 0.5, "step": 119160 }, { "epoch": 0.12002425363174969, "grad_norm": 8.702701082102617, "learning_rate": 4.939944947989217e-05, "loss": 2.6492, "mean_token_accuracy": 0.36896551847457887, "step": 119165 }, { "epoch": 0.12002928968485387, "grad_norm": 11.956838965722106, "learning_rate": 4.93993634803956e-05, "loss": 2.5793, "mean_token_accuracy": 0.4034482777118683, "step": 119170 }, { "epoch": 0.12003432573795804, "grad_norm": 10.97510252987617, "learning_rate": 4.939927747482518e-05, "loss": 2.4492, "mean_token_accuracy": 0.4551724135875702, "step": 119175 }, { "epoch": 0.12003936179106221, "grad_norm": 9.048682434985329, "learning_rate": 4.93991914631809e-05, "loss": 2.1737, "mean_token_accuracy": 0.4434361755847931, "step": 119180 }, { "epoch": 0.12004439784416639, "grad_norm": 10.984697320878448, "learning_rate": 4.93991054454628e-05, "loss": 2.4599, "mean_token_accuracy": 0.42758620381355283, "step": 119185 }, { "epoch": 0.12004943389727056, "grad_norm": 10.492211584079056, "learning_rate": 4.939901942167091e-05, "loss": 2.3248, "mean_token_accuracy": 0.42758620381355283, "step": 119190 }, { "epoch": 0.12005446995037473, "grad_norm": 15.40266057087019, "learning_rate": 4.9398933391805236e-05, "loss": 2.4143, "mean_token_accuracy": 0.4379310429096222, "step": 119195 }, { "epoch": 0.12005950600347891, "grad_norm": 9.522418932194364, "learning_rate": 4.939884735586581e-05, "loss": 2.0445, "mean_token_accuracy": 0.4344827592372894, "step": 119200 }, { "epoch": 0.12006454205658308, "grad_norm": 14.270578034997587, "learning_rate": 4.939876131385266e-05, "loss": 2.0061, "mean_token_accuracy": 0.4482758641242981, "step": 119205 }, { "epoch": 0.12006957810968726, "grad_norm": 12.579368981392484, "learning_rate": 4.9398675265765806e-05, "loss": 2.5752, "mean_token_accuracy": 0.3931034505367279, "step": 119210 }, { "epoch": 0.12007461416279143, "grad_norm": 10.493216865759225, "learning_rate": 4.9398589211605285e-05, "loss": 2.4452, "mean_token_accuracy": 0.4448275864124298, "step": 119215 }, { "epoch": 0.1200796502158956, "grad_norm": 9.083767363237282, "learning_rate": 4.9398503151371096e-05, "loss": 2.0128, "mean_token_accuracy": 0.4655172348022461, "step": 119220 }, { "epoch": 0.12008468626899976, "grad_norm": 8.68131762824565, "learning_rate": 4.939841708506327e-05, "loss": 1.985, "mean_token_accuracy": 0.4586206912994385, "step": 119225 }, { "epoch": 0.12008972232210394, "grad_norm": 9.060833273217225, "learning_rate": 4.939833101268185e-05, "loss": 2.2175, "mean_token_accuracy": 0.5, "step": 119230 }, { "epoch": 0.12009475837520811, "grad_norm": 10.658697829405641, "learning_rate": 4.9398244934226835e-05, "loss": 2.547, "mean_token_accuracy": 0.37241379618644715, "step": 119235 }, { "epoch": 0.12009979442831228, "grad_norm": 11.165869247559232, "learning_rate": 4.939815884969827e-05, "loss": 2.3151, "mean_token_accuracy": 0.44137930274009707, "step": 119240 }, { "epoch": 0.12010483048141646, "grad_norm": 9.00918724149998, "learning_rate": 4.939807275909616e-05, "loss": 2.3159, "mean_token_accuracy": 0.44827585220336913, "step": 119245 }, { "epoch": 0.12010986653452063, "grad_norm": 8.59740497651537, "learning_rate": 4.939798666242054e-05, "loss": 2.4973, "mean_token_accuracy": 0.4259528160095215, "step": 119250 }, { "epoch": 0.1201149025876248, "grad_norm": 12.549208054763874, "learning_rate": 4.939790055967144e-05, "loss": 2.3964, "mean_token_accuracy": 0.4034482717514038, "step": 119255 }, { "epoch": 0.12011993864072898, "grad_norm": 10.297792320242346, "learning_rate": 4.9397814450848866e-05, "loss": 2.378, "mean_token_accuracy": 0.3999999940395355, "step": 119260 }, { "epoch": 0.12012497469383315, "grad_norm": 13.728203886537814, "learning_rate": 4.939772833595286e-05, "loss": 2.1032, "mean_token_accuracy": 0.47241379618644713, "step": 119265 }, { "epoch": 0.12013001074693733, "grad_norm": 9.196916049079698, "learning_rate": 4.939764221498343e-05, "loss": 2.2051, "mean_token_accuracy": 0.44482759237289426, "step": 119270 }, { "epoch": 0.1201350468000415, "grad_norm": 8.360078704652105, "learning_rate": 4.93975560879406e-05, "loss": 2.1462, "mean_token_accuracy": 0.47931034564971925, "step": 119275 }, { "epoch": 0.12014008285314567, "grad_norm": 10.720533760981393, "learning_rate": 4.939746995482441e-05, "loss": 2.229, "mean_token_accuracy": 0.4551724076271057, "step": 119280 }, { "epoch": 0.12014511890624985, "grad_norm": 12.01617068733847, "learning_rate": 4.939738381563488e-05, "loss": 2.8512, "mean_token_accuracy": 0.37586206793785093, "step": 119285 }, { "epoch": 0.12015015495935402, "grad_norm": 10.344463691227435, "learning_rate": 4.939729767037203e-05, "loss": 2.7412, "mean_token_accuracy": 0.39310345649719236, "step": 119290 }, { "epoch": 0.12015519101245818, "grad_norm": 10.8528720313763, "learning_rate": 4.9397211519035876e-05, "loss": 2.5764, "mean_token_accuracy": 0.4034482717514038, "step": 119295 }, { "epoch": 0.12016022706556236, "grad_norm": 10.249421101168654, "learning_rate": 4.939712536162645e-05, "loss": 2.3834, "mean_token_accuracy": 0.46551724076271056, "step": 119300 }, { "epoch": 0.12016526311866653, "grad_norm": 9.908293084728589, "learning_rate": 4.939703919814377e-05, "loss": 2.3114, "mean_token_accuracy": 0.4689655125141144, "step": 119305 }, { "epoch": 0.1201702991717707, "grad_norm": 11.284976380655607, "learning_rate": 4.939695302858787e-05, "loss": 2.4606, "mean_token_accuracy": 0.3896551728248596, "step": 119310 }, { "epoch": 0.12017533522487488, "grad_norm": 11.631437179094759, "learning_rate": 4.9396866852958764e-05, "loss": 2.5469, "mean_token_accuracy": 0.38620689511299133, "step": 119315 }, { "epoch": 0.12018037127797905, "grad_norm": 9.041907164640111, "learning_rate": 4.9396780671256494e-05, "loss": 2.4526, "mean_token_accuracy": 0.4241379380226135, "step": 119320 }, { "epoch": 0.12018540733108322, "grad_norm": 10.280150140189098, "learning_rate": 4.939669448348105e-05, "loss": 2.1474, "mean_token_accuracy": 0.44482759237289426, "step": 119325 }, { "epoch": 0.1201904433841874, "grad_norm": 27.760202456618853, "learning_rate": 4.9396608289632483e-05, "loss": 2.5445, "mean_token_accuracy": 0.4448275864124298, "step": 119330 }, { "epoch": 0.12019547943729157, "grad_norm": 11.669155123272887, "learning_rate": 4.939652208971082e-05, "loss": 2.6354, "mean_token_accuracy": 0.39812461137771604, "step": 119335 }, { "epoch": 0.12020051549039575, "grad_norm": 7.567567153558938, "learning_rate": 4.939643588371607e-05, "loss": 2.1298, "mean_token_accuracy": 0.5019963622093201, "step": 119340 }, { "epoch": 0.12020555154349992, "grad_norm": 11.94060441776718, "learning_rate": 4.939634967164826e-05, "loss": 2.4958, "mean_token_accuracy": 0.3862069010734558, "step": 119345 }, { "epoch": 0.1202105875966041, "grad_norm": 11.562754340638174, "learning_rate": 4.9396263453507415e-05, "loss": 2.7727, "mean_token_accuracy": 0.37586206793785093, "step": 119350 }, { "epoch": 0.12021562364970827, "grad_norm": 10.803906235923012, "learning_rate": 4.939617722929356e-05, "loss": 2.815, "mean_token_accuracy": 0.37586206793785093, "step": 119355 }, { "epoch": 0.12022065970281244, "grad_norm": 10.604244322832534, "learning_rate": 4.939609099900671e-05, "loss": 2.2045, "mean_token_accuracy": 0.4862068951129913, "step": 119360 }, { "epoch": 0.1202256957559166, "grad_norm": 10.367510210182056, "learning_rate": 4.939600476264691e-05, "loss": 2.286, "mean_token_accuracy": 0.4620689690113068, "step": 119365 }, { "epoch": 0.12023073180902077, "grad_norm": 8.71436870723413, "learning_rate": 4.939591852021417e-05, "loss": 2.2074, "mean_token_accuracy": 0.4586206912994385, "step": 119370 }, { "epoch": 0.12023576786212495, "grad_norm": 11.10623928914366, "learning_rate": 4.939583227170851e-05, "loss": 3.0237, "mean_token_accuracy": 0.3448275804519653, "step": 119375 }, { "epoch": 0.12024080391522912, "grad_norm": 11.043683362963579, "learning_rate": 4.9395746017129965e-05, "loss": 2.4732, "mean_token_accuracy": 0.40689656138420105, "step": 119380 }, { "epoch": 0.1202458399683333, "grad_norm": 13.284203050957663, "learning_rate": 4.9395659756478545e-05, "loss": 3.2988, "mean_token_accuracy": 0.36896551251411436, "step": 119385 }, { "epoch": 0.12025087602143747, "grad_norm": 9.54136856788621, "learning_rate": 4.939557348975429e-05, "loss": 2.2835, "mean_token_accuracy": 0.4172413766384125, "step": 119390 }, { "epoch": 0.12025591207454164, "grad_norm": 10.81992871715047, "learning_rate": 4.939548721695721e-05, "loss": 2.1642, "mean_token_accuracy": 0.4517241358757019, "step": 119395 }, { "epoch": 0.12026094812764582, "grad_norm": 10.651737103546157, "learning_rate": 4.939540093808734e-05, "loss": 2.5969, "mean_token_accuracy": 0.36206896901130675, "step": 119400 }, { "epoch": 0.12026598418074999, "grad_norm": 9.287539398613724, "learning_rate": 4.9395314653144696e-05, "loss": 2.1858, "mean_token_accuracy": 0.42758620381355283, "step": 119405 }, { "epoch": 0.12027102023385416, "grad_norm": 9.869456143179955, "learning_rate": 4.9395228362129313e-05, "loss": 2.6165, "mean_token_accuracy": 0.42758620381355283, "step": 119410 }, { "epoch": 0.12027605628695834, "grad_norm": 32.52525249723404, "learning_rate": 4.939514206504119e-05, "loss": 3.0794, "mean_token_accuracy": 0.3965517163276672, "step": 119415 }, { "epoch": 0.12028109234006251, "grad_norm": 12.64574993353301, "learning_rate": 4.939505576188038e-05, "loss": 2.3077, "mean_token_accuracy": 0.4517241358757019, "step": 119420 }, { "epoch": 0.12028612839316669, "grad_norm": 9.992750336531746, "learning_rate": 4.939496945264689e-05, "loss": 2.5118, "mean_token_accuracy": 0.4448275864124298, "step": 119425 }, { "epoch": 0.12029116444627086, "grad_norm": 10.348380643279022, "learning_rate": 4.939488313734076e-05, "loss": 2.0726, "mean_token_accuracy": 0.4862069010734558, "step": 119430 }, { "epoch": 0.12029620049937502, "grad_norm": 11.011436839827741, "learning_rate": 4.939479681596199e-05, "loss": 2.5894, "mean_token_accuracy": 0.4206896543502808, "step": 119435 }, { "epoch": 0.1203012365524792, "grad_norm": 9.745717444248355, "learning_rate": 4.939471048851062e-05, "loss": 2.4751, "mean_token_accuracy": 0.39655172228813174, "step": 119440 }, { "epoch": 0.12030627260558337, "grad_norm": 11.221352869419286, "learning_rate": 4.939462415498666e-05, "loss": 2.1377, "mean_token_accuracy": 0.44827585816383364, "step": 119445 }, { "epoch": 0.12031130865868754, "grad_norm": 12.30248396059743, "learning_rate": 4.9394537815390165e-05, "loss": 2.4484, "mean_token_accuracy": 0.3896551728248596, "step": 119450 }, { "epoch": 0.12031634471179171, "grad_norm": 7.836043482022656, "learning_rate": 4.939445146972112e-05, "loss": 2.1376, "mean_token_accuracy": 0.44137929677963256, "step": 119455 }, { "epoch": 0.12032138076489589, "grad_norm": 10.88343759550977, "learning_rate": 4.939436511797958e-05, "loss": 2.1031, "mean_token_accuracy": 0.4620689630508423, "step": 119460 }, { "epoch": 0.12032641681800006, "grad_norm": 16.91187662092988, "learning_rate": 4.939427876016555e-05, "loss": 2.9889, "mean_token_accuracy": 0.4172413766384125, "step": 119465 }, { "epoch": 0.12033145287110424, "grad_norm": 11.636216546632317, "learning_rate": 4.939419239627906e-05, "loss": 2.2032, "mean_token_accuracy": 0.4569267988204956, "step": 119470 }, { "epoch": 0.12033648892420841, "grad_norm": 10.63139269103073, "learning_rate": 4.939410602632013e-05, "loss": 1.991, "mean_token_accuracy": 0.5090139031410217, "step": 119475 }, { "epoch": 0.12034152497731258, "grad_norm": 9.018000626298614, "learning_rate": 4.93940196502888e-05, "loss": 2.4226, "mean_token_accuracy": 0.3896551728248596, "step": 119480 }, { "epoch": 0.12034656103041676, "grad_norm": 8.911382714420135, "learning_rate": 4.939393326818507e-05, "loss": 1.8531, "mean_token_accuracy": 0.5034482717514038, "step": 119485 }, { "epoch": 0.12035159708352093, "grad_norm": 11.54642673046286, "learning_rate": 4.9393846880008984e-05, "loss": 2.3367, "mean_token_accuracy": 0.43793103098869324, "step": 119490 }, { "epoch": 0.1203566331366251, "grad_norm": 10.832706700787105, "learning_rate": 4.939376048576055e-05, "loss": 2.2838, "mean_token_accuracy": 0.4137930989265442, "step": 119495 }, { "epoch": 0.12036166918972928, "grad_norm": 9.819163447589506, "learning_rate": 4.939367408543981e-05, "loss": 2.1329, "mean_token_accuracy": 0.44482759237289426, "step": 119500 }, { "epoch": 0.12036670524283344, "grad_norm": 9.918766033231737, "learning_rate": 4.939358767904677e-05, "loss": 2.2703, "mean_token_accuracy": 0.42413792610168455, "step": 119505 }, { "epoch": 0.12037174129593761, "grad_norm": 11.972368695892213, "learning_rate": 4.9393501266581465e-05, "loss": 2.3409, "mean_token_accuracy": 0.4620689690113068, "step": 119510 }, { "epoch": 0.12037677734904179, "grad_norm": 11.767434629251118, "learning_rate": 4.939341484804391e-05, "loss": 2.6007, "mean_token_accuracy": 0.38275861740112305, "step": 119515 }, { "epoch": 0.12038181340214596, "grad_norm": 8.979492131450424, "learning_rate": 4.9393328423434146e-05, "loss": 2.5495, "mean_token_accuracy": 0.4413793087005615, "step": 119520 }, { "epoch": 0.12038684945525013, "grad_norm": 9.724939695225908, "learning_rate": 4.939324199275218e-05, "loss": 2.2549, "mean_token_accuracy": 0.44482758045196535, "step": 119525 }, { "epoch": 0.12039188550835431, "grad_norm": 12.575643828864438, "learning_rate": 4.9393155555998034e-05, "loss": 2.6445, "mean_token_accuracy": 0.37931033968925476, "step": 119530 }, { "epoch": 0.12039692156145848, "grad_norm": 11.776350353140675, "learning_rate": 4.939306911317175e-05, "loss": 2.3675, "mean_token_accuracy": 0.39655172526836396, "step": 119535 }, { "epoch": 0.12040195761456265, "grad_norm": 9.704736849706698, "learning_rate": 4.939298266427334e-05, "loss": 2.2858, "mean_token_accuracy": 0.4570477962493896, "step": 119540 }, { "epoch": 0.12040699366766683, "grad_norm": 10.293264739354548, "learning_rate": 4.9392896209302826e-05, "loss": 2.3684, "mean_token_accuracy": 0.4137930989265442, "step": 119545 }, { "epoch": 0.120412029720771, "grad_norm": 11.01948440028967, "learning_rate": 4.9392809748260236e-05, "loss": 2.0536, "mean_token_accuracy": 0.4918935298919678, "step": 119550 }, { "epoch": 0.12041706577387518, "grad_norm": 10.314883741051242, "learning_rate": 4.939272328114559e-05, "loss": 2.0789, "mean_token_accuracy": 0.4896551728248596, "step": 119555 }, { "epoch": 0.12042210182697935, "grad_norm": 9.714599254518184, "learning_rate": 4.939263680795892e-05, "loss": 2.3804, "mean_token_accuracy": 0.42068966031074523, "step": 119560 }, { "epoch": 0.12042713788008352, "grad_norm": 7.991958419485984, "learning_rate": 4.939255032870025e-05, "loss": 2.2102, "mean_token_accuracy": 0.4435571670532227, "step": 119565 }, { "epoch": 0.1204321739331877, "grad_norm": 9.614107222669023, "learning_rate": 4.9392463843369595e-05, "loss": 2.3379, "mean_token_accuracy": 0.40344828367233276, "step": 119570 }, { "epoch": 0.12043720998629186, "grad_norm": 8.794686040579972, "learning_rate": 4.9392377351966986e-05, "loss": 2.1446, "mean_token_accuracy": 0.4620689630508423, "step": 119575 }, { "epoch": 0.12044224603939603, "grad_norm": 12.737362241231565, "learning_rate": 4.939229085449243e-05, "loss": 2.4156, "mean_token_accuracy": 0.42758620381355283, "step": 119580 }, { "epoch": 0.1204472820925002, "grad_norm": 12.01372889612714, "learning_rate": 4.939220435094599e-05, "loss": 2.3528, "mean_token_accuracy": 0.441379314661026, "step": 119585 }, { "epoch": 0.12045231814560438, "grad_norm": 9.591463992722597, "learning_rate": 4.939211784132765e-05, "loss": 2.0362, "mean_token_accuracy": 0.4878402888774872, "step": 119590 }, { "epoch": 0.12045735419870855, "grad_norm": 12.380487522905705, "learning_rate": 4.939203132563745e-05, "loss": 2.5136, "mean_token_accuracy": 0.4448275864124298, "step": 119595 }, { "epoch": 0.12046239025181273, "grad_norm": 10.470770068110566, "learning_rate": 4.939194480387542e-05, "loss": 2.5423, "mean_token_accuracy": 0.4137930929660797, "step": 119600 }, { "epoch": 0.1204674263049169, "grad_norm": 12.650502820147741, "learning_rate": 4.939185827604157e-05, "loss": 2.3645, "mean_token_accuracy": 0.4034482717514038, "step": 119605 }, { "epoch": 0.12047246235802107, "grad_norm": 9.570650699452537, "learning_rate": 4.939177174213593e-05, "loss": 2.2257, "mean_token_accuracy": 0.4206896543502808, "step": 119610 }, { "epoch": 0.12047749841112525, "grad_norm": 11.739066180743068, "learning_rate": 4.939168520215854e-05, "loss": 2.2903, "mean_token_accuracy": 0.417241370677948, "step": 119615 }, { "epoch": 0.12048253446422942, "grad_norm": 11.658804802019306, "learning_rate": 4.9391598656109394e-05, "loss": 2.0974, "mean_token_accuracy": 0.43103447556495667, "step": 119620 }, { "epoch": 0.1204875705173336, "grad_norm": 8.22395582134829, "learning_rate": 4.9391512103988543e-05, "loss": 2.2502, "mean_token_accuracy": 0.5160314559936523, "step": 119625 }, { "epoch": 0.12049260657043777, "grad_norm": 10.632937136180853, "learning_rate": 4.939142554579599e-05, "loss": 2.2095, "mean_token_accuracy": 0.4586206912994385, "step": 119630 }, { "epoch": 0.12049764262354194, "grad_norm": 9.372745707739217, "learning_rate": 4.939133898153178e-05, "loss": 3.4135, "mean_token_accuracy": 0.3620689660310745, "step": 119635 }, { "epoch": 0.12050267867664612, "grad_norm": 12.383556932427554, "learning_rate": 4.9391252411195917e-05, "loss": 2.5519, "mean_token_accuracy": 0.41554749608039854, "step": 119640 }, { "epoch": 0.12050771472975028, "grad_norm": 14.570918422761398, "learning_rate": 4.9391165834788435e-05, "loss": 2.2136, "mean_token_accuracy": 0.47931034564971925, "step": 119645 }, { "epoch": 0.12051275078285445, "grad_norm": 12.057980017328477, "learning_rate": 4.939107925230935e-05, "loss": 2.4286, "mean_token_accuracy": 0.37586206793785093, "step": 119650 }, { "epoch": 0.12051778683595862, "grad_norm": 11.984541302396252, "learning_rate": 4.9390992663758694e-05, "loss": 3.0031, "mean_token_accuracy": 0.3620689630508423, "step": 119655 }, { "epoch": 0.1205228228890628, "grad_norm": 8.9438285946296, "learning_rate": 4.9390906069136494e-05, "loss": 2.102, "mean_token_accuracy": 0.47241378426551817, "step": 119660 }, { "epoch": 0.12052785894216697, "grad_norm": 9.183306602436094, "learning_rate": 4.9390819468442776e-05, "loss": 2.3076, "mean_token_accuracy": 0.4551724135875702, "step": 119665 }, { "epoch": 0.12053289499527114, "grad_norm": 17.02493412871042, "learning_rate": 4.9390732861677546e-05, "loss": 2.5484, "mean_token_accuracy": 0.44137930274009707, "step": 119670 }, { "epoch": 0.12053793104837532, "grad_norm": 10.495945609886947, "learning_rate": 4.9390646248840845e-05, "loss": 2.3382, "mean_token_accuracy": 0.4344827592372894, "step": 119675 }, { "epoch": 0.12054296710147949, "grad_norm": 10.71761475111207, "learning_rate": 4.9390559629932686e-05, "loss": 2.618, "mean_token_accuracy": 0.3999999940395355, "step": 119680 }, { "epoch": 0.12054800315458367, "grad_norm": 10.270847984584083, "learning_rate": 4.939047300495311e-05, "loss": 2.0535, "mean_token_accuracy": 0.46551724076271056, "step": 119685 }, { "epoch": 0.12055303920768784, "grad_norm": 11.069210295224085, "learning_rate": 4.9390386373902124e-05, "loss": 2.1289, "mean_token_accuracy": 0.4620689630508423, "step": 119690 }, { "epoch": 0.12055807526079201, "grad_norm": 12.26007824166588, "learning_rate": 4.9390299736779755e-05, "loss": 2.1972, "mean_token_accuracy": 0.42758620381355283, "step": 119695 }, { "epoch": 0.12056311131389619, "grad_norm": 12.172665597580234, "learning_rate": 4.939021309358603e-05, "loss": 2.5059, "mean_token_accuracy": 0.41034482717514037, "step": 119700 }, { "epoch": 0.12056814736700036, "grad_norm": 10.495685601158664, "learning_rate": 4.9390126444320976e-05, "loss": 2.4284, "mean_token_accuracy": 0.42758620381355283, "step": 119705 }, { "epoch": 0.12057318342010453, "grad_norm": 14.974502979744939, "learning_rate": 4.939003978898461e-05, "loss": 2.473, "mean_token_accuracy": 0.4206896543502808, "step": 119710 }, { "epoch": 0.1205782194732087, "grad_norm": 10.603599990063904, "learning_rate": 4.938995312757696e-05, "loss": 2.0775, "mean_token_accuracy": 0.47586206793785096, "step": 119715 }, { "epoch": 0.12058325552631287, "grad_norm": 10.915449558780491, "learning_rate": 4.938986646009806e-05, "loss": 2.5771, "mean_token_accuracy": 0.47755595445632937, "step": 119720 }, { "epoch": 0.12058829157941704, "grad_norm": 8.697697814021558, "learning_rate": 4.9389779786547905e-05, "loss": 2.2102, "mean_token_accuracy": 0.4517241418361664, "step": 119725 }, { "epoch": 0.12059332763252122, "grad_norm": 9.949494820778709, "learning_rate": 4.938969310692655e-05, "loss": 2.4292, "mean_token_accuracy": 0.44827585816383364, "step": 119730 }, { "epoch": 0.12059836368562539, "grad_norm": 7.897445737047752, "learning_rate": 4.9389606421234006e-05, "loss": 1.9606, "mean_token_accuracy": 0.49516030550003054, "step": 119735 }, { "epoch": 0.12060339973872956, "grad_norm": 8.208998202976456, "learning_rate": 4.93895197294703e-05, "loss": 2.2496, "mean_token_accuracy": 0.47241379618644713, "step": 119740 }, { "epoch": 0.12060843579183374, "grad_norm": 10.776803221210308, "learning_rate": 4.9389433031635456e-05, "loss": 2.4423, "mean_token_accuracy": 0.42413793206214906, "step": 119745 }, { "epoch": 0.12061347184493791, "grad_norm": 10.536352296005909, "learning_rate": 4.938934632772949e-05, "loss": 2.3323, "mean_token_accuracy": 0.4344827592372894, "step": 119750 }, { "epoch": 0.12061850789804208, "grad_norm": 10.659730998003429, "learning_rate": 4.938925961775243e-05, "loss": 2.2688, "mean_token_accuracy": 0.44827587008476255, "step": 119755 }, { "epoch": 0.12062354395114626, "grad_norm": 10.37958100675098, "learning_rate": 4.938917290170431e-05, "loss": 2.218, "mean_token_accuracy": 0.44827585816383364, "step": 119760 }, { "epoch": 0.12062858000425043, "grad_norm": 11.495114283050215, "learning_rate": 4.938908617958514e-05, "loss": 2.1074, "mean_token_accuracy": 0.5026013255119324, "step": 119765 }, { "epoch": 0.1206336160573546, "grad_norm": 11.40299953602754, "learning_rate": 4.938899945139496e-05, "loss": 2.265, "mean_token_accuracy": 0.4689655065536499, "step": 119770 }, { "epoch": 0.12063865211045878, "grad_norm": 10.071225830748558, "learning_rate": 4.9388912717133774e-05, "loss": 2.3125, "mean_token_accuracy": 0.46551724672317507, "step": 119775 }, { "epoch": 0.12064368816356295, "grad_norm": 10.156495815418682, "learning_rate": 4.938882597680162e-05, "loss": 2.5974, "mean_token_accuracy": 0.4, "step": 119780 }, { "epoch": 0.12064872421666711, "grad_norm": 10.775821589432335, "learning_rate": 4.9388739230398515e-05, "loss": 2.5367, "mean_token_accuracy": 0.4068965494632721, "step": 119785 }, { "epoch": 0.12065376026977129, "grad_norm": 11.932509695464233, "learning_rate": 4.938865247792449e-05, "loss": 2.3439, "mean_token_accuracy": 0.4560344874858856, "step": 119790 }, { "epoch": 0.12065879632287546, "grad_norm": 10.289316619823074, "learning_rate": 4.938856571937956e-05, "loss": 2.5038, "mean_token_accuracy": 0.43103447556495667, "step": 119795 }, { "epoch": 0.12066383237597963, "grad_norm": 10.219116144507854, "learning_rate": 4.9388478954763764e-05, "loss": 2.2071, "mean_token_accuracy": 0.4918935298919678, "step": 119800 }, { "epoch": 0.12066886842908381, "grad_norm": 19.40126364999409, "learning_rate": 4.9388392184077106e-05, "loss": 2.546, "mean_token_accuracy": 0.4310344815254211, "step": 119805 }, { "epoch": 0.12067390448218798, "grad_norm": 11.096735218936674, "learning_rate": 4.938830540731963e-05, "loss": 2.2491, "mean_token_accuracy": 0.4931034564971924, "step": 119810 }, { "epoch": 0.12067894053529216, "grad_norm": 11.279328338358503, "learning_rate": 4.938821862449135e-05, "loss": 2.6181, "mean_token_accuracy": 0.42758620977401735, "step": 119815 }, { "epoch": 0.12068397658839633, "grad_norm": 10.461129544658773, "learning_rate": 4.9388131835592284e-05, "loss": 2.448, "mean_token_accuracy": 0.3931034505367279, "step": 119820 }, { "epoch": 0.1206890126415005, "grad_norm": 15.349564432918356, "learning_rate": 4.938804504062247e-05, "loss": 2.4651, "mean_token_accuracy": 0.4344827592372894, "step": 119825 }, { "epoch": 0.12069404869460468, "grad_norm": 10.463945414669572, "learning_rate": 4.938795823958192e-05, "loss": 2.7383, "mean_token_accuracy": 0.42758620977401735, "step": 119830 }, { "epoch": 0.12069908474770885, "grad_norm": 11.141218834272754, "learning_rate": 4.9387871432470665e-05, "loss": 2.6949, "mean_token_accuracy": 0.4206896543502808, "step": 119835 }, { "epoch": 0.12070412080081303, "grad_norm": 9.729841806931288, "learning_rate": 4.938778461928874e-05, "loss": 2.3785, "mean_token_accuracy": 0.4586206912994385, "step": 119840 }, { "epoch": 0.1207091568539172, "grad_norm": 9.286168167246696, "learning_rate": 4.938769780003614e-05, "loss": 2.0795, "mean_token_accuracy": 0.5000000059604645, "step": 119845 }, { "epoch": 0.12071419290702137, "grad_norm": 10.922441992397061, "learning_rate": 4.938761097471291e-05, "loss": 2.6727, "mean_token_accuracy": 0.36551723778247835, "step": 119850 }, { "epoch": 0.12071922896012553, "grad_norm": 13.17136120137017, "learning_rate": 4.938752414331906e-05, "loss": 2.2017, "mean_token_accuracy": 0.49999998807907103, "step": 119855 }, { "epoch": 0.1207242650132297, "grad_norm": 12.032808538737187, "learning_rate": 4.9387437305854634e-05, "loss": 2.4743, "mean_token_accuracy": 0.3758620619773865, "step": 119860 }, { "epoch": 0.12072930106633388, "grad_norm": 11.144776189053026, "learning_rate": 4.9387350462319647e-05, "loss": 2.4078, "mean_token_accuracy": 0.42413792610168455, "step": 119865 }, { "epoch": 0.12073433711943805, "grad_norm": 11.093021360465691, "learning_rate": 4.9387263612714116e-05, "loss": 2.6446, "mean_token_accuracy": 0.3931034505367279, "step": 119870 }, { "epoch": 0.12073937317254223, "grad_norm": 10.068329441708558, "learning_rate": 4.9387176757038077e-05, "loss": 2.1654, "mean_token_accuracy": 0.47586206793785096, "step": 119875 }, { "epoch": 0.1207444092256464, "grad_norm": 11.692878504966988, "learning_rate": 4.938708989529154e-05, "loss": 2.7517, "mean_token_accuracy": 0.38965516686439516, "step": 119880 }, { "epoch": 0.12074944527875058, "grad_norm": 9.56749402782208, "learning_rate": 4.9387003027474545e-05, "loss": 2.5233, "mean_token_accuracy": 0.38620689511299133, "step": 119885 }, { "epoch": 0.12075448133185475, "grad_norm": 9.429646089478142, "learning_rate": 4.93869161535871e-05, "loss": 2.4559, "mean_token_accuracy": 0.43448275327682495, "step": 119890 }, { "epoch": 0.12075951738495892, "grad_norm": 11.126281550131901, "learning_rate": 4.938682927362925e-05, "loss": 2.3736, "mean_token_accuracy": 0.467271625995636, "step": 119895 }, { "epoch": 0.1207645534380631, "grad_norm": 12.089545852863473, "learning_rate": 4.938674238760099e-05, "loss": 2.7572, "mean_token_accuracy": 0.38275861740112305, "step": 119900 }, { "epoch": 0.12076958949116727, "grad_norm": 11.928383775352179, "learning_rate": 4.938665549550238e-05, "loss": 2.8054, "mean_token_accuracy": 0.37241379618644715, "step": 119905 }, { "epoch": 0.12077462554427144, "grad_norm": 13.112245779094343, "learning_rate": 4.9386568597333406e-05, "loss": 2.8366, "mean_token_accuracy": 0.4, "step": 119910 }, { "epoch": 0.12077966159737562, "grad_norm": 11.663632670937336, "learning_rate": 4.9386481693094115e-05, "loss": 2.5705, "mean_token_accuracy": 0.38965516686439516, "step": 119915 }, { "epoch": 0.12078469765047979, "grad_norm": 12.139700838623277, "learning_rate": 4.938639478278453e-05, "loss": 2.3511, "mean_token_accuracy": 0.4517241418361664, "step": 119920 }, { "epoch": 0.12078973370358395, "grad_norm": 10.141972203242688, "learning_rate": 4.938630786640467e-05, "loss": 2.7594, "mean_token_accuracy": 0.3655172407627106, "step": 119925 }, { "epoch": 0.12079476975668813, "grad_norm": 9.07689169863671, "learning_rate": 4.938622094395456e-05, "loss": 2.3086, "mean_token_accuracy": 0.46551724076271056, "step": 119930 }, { "epoch": 0.1207998058097923, "grad_norm": 9.862111892403567, "learning_rate": 4.9386134015434225e-05, "loss": 2.5544, "mean_token_accuracy": 0.41379310488700866, "step": 119935 }, { "epoch": 0.12080484186289647, "grad_norm": 10.557784555330421, "learning_rate": 4.93860470808437e-05, "loss": 2.6732, "mean_token_accuracy": 0.43103447556495667, "step": 119940 }, { "epoch": 0.12080987791600065, "grad_norm": 9.70815899929393, "learning_rate": 4.9385960140182986e-05, "loss": 2.3673, "mean_token_accuracy": 0.4413793087005615, "step": 119945 }, { "epoch": 0.12081491396910482, "grad_norm": 9.443494375477073, "learning_rate": 4.9385873193452124e-05, "loss": 2.6375, "mean_token_accuracy": 0.41034482419490814, "step": 119950 }, { "epoch": 0.120819950022209, "grad_norm": 10.138093443799047, "learning_rate": 4.9385786240651126e-05, "loss": 2.3327, "mean_token_accuracy": 0.4620689690113068, "step": 119955 }, { "epoch": 0.12082498607531317, "grad_norm": 13.190709410072337, "learning_rate": 4.9385699281780026e-05, "loss": 3.5428, "mean_token_accuracy": 0.3793103456497192, "step": 119960 }, { "epoch": 0.12083002212841734, "grad_norm": 10.022703388811944, "learning_rate": 4.938561231683885e-05, "loss": 2.1006, "mean_token_accuracy": 0.4551724076271057, "step": 119965 }, { "epoch": 0.12083505818152152, "grad_norm": 17.57306208010187, "learning_rate": 4.938552534582762e-05, "loss": 2.576, "mean_token_accuracy": 0.41379311084747317, "step": 119970 }, { "epoch": 0.12084009423462569, "grad_norm": 9.476523944353834, "learning_rate": 4.938543836874635e-05, "loss": 2.0163, "mean_token_accuracy": 0.4655172318220139, "step": 119975 }, { "epoch": 0.12084513028772986, "grad_norm": 11.95481414916916, "learning_rate": 4.938535138559508e-05, "loss": 2.4738, "mean_token_accuracy": 0.4896551728248596, "step": 119980 }, { "epoch": 0.12085016634083404, "grad_norm": 8.58745794557383, "learning_rate": 4.938526439637383e-05, "loss": 2.0912, "mean_token_accuracy": 0.45862069725990295, "step": 119985 }, { "epoch": 0.12085520239393821, "grad_norm": 19.392899335778417, "learning_rate": 4.938517740108261e-05, "loss": 2.6767, "mean_token_accuracy": 0.4517241418361664, "step": 119990 }, { "epoch": 0.12086023844704237, "grad_norm": 10.971620664565677, "learning_rate": 4.9385090399721457e-05, "loss": 3.3046, "mean_token_accuracy": 0.3448275774717331, "step": 119995 }, { "epoch": 0.12086527450014654, "grad_norm": 12.210256226935813, "learning_rate": 4.938500339229039e-05, "loss": 2.5852, "mean_token_accuracy": 0.38275861740112305, "step": 120000 }, { "epoch": 0.12087031055325072, "grad_norm": 12.116814794560296, "learning_rate": 4.938491637878943e-05, "loss": 2.7835, "mean_token_accuracy": 0.4, "step": 120005 }, { "epoch": 0.12087534660635489, "grad_norm": 9.982573184528107, "learning_rate": 4.9384829359218624e-05, "loss": 2.0066, "mean_token_accuracy": 0.47931034564971925, "step": 120010 }, { "epoch": 0.12088038265945907, "grad_norm": 9.237819035520596, "learning_rate": 4.938474233357797e-05, "loss": 2.2866, "mean_token_accuracy": 0.46896551847457885, "step": 120015 }, { "epoch": 0.12088541871256324, "grad_norm": 11.201955113769582, "learning_rate": 4.9384655301867504e-05, "loss": 2.7916, "mean_token_accuracy": 0.4, "step": 120020 }, { "epoch": 0.12089045476566741, "grad_norm": 10.385106638494413, "learning_rate": 4.938456826408724e-05, "loss": 2.1148, "mean_token_accuracy": 0.47931034564971925, "step": 120025 }, { "epoch": 0.12089549081877159, "grad_norm": 13.535292564317503, "learning_rate": 4.9384481220237215e-05, "loss": 2.4539, "mean_token_accuracy": 0.3827586233615875, "step": 120030 }, { "epoch": 0.12090052687187576, "grad_norm": 17.069054210040363, "learning_rate": 4.938439417031745e-05, "loss": 2.4322, "mean_token_accuracy": 0.4, "step": 120035 }, { "epoch": 0.12090556292497993, "grad_norm": 8.657831299887144, "learning_rate": 4.938430711432796e-05, "loss": 2.1443, "mean_token_accuracy": 0.47586206793785096, "step": 120040 }, { "epoch": 0.12091059897808411, "grad_norm": 11.789636023570297, "learning_rate": 4.938422005226878e-05, "loss": 2.2108, "mean_token_accuracy": 0.4814277052879333, "step": 120045 }, { "epoch": 0.12091563503118828, "grad_norm": 12.160959023374206, "learning_rate": 4.9384132984139936e-05, "loss": 2.3242, "mean_token_accuracy": 0.3808832347393036, "step": 120050 }, { "epoch": 0.12092067108429246, "grad_norm": 11.746523812364865, "learning_rate": 4.938404590994144e-05, "loss": 2.5708, "mean_token_accuracy": 0.3999999940395355, "step": 120055 }, { "epoch": 0.12092570713739663, "grad_norm": 11.00083660213163, "learning_rate": 4.938395882967332e-05, "loss": 2.1925, "mean_token_accuracy": 0.4413793087005615, "step": 120060 }, { "epoch": 0.12093074319050079, "grad_norm": 10.349146856917724, "learning_rate": 4.9383871743335597e-05, "loss": 2.0897, "mean_token_accuracy": 0.4344827592372894, "step": 120065 }, { "epoch": 0.12093577924360496, "grad_norm": 10.460742382898891, "learning_rate": 4.938378465092831e-05, "loss": 2.8577, "mean_token_accuracy": 0.4275862157344818, "step": 120070 }, { "epoch": 0.12094081529670914, "grad_norm": 12.77132394219928, "learning_rate": 4.938369755245147e-05, "loss": 2.1322, "mean_token_accuracy": 0.4862068951129913, "step": 120075 }, { "epoch": 0.12094585134981331, "grad_norm": 13.448642945068208, "learning_rate": 4.938361044790511e-05, "loss": 2.7866, "mean_token_accuracy": 0.4103448331356049, "step": 120080 }, { "epoch": 0.12095088740291748, "grad_norm": 9.766243083882312, "learning_rate": 4.938352333728924e-05, "loss": 2.6188, "mean_token_accuracy": 0.43103448748588563, "step": 120085 }, { "epoch": 0.12095592345602166, "grad_norm": 9.926836684641998, "learning_rate": 4.9383436220603894e-05, "loss": 2.3454, "mean_token_accuracy": 0.4379310250282288, "step": 120090 }, { "epoch": 0.12096095950912583, "grad_norm": 9.773049601919867, "learning_rate": 4.93833490978491e-05, "loss": 2.3402, "mean_token_accuracy": 0.4137930989265442, "step": 120095 }, { "epoch": 0.12096599556223, "grad_norm": 9.190953412641445, "learning_rate": 4.938326196902488e-05, "loss": 2.1136, "mean_token_accuracy": 0.482758617401123, "step": 120100 }, { "epoch": 0.12097103161533418, "grad_norm": 11.33808946977182, "learning_rate": 4.9383174834131254e-05, "loss": 2.4902, "mean_token_accuracy": 0.4366606116294861, "step": 120105 }, { "epoch": 0.12097606766843835, "grad_norm": 10.95698627837282, "learning_rate": 4.938308769316824e-05, "loss": 2.1453, "mean_token_accuracy": 0.4517241358757019, "step": 120110 }, { "epoch": 0.12098110372154253, "grad_norm": 11.447831193440226, "learning_rate": 4.9383000546135875e-05, "loss": 2.3365, "mean_token_accuracy": 0.4379310369491577, "step": 120115 }, { "epoch": 0.1209861397746467, "grad_norm": 13.032614444825704, "learning_rate": 4.938291339303418e-05, "loss": 2.5661, "mean_token_accuracy": 0.42413792610168455, "step": 120120 }, { "epoch": 0.12099117582775087, "grad_norm": 10.607033673857114, "learning_rate": 4.9382826233863176e-05, "loss": 2.1775, "mean_token_accuracy": 0.47416818141937256, "step": 120125 }, { "epoch": 0.12099621188085505, "grad_norm": 10.928678481999963, "learning_rate": 4.938273906862289e-05, "loss": 2.188, "mean_token_accuracy": 0.43793103098869324, "step": 120130 }, { "epoch": 0.12100124793395921, "grad_norm": 8.716496323919689, "learning_rate": 4.938265189731335e-05, "loss": 2.1433, "mean_token_accuracy": 0.4464004814624786, "step": 120135 }, { "epoch": 0.12100628398706338, "grad_norm": 9.82406300903284, "learning_rate": 4.938256471993457e-05, "loss": 1.7709, "mean_token_accuracy": 0.5068965554237366, "step": 120140 }, { "epoch": 0.12101132004016756, "grad_norm": 12.211974652897986, "learning_rate": 4.938247753648658e-05, "loss": 2.6315, "mean_token_accuracy": 0.3896551728248596, "step": 120145 }, { "epoch": 0.12101635609327173, "grad_norm": 10.989976602793591, "learning_rate": 4.938239034696939e-05, "loss": 2.4892, "mean_token_accuracy": 0.3947973370552063, "step": 120150 }, { "epoch": 0.1210213921463759, "grad_norm": 7.8069879239806905, "learning_rate": 4.938230315138305e-05, "loss": 3.1158, "mean_token_accuracy": 0.3655172407627106, "step": 120155 }, { "epoch": 0.12102642819948008, "grad_norm": 10.614186702616331, "learning_rate": 4.938221594972757e-05, "loss": 2.252, "mean_token_accuracy": 0.4413793087005615, "step": 120160 }, { "epoch": 0.12103146425258425, "grad_norm": 12.378579465273567, "learning_rate": 4.938212874200298e-05, "loss": 2.154, "mean_token_accuracy": 0.4562807857990265, "step": 120165 }, { "epoch": 0.12103650030568842, "grad_norm": 8.096885142546059, "learning_rate": 4.938204152820929e-05, "loss": 2.1504, "mean_token_accuracy": 0.4379310369491577, "step": 120170 }, { "epoch": 0.1210415363587926, "grad_norm": 6.968283989863469, "learning_rate": 4.938195430834654e-05, "loss": 2.1627, "mean_token_accuracy": 0.5124016880989075, "step": 120175 }, { "epoch": 0.12104657241189677, "grad_norm": 9.825609365282302, "learning_rate": 4.938186708241475e-05, "loss": 2.096, "mean_token_accuracy": 0.4620689630508423, "step": 120180 }, { "epoch": 0.12105160846500095, "grad_norm": 9.305946906054047, "learning_rate": 4.938177985041394e-05, "loss": 2.0768, "mean_token_accuracy": 0.4551724135875702, "step": 120185 }, { "epoch": 0.12105664451810512, "grad_norm": 11.453103384495998, "learning_rate": 4.938169261234414e-05, "loss": 2.3913, "mean_token_accuracy": 0.4103448212146759, "step": 120190 }, { "epoch": 0.12106168057120929, "grad_norm": 10.977990146632083, "learning_rate": 4.9381605368205366e-05, "loss": 2.1762, "mean_token_accuracy": 0.48275862336158754, "step": 120195 }, { "epoch": 0.12106671662431347, "grad_norm": 11.151734148888854, "learning_rate": 4.938151811799765e-05, "loss": 2.5549, "mean_token_accuracy": 0.4263157844543457, "step": 120200 }, { "epoch": 0.12107175267741763, "grad_norm": 13.424320880941268, "learning_rate": 4.938143086172101e-05, "loss": 2.8915, "mean_token_accuracy": 0.4034482777118683, "step": 120205 }, { "epoch": 0.1210767887305218, "grad_norm": 11.285235111386482, "learning_rate": 4.9381343599375474e-05, "loss": 2.7379, "mean_token_accuracy": 0.42413792610168455, "step": 120210 }, { "epoch": 0.12108182478362597, "grad_norm": 10.718240452057701, "learning_rate": 4.9381256330961076e-05, "loss": 2.492, "mean_token_accuracy": 0.39655172228813174, "step": 120215 }, { "epoch": 0.12108686083673015, "grad_norm": 13.86921907798621, "learning_rate": 4.938116905647782e-05, "loss": 2.3665, "mean_token_accuracy": 0.4137930989265442, "step": 120220 }, { "epoch": 0.12109189688983432, "grad_norm": 11.210185715616591, "learning_rate": 4.938108177592574e-05, "loss": 2.2386, "mean_token_accuracy": 0.4571082890033722, "step": 120225 }, { "epoch": 0.1210969329429385, "grad_norm": 9.603845382617719, "learning_rate": 4.938099448930486e-05, "loss": 2.3047, "mean_token_accuracy": 0.44137930274009707, "step": 120230 }, { "epoch": 0.12110196899604267, "grad_norm": 10.672811966930382, "learning_rate": 4.9380907196615205e-05, "loss": 2.659, "mean_token_accuracy": 0.4206896543502808, "step": 120235 }, { "epoch": 0.12110700504914684, "grad_norm": 9.996195800708499, "learning_rate": 4.9380819897856806e-05, "loss": 2.3678, "mean_token_accuracy": 0.4068965494632721, "step": 120240 }, { "epoch": 0.12111204110225102, "grad_norm": 10.361795962037187, "learning_rate": 4.9380732593029675e-05, "loss": 2.6602, "mean_token_accuracy": 0.4, "step": 120245 }, { "epoch": 0.12111707715535519, "grad_norm": 12.399482218678184, "learning_rate": 4.9380645282133836e-05, "loss": 2.8659, "mean_token_accuracy": 0.3620689570903778, "step": 120250 }, { "epoch": 0.12112211320845936, "grad_norm": 10.285962165738681, "learning_rate": 4.9380557965169325e-05, "loss": 2.495, "mean_token_accuracy": 0.42068966031074523, "step": 120255 }, { "epoch": 0.12112714926156354, "grad_norm": 9.784592649551751, "learning_rate": 4.938047064213616e-05, "loss": 2.4922, "mean_token_accuracy": 0.4, "step": 120260 }, { "epoch": 0.12113218531466771, "grad_norm": 12.99517676049138, "learning_rate": 4.938038331303436e-05, "loss": 2.4436, "mean_token_accuracy": 0.45517240166664125, "step": 120265 }, { "epoch": 0.12113722136777189, "grad_norm": 9.92430422859366, "learning_rate": 4.9380295977863954e-05, "loss": 2.1558, "mean_token_accuracy": 0.48802178502082827, "step": 120270 }, { "epoch": 0.12114225742087605, "grad_norm": 9.64559470193739, "learning_rate": 4.9380208636624976e-05, "loss": 2.5619, "mean_token_accuracy": 0.4, "step": 120275 }, { "epoch": 0.12114729347398022, "grad_norm": 9.997121108832845, "learning_rate": 4.938012128931743e-05, "loss": 2.3047, "mean_token_accuracy": 0.45221675634384156, "step": 120280 }, { "epoch": 0.1211523295270844, "grad_norm": 9.814901312998252, "learning_rate": 4.9380033935941356e-05, "loss": 2.2789, "mean_token_accuracy": 0.4172413766384125, "step": 120285 }, { "epoch": 0.12115736558018857, "grad_norm": 10.38207030718613, "learning_rate": 4.9379946576496774e-05, "loss": 2.351, "mean_token_accuracy": 0.41724138259887694, "step": 120290 }, { "epoch": 0.12116240163329274, "grad_norm": 15.741537450101857, "learning_rate": 4.93798592109837e-05, "loss": 2.8712, "mean_token_accuracy": 0.3793103516101837, "step": 120295 }, { "epoch": 0.12116743768639691, "grad_norm": 10.194906733545208, "learning_rate": 4.9379771839402175e-05, "loss": 2.1374, "mean_token_accuracy": 0.47931034564971925, "step": 120300 }, { "epoch": 0.12117247373950109, "grad_norm": 11.166133493141777, "learning_rate": 4.9379684461752205e-05, "loss": 2.4095, "mean_token_accuracy": 0.39310344457626345, "step": 120305 }, { "epoch": 0.12117750979260526, "grad_norm": 11.285412342761997, "learning_rate": 4.937959707803382e-05, "loss": 2.3926, "mean_token_accuracy": 0.4103448212146759, "step": 120310 }, { "epoch": 0.12118254584570944, "grad_norm": 9.47469002559648, "learning_rate": 4.937950968824706e-05, "loss": 2.3099, "mean_token_accuracy": 0.42758620381355283, "step": 120315 }, { "epoch": 0.12118758189881361, "grad_norm": 11.423265110601163, "learning_rate": 4.937942229239192e-05, "loss": 2.2817, "mean_token_accuracy": 0.441379314661026, "step": 120320 }, { "epoch": 0.12119261795191778, "grad_norm": 11.600232200419542, "learning_rate": 4.937933489046846e-05, "loss": 2.6292, "mean_token_accuracy": 0.3931034505367279, "step": 120325 }, { "epoch": 0.12119765400502196, "grad_norm": 9.398905621716544, "learning_rate": 4.9379247482476674e-05, "loss": 2.3636, "mean_token_accuracy": 0.38620689511299133, "step": 120330 }, { "epoch": 0.12120269005812613, "grad_norm": 9.29751415571977, "learning_rate": 4.93791600684166e-05, "loss": 2.1308, "mean_token_accuracy": 0.4620689630508423, "step": 120335 }, { "epoch": 0.1212077261112303, "grad_norm": 10.86078431887358, "learning_rate": 4.937907264828825e-05, "loss": 2.5892, "mean_token_accuracy": 0.4137930989265442, "step": 120340 }, { "epoch": 0.12121276216433446, "grad_norm": 10.236869872636051, "learning_rate": 4.9378985222091675e-05, "loss": 2.4435, "mean_token_accuracy": 0.43103448748588563, "step": 120345 }, { "epoch": 0.12121779821743864, "grad_norm": 9.665608131417388, "learning_rate": 4.937889778982687e-05, "loss": 2.4599, "mean_token_accuracy": 0.4310344785451889, "step": 120350 }, { "epoch": 0.12122283427054281, "grad_norm": 14.653260246763328, "learning_rate": 4.937881035149387e-05, "loss": 2.489, "mean_token_accuracy": 0.46551724076271056, "step": 120355 }, { "epoch": 0.12122787032364699, "grad_norm": 8.42060325037053, "learning_rate": 4.937872290709271e-05, "loss": 2.8, "mean_token_accuracy": 0.4551724076271057, "step": 120360 }, { "epoch": 0.12123290637675116, "grad_norm": 10.600841404884385, "learning_rate": 4.93786354566234e-05, "loss": 2.4621, "mean_token_accuracy": 0.4103448331356049, "step": 120365 }, { "epoch": 0.12123794242985533, "grad_norm": 12.088405929896956, "learning_rate": 4.937854800008596e-05, "loss": 2.3426, "mean_token_accuracy": 0.38620689511299133, "step": 120370 }, { "epoch": 0.12124297848295951, "grad_norm": 11.0242749881951, "learning_rate": 4.937846053748044e-05, "loss": 2.2108, "mean_token_accuracy": 0.46896551847457885, "step": 120375 }, { "epoch": 0.12124801453606368, "grad_norm": 9.603891355691717, "learning_rate": 4.937837306880683e-05, "loss": 2.9046, "mean_token_accuracy": 0.4034482717514038, "step": 120380 }, { "epoch": 0.12125305058916785, "grad_norm": 11.782482720778125, "learning_rate": 4.937828559406518e-05, "loss": 2.7371, "mean_token_accuracy": 0.4068965554237366, "step": 120385 }, { "epoch": 0.12125808664227203, "grad_norm": 9.161066812798278, "learning_rate": 4.9378198113255504e-05, "loss": 2.5352, "mean_token_accuracy": 0.44827585220336913, "step": 120390 }, { "epoch": 0.1212631226953762, "grad_norm": 11.593063145764928, "learning_rate": 4.937811062637783e-05, "loss": 2.522, "mean_token_accuracy": 0.4103448212146759, "step": 120395 }, { "epoch": 0.12126815874848038, "grad_norm": 12.426460319688541, "learning_rate": 4.937802313343217e-05, "loss": 2.65, "mean_token_accuracy": 0.38620689511299133, "step": 120400 }, { "epoch": 0.12127319480158455, "grad_norm": 9.873844750451001, "learning_rate": 4.937793563441858e-05, "loss": 2.4676, "mean_token_accuracy": 0.41724138259887694, "step": 120405 }, { "epoch": 0.12127823085468872, "grad_norm": 11.272156999710116, "learning_rate": 4.937784812933705e-05, "loss": 2.3113, "mean_token_accuracy": 0.48965516686439514, "step": 120410 }, { "epoch": 0.12128326690779288, "grad_norm": 11.720457982146641, "learning_rate": 4.9377760618187614e-05, "loss": 2.2135, "mean_token_accuracy": 0.4655172348022461, "step": 120415 }, { "epoch": 0.12128830296089706, "grad_norm": 10.601317848607376, "learning_rate": 4.9377673100970304e-05, "loss": 2.4268, "mean_token_accuracy": 0.458620685338974, "step": 120420 }, { "epoch": 0.12129333901400123, "grad_norm": 15.489294899875768, "learning_rate": 4.9377585577685136e-05, "loss": 2.7776, "mean_token_accuracy": 0.4344827592372894, "step": 120425 }, { "epoch": 0.1212983750671054, "grad_norm": 10.375758239062606, "learning_rate": 4.937749804833214e-05, "loss": 2.5071, "mean_token_accuracy": 0.4068965494632721, "step": 120430 }, { "epoch": 0.12130341112020958, "grad_norm": 10.828522449640985, "learning_rate": 4.937741051291135e-05, "loss": 2.4512, "mean_token_accuracy": 0.4103448331356049, "step": 120435 }, { "epoch": 0.12130844717331375, "grad_norm": 9.255624322829284, "learning_rate": 4.9377322971422764e-05, "loss": 2.7181, "mean_token_accuracy": 0.4344827592372894, "step": 120440 }, { "epoch": 0.12131348322641793, "grad_norm": 11.375533692867764, "learning_rate": 4.937723542386643e-05, "loss": 2.543, "mean_token_accuracy": 0.37931033968925476, "step": 120445 }, { "epoch": 0.1213185192795221, "grad_norm": 9.163183192069422, "learning_rate": 4.937714787024235e-05, "loss": 2.5502, "mean_token_accuracy": 0.38620689511299133, "step": 120450 }, { "epoch": 0.12132355533262627, "grad_norm": 12.600403676226422, "learning_rate": 4.937706031055057e-05, "loss": 2.7572, "mean_token_accuracy": 0.3703569293022156, "step": 120455 }, { "epoch": 0.12132859138573045, "grad_norm": 14.277540634614661, "learning_rate": 4.9376972744791105e-05, "loss": 2.491, "mean_token_accuracy": 0.4034482717514038, "step": 120460 }, { "epoch": 0.12133362743883462, "grad_norm": 11.17707346836547, "learning_rate": 4.937688517296399e-05, "loss": 2.571, "mean_token_accuracy": 0.4034482717514038, "step": 120465 }, { "epoch": 0.1213386634919388, "grad_norm": 10.11725155944679, "learning_rate": 4.9376797595069224e-05, "loss": 2.3732, "mean_token_accuracy": 0.417241370677948, "step": 120470 }, { "epoch": 0.12134369954504297, "grad_norm": 10.42665940094799, "learning_rate": 4.937671001110685e-05, "loss": 2.559, "mean_token_accuracy": 0.39655172228813174, "step": 120475 }, { "epoch": 0.12134873559814714, "grad_norm": 10.256120201551772, "learning_rate": 4.9376622421076894e-05, "loss": 2.4898, "mean_token_accuracy": 0.4344827651977539, "step": 120480 }, { "epoch": 0.1213537716512513, "grad_norm": 16.923836450998838, "learning_rate": 4.937653482497938e-05, "loss": 2.4644, "mean_token_accuracy": 0.4517241299152374, "step": 120485 }, { "epoch": 0.12135880770435548, "grad_norm": 12.059392930689311, "learning_rate": 4.937644722281431e-05, "loss": 2.5091, "mean_token_accuracy": 0.41893526911735535, "step": 120490 }, { "epoch": 0.12136384375745965, "grad_norm": 9.94589110734491, "learning_rate": 4.937635961458174e-05, "loss": 2.4979, "mean_token_accuracy": 0.4655172348022461, "step": 120495 }, { "epoch": 0.12136887981056382, "grad_norm": 10.345628469984126, "learning_rate": 4.9376272000281676e-05, "loss": 2.5816, "mean_token_accuracy": 0.41203871965408323, "step": 120500 }, { "epoch": 0.121373915863668, "grad_norm": 11.58355614348508, "learning_rate": 4.9376184379914144e-05, "loss": 2.5957, "mean_token_accuracy": 0.41379310488700866, "step": 120505 }, { "epoch": 0.12137895191677217, "grad_norm": 10.063103209053208, "learning_rate": 4.9376096753479173e-05, "loss": 2.1287, "mean_token_accuracy": 0.44827585220336913, "step": 120510 }, { "epoch": 0.12138398796987634, "grad_norm": 10.785798079029975, "learning_rate": 4.9376009120976784e-05, "loss": 2.3553, "mean_token_accuracy": 0.39655172228813174, "step": 120515 }, { "epoch": 0.12138902402298052, "grad_norm": 9.27298247722596, "learning_rate": 4.937592148240701e-05, "loss": 2.5951, "mean_token_accuracy": 0.3931034505367279, "step": 120520 }, { "epoch": 0.12139406007608469, "grad_norm": 10.36050684981949, "learning_rate": 4.9375833837769855e-05, "loss": 2.213, "mean_token_accuracy": 0.4448275864124298, "step": 120525 }, { "epoch": 0.12139909612918887, "grad_norm": 11.145933623528881, "learning_rate": 4.9375746187065364e-05, "loss": 2.2229, "mean_token_accuracy": 0.44137930274009707, "step": 120530 }, { "epoch": 0.12140413218229304, "grad_norm": 11.326628551346337, "learning_rate": 4.937565853029355e-05, "loss": 2.5891, "mean_token_accuracy": 0.3793103456497192, "step": 120535 }, { "epoch": 0.12140916823539721, "grad_norm": 9.883880007910863, "learning_rate": 4.937557086745444e-05, "loss": 2.3097, "mean_token_accuracy": 0.47241378426551817, "step": 120540 }, { "epoch": 0.12141420428850139, "grad_norm": 8.91231419260317, "learning_rate": 4.9375483198548055e-05, "loss": 2.299, "mean_token_accuracy": 0.45892316699028013, "step": 120545 }, { "epoch": 0.12141924034160556, "grad_norm": 11.438918901227437, "learning_rate": 4.937539552357443e-05, "loss": 2.4636, "mean_token_accuracy": 0.4137930989265442, "step": 120550 }, { "epoch": 0.12142427639470972, "grad_norm": 10.001018463477065, "learning_rate": 4.937530784253358e-05, "loss": 2.399, "mean_token_accuracy": 0.4310344815254211, "step": 120555 }, { "epoch": 0.1214293124478139, "grad_norm": 11.553979131488429, "learning_rate": 4.9375220155425534e-05, "loss": 2.5057, "mean_token_accuracy": 0.4103448212146759, "step": 120560 }, { "epoch": 0.12143434850091807, "grad_norm": 9.547920778014074, "learning_rate": 4.937513246225031e-05, "loss": 2.1815, "mean_token_accuracy": 0.44137930274009707, "step": 120565 }, { "epoch": 0.12143938455402224, "grad_norm": 11.02842244577163, "learning_rate": 4.9375044763007944e-05, "loss": 2.3135, "mean_token_accuracy": 0.39848760068416594, "step": 120570 }, { "epoch": 0.12144442060712642, "grad_norm": 10.233708769251557, "learning_rate": 4.9374957057698444e-05, "loss": 2.5839, "mean_token_accuracy": 0.41379310488700866, "step": 120575 }, { "epoch": 0.12144945666023059, "grad_norm": 9.63693952197964, "learning_rate": 4.937486934632185e-05, "loss": 2.2923, "mean_token_accuracy": 0.43103448748588563, "step": 120580 }, { "epoch": 0.12145449271333476, "grad_norm": 11.712512949514247, "learning_rate": 4.937478162887818e-05, "loss": 2.3935, "mean_token_accuracy": 0.37241379022598264, "step": 120585 }, { "epoch": 0.12145952876643894, "grad_norm": 10.196705029413163, "learning_rate": 4.937469390536744e-05, "loss": 3.0547, "mean_token_accuracy": 0.3862069010734558, "step": 120590 }, { "epoch": 0.12146456481954311, "grad_norm": 11.673843445773649, "learning_rate": 4.937460617578969e-05, "loss": 2.4666, "mean_token_accuracy": 0.47339901328086853, "step": 120595 }, { "epoch": 0.12146960087264728, "grad_norm": 11.051134910077542, "learning_rate": 4.937451844014493e-05, "loss": 2.3782, "mean_token_accuracy": 0.4137930989265442, "step": 120600 }, { "epoch": 0.12147463692575146, "grad_norm": 9.662490137264092, "learning_rate": 4.9374430698433195e-05, "loss": 2.2281, "mean_token_accuracy": 0.4310344815254211, "step": 120605 }, { "epoch": 0.12147967297885563, "grad_norm": 10.594896887822555, "learning_rate": 4.93743429506545e-05, "loss": 2.1138, "mean_token_accuracy": 0.4586206912994385, "step": 120610 }, { "epoch": 0.1214847090319598, "grad_norm": 10.164091822738284, "learning_rate": 4.937425519680888e-05, "loss": 2.3946, "mean_token_accuracy": 0.4137930989265442, "step": 120615 }, { "epoch": 0.12148974508506398, "grad_norm": 8.900498666653574, "learning_rate": 4.937416743689636e-05, "loss": 2.0881, "mean_token_accuracy": 0.4344827651977539, "step": 120620 }, { "epoch": 0.12149478113816814, "grad_norm": 9.72999431506868, "learning_rate": 4.937407967091694e-05, "loss": 2.4121, "mean_token_accuracy": 0.44482758045196535, "step": 120625 }, { "epoch": 0.12149981719127231, "grad_norm": 10.054439081019352, "learning_rate": 4.9373991898870666e-05, "loss": 2.8045, "mean_token_accuracy": 0.3655172407627106, "step": 120630 }, { "epoch": 0.12150485324437649, "grad_norm": 10.00976193556926, "learning_rate": 4.937390412075757e-05, "loss": 2.0481, "mean_token_accuracy": 0.5122807025909424, "step": 120635 }, { "epoch": 0.12150988929748066, "grad_norm": 11.67198068390328, "learning_rate": 4.937381633657766e-05, "loss": 2.1867, "mean_token_accuracy": 0.4862068951129913, "step": 120640 }, { "epoch": 0.12151492535058483, "grad_norm": 9.697137386991486, "learning_rate": 4.937372854633097e-05, "loss": 2.7594, "mean_token_accuracy": 0.3551724076271057, "step": 120645 }, { "epoch": 0.12151996140368901, "grad_norm": 10.94089020360911, "learning_rate": 4.937364075001751e-05, "loss": 2.2663, "mean_token_accuracy": 0.4620689630508423, "step": 120650 }, { "epoch": 0.12152499745679318, "grad_norm": 12.503431871236701, "learning_rate": 4.937355294763732e-05, "loss": 2.2608, "mean_token_accuracy": 0.4517241358757019, "step": 120655 }, { "epoch": 0.12153003350989736, "grad_norm": 11.678741885218328, "learning_rate": 4.937346513919042e-05, "loss": 2.9036, "mean_token_accuracy": 0.4068965494632721, "step": 120660 }, { "epoch": 0.12153506956300153, "grad_norm": 10.005761581058362, "learning_rate": 4.9373377324676823e-05, "loss": 2.2571, "mean_token_accuracy": 0.43793103098869324, "step": 120665 }, { "epoch": 0.1215401056161057, "grad_norm": 7.548786495964204, "learning_rate": 4.937328950409657e-05, "loss": 2.2689, "mean_token_accuracy": 0.49546279907226565, "step": 120670 }, { "epoch": 0.12154514166920988, "grad_norm": 9.387003846558114, "learning_rate": 4.937320167744968e-05, "loss": 2.6214, "mean_token_accuracy": 0.40689654648303986, "step": 120675 }, { "epoch": 0.12155017772231405, "grad_norm": 14.687873187352, "learning_rate": 4.937311384473617e-05, "loss": 2.1185, "mean_token_accuracy": 0.5, "step": 120680 }, { "epoch": 0.12155521377541822, "grad_norm": 9.67346853750333, "learning_rate": 4.937302600595608e-05, "loss": 2.4354, "mean_token_accuracy": 0.42413792610168455, "step": 120685 }, { "epoch": 0.1215602498285224, "grad_norm": 8.824522417738727, "learning_rate": 4.9372938161109414e-05, "loss": 1.9263, "mean_token_accuracy": 0.5103448390960693, "step": 120690 }, { "epoch": 0.12156528588162656, "grad_norm": 9.415608002719981, "learning_rate": 4.937285031019622e-05, "loss": 2.2339, "mean_token_accuracy": 0.4499092519283295, "step": 120695 }, { "epoch": 0.12157032193473073, "grad_norm": 13.767608234370705, "learning_rate": 4.93727624532165e-05, "loss": 2.7323, "mean_token_accuracy": 0.3793103456497192, "step": 120700 }, { "epoch": 0.1215753579878349, "grad_norm": 10.245752921240404, "learning_rate": 4.9372674590170285e-05, "loss": 2.534, "mean_token_accuracy": 0.3793103456497192, "step": 120705 }, { "epoch": 0.12158039404093908, "grad_norm": 10.983964394106494, "learning_rate": 4.937258672105761e-05, "loss": 2.1828, "mean_token_accuracy": 0.4551724076271057, "step": 120710 }, { "epoch": 0.12158543009404325, "grad_norm": 11.822016746933091, "learning_rate": 4.937249884587849e-05, "loss": 2.3725, "mean_token_accuracy": 0.39310343861579894, "step": 120715 }, { "epoch": 0.12159046614714743, "grad_norm": 10.601322210612762, "learning_rate": 4.937241096463295e-05, "loss": 2.9515, "mean_token_accuracy": 0.36551723778247835, "step": 120720 }, { "epoch": 0.1215955022002516, "grad_norm": 12.6458548777165, "learning_rate": 4.937232307732101e-05, "loss": 2.7574, "mean_token_accuracy": 0.4, "step": 120725 }, { "epoch": 0.12160053825335577, "grad_norm": 11.763389772036374, "learning_rate": 4.937223518394271e-05, "loss": 2.1901, "mean_token_accuracy": 0.4413793087005615, "step": 120730 }, { "epoch": 0.12160557430645995, "grad_norm": 9.92799858890616, "learning_rate": 4.9372147284498055e-05, "loss": 2.4223, "mean_token_accuracy": 0.37931033968925476, "step": 120735 }, { "epoch": 0.12161061035956412, "grad_norm": 12.086966002312767, "learning_rate": 4.9372059378987087e-05, "loss": 2.2654, "mean_token_accuracy": 0.4275862157344818, "step": 120740 }, { "epoch": 0.1216156464126683, "grad_norm": 8.773783071522493, "learning_rate": 4.937197146740981e-05, "loss": 2.1319, "mean_token_accuracy": 0.441379314661026, "step": 120745 }, { "epoch": 0.12162068246577247, "grad_norm": 10.49362533066179, "learning_rate": 4.937188354976626e-05, "loss": 2.3509, "mean_token_accuracy": 0.4137930989265442, "step": 120750 }, { "epoch": 0.12162571851887664, "grad_norm": 9.880684153305749, "learning_rate": 4.937179562605648e-05, "loss": 2.3882, "mean_token_accuracy": 0.4068965524435043, "step": 120755 }, { "epoch": 0.12163075457198082, "grad_norm": 8.57741112309702, "learning_rate": 4.9371707696280455e-05, "loss": 1.9318, "mean_token_accuracy": 0.49866907596588134, "step": 120760 }, { "epoch": 0.12163579062508498, "grad_norm": 10.615805398260516, "learning_rate": 4.937161976043824e-05, "loss": 2.8547, "mean_token_accuracy": 0.358620685338974, "step": 120765 }, { "epoch": 0.12164082667818915, "grad_norm": 9.87325862359283, "learning_rate": 4.9371531818529846e-05, "loss": 2.6827, "mean_token_accuracy": 0.3896551728248596, "step": 120770 }, { "epoch": 0.12164586273129332, "grad_norm": 9.93872256032969, "learning_rate": 4.937144387055531e-05, "loss": 2.2824, "mean_token_accuracy": 0.4655172348022461, "step": 120775 }, { "epoch": 0.1216508987843975, "grad_norm": 10.41750432024745, "learning_rate": 4.9371355916514636e-05, "loss": 2.307, "mean_token_accuracy": 0.4517241358757019, "step": 120780 }, { "epoch": 0.12165593483750167, "grad_norm": 12.950178847478162, "learning_rate": 4.937126795640786e-05, "loss": 2.5854, "mean_token_accuracy": 0.3793103456497192, "step": 120785 }, { "epoch": 0.12166097089060585, "grad_norm": 9.266068436255633, "learning_rate": 4.9371179990235016e-05, "loss": 1.9095, "mean_token_accuracy": 0.5310344874858857, "step": 120790 }, { "epoch": 0.12166600694371002, "grad_norm": 11.248566923948843, "learning_rate": 4.937109201799611e-05, "loss": 2.5359, "mean_token_accuracy": 0.4068965494632721, "step": 120795 }, { "epoch": 0.1216710429968142, "grad_norm": 12.076005330854338, "learning_rate": 4.937100403969118e-05, "loss": 2.3918, "mean_token_accuracy": 0.4137930929660797, "step": 120800 }, { "epoch": 0.12167607904991837, "grad_norm": 12.589601539914614, "learning_rate": 4.9370916055320246e-05, "loss": 2.3997, "mean_token_accuracy": 0.42068966031074523, "step": 120805 }, { "epoch": 0.12168111510302254, "grad_norm": 12.587271314276455, "learning_rate": 4.937082806488333e-05, "loss": 2.3232, "mean_token_accuracy": 0.4103448212146759, "step": 120810 }, { "epoch": 0.12168615115612672, "grad_norm": 14.264993183318627, "learning_rate": 4.937074006838046e-05, "loss": 2.5165, "mean_token_accuracy": 0.44277071952819824, "step": 120815 }, { "epoch": 0.12169118720923089, "grad_norm": 10.280872429740363, "learning_rate": 4.9370652065811655e-05, "loss": 2.3239, "mean_token_accuracy": 0.43448275327682495, "step": 120820 }, { "epoch": 0.12169622326233506, "grad_norm": 10.382145245856712, "learning_rate": 4.9370564057176946e-05, "loss": 2.4227, "mean_token_accuracy": 0.4034482777118683, "step": 120825 }, { "epoch": 0.12170125931543924, "grad_norm": 11.046465818948418, "learning_rate": 4.937047604247635e-05, "loss": 2.5073, "mean_token_accuracy": 0.41034482717514037, "step": 120830 }, { "epoch": 0.1217062953685434, "grad_norm": 11.645857120861283, "learning_rate": 4.9370388021709906e-05, "loss": 2.6629, "mean_token_accuracy": 0.38275861740112305, "step": 120835 }, { "epoch": 0.12171133142164757, "grad_norm": 14.515156698114708, "learning_rate": 4.9370299994877616e-05, "loss": 2.9101, "mean_token_accuracy": 0.37241379022598264, "step": 120840 }, { "epoch": 0.12171636747475174, "grad_norm": 9.620447827615553, "learning_rate": 4.937021196197952e-05, "loss": 2.4242, "mean_token_accuracy": 0.4172413766384125, "step": 120845 }, { "epoch": 0.12172140352785592, "grad_norm": 11.891240101430462, "learning_rate": 4.9370123923015645e-05, "loss": 2.389, "mean_token_accuracy": 0.4103448212146759, "step": 120850 }, { "epoch": 0.12172643958096009, "grad_norm": 12.24415331286597, "learning_rate": 4.9370035877986004e-05, "loss": 2.6654, "mean_token_accuracy": 0.43793103098869324, "step": 120855 }, { "epoch": 0.12173147563406427, "grad_norm": 16.10458497593291, "learning_rate": 4.936994782689063e-05, "loss": 2.8543, "mean_token_accuracy": 0.39310344457626345, "step": 120860 }, { "epoch": 0.12173651168716844, "grad_norm": 9.974693272520863, "learning_rate": 4.936985976972955e-05, "loss": 2.6449, "mean_token_accuracy": 0.42068966031074523, "step": 120865 }, { "epoch": 0.12174154774027261, "grad_norm": 12.418056434408932, "learning_rate": 4.936977170650277e-05, "loss": 2.375, "mean_token_accuracy": 0.4034482777118683, "step": 120870 }, { "epoch": 0.12174658379337679, "grad_norm": 12.80926923393782, "learning_rate": 4.9369683637210326e-05, "loss": 2.5236, "mean_token_accuracy": 0.4517241299152374, "step": 120875 }, { "epoch": 0.12175161984648096, "grad_norm": 11.305023198755583, "learning_rate": 4.9369595561852264e-05, "loss": 2.505, "mean_token_accuracy": 0.43448275327682495, "step": 120880 }, { "epoch": 0.12175665589958513, "grad_norm": 12.939012358247384, "learning_rate": 4.9369507480428565e-05, "loss": 2.2778, "mean_token_accuracy": 0.4931034505367279, "step": 120885 }, { "epoch": 0.12176169195268931, "grad_norm": 11.262067885166859, "learning_rate": 4.936941939293929e-05, "loss": 2.814, "mean_token_accuracy": 0.37931033968925476, "step": 120890 }, { "epoch": 0.12176672800579348, "grad_norm": 11.422960089008283, "learning_rate": 4.9369331299384446e-05, "loss": 2.3471, "mean_token_accuracy": 0.4310344815254211, "step": 120895 }, { "epoch": 0.12177176405889764, "grad_norm": 11.016005044908395, "learning_rate": 4.9369243199764066e-05, "loss": 2.3323, "mean_token_accuracy": 0.4068965494632721, "step": 120900 }, { "epoch": 0.12177680011200182, "grad_norm": 11.228697001756135, "learning_rate": 4.9369155094078166e-05, "loss": 2.3851, "mean_token_accuracy": 0.42413793206214906, "step": 120905 }, { "epoch": 0.12178183616510599, "grad_norm": 9.543767393004085, "learning_rate": 4.936906698232677e-05, "loss": 2.0445, "mean_token_accuracy": 0.49999998807907103, "step": 120910 }, { "epoch": 0.12178687221821016, "grad_norm": 10.888401323398396, "learning_rate": 4.936897886450991e-05, "loss": 2.2918, "mean_token_accuracy": 0.4413793087005615, "step": 120915 }, { "epoch": 0.12179190827131434, "grad_norm": 9.677847150006269, "learning_rate": 4.9368890740627605e-05, "loss": 2.2958, "mean_token_accuracy": 0.4398064136505127, "step": 120920 }, { "epoch": 0.12179694432441851, "grad_norm": 10.724600558176078, "learning_rate": 4.936880261067989e-05, "loss": 2.3988, "mean_token_accuracy": 0.44827587008476255, "step": 120925 }, { "epoch": 0.12180198037752268, "grad_norm": 11.078514878068315, "learning_rate": 4.936871447466677e-05, "loss": 2.0681, "mean_token_accuracy": 0.47447065711021424, "step": 120930 }, { "epoch": 0.12180701643062686, "grad_norm": 10.307876550711665, "learning_rate": 4.936862633258829e-05, "loss": 2.3099, "mean_token_accuracy": 0.41034482419490814, "step": 120935 }, { "epoch": 0.12181205248373103, "grad_norm": 15.823716962417205, "learning_rate": 4.936853818444446e-05, "loss": 2.4597, "mean_token_accuracy": 0.43103448748588563, "step": 120940 }, { "epoch": 0.1218170885368352, "grad_norm": 10.768261005979381, "learning_rate": 4.9368450030235305e-05, "loss": 2.4851, "mean_token_accuracy": 0.4379310369491577, "step": 120945 }, { "epoch": 0.12182212458993938, "grad_norm": 12.907811499072487, "learning_rate": 4.936836186996087e-05, "loss": 2.3195, "mean_token_accuracy": 0.4482758641242981, "step": 120950 }, { "epoch": 0.12182716064304355, "grad_norm": 56.72053711103278, "learning_rate": 4.936827370362115e-05, "loss": 2.285, "mean_token_accuracy": 0.4413793087005615, "step": 120955 }, { "epoch": 0.12183219669614773, "grad_norm": 14.416886640099078, "learning_rate": 4.9368185531216176e-05, "loss": 2.6549, "mean_token_accuracy": 0.46727163791656495, "step": 120960 }, { "epoch": 0.1218372327492519, "grad_norm": 9.846233034514107, "learning_rate": 4.9368097352745994e-05, "loss": 2.1298, "mean_token_accuracy": 0.4655172348022461, "step": 120965 }, { "epoch": 0.12184226880235606, "grad_norm": 10.118920275099352, "learning_rate": 4.936800916821061e-05, "loss": 2.6429, "mean_token_accuracy": 0.42068966031074523, "step": 120970 }, { "epoch": 0.12184730485546023, "grad_norm": 12.882912051116485, "learning_rate": 4.936792097761005e-05, "loss": 2.8345, "mean_token_accuracy": 0.4034482777118683, "step": 120975 }, { "epoch": 0.12185234090856441, "grad_norm": 10.87623865508724, "learning_rate": 4.936783278094433e-05, "loss": 2.2185, "mean_token_accuracy": 0.47586206197738645, "step": 120980 }, { "epoch": 0.12185737696166858, "grad_norm": 11.412181938900813, "learning_rate": 4.93677445782135e-05, "loss": 2.2828, "mean_token_accuracy": 0.4517241358757019, "step": 120985 }, { "epoch": 0.12186241301477276, "grad_norm": 9.268336075017007, "learning_rate": 4.936765636941757e-05, "loss": 2.323, "mean_token_accuracy": 0.41379310488700866, "step": 120990 }, { "epoch": 0.12186744906787693, "grad_norm": 8.857694394723612, "learning_rate": 4.9367568154556554e-05, "loss": 2.5201, "mean_token_accuracy": 0.4413793087005615, "step": 120995 }, { "epoch": 0.1218724851209811, "grad_norm": 10.560603745111912, "learning_rate": 4.936747993363049e-05, "loss": 2.5293, "mean_token_accuracy": 0.4379310369491577, "step": 121000 }, { "epoch": 0.12187752117408528, "grad_norm": 10.309961578868663, "learning_rate": 4.93673917066394e-05, "loss": 2.0209, "mean_token_accuracy": 0.48965516686439514, "step": 121005 }, { "epoch": 0.12188255722718945, "grad_norm": 11.298540414880438, "learning_rate": 4.93673034735833e-05, "loss": 2.6075, "mean_token_accuracy": 0.36206896901130675, "step": 121010 }, { "epoch": 0.12188759328029362, "grad_norm": 15.477266608422392, "learning_rate": 4.9367215234462235e-05, "loss": 2.8664, "mean_token_accuracy": 0.3793103516101837, "step": 121015 }, { "epoch": 0.1218926293333978, "grad_norm": 11.486352501534608, "learning_rate": 4.9367126989276215e-05, "loss": 2.4449, "mean_token_accuracy": 0.39655172228813174, "step": 121020 }, { "epoch": 0.12189766538650197, "grad_norm": 8.983045730980086, "learning_rate": 4.936703873802526e-05, "loss": 2.614, "mean_token_accuracy": 0.4034482717514038, "step": 121025 }, { "epoch": 0.12190270143960615, "grad_norm": 10.376790430148095, "learning_rate": 4.9366950480709405e-05, "loss": 2.5675, "mean_token_accuracy": 0.38965516686439516, "step": 121030 }, { "epoch": 0.12190773749271032, "grad_norm": 11.162068342335486, "learning_rate": 4.936686221732866e-05, "loss": 1.9516, "mean_token_accuracy": 0.5068965494632721, "step": 121035 }, { "epoch": 0.12191277354581448, "grad_norm": 10.600636211328618, "learning_rate": 4.9366773947883064e-05, "loss": 2.4381, "mean_token_accuracy": 0.4275861978530884, "step": 121040 }, { "epoch": 0.12191780959891865, "grad_norm": 10.652840561018301, "learning_rate": 4.936668567237264e-05, "loss": 2.4576, "mean_token_accuracy": 0.38275861740112305, "step": 121045 }, { "epoch": 0.12192284565202283, "grad_norm": 9.656538319058857, "learning_rate": 4.93665973907974e-05, "loss": 2.1529, "mean_token_accuracy": 0.47931034564971925, "step": 121050 }, { "epoch": 0.121927881705127, "grad_norm": 12.661305047453597, "learning_rate": 4.936650910315739e-05, "loss": 2.1475, "mean_token_accuracy": 0.4758620738983154, "step": 121055 }, { "epoch": 0.12193291775823117, "grad_norm": 11.217138411052392, "learning_rate": 4.936642080945262e-05, "loss": 2.5425, "mean_token_accuracy": 0.3827586203813553, "step": 121060 }, { "epoch": 0.12193795381133535, "grad_norm": 9.37308839789053, "learning_rate": 4.936633250968311e-05, "loss": 2.2179, "mean_token_accuracy": 0.4241379380226135, "step": 121065 }, { "epoch": 0.12194298986443952, "grad_norm": 8.51889995827723, "learning_rate": 4.9366244203848896e-05, "loss": 2.2586, "mean_token_accuracy": 0.45172414779663084, "step": 121070 }, { "epoch": 0.1219480259175437, "grad_norm": 9.516899425169772, "learning_rate": 4.936615589194999e-05, "loss": 2.337, "mean_token_accuracy": 0.4931034445762634, "step": 121075 }, { "epoch": 0.12195306197064787, "grad_norm": 10.179137119713541, "learning_rate": 4.936606757398643e-05, "loss": 2.1674, "mean_token_accuracy": 0.5119177162647247, "step": 121080 }, { "epoch": 0.12195809802375204, "grad_norm": 11.19400002726952, "learning_rate": 4.936597924995824e-05, "loss": 2.6164, "mean_token_accuracy": 0.4034482777118683, "step": 121085 }, { "epoch": 0.12196313407685622, "grad_norm": 10.478852863741832, "learning_rate": 4.9365890919865435e-05, "loss": 2.0486, "mean_token_accuracy": 0.47586206793785096, "step": 121090 }, { "epoch": 0.12196817012996039, "grad_norm": 12.88581354153339, "learning_rate": 4.936580258370804e-05, "loss": 2.7266, "mean_token_accuracy": 0.4137930929660797, "step": 121095 }, { "epoch": 0.12197320618306456, "grad_norm": 12.614571098767058, "learning_rate": 4.936571424148609e-05, "loss": 2.8028, "mean_token_accuracy": 0.37586206793785093, "step": 121100 }, { "epoch": 0.12197824223616874, "grad_norm": 11.257747289112984, "learning_rate": 4.93656258931996e-05, "loss": 2.2517, "mean_token_accuracy": 0.43103448748588563, "step": 121105 }, { "epoch": 0.1219832782892729, "grad_norm": 9.710058035522483, "learning_rate": 4.9365537538848596e-05, "loss": 2.5217, "mean_token_accuracy": 0.4034482717514038, "step": 121110 }, { "epoch": 0.12198831434237707, "grad_norm": 12.99042040873905, "learning_rate": 4.93654491784331e-05, "loss": 2.7372, "mean_token_accuracy": 0.39310344457626345, "step": 121115 }, { "epoch": 0.12199335039548125, "grad_norm": 12.102373886652407, "learning_rate": 4.936536081195314e-05, "loss": 2.2319, "mean_token_accuracy": 0.5034482717514038, "step": 121120 }, { "epoch": 0.12199838644858542, "grad_norm": 8.671251500205502, "learning_rate": 4.9365272439408746e-05, "loss": 2.7123, "mean_token_accuracy": 0.3793103486299515, "step": 121125 }, { "epoch": 0.12200342250168959, "grad_norm": 10.510099727228706, "learning_rate": 4.936518406079993e-05, "loss": 2.5184, "mean_token_accuracy": 0.4, "step": 121130 }, { "epoch": 0.12200845855479377, "grad_norm": 9.75872879010189, "learning_rate": 4.9365095676126736e-05, "loss": 2.1933, "mean_token_accuracy": 0.3931034505367279, "step": 121135 }, { "epoch": 0.12201349460789794, "grad_norm": 9.514115811381586, "learning_rate": 4.9365007285389156e-05, "loss": 2.163, "mean_token_accuracy": 0.44482758045196535, "step": 121140 }, { "epoch": 0.12201853066100211, "grad_norm": 11.194827149128853, "learning_rate": 4.9364918888587254e-05, "loss": 2.4977, "mean_token_accuracy": 0.41379310488700866, "step": 121145 }, { "epoch": 0.12202356671410629, "grad_norm": 9.179973665571547, "learning_rate": 4.936483048572103e-05, "loss": 1.9788, "mean_token_accuracy": 0.4707804024219513, "step": 121150 }, { "epoch": 0.12202860276721046, "grad_norm": 10.146628749200255, "learning_rate": 4.936474207679051e-05, "loss": 1.9078, "mean_token_accuracy": 0.4983666121959686, "step": 121155 }, { "epoch": 0.12203363882031464, "grad_norm": 9.934749783353913, "learning_rate": 4.936465366179572e-05, "loss": 2.5568, "mean_token_accuracy": 0.4379310369491577, "step": 121160 }, { "epoch": 0.12203867487341881, "grad_norm": 10.040597958827709, "learning_rate": 4.93645652407367e-05, "loss": 3.0408, "mean_token_accuracy": 0.33236539363861084, "step": 121165 }, { "epoch": 0.12204371092652298, "grad_norm": 12.860581249828853, "learning_rate": 4.9364476813613444e-05, "loss": 2.8723, "mean_token_accuracy": 0.37586206793785093, "step": 121170 }, { "epoch": 0.12204874697962716, "grad_norm": 9.970452458169836, "learning_rate": 4.936438838042601e-05, "loss": 2.3145, "mean_token_accuracy": 0.42413793206214906, "step": 121175 }, { "epoch": 0.12205378303273132, "grad_norm": 10.359453306483196, "learning_rate": 4.93642999411744e-05, "loss": 2.4171, "mean_token_accuracy": 0.4206896543502808, "step": 121180 }, { "epoch": 0.12205881908583549, "grad_norm": 9.92680492800865, "learning_rate": 4.936421149585864e-05, "loss": 2.2162, "mean_token_accuracy": 0.441379314661026, "step": 121185 }, { "epoch": 0.12206385513893966, "grad_norm": 9.15110478953717, "learning_rate": 4.936412304447876e-05, "loss": 2.2946, "mean_token_accuracy": 0.4137930989265442, "step": 121190 }, { "epoch": 0.12206889119204384, "grad_norm": 10.021739379620538, "learning_rate": 4.936403458703479e-05, "loss": 2.4026, "mean_token_accuracy": 0.42413793206214906, "step": 121195 }, { "epoch": 0.12207392724514801, "grad_norm": 17.75240690086342, "learning_rate": 4.936394612352674e-05, "loss": 2.6913, "mean_token_accuracy": 0.4172413766384125, "step": 121200 }, { "epoch": 0.12207896329825219, "grad_norm": 8.899132038704407, "learning_rate": 4.936385765395465e-05, "loss": 1.9303, "mean_token_accuracy": 0.4870689630508423, "step": 121205 }, { "epoch": 0.12208399935135636, "grad_norm": 11.503835137540108, "learning_rate": 4.936376917831853e-05, "loss": 2.5138, "mean_token_accuracy": 0.4172413766384125, "step": 121210 }, { "epoch": 0.12208903540446053, "grad_norm": 12.009769717857852, "learning_rate": 4.936368069661842e-05, "loss": 2.3726, "mean_token_accuracy": 0.4379310369491577, "step": 121215 }, { "epoch": 0.1220940714575647, "grad_norm": 14.429799926429205, "learning_rate": 4.936359220885434e-05, "loss": 2.7104, "mean_token_accuracy": 0.35862068831920624, "step": 121220 }, { "epoch": 0.12209910751066888, "grad_norm": 9.638666670722689, "learning_rate": 4.93635037150263e-05, "loss": 2.4151, "mean_token_accuracy": 0.4379310369491577, "step": 121225 }, { "epoch": 0.12210414356377305, "grad_norm": 12.674910190507333, "learning_rate": 4.936341521513434e-05, "loss": 2.6197, "mean_token_accuracy": 0.41034482717514037, "step": 121230 }, { "epoch": 0.12210917961687723, "grad_norm": 14.569840522821229, "learning_rate": 4.9363326709178484e-05, "loss": 2.6321, "mean_token_accuracy": 0.36896551251411436, "step": 121235 }, { "epoch": 0.1221142156699814, "grad_norm": 10.48847662719086, "learning_rate": 4.936323819715874e-05, "loss": 2.5415, "mean_token_accuracy": 0.42413792610168455, "step": 121240 }, { "epoch": 0.12211925172308558, "grad_norm": 11.810558738847785, "learning_rate": 4.9363149679075166e-05, "loss": 2.6024, "mean_token_accuracy": 0.3896551728248596, "step": 121245 }, { "epoch": 0.12212428777618974, "grad_norm": 10.183504769037581, "learning_rate": 4.9363061154927756e-05, "loss": 2.5701, "mean_token_accuracy": 0.37931033968925476, "step": 121250 }, { "epoch": 0.12212932382929391, "grad_norm": 12.193471720602403, "learning_rate": 4.9362972624716545e-05, "loss": 2.3607, "mean_token_accuracy": 0.4344827473163605, "step": 121255 }, { "epoch": 0.12213435988239808, "grad_norm": 12.938878458011072, "learning_rate": 4.9362884088441554e-05, "loss": 2.2493, "mean_token_accuracy": 0.46896551847457885, "step": 121260 }, { "epoch": 0.12213939593550226, "grad_norm": 11.36122923191146, "learning_rate": 4.936279554610281e-05, "loss": 2.7301, "mean_token_accuracy": 0.4206896543502808, "step": 121265 }, { "epoch": 0.12214443198860643, "grad_norm": 9.776324450531078, "learning_rate": 4.936270699770034e-05, "loss": 2.2343, "mean_token_accuracy": 0.4620689690113068, "step": 121270 }, { "epoch": 0.1221494680417106, "grad_norm": 14.679221488187956, "learning_rate": 4.9362618443234164e-05, "loss": 2.6374, "mean_token_accuracy": 0.4034482717514038, "step": 121275 }, { "epoch": 0.12215450409481478, "grad_norm": 11.263835252959014, "learning_rate": 4.936252988270431e-05, "loss": 2.2481, "mean_token_accuracy": 0.4260738015174866, "step": 121280 }, { "epoch": 0.12215954014791895, "grad_norm": 10.71129766485962, "learning_rate": 4.9362441316110804e-05, "loss": 2.3893, "mean_token_accuracy": 0.42413793206214906, "step": 121285 }, { "epoch": 0.12216457620102313, "grad_norm": 8.405130328536725, "learning_rate": 4.9362352743453675e-05, "loss": 2.0884, "mean_token_accuracy": 0.4517241358757019, "step": 121290 }, { "epoch": 0.1221696122541273, "grad_norm": 11.797503653762023, "learning_rate": 4.936226416473293e-05, "loss": 2.3586, "mean_token_accuracy": 0.4517241418361664, "step": 121295 }, { "epoch": 0.12217464830723147, "grad_norm": 13.954412993068757, "learning_rate": 4.936217557994861e-05, "loss": 2.7041, "mean_token_accuracy": 0.40689654350280763, "step": 121300 }, { "epoch": 0.12217968436033565, "grad_norm": 12.415919735727012, "learning_rate": 4.936208698910073e-05, "loss": 2.5593, "mean_token_accuracy": 0.42413792610168455, "step": 121305 }, { "epoch": 0.12218472041343982, "grad_norm": 10.828909108272393, "learning_rate": 4.9361998392189327e-05, "loss": 2.4364, "mean_token_accuracy": 0.4275862157344818, "step": 121310 }, { "epoch": 0.122189756466544, "grad_norm": 13.159279686413846, "learning_rate": 4.936190978921441e-05, "loss": 2.4755, "mean_token_accuracy": 0.4461584985256195, "step": 121315 }, { "epoch": 0.12219479251964815, "grad_norm": 8.356807728290756, "learning_rate": 4.9361821180176007e-05, "loss": 1.9142, "mean_token_accuracy": 0.4965517342090607, "step": 121320 }, { "epoch": 0.12219982857275233, "grad_norm": 13.356515919549372, "learning_rate": 4.936173256507415e-05, "loss": 2.7456, "mean_token_accuracy": 0.4241379201412201, "step": 121325 }, { "epoch": 0.1222048646258565, "grad_norm": 15.580783028937352, "learning_rate": 4.936164394390887e-05, "loss": 2.2568, "mean_token_accuracy": 0.4206896543502808, "step": 121330 }, { "epoch": 0.12220990067896068, "grad_norm": 9.987052801556084, "learning_rate": 4.936155531668017e-05, "loss": 2.4574, "mean_token_accuracy": 0.4034482777118683, "step": 121335 }, { "epoch": 0.12221493673206485, "grad_norm": 13.054548295401002, "learning_rate": 4.9361466683388085e-05, "loss": 2.2296, "mean_token_accuracy": 0.46551724076271056, "step": 121340 }, { "epoch": 0.12221997278516902, "grad_norm": 10.620575354067812, "learning_rate": 4.9361378044032646e-05, "loss": 2.4148, "mean_token_accuracy": 0.4310344815254211, "step": 121345 }, { "epoch": 0.1222250088382732, "grad_norm": 12.219362256712568, "learning_rate": 4.936128939861387e-05, "loss": 2.5817, "mean_token_accuracy": 0.37931033968925476, "step": 121350 }, { "epoch": 0.12223004489137737, "grad_norm": 10.844771906941629, "learning_rate": 4.9361200747131785e-05, "loss": 2.597, "mean_token_accuracy": 0.43448275327682495, "step": 121355 }, { "epoch": 0.12223508094448154, "grad_norm": 11.191620068907104, "learning_rate": 4.9361112089586416e-05, "loss": 2.5029, "mean_token_accuracy": 0.4034482777118683, "step": 121360 }, { "epoch": 0.12224011699758572, "grad_norm": 10.12592926822746, "learning_rate": 4.936102342597778e-05, "loss": 2.5111, "mean_token_accuracy": 0.41379310488700866, "step": 121365 }, { "epoch": 0.12224515305068989, "grad_norm": 10.03368269690278, "learning_rate": 4.936093475630592e-05, "loss": 2.2759, "mean_token_accuracy": 0.44482759237289426, "step": 121370 }, { "epoch": 0.12225018910379407, "grad_norm": 9.109986488930303, "learning_rate": 4.9360846080570834e-05, "loss": 1.9092, "mean_token_accuracy": 0.5036902606487275, "step": 121375 }, { "epoch": 0.12225522515689824, "grad_norm": 11.97127112414645, "learning_rate": 4.936075739877257e-05, "loss": 2.3626, "mean_token_accuracy": 0.42758620381355283, "step": 121380 }, { "epoch": 0.12226026121000241, "grad_norm": 12.70064289269278, "learning_rate": 4.9360668710911137e-05, "loss": 3.175, "mean_token_accuracy": 0.3594676285982132, "step": 121385 }, { "epoch": 0.12226529726310657, "grad_norm": 10.032753153666066, "learning_rate": 4.936058001698657e-05, "loss": 2.4278, "mean_token_accuracy": 0.42068964838981626, "step": 121390 }, { "epoch": 0.12227033331621075, "grad_norm": 12.213514883895792, "learning_rate": 4.936049131699889e-05, "loss": 2.5609, "mean_token_accuracy": 0.4068965494632721, "step": 121395 }, { "epoch": 0.12227536936931492, "grad_norm": 11.153800724567695, "learning_rate": 4.9360402610948115e-05, "loss": 2.422, "mean_token_accuracy": 0.38620689511299133, "step": 121400 }, { "epoch": 0.1222804054224191, "grad_norm": 11.833366417666083, "learning_rate": 4.936031389883428e-05, "loss": 2.5362, "mean_token_accuracy": 0.4, "step": 121405 }, { "epoch": 0.12228544147552327, "grad_norm": 9.554982204496632, "learning_rate": 4.936022518065741e-05, "loss": 2.3373, "mean_token_accuracy": 0.4310344815254211, "step": 121410 }, { "epoch": 0.12229047752862744, "grad_norm": 9.903573236209526, "learning_rate": 4.9360136456417524e-05, "loss": 2.6248, "mean_token_accuracy": 0.42758620381355283, "step": 121415 }, { "epoch": 0.12229551358173162, "grad_norm": 9.361240432603488, "learning_rate": 4.936004772611464e-05, "loss": 2.3405, "mean_token_accuracy": 0.41724138259887694, "step": 121420 }, { "epoch": 0.12230054963483579, "grad_norm": 14.207082360371928, "learning_rate": 4.935995898974879e-05, "loss": 2.6811, "mean_token_accuracy": 0.37241379022598264, "step": 121425 }, { "epoch": 0.12230558568793996, "grad_norm": 11.99375107440776, "learning_rate": 4.9359870247320007e-05, "loss": 2.5141, "mean_token_accuracy": 0.4261947989463806, "step": 121430 }, { "epoch": 0.12231062174104414, "grad_norm": 8.80324942000633, "learning_rate": 4.9359781498828303e-05, "loss": 2.1206, "mean_token_accuracy": 0.482758617401123, "step": 121435 }, { "epoch": 0.12231565779414831, "grad_norm": 11.111146703953166, "learning_rate": 4.935969274427371e-05, "loss": 2.2333, "mean_token_accuracy": 0.4655172348022461, "step": 121440 }, { "epoch": 0.12232069384725248, "grad_norm": 9.926311242152007, "learning_rate": 4.935960398365625e-05, "loss": 2.1255, "mean_token_accuracy": 0.4310344815254211, "step": 121445 }, { "epoch": 0.12232572990035666, "grad_norm": 10.450654889452851, "learning_rate": 4.935951521697594e-05, "loss": 2.0784, "mean_token_accuracy": 0.4862068951129913, "step": 121450 }, { "epoch": 0.12233076595346083, "grad_norm": 10.294057870232526, "learning_rate": 4.935942644423282e-05, "loss": 2.1529, "mean_token_accuracy": 0.4692679882049561, "step": 121455 }, { "epoch": 0.12233580200656499, "grad_norm": 15.892870821234443, "learning_rate": 4.93593376654269e-05, "loss": 2.3106, "mean_token_accuracy": 0.4689655125141144, "step": 121460 }, { "epoch": 0.12234083805966917, "grad_norm": 10.353767881472717, "learning_rate": 4.935924888055822e-05, "loss": 2.1775, "mean_token_accuracy": 0.46424682140350343, "step": 121465 }, { "epoch": 0.12234587411277334, "grad_norm": 10.278426076973306, "learning_rate": 4.935916008962679e-05, "loss": 2.546, "mean_token_accuracy": 0.42758620381355283, "step": 121470 }, { "epoch": 0.12235091016587751, "grad_norm": 13.428204139539321, "learning_rate": 4.935907129263264e-05, "loss": 2.4375, "mean_token_accuracy": 0.38965516686439516, "step": 121475 }, { "epoch": 0.12235594621898169, "grad_norm": 9.050061318899274, "learning_rate": 4.93589824895758e-05, "loss": 2.4069, "mean_token_accuracy": 0.38620689511299133, "step": 121480 }, { "epoch": 0.12236098227208586, "grad_norm": 9.360855229320544, "learning_rate": 4.935889368045628e-05, "loss": 2.4494, "mean_token_accuracy": 0.4551724076271057, "step": 121485 }, { "epoch": 0.12236601832519003, "grad_norm": 9.60718879025586, "learning_rate": 4.9358804865274124e-05, "loss": 2.693, "mean_token_accuracy": 0.42758620381355283, "step": 121490 }, { "epoch": 0.12237105437829421, "grad_norm": 11.53511541930201, "learning_rate": 4.935871604402934e-05, "loss": 2.4695, "mean_token_accuracy": 0.41724138855934145, "step": 121495 }, { "epoch": 0.12237609043139838, "grad_norm": 11.885172029189246, "learning_rate": 4.9358627216721964e-05, "loss": 2.2745, "mean_token_accuracy": 0.4413793087005615, "step": 121500 }, { "epoch": 0.12238112648450256, "grad_norm": 9.970289511704793, "learning_rate": 4.935853838335202e-05, "loss": 2.4042, "mean_token_accuracy": 0.4379310369491577, "step": 121505 }, { "epoch": 0.12238616253760673, "grad_norm": 9.820630526731277, "learning_rate": 4.935844954391951e-05, "loss": 2.165, "mean_token_accuracy": 0.493103438615799, "step": 121510 }, { "epoch": 0.1223911985907109, "grad_norm": 17.48422252904585, "learning_rate": 4.9358360698424495e-05, "loss": 2.5834, "mean_token_accuracy": 0.4206896543502808, "step": 121515 }, { "epoch": 0.12239623464381508, "grad_norm": 12.956607920809779, "learning_rate": 4.935827184686698e-05, "loss": 2.9828, "mean_token_accuracy": 0.4103448212146759, "step": 121520 }, { "epoch": 0.12240127069691925, "grad_norm": 10.05435123886727, "learning_rate": 4.935818298924699e-05, "loss": 2.091, "mean_token_accuracy": 0.47084089517593386, "step": 121525 }, { "epoch": 0.12240630675002341, "grad_norm": 9.384768454592837, "learning_rate": 4.935809412556455e-05, "loss": 2.3141, "mean_token_accuracy": 0.441379314661026, "step": 121530 }, { "epoch": 0.12241134280312758, "grad_norm": 12.261951383600177, "learning_rate": 4.935800525581969e-05, "loss": 2.2676, "mean_token_accuracy": 0.4689655125141144, "step": 121535 }, { "epoch": 0.12241637885623176, "grad_norm": 13.864205712724058, "learning_rate": 4.935791638001243e-05, "loss": 1.9623, "mean_token_accuracy": 0.4620689690113068, "step": 121540 }, { "epoch": 0.12242141490933593, "grad_norm": 8.363041382122102, "learning_rate": 4.9357827498142784e-05, "loss": 2.0423, "mean_token_accuracy": 0.4689655125141144, "step": 121545 }, { "epoch": 0.1224264509624401, "grad_norm": 8.964253345151192, "learning_rate": 4.93577386102108e-05, "loss": 2.6198, "mean_token_accuracy": 0.44482759237289426, "step": 121550 }, { "epoch": 0.12243148701554428, "grad_norm": 11.927986116234097, "learning_rate": 4.9357649716216485e-05, "loss": 2.3706, "mean_token_accuracy": 0.4689655125141144, "step": 121555 }, { "epoch": 0.12243652306864845, "grad_norm": 9.108694779143375, "learning_rate": 4.935756081615988e-05, "loss": 2.4559, "mean_token_accuracy": 0.4413793087005615, "step": 121560 }, { "epoch": 0.12244155912175263, "grad_norm": 10.995882469029445, "learning_rate": 4.935747191004099e-05, "loss": 2.3059, "mean_token_accuracy": 0.46896551847457885, "step": 121565 }, { "epoch": 0.1224465951748568, "grad_norm": 12.557856584590972, "learning_rate": 4.935738299785984e-05, "loss": 2.3171, "mean_token_accuracy": 0.44827585816383364, "step": 121570 }, { "epoch": 0.12245163122796097, "grad_norm": 7.766123913482772, "learning_rate": 4.9357294079616475e-05, "loss": 2.3083, "mean_token_accuracy": 0.44827585816383364, "step": 121575 }, { "epoch": 0.12245666728106515, "grad_norm": 11.862250227676634, "learning_rate": 4.935720515531091e-05, "loss": 2.4781, "mean_token_accuracy": 0.4034482717514038, "step": 121580 }, { "epoch": 0.12246170333416932, "grad_norm": 9.576481949485492, "learning_rate": 4.9357116224943164e-05, "loss": 2.2323, "mean_token_accuracy": 0.44694494605064394, "step": 121585 }, { "epoch": 0.1224667393872735, "grad_norm": 11.820432407760167, "learning_rate": 4.935702728851326e-05, "loss": 2.5114, "mean_token_accuracy": 0.4569872975349426, "step": 121590 }, { "epoch": 0.12247177544037767, "grad_norm": 10.79248982777244, "learning_rate": 4.935693834602124e-05, "loss": 2.233, "mean_token_accuracy": 0.441379314661026, "step": 121595 }, { "epoch": 0.12247681149348183, "grad_norm": 26.80401715911033, "learning_rate": 4.935684939746711e-05, "loss": 2.7738, "mean_token_accuracy": 0.39310344457626345, "step": 121600 }, { "epoch": 0.122481847546586, "grad_norm": 9.832309485629386, "learning_rate": 4.9356760442850894e-05, "loss": 2.0483, "mean_token_accuracy": 0.47931033968925474, "step": 121605 }, { "epoch": 0.12248688359969018, "grad_norm": 9.752849769286588, "learning_rate": 4.935667148217263e-05, "loss": 2.3149, "mean_token_accuracy": 0.4413793087005615, "step": 121610 }, { "epoch": 0.12249191965279435, "grad_norm": 9.915835315350268, "learning_rate": 4.935658251543234e-05, "loss": 2.1885, "mean_token_accuracy": 0.4, "step": 121615 }, { "epoch": 0.12249695570589852, "grad_norm": 17.210321293211422, "learning_rate": 4.935649354263004e-05, "loss": 2.5434, "mean_token_accuracy": 0.42546883821487425, "step": 121620 }, { "epoch": 0.1225019917590027, "grad_norm": 10.719012778819799, "learning_rate": 4.935640456376577e-05, "loss": 2.5969, "mean_token_accuracy": 0.4, "step": 121625 }, { "epoch": 0.12250702781210687, "grad_norm": 13.328770440093699, "learning_rate": 4.9356315578839536e-05, "loss": 2.5945, "mean_token_accuracy": 0.4310344815254211, "step": 121630 }, { "epoch": 0.12251206386521105, "grad_norm": 10.454093315816003, "learning_rate": 4.935622658785137e-05, "loss": 2.3913, "mean_token_accuracy": 0.41034482717514037, "step": 121635 }, { "epoch": 0.12251709991831522, "grad_norm": 11.469231157751647, "learning_rate": 4.935613759080131e-05, "loss": 2.7981, "mean_token_accuracy": 0.3965517282485962, "step": 121640 }, { "epoch": 0.1225221359714194, "grad_norm": 11.811339025463111, "learning_rate": 4.935604858768935e-05, "loss": 2.2884, "mean_token_accuracy": 0.43297035694122316, "step": 121645 }, { "epoch": 0.12252717202452357, "grad_norm": 10.91494700715504, "learning_rate": 4.935595957851554e-05, "loss": 2.407, "mean_token_accuracy": 0.4034482777118683, "step": 121650 }, { "epoch": 0.12253220807762774, "grad_norm": 9.602808461567404, "learning_rate": 4.935587056327991e-05, "loss": 2.3474, "mean_token_accuracy": 0.3965517282485962, "step": 121655 }, { "epoch": 0.12253724413073191, "grad_norm": 9.886630898940915, "learning_rate": 4.9355781541982455e-05, "loss": 2.3762, "mean_token_accuracy": 0.46733213067054746, "step": 121660 }, { "epoch": 0.12254228018383609, "grad_norm": 15.719746930413635, "learning_rate": 4.935569251462323e-05, "loss": 3.0132, "mean_token_accuracy": 0.31034483313560485, "step": 121665 }, { "epoch": 0.12254731623694025, "grad_norm": 9.711687822022741, "learning_rate": 4.9355603481202243e-05, "loss": 2.427, "mean_token_accuracy": 0.38275861740112305, "step": 121670 }, { "epoch": 0.12255235229004442, "grad_norm": 14.215228459804647, "learning_rate": 4.935551444171952e-05, "loss": 3.0844, "mean_token_accuracy": 0.3947368383407593, "step": 121675 }, { "epoch": 0.1225573883431486, "grad_norm": 8.346107333980514, "learning_rate": 4.935542539617509e-05, "loss": 2.9142, "mean_token_accuracy": 0.3758620619773865, "step": 121680 }, { "epoch": 0.12256242439625277, "grad_norm": 10.163008809713059, "learning_rate": 4.935533634456898e-05, "loss": 2.4207, "mean_token_accuracy": 0.46896551847457885, "step": 121685 }, { "epoch": 0.12256746044935694, "grad_norm": 10.430847811801907, "learning_rate": 4.9355247286901206e-05, "loss": 2.3228, "mean_token_accuracy": 0.43448275327682495, "step": 121690 }, { "epoch": 0.12257249650246112, "grad_norm": 10.771167818701835, "learning_rate": 4.9355158223171806e-05, "loss": 2.4486, "mean_token_accuracy": 0.4103448152542114, "step": 121695 }, { "epoch": 0.12257753255556529, "grad_norm": 10.034137743810472, "learning_rate": 4.935506915338079e-05, "loss": 2.6503, "mean_token_accuracy": 0.4068965554237366, "step": 121700 }, { "epoch": 0.12258256860866946, "grad_norm": 10.4542404728085, "learning_rate": 4.935498007752819e-05, "loss": 2.3909, "mean_token_accuracy": 0.41379310488700866, "step": 121705 }, { "epoch": 0.12258760466177364, "grad_norm": 10.983530671415469, "learning_rate": 4.935489099561403e-05, "loss": 2.7458, "mean_token_accuracy": 0.41724138557910917, "step": 121710 }, { "epoch": 0.12259264071487781, "grad_norm": 8.880514656150925, "learning_rate": 4.935480190763833e-05, "loss": 2.5023, "mean_token_accuracy": 0.42413792908191683, "step": 121715 }, { "epoch": 0.12259767676798199, "grad_norm": 8.189523926450258, "learning_rate": 4.935471281360112e-05, "loss": 2.1778, "mean_token_accuracy": 0.44482758045196535, "step": 121720 }, { "epoch": 0.12260271282108616, "grad_norm": 11.20940686563691, "learning_rate": 4.935462371350244e-05, "loss": 2.4978, "mean_token_accuracy": 0.38620689511299133, "step": 121725 }, { "epoch": 0.12260774887419033, "grad_norm": 10.86722975107101, "learning_rate": 4.935453460734228e-05, "loss": 2.4715, "mean_token_accuracy": 0.41034482717514037, "step": 121730 }, { "epoch": 0.12261278492729451, "grad_norm": 8.845563554866562, "learning_rate": 4.935444549512068e-05, "loss": 2.4495, "mean_token_accuracy": 0.44827585816383364, "step": 121735 }, { "epoch": 0.12261782098039867, "grad_norm": 9.751217644349351, "learning_rate": 4.9354356376837676e-05, "loss": 2.2902, "mean_token_accuracy": 0.458620685338974, "step": 121740 }, { "epoch": 0.12262285703350284, "grad_norm": 10.078829783583458, "learning_rate": 4.935426725249329e-05, "loss": 2.405, "mean_token_accuracy": 0.41379310488700866, "step": 121745 }, { "epoch": 0.12262789308660701, "grad_norm": 10.797611699898967, "learning_rate": 4.935417812208754e-05, "loss": 2.2163, "mean_token_accuracy": 0.4448275864124298, "step": 121750 }, { "epoch": 0.12263292913971119, "grad_norm": 12.4847417045229, "learning_rate": 4.935408898562045e-05, "loss": 2.6193, "mean_token_accuracy": 0.37586206793785093, "step": 121755 }, { "epoch": 0.12263796519281536, "grad_norm": 9.386764998884157, "learning_rate": 4.935399984309205e-05, "loss": 2.3367, "mean_token_accuracy": 0.4620689690113068, "step": 121760 }, { "epoch": 0.12264300124591954, "grad_norm": 10.206212944884228, "learning_rate": 4.935391069450235e-05, "loss": 2.1902, "mean_token_accuracy": 0.43793103098869324, "step": 121765 }, { "epoch": 0.12264803729902371, "grad_norm": 8.367255113049477, "learning_rate": 4.93538215398514e-05, "loss": 2.3965, "mean_token_accuracy": 0.4034482717514038, "step": 121770 }, { "epoch": 0.12265307335212788, "grad_norm": 12.882973084132638, "learning_rate": 4.935373237913921e-05, "loss": 2.1428, "mean_token_accuracy": 0.47931033968925474, "step": 121775 }, { "epoch": 0.12265810940523206, "grad_norm": 12.253773367498395, "learning_rate": 4.935364321236579e-05, "loss": 2.527, "mean_token_accuracy": 0.42413792610168455, "step": 121780 }, { "epoch": 0.12266314545833623, "grad_norm": 10.887786789705851, "learning_rate": 4.935355403953119e-05, "loss": 2.4306, "mean_token_accuracy": 0.43793103098869324, "step": 121785 }, { "epoch": 0.1226681815114404, "grad_norm": 12.070078026633132, "learning_rate": 4.935346486063543e-05, "loss": 2.3688, "mean_token_accuracy": 0.44482758045196535, "step": 121790 }, { "epoch": 0.12267321756454458, "grad_norm": 12.326080088770455, "learning_rate": 4.935337567567853e-05, "loss": 2.456, "mean_token_accuracy": 0.42068966031074523, "step": 121795 }, { "epoch": 0.12267825361764875, "grad_norm": 9.437294848283038, "learning_rate": 4.9353286484660505e-05, "loss": 2.2973, "mean_token_accuracy": 0.43103448748588563, "step": 121800 }, { "epoch": 0.12268328967075293, "grad_norm": 9.5412196916285, "learning_rate": 4.9353197287581406e-05, "loss": 2.4068, "mean_token_accuracy": 0.4413793087005615, "step": 121805 }, { "epoch": 0.12268832572385709, "grad_norm": 10.883449010273933, "learning_rate": 4.935310808444123e-05, "loss": 2.4444, "mean_token_accuracy": 0.4896551728248596, "step": 121810 }, { "epoch": 0.12269336177696126, "grad_norm": 10.060457538465636, "learning_rate": 4.935301887524001e-05, "loss": 2.1762, "mean_token_accuracy": 0.441379314661026, "step": 121815 }, { "epoch": 0.12269839783006543, "grad_norm": 9.876891216365344, "learning_rate": 4.9352929659977775e-05, "loss": 2.386, "mean_token_accuracy": 0.4172413766384125, "step": 121820 }, { "epoch": 0.12270343388316961, "grad_norm": 11.566010370984657, "learning_rate": 4.9352840438654554e-05, "loss": 2.5215, "mean_token_accuracy": 0.4103448331356049, "step": 121825 }, { "epoch": 0.12270846993627378, "grad_norm": 10.526634497768276, "learning_rate": 4.9352751211270365e-05, "loss": 2.2711, "mean_token_accuracy": 0.47931034564971925, "step": 121830 }, { "epoch": 0.12271350598937796, "grad_norm": 9.865047703071046, "learning_rate": 4.9352661977825235e-05, "loss": 2.5044, "mean_token_accuracy": 0.4034482777118683, "step": 121835 }, { "epoch": 0.12271854204248213, "grad_norm": 10.104994077691904, "learning_rate": 4.935257273831918e-05, "loss": 2.7055, "mean_token_accuracy": 0.36896551847457887, "step": 121840 }, { "epoch": 0.1227235780955863, "grad_norm": 10.495909520130299, "learning_rate": 4.935248349275224e-05, "loss": 2.5719, "mean_token_accuracy": 0.4034482777118683, "step": 121845 }, { "epoch": 0.12272861414869048, "grad_norm": 9.977928245229942, "learning_rate": 4.9352394241124424e-05, "loss": 2.1629, "mean_token_accuracy": 0.4586206912994385, "step": 121850 }, { "epoch": 0.12273365020179465, "grad_norm": 8.983168042489476, "learning_rate": 4.935230498343577e-05, "loss": 1.9745, "mean_token_accuracy": 0.4604960739612579, "step": 121855 }, { "epoch": 0.12273868625489882, "grad_norm": 10.463266512923543, "learning_rate": 4.935221571968629e-05, "loss": 2.2338, "mean_token_accuracy": 0.4310344815254211, "step": 121860 }, { "epoch": 0.122743722308003, "grad_norm": 10.715422570772402, "learning_rate": 4.935212644987603e-05, "loss": 2.6194, "mean_token_accuracy": 0.4172413766384125, "step": 121865 }, { "epoch": 0.12274875836110717, "grad_norm": 12.817112697024978, "learning_rate": 4.9352037174005e-05, "loss": 2.3101, "mean_token_accuracy": 0.458620685338974, "step": 121870 }, { "epoch": 0.12275379441421135, "grad_norm": 11.155629682729474, "learning_rate": 4.935194789207322e-05, "loss": 2.6317, "mean_token_accuracy": 0.41379310488700866, "step": 121875 }, { "epoch": 0.1227588304673155, "grad_norm": 11.608936094961171, "learning_rate": 4.9351858604080726e-05, "loss": 2.7478, "mean_token_accuracy": 0.458620685338974, "step": 121880 }, { "epoch": 0.12276386652041968, "grad_norm": 9.982194832615862, "learning_rate": 4.9351769310027536e-05, "loss": 2.2063, "mean_token_accuracy": 0.4482758641242981, "step": 121885 }, { "epoch": 0.12276890257352385, "grad_norm": 9.183322155753466, "learning_rate": 4.935168000991367e-05, "loss": 2.2612, "mean_token_accuracy": 0.4310344815254211, "step": 121890 }, { "epoch": 0.12277393862662803, "grad_norm": 11.314825603512896, "learning_rate": 4.935159070373917e-05, "loss": 2.6206, "mean_token_accuracy": 0.38275861740112305, "step": 121895 }, { "epoch": 0.1227789746797322, "grad_norm": 9.898057998260635, "learning_rate": 4.9351501391504035e-05, "loss": 1.8034, "mean_token_accuracy": 0.517241370677948, "step": 121900 }, { "epoch": 0.12278401073283637, "grad_norm": 11.2381078925426, "learning_rate": 4.9351412073208315e-05, "loss": 2.5622, "mean_token_accuracy": 0.3793103456497192, "step": 121905 }, { "epoch": 0.12278904678594055, "grad_norm": 10.566624335440231, "learning_rate": 4.935132274885202e-05, "loss": 2.377, "mean_token_accuracy": 0.41034482419490814, "step": 121910 }, { "epoch": 0.12279408283904472, "grad_norm": 9.181959502235571, "learning_rate": 4.935123341843519e-05, "loss": 2.3739, "mean_token_accuracy": 0.4103448301553726, "step": 121915 }, { "epoch": 0.1227991188921489, "grad_norm": 11.317684730500437, "learning_rate": 4.935114408195782e-05, "loss": 2.62, "mean_token_accuracy": 0.3896551728248596, "step": 121920 }, { "epoch": 0.12280415494525307, "grad_norm": 9.163524177015958, "learning_rate": 4.935105473941997e-05, "loss": 2.1259, "mean_token_accuracy": 0.47586206793785096, "step": 121925 }, { "epoch": 0.12280919099835724, "grad_norm": 8.947398990806773, "learning_rate": 4.935096539082164e-05, "loss": 2.0649, "mean_token_accuracy": 0.46551724672317507, "step": 121930 }, { "epoch": 0.12281422705146142, "grad_norm": 12.908806992140438, "learning_rate": 4.935087603616287e-05, "loss": 2.4393, "mean_token_accuracy": 0.4517241358757019, "step": 121935 }, { "epoch": 0.12281926310456559, "grad_norm": 8.906935245391251, "learning_rate": 4.9350786675443666e-05, "loss": 2.1313, "mean_token_accuracy": 0.42758620381355283, "step": 121940 }, { "epoch": 0.12282429915766976, "grad_norm": 9.254257509208076, "learning_rate": 4.935069730866407e-05, "loss": 2.5688, "mean_token_accuracy": 0.43448275327682495, "step": 121945 }, { "epoch": 0.12282933521077392, "grad_norm": 12.868984027175689, "learning_rate": 4.935060793582411e-05, "loss": 2.8082, "mean_token_accuracy": 0.358620685338974, "step": 121950 }, { "epoch": 0.1228343712638781, "grad_norm": 9.684376588769746, "learning_rate": 4.9350518556923794e-05, "loss": 2.196, "mean_token_accuracy": 0.4413793087005615, "step": 121955 }, { "epoch": 0.12283940731698227, "grad_norm": 32.1243764826919, "learning_rate": 4.935042917196316e-05, "loss": 3.1854, "mean_token_accuracy": 0.36551723778247835, "step": 121960 }, { "epoch": 0.12284444337008645, "grad_norm": 14.493461315580515, "learning_rate": 4.935033978094222e-05, "loss": 2.697, "mean_token_accuracy": 0.42758620381355283, "step": 121965 }, { "epoch": 0.12284947942319062, "grad_norm": 11.240579427875884, "learning_rate": 4.935025038386101e-05, "loss": 2.487, "mean_token_accuracy": 0.3896551728248596, "step": 121970 }, { "epoch": 0.12285451547629479, "grad_norm": 10.173161328823317, "learning_rate": 4.935016098071956e-05, "loss": 2.4905, "mean_token_accuracy": 0.4586206912994385, "step": 121975 }, { "epoch": 0.12285955152939897, "grad_norm": 12.41371511141704, "learning_rate": 4.935007157151788e-05, "loss": 2.1636, "mean_token_accuracy": 0.4877797901630402, "step": 121980 }, { "epoch": 0.12286458758250314, "grad_norm": 9.607398853428961, "learning_rate": 4.9349982156255994e-05, "loss": 2.5129, "mean_token_accuracy": 0.4448275864124298, "step": 121985 }, { "epoch": 0.12286962363560731, "grad_norm": 8.732479306056428, "learning_rate": 4.934989273493395e-05, "loss": 2.3917, "mean_token_accuracy": 0.41724138259887694, "step": 121990 }, { "epoch": 0.12287465968871149, "grad_norm": 11.018997964429701, "learning_rate": 4.934980330755174e-05, "loss": 2.5562, "mean_token_accuracy": 0.3999999940395355, "step": 121995 }, { "epoch": 0.12287969574181566, "grad_norm": 9.88367912052159, "learning_rate": 4.934971387410942e-05, "loss": 2.3735, "mean_token_accuracy": 0.4206896543502808, "step": 122000 }, { "epoch": 0.12288473179491984, "grad_norm": 11.125284335688404, "learning_rate": 4.934962443460699e-05, "loss": 2.4272, "mean_token_accuracy": 0.4310344934463501, "step": 122005 }, { "epoch": 0.12288976784802401, "grad_norm": 9.200645577064995, "learning_rate": 4.934953498904449e-05, "loss": 2.3684, "mean_token_accuracy": 0.4448275864124298, "step": 122010 }, { "epoch": 0.12289480390112818, "grad_norm": 11.973062038301448, "learning_rate": 4.934944553742194e-05, "loss": 2.3666, "mean_token_accuracy": 0.41034482717514037, "step": 122015 }, { "epoch": 0.12289983995423234, "grad_norm": 11.3152855482902, "learning_rate": 4.934935607973936e-05, "loss": 2.8278, "mean_token_accuracy": 0.4013309061527252, "step": 122020 }, { "epoch": 0.12290487600733652, "grad_norm": 9.861293063168498, "learning_rate": 4.934926661599679e-05, "loss": 2.1448, "mean_token_accuracy": 0.4620689630508423, "step": 122025 }, { "epoch": 0.12290991206044069, "grad_norm": 12.701298497665615, "learning_rate": 4.9349177146194236e-05, "loss": 2.7552, "mean_token_accuracy": 0.40145190358161925, "step": 122030 }, { "epoch": 0.12291494811354486, "grad_norm": 11.370258313665408, "learning_rate": 4.9349087670331726e-05, "loss": 2.3378, "mean_token_accuracy": 0.4034482777118683, "step": 122035 }, { "epoch": 0.12291998416664904, "grad_norm": 9.157821819141166, "learning_rate": 4.93489981884093e-05, "loss": 2.0949, "mean_token_accuracy": 0.48275861144065857, "step": 122040 }, { "epoch": 0.12292502021975321, "grad_norm": 11.709418180204672, "learning_rate": 4.934890870042697e-05, "loss": 2.8257, "mean_token_accuracy": 0.3655172407627106, "step": 122045 }, { "epoch": 0.12293005627285739, "grad_norm": 12.674235237540847, "learning_rate": 4.9348819206384764e-05, "loss": 2.3904, "mean_token_accuracy": 0.4620689690113068, "step": 122050 }, { "epoch": 0.12293509232596156, "grad_norm": 7.832461608738996, "learning_rate": 4.9348729706282706e-05, "loss": 2.2918, "mean_token_accuracy": 0.4482758641242981, "step": 122055 }, { "epoch": 0.12294012837906573, "grad_norm": 11.729691535342516, "learning_rate": 4.9348640200120824e-05, "loss": 2.3476, "mean_token_accuracy": 0.41785714626312254, "step": 122060 }, { "epoch": 0.1229451644321699, "grad_norm": 12.683163604003601, "learning_rate": 4.934855068789913e-05, "loss": 2.1076, "mean_token_accuracy": 0.5137931108474731, "step": 122065 }, { "epoch": 0.12295020048527408, "grad_norm": 10.10668550733948, "learning_rate": 4.9348461169617674e-05, "loss": 2.3285, "mean_token_accuracy": 0.4241379201412201, "step": 122070 }, { "epoch": 0.12295523653837825, "grad_norm": 9.60832629517651, "learning_rate": 4.934837164527645e-05, "loss": 2.0307, "mean_token_accuracy": 0.5020568549633027, "step": 122075 }, { "epoch": 0.12296027259148243, "grad_norm": 10.753520562126957, "learning_rate": 4.934828211487551e-05, "loss": 2.6678, "mean_token_accuracy": 0.486699515581131, "step": 122080 }, { "epoch": 0.1229653086445866, "grad_norm": 10.864081625897514, "learning_rate": 4.934819257841487e-05, "loss": 2.5528, "mean_token_accuracy": 0.42068966031074523, "step": 122085 }, { "epoch": 0.12297034469769076, "grad_norm": 10.425084734217068, "learning_rate": 4.934810303589454e-05, "loss": 2.579, "mean_token_accuracy": 0.41034482717514037, "step": 122090 }, { "epoch": 0.12297538075079494, "grad_norm": 9.165616166946, "learning_rate": 4.9348013487314566e-05, "loss": 2.2168, "mean_token_accuracy": 0.44827587008476255, "step": 122095 }, { "epoch": 0.12298041680389911, "grad_norm": 12.455719235845594, "learning_rate": 4.934792393267496e-05, "loss": 2.6802, "mean_token_accuracy": 0.3758620709180832, "step": 122100 }, { "epoch": 0.12298545285700328, "grad_norm": 12.10342353058217, "learning_rate": 4.9347834371975754e-05, "loss": 2.6783, "mean_token_accuracy": 0.4137930989265442, "step": 122105 }, { "epoch": 0.12299048891010746, "grad_norm": 10.266020780838815, "learning_rate": 4.9347744805216966e-05, "loss": 2.1128, "mean_token_accuracy": 0.47586206793785096, "step": 122110 }, { "epoch": 0.12299552496321163, "grad_norm": 10.073603447435751, "learning_rate": 4.934765523239863e-05, "loss": 2.1167, "mean_token_accuracy": 0.5034482717514038, "step": 122115 }, { "epoch": 0.1230005610163158, "grad_norm": 10.152533713681779, "learning_rate": 4.934756565352076e-05, "loss": 2.794, "mean_token_accuracy": 0.40526315569877625, "step": 122120 }, { "epoch": 0.12300559706941998, "grad_norm": 10.1068134619733, "learning_rate": 4.9347476068583386e-05, "loss": 2.4023, "mean_token_accuracy": 0.3896551728248596, "step": 122125 }, { "epoch": 0.12301063312252415, "grad_norm": 10.609370120959813, "learning_rate": 4.9347386477586533e-05, "loss": 2.3438, "mean_token_accuracy": 0.4689655125141144, "step": 122130 }, { "epoch": 0.12301566917562833, "grad_norm": 9.742480497713437, "learning_rate": 4.934729688053023e-05, "loss": 2.6343, "mean_token_accuracy": 0.4206896543502808, "step": 122135 }, { "epoch": 0.1230207052287325, "grad_norm": 68.44647623926005, "learning_rate": 4.934720727741449e-05, "loss": 2.9402, "mean_token_accuracy": 0.3965517312288284, "step": 122140 }, { "epoch": 0.12302574128183667, "grad_norm": 10.335090569920583, "learning_rate": 4.934711766823935e-05, "loss": 1.9034, "mean_token_accuracy": 0.5137930989265442, "step": 122145 }, { "epoch": 0.12303077733494085, "grad_norm": 10.942605504454992, "learning_rate": 4.934702805300484e-05, "loss": 2.4326, "mean_token_accuracy": 0.4776769459247589, "step": 122150 }, { "epoch": 0.12303581338804502, "grad_norm": 10.417116363783542, "learning_rate": 4.9346938431710966e-05, "loss": 1.9756, "mean_token_accuracy": 0.4974591672420502, "step": 122155 }, { "epoch": 0.12304084944114918, "grad_norm": 14.272467142322608, "learning_rate": 4.934684880435776e-05, "loss": 2.4885, "mean_token_accuracy": 0.3931034505367279, "step": 122160 }, { "epoch": 0.12304588549425335, "grad_norm": 10.231096807610847, "learning_rate": 4.934675917094525e-05, "loss": 2.567, "mean_token_accuracy": 0.4000000059604645, "step": 122165 }, { "epoch": 0.12305092154735753, "grad_norm": 10.613035074623335, "learning_rate": 4.934666953147347e-05, "loss": 2.4373, "mean_token_accuracy": 0.4172413766384125, "step": 122170 }, { "epoch": 0.1230559576004617, "grad_norm": 11.604897608991482, "learning_rate": 4.934657988594242e-05, "loss": 2.3155, "mean_token_accuracy": 0.44827585816383364, "step": 122175 }, { "epoch": 0.12306099365356588, "grad_norm": 9.442446180596152, "learning_rate": 4.934649023435215e-05, "loss": 2.4477, "mean_token_accuracy": 0.39310344457626345, "step": 122180 }, { "epoch": 0.12306602970667005, "grad_norm": 14.862346094080515, "learning_rate": 4.9346400576702666e-05, "loss": 2.6597, "mean_token_accuracy": 0.3793103456497192, "step": 122185 }, { "epoch": 0.12307106575977422, "grad_norm": 9.237671255463706, "learning_rate": 4.9346310912994004e-05, "loss": 2.5755, "mean_token_accuracy": 0.4, "step": 122190 }, { "epoch": 0.1230761018128784, "grad_norm": 9.426723561084556, "learning_rate": 4.9346221243226184e-05, "loss": 2.4332, "mean_token_accuracy": 0.44827585220336913, "step": 122195 }, { "epoch": 0.12308113786598257, "grad_norm": 11.685295995903461, "learning_rate": 4.934613156739924e-05, "loss": 2.5591, "mean_token_accuracy": 0.4517241358757019, "step": 122200 }, { "epoch": 0.12308617391908674, "grad_norm": 9.24584563280647, "learning_rate": 4.93460418855132e-05, "loss": 2.0428, "mean_token_accuracy": 0.4862069010734558, "step": 122205 }, { "epoch": 0.12309120997219092, "grad_norm": 9.845804357507516, "learning_rate": 4.934595219756805e-05, "loss": 2.4044, "mean_token_accuracy": 0.43793103098869324, "step": 122210 }, { "epoch": 0.12309624602529509, "grad_norm": 9.59392011588664, "learning_rate": 4.934586250356386e-05, "loss": 2.4527, "mean_token_accuracy": 0.3793103516101837, "step": 122215 }, { "epoch": 0.12310128207839927, "grad_norm": 10.38199541003397, "learning_rate": 4.934577280350064e-05, "loss": 2.5265, "mean_token_accuracy": 0.4517241358757019, "step": 122220 }, { "epoch": 0.12310631813150344, "grad_norm": 11.495513406816896, "learning_rate": 4.934568309737841e-05, "loss": 2.3304, "mean_token_accuracy": 0.42413792610168455, "step": 122225 }, { "epoch": 0.1231113541846076, "grad_norm": 11.851411438700698, "learning_rate": 4.93455933851972e-05, "loss": 2.0192, "mean_token_accuracy": 0.5034482777118683, "step": 122230 }, { "epoch": 0.12311639023771177, "grad_norm": 14.31969473723742, "learning_rate": 4.9345503666957035e-05, "loss": 2.5795, "mean_token_accuracy": 0.4137930989265442, "step": 122235 }, { "epoch": 0.12312142629081595, "grad_norm": 10.698692759965065, "learning_rate": 4.934541394265794e-05, "loss": 3.2477, "mean_token_accuracy": 0.37241379618644715, "step": 122240 }, { "epoch": 0.12312646234392012, "grad_norm": 10.50615280000507, "learning_rate": 4.9345324212299934e-05, "loss": 2.1866, "mean_token_accuracy": 0.4758620738983154, "step": 122245 }, { "epoch": 0.1231314983970243, "grad_norm": 11.09216276946314, "learning_rate": 4.934523447588304e-05, "loss": 2.3951, "mean_token_accuracy": 0.41034482717514037, "step": 122250 }, { "epoch": 0.12313653445012847, "grad_norm": 13.996994909452836, "learning_rate": 4.9345144733407296e-05, "loss": 2.1654, "mean_token_accuracy": 0.4620689630508423, "step": 122255 }, { "epoch": 0.12314157050323264, "grad_norm": 10.064488265327634, "learning_rate": 4.934505498487271e-05, "loss": 2.2446, "mean_token_accuracy": 0.4482758641242981, "step": 122260 }, { "epoch": 0.12314660655633682, "grad_norm": 12.082994841482064, "learning_rate": 4.934496523027933e-05, "loss": 2.8551, "mean_token_accuracy": 0.37586206793785093, "step": 122265 }, { "epoch": 0.12315164260944099, "grad_norm": 10.886210630920107, "learning_rate": 4.9344875469627166e-05, "loss": 2.3093, "mean_token_accuracy": 0.4551724076271057, "step": 122270 }, { "epoch": 0.12315667866254516, "grad_norm": 10.62891950233889, "learning_rate": 4.934478570291624e-05, "loss": 2.4382, "mean_token_accuracy": 0.40139141082763674, "step": 122275 }, { "epoch": 0.12316171471564934, "grad_norm": 11.946527016663094, "learning_rate": 4.934469593014658e-05, "loss": 2.5413, "mean_token_accuracy": 0.45172414779663084, "step": 122280 }, { "epoch": 0.12316675076875351, "grad_norm": 10.377185977853642, "learning_rate": 4.934460615131821e-05, "loss": 2.3462, "mean_token_accuracy": 0.41034482717514037, "step": 122285 }, { "epoch": 0.12317178682185768, "grad_norm": 8.429718218233317, "learning_rate": 4.934451636643116e-05, "loss": 2.0345, "mean_token_accuracy": 0.49655171632766726, "step": 122290 }, { "epoch": 0.12317682287496186, "grad_norm": 11.707822099177465, "learning_rate": 4.934442657548546e-05, "loss": 2.1893, "mean_token_accuracy": 0.48275862336158754, "step": 122295 }, { "epoch": 0.12318185892806602, "grad_norm": 10.599966945278034, "learning_rate": 4.9344336778481114e-05, "loss": 2.2445, "mean_token_accuracy": 0.44827585816383364, "step": 122300 }, { "epoch": 0.12318689498117019, "grad_norm": 10.3228864071613, "learning_rate": 4.9344246975418165e-05, "loss": 2.4709, "mean_token_accuracy": 0.4137930989265442, "step": 122305 }, { "epoch": 0.12319193103427437, "grad_norm": 19.561972299995883, "learning_rate": 4.934415716629663e-05, "loss": 2.6427, "mean_token_accuracy": 0.4670901417732239, "step": 122310 }, { "epoch": 0.12319696708737854, "grad_norm": 12.098653544273036, "learning_rate": 4.934406735111653e-05, "loss": 2.6018, "mean_token_accuracy": 0.37241379022598264, "step": 122315 }, { "epoch": 0.12320200314048271, "grad_norm": 13.267338246173694, "learning_rate": 4.934397752987791e-05, "loss": 2.8484, "mean_token_accuracy": 0.3999999940395355, "step": 122320 }, { "epoch": 0.12320703919358689, "grad_norm": 11.88545734147257, "learning_rate": 4.934388770258078e-05, "loss": 2.6183, "mean_token_accuracy": 0.4000000059604645, "step": 122325 }, { "epoch": 0.12321207524669106, "grad_norm": 9.549587396754854, "learning_rate": 4.934379786922516e-05, "loss": 2.3662, "mean_token_accuracy": 0.458620685338974, "step": 122330 }, { "epoch": 0.12321711129979523, "grad_norm": 10.598679680548079, "learning_rate": 4.934370802981109e-05, "loss": 2.2729, "mean_token_accuracy": 0.441379314661026, "step": 122335 }, { "epoch": 0.12322214735289941, "grad_norm": 10.873357950340468, "learning_rate": 4.9343618184338576e-05, "loss": 2.3628, "mean_token_accuracy": 0.4257108271121979, "step": 122340 }, { "epoch": 0.12322718340600358, "grad_norm": 8.808599840763735, "learning_rate": 4.934352833280766e-05, "loss": 2.5232, "mean_token_accuracy": 0.39655171930789945, "step": 122345 }, { "epoch": 0.12323221945910776, "grad_norm": 11.236695446009245, "learning_rate": 4.934343847521835e-05, "loss": 2.6449, "mean_token_accuracy": 0.4310344815254211, "step": 122350 }, { "epoch": 0.12323725551221193, "grad_norm": 9.154052568938097, "learning_rate": 4.9343348611570694e-05, "loss": 2.4198, "mean_token_accuracy": 0.42068966031074523, "step": 122355 }, { "epoch": 0.1232422915653161, "grad_norm": 12.884911901827888, "learning_rate": 4.9343258741864704e-05, "loss": 2.7091, "mean_token_accuracy": 0.3551724076271057, "step": 122360 }, { "epoch": 0.12324732761842028, "grad_norm": 9.59221002167568, "learning_rate": 4.934316886610039e-05, "loss": 2.3734, "mean_token_accuracy": 0.4551724135875702, "step": 122365 }, { "epoch": 0.12325236367152444, "grad_norm": 10.957045855982939, "learning_rate": 4.93430789842778e-05, "loss": 2.7854, "mean_token_accuracy": 0.41034482419490814, "step": 122370 }, { "epoch": 0.12325739972462861, "grad_norm": 8.685332501064916, "learning_rate": 4.934298909639695e-05, "loss": 2.3265, "mean_token_accuracy": 0.44827587008476255, "step": 122375 }, { "epoch": 0.12326243577773278, "grad_norm": 12.684931364529685, "learning_rate": 4.9342899202457865e-05, "loss": 2.6232, "mean_token_accuracy": 0.40344828367233276, "step": 122380 }, { "epoch": 0.12326747183083696, "grad_norm": 10.700377474508583, "learning_rate": 4.9342809302460574e-05, "loss": 2.478, "mean_token_accuracy": 0.39310344457626345, "step": 122385 }, { "epoch": 0.12327250788394113, "grad_norm": 10.669205270590549, "learning_rate": 4.934271939640509e-05, "loss": 2.4491, "mean_token_accuracy": 0.44827585220336913, "step": 122390 }, { "epoch": 0.1232775439370453, "grad_norm": 8.964201599565843, "learning_rate": 4.934262948429146e-05, "loss": 2.4483, "mean_token_accuracy": 0.4537809997797012, "step": 122395 }, { "epoch": 0.12328257999014948, "grad_norm": 10.237126865968039, "learning_rate": 4.9342539566119685e-05, "loss": 2.1953, "mean_token_accuracy": 0.46551724076271056, "step": 122400 }, { "epoch": 0.12328761604325365, "grad_norm": 10.495606421078685, "learning_rate": 4.93424496418898e-05, "loss": 2.2949, "mean_token_accuracy": 0.43793103396892546, "step": 122405 }, { "epoch": 0.12329265209635783, "grad_norm": 11.78599740399031, "learning_rate": 4.934235971160183e-05, "loss": 2.7297, "mean_token_accuracy": 0.42068964838981626, "step": 122410 }, { "epoch": 0.123297688149462, "grad_norm": 11.006839787611197, "learning_rate": 4.934226977525581e-05, "loss": 2.2737, "mean_token_accuracy": 0.4551724135875702, "step": 122415 }, { "epoch": 0.12330272420256617, "grad_norm": 8.10602968583033, "learning_rate": 4.934217983285175e-05, "loss": 2.431, "mean_token_accuracy": 0.44609800577163694, "step": 122420 }, { "epoch": 0.12330776025567035, "grad_norm": 10.681590818867242, "learning_rate": 4.934208988438967e-05, "loss": 2.5981, "mean_token_accuracy": 0.4000000059604645, "step": 122425 }, { "epoch": 0.12331279630877452, "grad_norm": 9.501809449986075, "learning_rate": 4.934199992986961e-05, "loss": 2.1827, "mean_token_accuracy": 0.4862069010734558, "step": 122430 }, { "epoch": 0.1233178323618787, "grad_norm": 8.0031911329358, "learning_rate": 4.93419099692916e-05, "loss": 2.1435, "mean_token_accuracy": 0.4379310250282288, "step": 122435 }, { "epoch": 0.12332286841498286, "grad_norm": 18.725685310228762, "learning_rate": 4.934182000265564e-05, "loss": 2.4854, "mean_token_accuracy": 0.4448275864124298, "step": 122440 }, { "epoch": 0.12332790446808703, "grad_norm": 10.02171009051166, "learning_rate": 4.934173002996179e-05, "loss": 2.0844, "mean_token_accuracy": 0.4344827651977539, "step": 122445 }, { "epoch": 0.1233329405211912, "grad_norm": 15.272123740620883, "learning_rate": 4.934164005121004e-05, "loss": 2.5708, "mean_token_accuracy": 0.37241379022598264, "step": 122450 }, { "epoch": 0.12333797657429538, "grad_norm": 9.492198916098461, "learning_rate": 4.934155006640042e-05, "loss": 2.1095, "mean_token_accuracy": 0.43103447556495667, "step": 122455 }, { "epoch": 0.12334301262739955, "grad_norm": 11.532721926617688, "learning_rate": 4.934146007553298e-05, "loss": 2.8655, "mean_token_accuracy": 0.34482758343219755, "step": 122460 }, { "epoch": 0.12334804868050372, "grad_norm": 9.475005782651309, "learning_rate": 4.934137007860773e-05, "loss": 2.2627, "mean_token_accuracy": 0.41379311084747317, "step": 122465 }, { "epoch": 0.1233530847336079, "grad_norm": 14.421185428687581, "learning_rate": 4.934128007562469e-05, "loss": 2.4155, "mean_token_accuracy": 0.4620689630508423, "step": 122470 }, { "epoch": 0.12335812078671207, "grad_norm": 9.488506030382721, "learning_rate": 4.934119006658388e-05, "loss": 1.9484, "mean_token_accuracy": 0.47931033968925474, "step": 122475 }, { "epoch": 0.12336315683981625, "grad_norm": 11.20149929380574, "learning_rate": 4.9341100051485354e-05, "loss": 2.9896, "mean_token_accuracy": 0.3241379290819168, "step": 122480 }, { "epoch": 0.12336819289292042, "grad_norm": 10.523035609920209, "learning_rate": 4.93410100303291e-05, "loss": 2.3309, "mean_token_accuracy": 0.41379310488700866, "step": 122485 }, { "epoch": 0.1233732289460246, "grad_norm": 10.057987553972398, "learning_rate": 4.9340920003115165e-05, "loss": 2.0617, "mean_token_accuracy": 0.4413793087005615, "step": 122490 }, { "epoch": 0.12337826499912877, "grad_norm": 30.662806239973115, "learning_rate": 4.9340829969843576e-05, "loss": 2.5439, "mean_token_accuracy": 0.4586206912994385, "step": 122495 }, { "epoch": 0.12338330105223294, "grad_norm": 10.517830905827733, "learning_rate": 4.9340739930514346e-05, "loss": 2.6503, "mean_token_accuracy": 0.37241379618644715, "step": 122500 }, { "epoch": 0.12338833710533711, "grad_norm": 12.188794124120367, "learning_rate": 4.934064988512751e-05, "loss": 2.4389, "mean_token_accuracy": 0.4551724135875702, "step": 122505 }, { "epoch": 0.12339337315844127, "grad_norm": 14.063493944951889, "learning_rate": 4.934055983368308e-05, "loss": 2.9226, "mean_token_accuracy": 0.42068966031074523, "step": 122510 }, { "epoch": 0.12339840921154545, "grad_norm": 11.751157732192395, "learning_rate": 4.9340469776181096e-05, "loss": 2.2274, "mean_token_accuracy": 0.47931033968925474, "step": 122515 }, { "epoch": 0.12340344526464962, "grad_norm": 14.59810801429352, "learning_rate": 4.9340379712621573e-05, "loss": 2.8038, "mean_token_accuracy": 0.3931034505367279, "step": 122520 }, { "epoch": 0.1234084813177538, "grad_norm": 9.975519892782028, "learning_rate": 4.9340289643004546e-05, "loss": 2.3315, "mean_token_accuracy": 0.4551724135875702, "step": 122525 }, { "epoch": 0.12341351737085797, "grad_norm": 9.167926122192553, "learning_rate": 4.9340199567330027e-05, "loss": 1.7595, "mean_token_accuracy": 0.517241370677948, "step": 122530 }, { "epoch": 0.12341855342396214, "grad_norm": 10.63017394951189, "learning_rate": 4.934010948559804e-05, "loss": 2.6677, "mean_token_accuracy": 0.3517241358757019, "step": 122535 }, { "epoch": 0.12342358947706632, "grad_norm": 10.82094683116335, "learning_rate": 4.934001939780863e-05, "loss": 2.2388, "mean_token_accuracy": 0.4379310369491577, "step": 122540 }, { "epoch": 0.12342862553017049, "grad_norm": 8.789235422813926, "learning_rate": 4.93399293039618e-05, "loss": 2.9268, "mean_token_accuracy": 0.3776769578456879, "step": 122545 }, { "epoch": 0.12343366158327466, "grad_norm": 9.11318163434481, "learning_rate": 4.9339839204057594e-05, "loss": 2.3913, "mean_token_accuracy": 0.3931034505367279, "step": 122550 }, { "epoch": 0.12343869763637884, "grad_norm": 10.645606718158037, "learning_rate": 4.933974909809602e-05, "loss": 2.4794, "mean_token_accuracy": 0.4034482777118683, "step": 122555 }, { "epoch": 0.12344373368948301, "grad_norm": 12.97868220946382, "learning_rate": 4.9339658986077115e-05, "loss": 2.6555, "mean_token_accuracy": 0.41034482717514037, "step": 122560 }, { "epoch": 0.12344876974258719, "grad_norm": 11.268035981086955, "learning_rate": 4.93395688680009e-05, "loss": 2.6434, "mean_token_accuracy": 0.3793103456497192, "step": 122565 }, { "epoch": 0.12345380579569136, "grad_norm": 10.363461799977566, "learning_rate": 4.933947874386739e-05, "loss": 2.1606, "mean_token_accuracy": 0.46551724076271056, "step": 122570 }, { "epoch": 0.12345884184879553, "grad_norm": 9.597729992356948, "learning_rate": 4.933938861367662e-05, "loss": 2.5874, "mean_token_accuracy": 0.43793103098869324, "step": 122575 }, { "epoch": 0.1234638779018997, "grad_norm": 10.214094880742476, "learning_rate": 4.933929847742862e-05, "loss": 2.0039, "mean_token_accuracy": 0.4551724135875702, "step": 122580 }, { "epoch": 0.12346891395500387, "grad_norm": 21.97856787739723, "learning_rate": 4.933920833512341e-05, "loss": 2.4996, "mean_token_accuracy": 0.4517241299152374, "step": 122585 }, { "epoch": 0.12347395000810804, "grad_norm": 9.073023721521006, "learning_rate": 4.9339118186761006e-05, "loss": 2.0413, "mean_token_accuracy": 0.4988505721092224, "step": 122590 }, { "epoch": 0.12347898606121221, "grad_norm": 11.45995196448123, "learning_rate": 4.9339028032341446e-05, "loss": 2.4562, "mean_token_accuracy": 0.42758620381355283, "step": 122595 }, { "epoch": 0.12348402211431639, "grad_norm": 10.166830509264564, "learning_rate": 4.933893787186475e-05, "loss": 2.0681, "mean_token_accuracy": 0.4551724135875702, "step": 122600 }, { "epoch": 0.12348905816742056, "grad_norm": 15.125531378775012, "learning_rate": 4.933884770533094e-05, "loss": 2.8791, "mean_token_accuracy": 0.3551724135875702, "step": 122605 }, { "epoch": 0.12349409422052474, "grad_norm": 11.559263634711028, "learning_rate": 4.9338757532740056e-05, "loss": 2.833, "mean_token_accuracy": 0.358620685338974, "step": 122610 }, { "epoch": 0.12349913027362891, "grad_norm": 8.377145186880623, "learning_rate": 4.93386673540921e-05, "loss": 2.0139, "mean_token_accuracy": 0.5034482777118683, "step": 122615 }, { "epoch": 0.12350416632673308, "grad_norm": 14.01875907962331, "learning_rate": 4.9338577169387114e-05, "loss": 2.4491, "mean_token_accuracy": 0.4620689630508423, "step": 122620 }, { "epoch": 0.12350920237983726, "grad_norm": 9.439122293341352, "learning_rate": 4.933848697862511e-05, "loss": 2.6415, "mean_token_accuracy": 0.3310344755649567, "step": 122625 }, { "epoch": 0.12351423843294143, "grad_norm": 10.854702700968856, "learning_rate": 4.933839678180612e-05, "loss": 2.2741, "mean_token_accuracy": 0.4413793087005615, "step": 122630 }, { "epoch": 0.1235192744860456, "grad_norm": 11.606268337319495, "learning_rate": 4.933830657893018e-05, "loss": 2.6366, "mean_token_accuracy": 0.3931034505367279, "step": 122635 }, { "epoch": 0.12352431053914978, "grad_norm": 11.934936294638902, "learning_rate": 4.9338216369997305e-05, "loss": 2.3036, "mean_token_accuracy": 0.42758620381355283, "step": 122640 }, { "epoch": 0.12352934659225395, "grad_norm": 10.6581479301926, "learning_rate": 4.93381261550075e-05, "loss": 2.1824, "mean_token_accuracy": 0.5034482717514038, "step": 122645 }, { "epoch": 0.12353438264535811, "grad_norm": 10.98119753348063, "learning_rate": 4.933803593396083e-05, "loss": 2.2313, "mean_token_accuracy": 0.4517241358757019, "step": 122650 }, { "epoch": 0.12353941869846229, "grad_norm": 9.386481536184336, "learning_rate": 4.9337945706857286e-05, "loss": 2.2538, "mean_token_accuracy": 0.44482758045196535, "step": 122655 }, { "epoch": 0.12354445475156646, "grad_norm": 8.275189126747991, "learning_rate": 4.933785547369691e-05, "loss": 2.0559, "mean_token_accuracy": 0.4758620738983154, "step": 122660 }, { "epoch": 0.12354949080467063, "grad_norm": 12.947507515004895, "learning_rate": 4.9337765234479724e-05, "loss": 2.4047, "mean_token_accuracy": 0.4068965554237366, "step": 122665 }, { "epoch": 0.12355452685777481, "grad_norm": 11.591014604155436, "learning_rate": 4.9337674989205756e-05, "loss": 2.6539, "mean_token_accuracy": 0.3793103456497192, "step": 122670 }, { "epoch": 0.12355956291087898, "grad_norm": 11.831730717192517, "learning_rate": 4.933758473787502e-05, "loss": 2.614, "mean_token_accuracy": 0.3517241358757019, "step": 122675 }, { "epoch": 0.12356459896398315, "grad_norm": 8.255610433265508, "learning_rate": 4.933749448048755e-05, "loss": 2.0614, "mean_token_accuracy": 0.5034482836723327, "step": 122680 }, { "epoch": 0.12356963501708733, "grad_norm": 9.238959475696083, "learning_rate": 4.933740421704337e-05, "loss": 3.0759, "mean_token_accuracy": 0.3758620619773865, "step": 122685 }, { "epoch": 0.1235746710701915, "grad_norm": 9.687465701072114, "learning_rate": 4.933731394754251e-05, "loss": 2.4747, "mean_token_accuracy": 0.40344826877117157, "step": 122690 }, { "epoch": 0.12357970712329568, "grad_norm": 10.552977557162263, "learning_rate": 4.933722367198499e-05, "loss": 2.3345, "mean_token_accuracy": 0.4367816150188446, "step": 122695 }, { "epoch": 0.12358474317639985, "grad_norm": 9.247735368272822, "learning_rate": 4.9337133390370823e-05, "loss": 2.3351, "mean_token_accuracy": 0.43448275327682495, "step": 122700 }, { "epoch": 0.12358977922950402, "grad_norm": 9.510902667002675, "learning_rate": 4.933704310270005e-05, "loss": 2.187, "mean_token_accuracy": 0.458620685338974, "step": 122705 }, { "epoch": 0.1235948152826082, "grad_norm": 9.1942552925599, "learning_rate": 4.9336952808972695e-05, "loss": 2.1377, "mean_token_accuracy": 0.5034482717514038, "step": 122710 }, { "epoch": 0.12359985133571237, "grad_norm": 12.30740033421167, "learning_rate": 4.933686250918879e-05, "loss": 2.0976, "mean_token_accuracy": 0.5, "step": 122715 }, { "epoch": 0.12360488738881653, "grad_norm": 10.833060155889019, "learning_rate": 4.933677220334833e-05, "loss": 2.1524, "mean_token_accuracy": 0.441379314661026, "step": 122720 }, { "epoch": 0.1236099234419207, "grad_norm": 9.470699214866597, "learning_rate": 4.9336681891451375e-05, "loss": 2.2198, "mean_token_accuracy": 0.4413793087005615, "step": 122725 }, { "epoch": 0.12361495949502488, "grad_norm": 11.043474440367223, "learning_rate": 4.933659157349793e-05, "loss": 2.4286, "mean_token_accuracy": 0.38112522959709166, "step": 122730 }, { "epoch": 0.12361999554812905, "grad_norm": 11.205923135653073, "learning_rate": 4.933650124948801e-05, "loss": 2.5146, "mean_token_accuracy": 0.3965517282485962, "step": 122735 }, { "epoch": 0.12362503160123323, "grad_norm": 13.127696901634849, "learning_rate": 4.9336410919421675e-05, "loss": 2.3849, "mean_token_accuracy": 0.475862056016922, "step": 122740 }, { "epoch": 0.1236300676543374, "grad_norm": 12.189552039999455, "learning_rate": 4.933632058329893e-05, "loss": 2.5459, "mean_token_accuracy": 0.4344827592372894, "step": 122745 }, { "epoch": 0.12363510370744157, "grad_norm": 11.750928280873806, "learning_rate": 4.933623024111979e-05, "loss": 2.7862, "mean_token_accuracy": 0.36896551847457887, "step": 122750 }, { "epoch": 0.12364013976054575, "grad_norm": 11.644090559075515, "learning_rate": 4.9336139892884294e-05, "loss": 2.2513, "mean_token_accuracy": 0.4655172348022461, "step": 122755 }, { "epoch": 0.12364517581364992, "grad_norm": 10.814084565507269, "learning_rate": 4.9336049538592456e-05, "loss": 2.3283, "mean_token_accuracy": 0.4620689630508423, "step": 122760 }, { "epoch": 0.1236502118667541, "grad_norm": 12.056941159123008, "learning_rate": 4.933595917824431e-05, "loss": 2.4188, "mean_token_accuracy": 0.4517241418361664, "step": 122765 }, { "epoch": 0.12365524791985827, "grad_norm": 8.035789403161033, "learning_rate": 4.933586881183989e-05, "loss": 2.117, "mean_token_accuracy": 0.47931034564971925, "step": 122770 }, { "epoch": 0.12366028397296244, "grad_norm": 9.921849775718766, "learning_rate": 4.93357784393792e-05, "loss": 2.5015, "mean_token_accuracy": 0.43448275327682495, "step": 122775 }, { "epoch": 0.12366532002606662, "grad_norm": 12.974777475857632, "learning_rate": 4.933568806086228e-05, "loss": 2.6382, "mean_token_accuracy": 0.3931034505367279, "step": 122780 }, { "epoch": 0.12367035607917079, "grad_norm": 9.115591857408452, "learning_rate": 4.933559767628915e-05, "loss": 2.1798, "mean_token_accuracy": 0.47586206197738645, "step": 122785 }, { "epoch": 0.12367539213227495, "grad_norm": 12.00099751559129, "learning_rate": 4.9335507285659844e-05, "loss": 2.3544, "mean_token_accuracy": 0.4068965494632721, "step": 122790 }, { "epoch": 0.12368042818537912, "grad_norm": 12.223195474719459, "learning_rate": 4.933541688897436e-05, "loss": 2.5474, "mean_token_accuracy": 0.4114531993865967, "step": 122795 }, { "epoch": 0.1236854642384833, "grad_norm": 8.883663320839975, "learning_rate": 4.933532648623276e-05, "loss": 1.9578, "mean_token_accuracy": 0.4793103575706482, "step": 122800 }, { "epoch": 0.12369050029158747, "grad_norm": 9.001479448267245, "learning_rate": 4.933523607743504e-05, "loss": 2.163, "mean_token_accuracy": 0.4517241358757019, "step": 122805 }, { "epoch": 0.12369553634469165, "grad_norm": 10.619966449266244, "learning_rate": 4.933514566258124e-05, "loss": 2.3223, "mean_token_accuracy": 0.38965516686439516, "step": 122810 }, { "epoch": 0.12370057239779582, "grad_norm": 9.622159652964532, "learning_rate": 4.933505524167138e-05, "loss": 2.0597, "mean_token_accuracy": 0.4517241299152374, "step": 122815 }, { "epoch": 0.12370560845089999, "grad_norm": 14.221265806155078, "learning_rate": 4.9334964814705483e-05, "loss": 2.8977, "mean_token_accuracy": 0.3896551728248596, "step": 122820 }, { "epoch": 0.12371064450400417, "grad_norm": 10.585106992767665, "learning_rate": 4.933487438168358e-05, "loss": 2.8013, "mean_token_accuracy": 0.4034482777118683, "step": 122825 }, { "epoch": 0.12371568055710834, "grad_norm": 9.244865405063791, "learning_rate": 4.933478394260569e-05, "loss": 2.3691, "mean_token_accuracy": 0.43103448748588563, "step": 122830 }, { "epoch": 0.12372071661021251, "grad_norm": 8.728101480524408, "learning_rate": 4.933469349747184e-05, "loss": 2.0607, "mean_token_accuracy": 0.4862068951129913, "step": 122835 }, { "epoch": 0.12372575266331669, "grad_norm": 11.918679743896949, "learning_rate": 4.933460304628207e-05, "loss": 2.4647, "mean_token_accuracy": 0.42413792610168455, "step": 122840 }, { "epoch": 0.12373078871642086, "grad_norm": 14.893946559527697, "learning_rate": 4.9334512589036374e-05, "loss": 2.8687, "mean_token_accuracy": 0.3896551728248596, "step": 122845 }, { "epoch": 0.12373582476952504, "grad_norm": 13.07274882812411, "learning_rate": 4.9334422125734804e-05, "loss": 2.6378, "mean_token_accuracy": 0.3862069010734558, "step": 122850 }, { "epoch": 0.12374086082262921, "grad_norm": 9.309205331551194, "learning_rate": 4.933433165637737e-05, "loss": 2.2724, "mean_token_accuracy": 0.43793103098869324, "step": 122855 }, { "epoch": 0.12374589687573337, "grad_norm": 9.830650399023668, "learning_rate": 4.933424118096411e-05, "loss": 2.5416, "mean_token_accuracy": 0.425952810049057, "step": 122860 }, { "epoch": 0.12375093292883754, "grad_norm": 11.682765027821395, "learning_rate": 4.9334150699495044e-05, "loss": 2.6916, "mean_token_accuracy": 0.3379310369491577, "step": 122865 }, { "epoch": 0.12375596898194172, "grad_norm": 12.274278011156381, "learning_rate": 4.933406021197018e-05, "loss": 2.666, "mean_token_accuracy": 0.4379310369491577, "step": 122870 }, { "epoch": 0.12376100503504589, "grad_norm": 9.517303409525839, "learning_rate": 4.933396971838957e-05, "loss": 1.9496, "mean_token_accuracy": 0.5034482777118683, "step": 122875 }, { "epoch": 0.12376604108815006, "grad_norm": 11.183388915112685, "learning_rate": 4.933387921875322e-05, "loss": 2.171, "mean_token_accuracy": 0.4137930989265442, "step": 122880 }, { "epoch": 0.12377107714125424, "grad_norm": 10.102851830475673, "learning_rate": 4.933378871306116e-05, "loss": 2.4229, "mean_token_accuracy": 0.4068965494632721, "step": 122885 }, { "epoch": 0.12377611319435841, "grad_norm": 10.549116762332543, "learning_rate": 4.9333698201313426e-05, "loss": 2.6748, "mean_token_accuracy": 0.3655172407627106, "step": 122890 }, { "epoch": 0.12378114924746259, "grad_norm": 10.412924194872431, "learning_rate": 4.933360768351003e-05, "loss": 2.6036, "mean_token_accuracy": 0.4517241418361664, "step": 122895 }, { "epoch": 0.12378618530056676, "grad_norm": 9.597784032188406, "learning_rate": 4.9333517159651004e-05, "loss": 2.2233, "mean_token_accuracy": 0.46896551847457885, "step": 122900 }, { "epoch": 0.12379122135367093, "grad_norm": 9.62977868901446, "learning_rate": 4.933342662973637e-05, "loss": 2.1076, "mean_token_accuracy": 0.482758629322052, "step": 122905 }, { "epoch": 0.1237962574067751, "grad_norm": 14.18589321538855, "learning_rate": 4.933333609376616e-05, "loss": 2.6625, "mean_token_accuracy": 0.3862068891525269, "step": 122910 }, { "epoch": 0.12380129345987928, "grad_norm": 10.620738254493059, "learning_rate": 4.933324555174038e-05, "loss": 2.3512, "mean_token_accuracy": 0.4206896543502808, "step": 122915 }, { "epoch": 0.12380632951298345, "grad_norm": 12.553274848792418, "learning_rate": 4.9333155003659075e-05, "loss": 2.7671, "mean_token_accuracy": 0.38620689511299133, "step": 122920 }, { "epoch": 0.12381136556608763, "grad_norm": 11.177973919066197, "learning_rate": 4.933306444952227e-05, "loss": 2.3321, "mean_token_accuracy": 0.4137930989265442, "step": 122925 }, { "epoch": 0.12381640161919179, "grad_norm": 15.734728497837217, "learning_rate": 4.933297388932997e-05, "loss": 2.4784, "mean_token_accuracy": 0.4137930989265442, "step": 122930 }, { "epoch": 0.12382143767229596, "grad_norm": 18.686827346883888, "learning_rate": 4.9332883323082215e-05, "loss": 2.2393, "mean_token_accuracy": 0.4206896543502808, "step": 122935 }, { "epoch": 0.12382647372540014, "grad_norm": 11.696466187630268, "learning_rate": 4.933279275077903e-05, "loss": 2.4513, "mean_token_accuracy": 0.42413793206214906, "step": 122940 }, { "epoch": 0.12383150977850431, "grad_norm": 14.387234670852038, "learning_rate": 4.933270217242044e-05, "loss": 2.0492, "mean_token_accuracy": 0.4758620738983154, "step": 122945 }, { "epoch": 0.12383654583160848, "grad_norm": 9.64707468667973, "learning_rate": 4.933261158800647e-05, "loss": 2.4413, "mean_token_accuracy": 0.39655172228813174, "step": 122950 }, { "epoch": 0.12384158188471266, "grad_norm": 12.283068189896253, "learning_rate": 4.9332520997537147e-05, "loss": 2.5172, "mean_token_accuracy": 0.4034482717514038, "step": 122955 }, { "epoch": 0.12384661793781683, "grad_norm": 9.61040748359341, "learning_rate": 4.933243040101248e-05, "loss": 2.8036, "mean_token_accuracy": 0.4034482717514038, "step": 122960 }, { "epoch": 0.123851653990921, "grad_norm": 9.911546105077123, "learning_rate": 4.933233979843252e-05, "loss": 2.1647, "mean_token_accuracy": 0.45862067937850953, "step": 122965 }, { "epoch": 0.12385669004402518, "grad_norm": 10.78625900590272, "learning_rate": 4.933224918979727e-05, "loss": 2.1092, "mean_token_accuracy": 0.4551724135875702, "step": 122970 }, { "epoch": 0.12386172609712935, "grad_norm": 10.395474681117163, "learning_rate": 4.933215857510677e-05, "loss": 2.5441, "mean_token_accuracy": 0.4172413766384125, "step": 122975 }, { "epoch": 0.12386676215023353, "grad_norm": 8.471404191387302, "learning_rate": 4.933206795436104e-05, "loss": 2.1045, "mean_token_accuracy": 0.4620689630508423, "step": 122980 }, { "epoch": 0.1238717982033377, "grad_norm": 10.138903641802825, "learning_rate": 4.93319773275601e-05, "loss": 2.6696, "mean_token_accuracy": 0.34827586114406583, "step": 122985 }, { "epoch": 0.12387683425644187, "grad_norm": 9.879582244721448, "learning_rate": 4.933188669470399e-05, "loss": 2.0361, "mean_token_accuracy": 0.5137930989265442, "step": 122990 }, { "epoch": 0.12388187030954605, "grad_norm": 9.295481233671834, "learning_rate": 4.933179605579271e-05, "loss": 2.3187, "mean_token_accuracy": 0.42758620977401735, "step": 122995 }, { "epoch": 0.1238869063626502, "grad_norm": 11.421037056117518, "learning_rate": 4.933170541082631e-05, "loss": 2.6755, "mean_token_accuracy": 0.3482758641242981, "step": 123000 }, { "epoch": 0.12389194241575438, "grad_norm": 10.437684618484582, "learning_rate": 4.93316147598048e-05, "loss": 2.3647, "mean_token_accuracy": 0.4413793087005615, "step": 123005 }, { "epoch": 0.12389697846885855, "grad_norm": 10.476839742944929, "learning_rate": 4.933152410272821e-05, "loss": 2.7695, "mean_token_accuracy": 0.41034482717514037, "step": 123010 }, { "epoch": 0.12390201452196273, "grad_norm": 10.838014396490431, "learning_rate": 4.933143343959657e-05, "loss": 2.1852, "mean_token_accuracy": 0.4379310369491577, "step": 123015 }, { "epoch": 0.1239070505750669, "grad_norm": 11.5087287255968, "learning_rate": 4.9331342770409894e-05, "loss": 2.3872, "mean_token_accuracy": 0.4068965494632721, "step": 123020 }, { "epoch": 0.12391208662817108, "grad_norm": 9.389222967909676, "learning_rate": 4.933125209516822e-05, "loss": 2.7633, "mean_token_accuracy": 0.408771938085556, "step": 123025 }, { "epoch": 0.12391712268127525, "grad_norm": 10.308981962959685, "learning_rate": 4.933116141387156e-05, "loss": 2.3194, "mean_token_accuracy": 0.42413793206214906, "step": 123030 }, { "epoch": 0.12392215873437942, "grad_norm": 9.561279221556552, "learning_rate": 4.9331070726519956e-05, "loss": 2.9644, "mean_token_accuracy": 0.43448275327682495, "step": 123035 }, { "epoch": 0.1239271947874836, "grad_norm": 10.0573677422884, "learning_rate": 4.933098003311341e-05, "loss": 2.4854, "mean_token_accuracy": 0.42413793206214906, "step": 123040 }, { "epoch": 0.12393223084058777, "grad_norm": 10.431477206753401, "learning_rate": 4.933088933365197e-05, "loss": 2.5239, "mean_token_accuracy": 0.37241379022598264, "step": 123045 }, { "epoch": 0.12393726689369194, "grad_norm": 10.867068217849392, "learning_rate": 4.933079862813565e-05, "loss": 2.0881, "mean_token_accuracy": 0.5172413647174835, "step": 123050 }, { "epoch": 0.12394230294679612, "grad_norm": 9.542101754828474, "learning_rate": 4.933070791656448e-05, "loss": 2.2698, "mean_token_accuracy": 0.47931033968925474, "step": 123055 }, { "epoch": 0.12394733899990029, "grad_norm": 9.421025680142774, "learning_rate": 4.933061719893848e-05, "loss": 2.5452, "mean_token_accuracy": 0.44137930274009707, "step": 123060 }, { "epoch": 0.12395237505300447, "grad_norm": 11.830683477115663, "learning_rate": 4.9330526475257674e-05, "loss": 2.4479, "mean_token_accuracy": 0.3655172407627106, "step": 123065 }, { "epoch": 0.12395741110610863, "grad_norm": 8.705518853903019, "learning_rate": 4.9330435745522086e-05, "loss": 2.5622, "mean_token_accuracy": 0.4517241358757019, "step": 123070 }, { "epoch": 0.1239624471592128, "grad_norm": 9.681590034965595, "learning_rate": 4.933034500973175e-05, "loss": 2.1431, "mean_token_accuracy": 0.4655172348022461, "step": 123075 }, { "epoch": 0.12396748321231697, "grad_norm": 11.509248704375837, "learning_rate": 4.9330254267886684e-05, "loss": 2.4454, "mean_token_accuracy": 0.4206896543502808, "step": 123080 }, { "epoch": 0.12397251926542115, "grad_norm": 10.983950387277721, "learning_rate": 4.9330163519986916e-05, "loss": 2.4705, "mean_token_accuracy": 0.4586206912994385, "step": 123085 }, { "epoch": 0.12397755531852532, "grad_norm": 9.778208279643033, "learning_rate": 4.933007276603247e-05, "loss": 2.3174, "mean_token_accuracy": 0.4103448212146759, "step": 123090 }, { "epoch": 0.1239825913716295, "grad_norm": 12.598905892946542, "learning_rate": 4.9329982006023374e-05, "loss": 2.2357, "mean_token_accuracy": 0.4137930989265442, "step": 123095 }, { "epoch": 0.12398762742473367, "grad_norm": 12.948293171149169, "learning_rate": 4.9329891239959656e-05, "loss": 2.2673, "mean_token_accuracy": 0.4931034445762634, "step": 123100 }, { "epoch": 0.12399266347783784, "grad_norm": 12.287551972168124, "learning_rate": 4.932980046784133e-05, "loss": 2.1795, "mean_token_accuracy": 0.4608590483665466, "step": 123105 }, { "epoch": 0.12399769953094202, "grad_norm": 10.708187140126693, "learning_rate": 4.932970968966842e-05, "loss": 2.518, "mean_token_accuracy": 0.3999999940395355, "step": 123110 }, { "epoch": 0.12400273558404619, "grad_norm": 11.383412162028849, "learning_rate": 4.932961890544097e-05, "loss": 2.5119, "mean_token_accuracy": 0.39655172228813174, "step": 123115 }, { "epoch": 0.12400777163715036, "grad_norm": 10.48135826335423, "learning_rate": 4.9329528115158994e-05, "loss": 2.4048, "mean_token_accuracy": 0.4275862157344818, "step": 123120 }, { "epoch": 0.12401280769025454, "grad_norm": 11.668377404142335, "learning_rate": 4.932943731882252e-05, "loss": 2.3271, "mean_token_accuracy": 0.4620689690113068, "step": 123125 }, { "epoch": 0.12401784374335871, "grad_norm": 18.12678406395544, "learning_rate": 4.9329346516431564e-05, "loss": 2.7186, "mean_token_accuracy": 0.334482753276825, "step": 123130 }, { "epoch": 0.12402287979646288, "grad_norm": 10.52793786279384, "learning_rate": 4.932925570798615e-05, "loss": 2.1032, "mean_token_accuracy": 0.44827585816383364, "step": 123135 }, { "epoch": 0.12402791584956704, "grad_norm": 12.860449506042098, "learning_rate": 4.932916489348632e-05, "loss": 2.5461, "mean_token_accuracy": 0.38275861740112305, "step": 123140 }, { "epoch": 0.12403295190267122, "grad_norm": 10.48567017880704, "learning_rate": 4.932907407293209e-05, "loss": 2.2084, "mean_token_accuracy": 0.47931034564971925, "step": 123145 }, { "epoch": 0.12403798795577539, "grad_norm": 10.094704907297531, "learning_rate": 4.932898324632348e-05, "loss": 2.1186, "mean_token_accuracy": 0.5137930929660797, "step": 123150 }, { "epoch": 0.12404302400887957, "grad_norm": 11.368293399874839, "learning_rate": 4.932889241366052e-05, "loss": 2.3863, "mean_token_accuracy": 0.4172413766384125, "step": 123155 }, { "epoch": 0.12404806006198374, "grad_norm": 9.861717710956027, "learning_rate": 4.932880157494324e-05, "loss": 2.1843, "mean_token_accuracy": 0.4, "step": 123160 }, { "epoch": 0.12405309611508791, "grad_norm": 11.714800578611175, "learning_rate": 4.9328710730171655e-05, "loss": 2.5869, "mean_token_accuracy": 0.4069570451974869, "step": 123165 }, { "epoch": 0.12405813216819209, "grad_norm": 10.010907623165348, "learning_rate": 4.9328619879345797e-05, "loss": 2.5001, "mean_token_accuracy": 0.38275861740112305, "step": 123170 }, { "epoch": 0.12406316822129626, "grad_norm": 16.96044715898209, "learning_rate": 4.93285290224657e-05, "loss": 2.5044, "mean_token_accuracy": 0.4551724135875702, "step": 123175 }, { "epoch": 0.12406820427440043, "grad_norm": 10.72564551454885, "learning_rate": 4.932843815953136e-05, "loss": 2.5379, "mean_token_accuracy": 0.42413792610168455, "step": 123180 }, { "epoch": 0.12407324032750461, "grad_norm": 12.05064641159864, "learning_rate": 4.932834729054284e-05, "loss": 2.3526, "mean_token_accuracy": 0.39999999701976774, "step": 123185 }, { "epoch": 0.12407827638060878, "grad_norm": 9.475588833538952, "learning_rate": 4.932825641550013e-05, "loss": 2.034, "mean_token_accuracy": 0.5, "step": 123190 }, { "epoch": 0.12408331243371296, "grad_norm": 10.048130366180112, "learning_rate": 4.932816553440329e-05, "loss": 2.758, "mean_token_accuracy": 0.36896551549434664, "step": 123195 }, { "epoch": 0.12408834848681713, "grad_norm": 21.957440087304068, "learning_rate": 4.9328074647252314e-05, "loss": 2.641, "mean_token_accuracy": 0.43436177372932433, "step": 123200 }, { "epoch": 0.1240933845399213, "grad_norm": 9.74255369131871, "learning_rate": 4.932798375404724e-05, "loss": 2.1832, "mean_token_accuracy": 0.47241379618644713, "step": 123205 }, { "epoch": 0.12409842059302546, "grad_norm": 13.61766829837327, "learning_rate": 4.9327892854788094e-05, "loss": 2.3567, "mean_token_accuracy": 0.4103448212146759, "step": 123210 }, { "epoch": 0.12410345664612964, "grad_norm": 14.615082523035545, "learning_rate": 4.93278019494749e-05, "loss": 2.5977, "mean_token_accuracy": 0.38965516686439516, "step": 123215 }, { "epoch": 0.12410849269923381, "grad_norm": 8.428534949801909, "learning_rate": 4.9327711038107696e-05, "loss": 2.0244, "mean_token_accuracy": 0.48374384045600893, "step": 123220 }, { "epoch": 0.12411352875233798, "grad_norm": 10.762917444876706, "learning_rate": 4.932762012068648e-05, "loss": 2.419, "mean_token_accuracy": 0.4517241418361664, "step": 123225 }, { "epoch": 0.12411856480544216, "grad_norm": 10.747234994252992, "learning_rate": 4.9327529197211304e-05, "loss": 2.5998, "mean_token_accuracy": 0.4087114453315735, "step": 123230 }, { "epoch": 0.12412360085854633, "grad_norm": 9.57360757696649, "learning_rate": 4.9327438267682166e-05, "loss": 2.1839, "mean_token_accuracy": 0.4620689630508423, "step": 123235 }, { "epoch": 0.1241286369116505, "grad_norm": 8.406422987196738, "learning_rate": 4.932734733209912e-05, "loss": 2.3269, "mean_token_accuracy": 0.43103448748588563, "step": 123240 }, { "epoch": 0.12413367296475468, "grad_norm": 12.941132001325842, "learning_rate": 4.9327256390462176e-05, "loss": 2.7547, "mean_token_accuracy": 0.38965516686439516, "step": 123245 }, { "epoch": 0.12413870901785885, "grad_norm": 9.752449969786012, "learning_rate": 4.932716544277135e-05, "loss": 2.4135, "mean_token_accuracy": 0.42413793206214906, "step": 123250 }, { "epoch": 0.12414374507096303, "grad_norm": 13.615187740349615, "learning_rate": 4.9327074489026686e-05, "loss": 2.3145, "mean_token_accuracy": 0.458620685338974, "step": 123255 }, { "epoch": 0.1241487811240672, "grad_norm": 10.040319267060978, "learning_rate": 4.93269835292282e-05, "loss": 2.373, "mean_token_accuracy": 0.43103447556495667, "step": 123260 }, { "epoch": 0.12415381717717137, "grad_norm": 10.079910092066099, "learning_rate": 4.932689256337592e-05, "loss": 2.6837, "mean_token_accuracy": 0.36896551251411436, "step": 123265 }, { "epoch": 0.12415885323027555, "grad_norm": 10.38608072450349, "learning_rate": 4.932680159146987e-05, "loss": 2.4664, "mean_token_accuracy": 0.4570477962493896, "step": 123270 }, { "epoch": 0.12416388928337972, "grad_norm": 10.729621282416062, "learning_rate": 4.932671061351008e-05, "loss": 2.4258, "mean_token_accuracy": 0.42068966031074523, "step": 123275 }, { "epoch": 0.12416892533648388, "grad_norm": 11.276463566834492, "learning_rate": 4.9326619629496565e-05, "loss": 2.602, "mean_token_accuracy": 0.3965517282485962, "step": 123280 }, { "epoch": 0.12417396138958806, "grad_norm": 10.133533547966623, "learning_rate": 4.932652863942936e-05, "loss": 2.3539, "mean_token_accuracy": 0.4172413766384125, "step": 123285 }, { "epoch": 0.12417899744269223, "grad_norm": 13.013234062376828, "learning_rate": 4.932643764330848e-05, "loss": 2.6983, "mean_token_accuracy": 0.3655172407627106, "step": 123290 }, { "epoch": 0.1241840334957964, "grad_norm": 11.833425017667839, "learning_rate": 4.932634664113396e-05, "loss": 2.2789, "mean_token_accuracy": 0.4344827651977539, "step": 123295 }, { "epoch": 0.12418906954890058, "grad_norm": 11.757532075886633, "learning_rate": 4.932625563290582e-05, "loss": 2.8312, "mean_token_accuracy": 0.3862068891525269, "step": 123300 }, { "epoch": 0.12419410560200475, "grad_norm": 11.451683336274522, "learning_rate": 4.9326164618624086e-05, "loss": 2.4466, "mean_token_accuracy": 0.4586206942796707, "step": 123305 }, { "epoch": 0.12419914165510892, "grad_norm": 11.259269763179793, "learning_rate": 4.932607359828879e-05, "loss": 2.3162, "mean_token_accuracy": 0.40689654350280763, "step": 123310 }, { "epoch": 0.1242041777082131, "grad_norm": 11.918246905982889, "learning_rate": 4.932598257189994e-05, "loss": 2.6922, "mean_token_accuracy": 0.36896551847457887, "step": 123315 }, { "epoch": 0.12420921376131727, "grad_norm": 10.53536661795408, "learning_rate": 4.9325891539457575e-05, "loss": 2.3382, "mean_token_accuracy": 0.4344827592372894, "step": 123320 }, { "epoch": 0.12421424981442145, "grad_norm": 8.986325722805827, "learning_rate": 4.932580050096172e-05, "loss": 2.5828, "mean_token_accuracy": 0.417241370677948, "step": 123325 }, { "epoch": 0.12421928586752562, "grad_norm": 9.110786002475594, "learning_rate": 4.9325709456412407e-05, "loss": 2.6606, "mean_token_accuracy": 0.4310344815254211, "step": 123330 }, { "epoch": 0.1242243219206298, "grad_norm": 19.228952943598653, "learning_rate": 4.932561840580964e-05, "loss": 2.6777, "mean_token_accuracy": 0.4344827651977539, "step": 123335 }, { "epoch": 0.12422935797373397, "grad_norm": 10.83522742378952, "learning_rate": 4.932552734915345e-05, "loss": 2.6914, "mean_token_accuracy": 0.3931034505367279, "step": 123340 }, { "epoch": 0.12423439402683814, "grad_norm": 10.136030393349158, "learning_rate": 4.932543628644388e-05, "loss": 2.1136, "mean_token_accuracy": 0.46049606800079346, "step": 123345 }, { "epoch": 0.1242394300799423, "grad_norm": 9.226142623918228, "learning_rate": 4.9325345217680945e-05, "loss": 2.0239, "mean_token_accuracy": 0.4603750824928284, "step": 123350 }, { "epoch": 0.12424446613304647, "grad_norm": 12.026389118068465, "learning_rate": 4.932525414286467e-05, "loss": 2.127, "mean_token_accuracy": 0.4851784646511078, "step": 123355 }, { "epoch": 0.12424950218615065, "grad_norm": 10.800474833113938, "learning_rate": 4.932516306199507e-05, "loss": 2.3156, "mean_token_accuracy": 0.441379314661026, "step": 123360 }, { "epoch": 0.12425453823925482, "grad_norm": 11.772811955549438, "learning_rate": 4.932507197507219e-05, "loss": 2.4715, "mean_token_accuracy": 0.4206896543502808, "step": 123365 }, { "epoch": 0.124259574292359, "grad_norm": 12.381118087819438, "learning_rate": 4.932498088209604e-05, "loss": 2.582, "mean_token_accuracy": 0.42758620381355283, "step": 123370 }, { "epoch": 0.12426461034546317, "grad_norm": 23.817773174982808, "learning_rate": 4.932488978306665e-05, "loss": 2.4687, "mean_token_accuracy": 0.4310344815254211, "step": 123375 }, { "epoch": 0.12426964639856734, "grad_norm": 10.785328934366714, "learning_rate": 4.932479867798404e-05, "loss": 2.3527, "mean_token_accuracy": 0.44482758045196535, "step": 123380 }, { "epoch": 0.12427468245167152, "grad_norm": 7.588134819157633, "learning_rate": 4.932470756684825e-05, "loss": 2.3163, "mean_token_accuracy": 0.45583788156509397, "step": 123385 }, { "epoch": 0.12427971850477569, "grad_norm": 9.970622714855644, "learning_rate": 4.932461644965929e-05, "loss": 2.4369, "mean_token_accuracy": 0.4172413766384125, "step": 123390 }, { "epoch": 0.12428475455787986, "grad_norm": 10.703460641311072, "learning_rate": 4.932452532641719e-05, "loss": 2.4344, "mean_token_accuracy": 0.42413793206214906, "step": 123395 }, { "epoch": 0.12428979061098404, "grad_norm": 11.057351957639316, "learning_rate": 4.932443419712198e-05, "loss": 2.4941, "mean_token_accuracy": 0.4344827592372894, "step": 123400 }, { "epoch": 0.12429482666408821, "grad_norm": 12.06777601976704, "learning_rate": 4.932434306177368e-05, "loss": 2.5108, "mean_token_accuracy": 0.44482758045196535, "step": 123405 }, { "epoch": 0.12429986271719239, "grad_norm": 9.745327129200179, "learning_rate": 4.932425192037232e-05, "loss": 2.5553, "mean_token_accuracy": 0.4323048949241638, "step": 123410 }, { "epoch": 0.12430489877029656, "grad_norm": 12.494454986223051, "learning_rate": 4.932416077291792e-05, "loss": 2.7016, "mean_token_accuracy": 0.3793103516101837, "step": 123415 }, { "epoch": 0.12430993482340072, "grad_norm": 10.671681937722541, "learning_rate": 4.9324069619410515e-05, "loss": 2.4252, "mean_token_accuracy": 0.42413792610168455, "step": 123420 }, { "epoch": 0.1243149708765049, "grad_norm": 9.71090263272129, "learning_rate": 4.932397845985011e-05, "loss": 2.7505, "mean_token_accuracy": 0.3999999940395355, "step": 123425 }, { "epoch": 0.12432000692960907, "grad_norm": 9.547100984754968, "learning_rate": 4.932388729423675e-05, "loss": 2.267, "mean_token_accuracy": 0.4413793087005615, "step": 123430 }, { "epoch": 0.12432504298271324, "grad_norm": 28.65387918051316, "learning_rate": 4.932379612257045e-05, "loss": 2.7288, "mean_token_accuracy": 0.41034482717514037, "step": 123435 }, { "epoch": 0.12433007903581741, "grad_norm": 9.559524436937783, "learning_rate": 4.932370494485125e-05, "loss": 2.3979, "mean_token_accuracy": 0.3862069010734558, "step": 123440 }, { "epoch": 0.12433511508892159, "grad_norm": 8.859240559581517, "learning_rate": 4.932361376107915e-05, "loss": 1.9736, "mean_token_accuracy": 0.5124016880989075, "step": 123445 }, { "epoch": 0.12434015114202576, "grad_norm": 13.439340088597277, "learning_rate": 4.9323522571254195e-05, "loss": 2.5032, "mean_token_accuracy": 0.41724138259887694, "step": 123450 }, { "epoch": 0.12434518719512994, "grad_norm": 14.494201727076739, "learning_rate": 4.932343137537641e-05, "loss": 2.0793, "mean_token_accuracy": 0.49999999403953554, "step": 123455 }, { "epoch": 0.12435022324823411, "grad_norm": 13.525405972438469, "learning_rate": 4.9323340173445804e-05, "loss": 2.6097, "mean_token_accuracy": 0.41724138259887694, "step": 123460 }, { "epoch": 0.12435525930133828, "grad_norm": 10.659670098934646, "learning_rate": 4.932324896546242e-05, "loss": 2.1852, "mean_token_accuracy": 0.47586206793785096, "step": 123465 }, { "epoch": 0.12436029535444246, "grad_norm": 13.545564239805788, "learning_rate": 4.9323157751426277e-05, "loss": 2.5635, "mean_token_accuracy": 0.40344826579093934, "step": 123470 }, { "epoch": 0.12436533140754663, "grad_norm": 12.404792712341317, "learning_rate": 4.9323066531337394e-05, "loss": 2.5232, "mean_token_accuracy": 0.42068966031074523, "step": 123475 }, { "epoch": 0.1243703674606508, "grad_norm": 11.971366749414326, "learning_rate": 4.9322975305195803e-05, "loss": 2.4219, "mean_token_accuracy": 0.47241378426551817, "step": 123480 }, { "epoch": 0.12437540351375498, "grad_norm": 10.491526317177453, "learning_rate": 4.932288407300153e-05, "loss": 2.0759, "mean_token_accuracy": 0.4793103516101837, "step": 123485 }, { "epoch": 0.12438043956685914, "grad_norm": 14.121802508591614, "learning_rate": 4.932279283475461e-05, "loss": 2.8156, "mean_token_accuracy": 0.3862068891525269, "step": 123490 }, { "epoch": 0.12438547561996331, "grad_norm": 8.981840407513966, "learning_rate": 4.932270159045504e-05, "loss": 2.1874, "mean_token_accuracy": 0.4379310369491577, "step": 123495 }, { "epoch": 0.12439051167306749, "grad_norm": 10.878311789490034, "learning_rate": 4.932261034010287e-05, "loss": 2.2241, "mean_token_accuracy": 0.4310344815254211, "step": 123500 }, { "epoch": 0.12439554772617166, "grad_norm": 14.448846822573158, "learning_rate": 4.932251908369812e-05, "loss": 2.5753, "mean_token_accuracy": 0.41379310488700866, "step": 123505 }, { "epoch": 0.12440058377927583, "grad_norm": 9.363857420735751, "learning_rate": 4.932242782124081e-05, "loss": 2.1954, "mean_token_accuracy": 0.42758620977401735, "step": 123510 }, { "epoch": 0.12440561983238001, "grad_norm": 12.516416195348718, "learning_rate": 4.932233655273096e-05, "loss": 2.1983, "mean_token_accuracy": 0.4604355752468109, "step": 123515 }, { "epoch": 0.12441065588548418, "grad_norm": 11.037564589852316, "learning_rate": 4.932224527816861e-05, "loss": 2.5965, "mean_token_accuracy": 0.4034482777118683, "step": 123520 }, { "epoch": 0.12441569193858835, "grad_norm": 12.07528768859811, "learning_rate": 4.932215399755379e-05, "loss": 2.8248, "mean_token_accuracy": 0.38620689511299133, "step": 123525 }, { "epoch": 0.12442072799169253, "grad_norm": 10.434405104204275, "learning_rate": 4.9322062710886505e-05, "loss": 2.1271, "mean_token_accuracy": 0.4896551728248596, "step": 123530 }, { "epoch": 0.1244257640447967, "grad_norm": 11.304892198752592, "learning_rate": 4.932197141816678e-05, "loss": 2.4779, "mean_token_accuracy": 0.4379310369491577, "step": 123535 }, { "epoch": 0.12443080009790088, "grad_norm": 9.920643442686885, "learning_rate": 4.9321880119394666e-05, "loss": 2.0384, "mean_token_accuracy": 0.482758617401123, "step": 123540 }, { "epoch": 0.12443583615100505, "grad_norm": 11.921665429899907, "learning_rate": 4.9321788814570165e-05, "loss": 2.6862, "mean_token_accuracy": 0.3827586114406586, "step": 123545 }, { "epoch": 0.12444087220410922, "grad_norm": 14.894451234447269, "learning_rate": 4.93216975036933e-05, "loss": 2.2954, "mean_token_accuracy": 0.4849364757537842, "step": 123550 }, { "epoch": 0.1244459082572134, "grad_norm": 10.677267425999865, "learning_rate": 4.9321606186764126e-05, "loss": 2.759, "mean_token_accuracy": 0.39310344457626345, "step": 123555 }, { "epoch": 0.12445094431031756, "grad_norm": 11.453531964760659, "learning_rate": 4.932151486378263e-05, "loss": 2.448, "mean_token_accuracy": 0.42068966031074523, "step": 123560 }, { "epoch": 0.12445598036342173, "grad_norm": 12.149339625850565, "learning_rate": 4.932142353474887e-05, "loss": 2.5669, "mean_token_accuracy": 0.37586206793785093, "step": 123565 }, { "epoch": 0.1244610164165259, "grad_norm": 14.387005558601993, "learning_rate": 4.932133219966285e-05, "loss": 2.3753, "mean_token_accuracy": 0.4896551609039307, "step": 123570 }, { "epoch": 0.12446605246963008, "grad_norm": 9.610242161842296, "learning_rate": 4.9321240858524597e-05, "loss": 1.9729, "mean_token_accuracy": 0.46551724672317507, "step": 123575 }, { "epoch": 0.12447108852273425, "grad_norm": 10.729990638589701, "learning_rate": 4.9321149511334144e-05, "loss": 1.9673, "mean_token_accuracy": 0.4871921122074127, "step": 123580 }, { "epoch": 0.12447612457583843, "grad_norm": 12.314099501332782, "learning_rate": 4.932105815809152e-05, "loss": 2.6641, "mean_token_accuracy": 0.4103448331356049, "step": 123585 }, { "epoch": 0.1244811606289426, "grad_norm": 11.820789226694464, "learning_rate": 4.9320966798796736e-05, "loss": 2.2736, "mean_token_accuracy": 0.47931034564971925, "step": 123590 }, { "epoch": 0.12448619668204677, "grad_norm": 10.065627258347645, "learning_rate": 4.9320875433449835e-05, "loss": 2.3747, "mean_token_accuracy": 0.45517241954803467, "step": 123595 }, { "epoch": 0.12449123273515095, "grad_norm": 9.663683918750145, "learning_rate": 4.932078406205083e-05, "loss": 2.3449, "mean_token_accuracy": 0.42068964838981626, "step": 123600 }, { "epoch": 0.12449626878825512, "grad_norm": 9.791134050849395, "learning_rate": 4.9320692684599745e-05, "loss": 2.5731, "mean_token_accuracy": 0.36551723480224607, "step": 123605 }, { "epoch": 0.1245013048413593, "grad_norm": 10.175382639824074, "learning_rate": 4.932060130109661e-05, "loss": 2.5262, "mean_token_accuracy": 0.4344827592372894, "step": 123610 }, { "epoch": 0.12450634089446347, "grad_norm": 10.568509304617983, "learning_rate": 4.9320509911541446e-05, "loss": 2.331, "mean_token_accuracy": 0.46896552443504336, "step": 123615 }, { "epoch": 0.12451137694756764, "grad_norm": 10.804222047710022, "learning_rate": 4.932041851593429e-05, "loss": 2.434, "mean_token_accuracy": 0.4206896543502808, "step": 123620 }, { "epoch": 0.12451641300067182, "grad_norm": 10.42270728526008, "learning_rate": 4.9320327114275156e-05, "loss": 2.3402, "mean_token_accuracy": 0.4153659999370575, "step": 123625 }, { "epoch": 0.12452144905377598, "grad_norm": 12.493308769494055, "learning_rate": 4.932023570656407e-05, "loss": 2.4953, "mean_token_accuracy": 0.3931034505367279, "step": 123630 }, { "epoch": 0.12452648510688015, "grad_norm": 9.52786883902121, "learning_rate": 4.932014429280107e-05, "loss": 2.2242, "mean_token_accuracy": 0.4482758641242981, "step": 123635 }, { "epoch": 0.12453152115998432, "grad_norm": 10.849111844048021, "learning_rate": 4.932005287298616e-05, "loss": 2.4742, "mean_token_accuracy": 0.4344827592372894, "step": 123640 }, { "epoch": 0.1245365572130885, "grad_norm": 10.348852917568902, "learning_rate": 4.931996144711938e-05, "loss": 2.4332, "mean_token_accuracy": 0.4517241299152374, "step": 123645 }, { "epoch": 0.12454159326619267, "grad_norm": 9.367672234804413, "learning_rate": 4.931987001520076e-05, "loss": 2.4281, "mean_token_accuracy": 0.4517241299152374, "step": 123650 }, { "epoch": 0.12454662931929684, "grad_norm": 9.344785450827878, "learning_rate": 4.931977857723031e-05, "loss": 2.0822, "mean_token_accuracy": 0.48965516686439514, "step": 123655 }, { "epoch": 0.12455166537240102, "grad_norm": 11.600768022953147, "learning_rate": 4.9319687133208066e-05, "loss": 2.2734, "mean_token_accuracy": 0.43793103098869324, "step": 123660 }, { "epoch": 0.12455670142550519, "grad_norm": 10.30137558229247, "learning_rate": 4.9319595683134045e-05, "loss": 2.341, "mean_token_accuracy": 0.3965517282485962, "step": 123665 }, { "epoch": 0.12456173747860937, "grad_norm": 9.901523654434826, "learning_rate": 4.9319504227008284e-05, "loss": 2.5494, "mean_token_accuracy": 0.39655172526836396, "step": 123670 }, { "epoch": 0.12456677353171354, "grad_norm": 12.095347484241078, "learning_rate": 4.9319412764830806e-05, "loss": 2.3354, "mean_token_accuracy": 0.42758620381355283, "step": 123675 }, { "epoch": 0.12457180958481771, "grad_norm": 9.619847395745683, "learning_rate": 4.931932129660162e-05, "loss": 2.4084, "mean_token_accuracy": 0.44482758045196535, "step": 123680 }, { "epoch": 0.12457684563792189, "grad_norm": 9.395549674515834, "learning_rate": 4.9319229822320774e-05, "loss": 2.1872, "mean_token_accuracy": 0.44827585816383364, "step": 123685 }, { "epoch": 0.12458188169102606, "grad_norm": 10.512749270574066, "learning_rate": 4.931913834198828e-05, "loss": 2.4194, "mean_token_accuracy": 0.3620689630508423, "step": 123690 }, { "epoch": 0.12458691774413024, "grad_norm": 11.996159906718693, "learning_rate": 4.931904685560416e-05, "loss": 3.0487, "mean_token_accuracy": 0.3551724135875702, "step": 123695 }, { "epoch": 0.1245919537972344, "grad_norm": 10.081222082945459, "learning_rate": 4.9318955363168456e-05, "loss": 2.545, "mean_token_accuracy": 0.38620689511299133, "step": 123700 }, { "epoch": 0.12459698985033857, "grad_norm": 9.23894621754333, "learning_rate": 4.931886386468118e-05, "loss": 2.2581, "mean_token_accuracy": 0.44827585816383364, "step": 123705 }, { "epoch": 0.12460202590344274, "grad_norm": 13.542145021256491, "learning_rate": 4.931877236014236e-05, "loss": 2.8101, "mean_token_accuracy": 0.39310344457626345, "step": 123710 }, { "epoch": 0.12460706195654692, "grad_norm": 11.581129341966413, "learning_rate": 4.931868084955203e-05, "loss": 2.7297, "mean_token_accuracy": 0.37586206793785093, "step": 123715 }, { "epoch": 0.12461209800965109, "grad_norm": 11.798533917299554, "learning_rate": 4.9318589332910195e-05, "loss": 2.9002, "mean_token_accuracy": 0.4241379380226135, "step": 123720 }, { "epoch": 0.12461713406275526, "grad_norm": 9.640623293799536, "learning_rate": 4.93184978102169e-05, "loss": 2.0202, "mean_token_accuracy": 0.5206896543502808, "step": 123725 }, { "epoch": 0.12462217011585944, "grad_norm": 9.617962068195393, "learning_rate": 4.9318406281472154e-05, "loss": 2.5705, "mean_token_accuracy": 0.36551723480224607, "step": 123730 }, { "epoch": 0.12462720616896361, "grad_norm": 9.604813469913651, "learning_rate": 4.9318314746676006e-05, "loss": 2.3539, "mean_token_accuracy": 0.4344827592372894, "step": 123735 }, { "epoch": 0.12463224222206779, "grad_norm": 9.077493884672778, "learning_rate": 4.931822320582846e-05, "loss": 2.2066, "mean_token_accuracy": 0.42068964838981626, "step": 123740 }, { "epoch": 0.12463727827517196, "grad_norm": 10.867110172558268, "learning_rate": 4.9318131658929545e-05, "loss": 2.8088, "mean_token_accuracy": 0.4068965494632721, "step": 123745 }, { "epoch": 0.12464231432827613, "grad_norm": 9.392779864386263, "learning_rate": 4.931804010597929e-05, "loss": 2.4038, "mean_token_accuracy": 0.4620689690113068, "step": 123750 }, { "epoch": 0.1246473503813803, "grad_norm": 9.77315457119676, "learning_rate": 4.931794854697773e-05, "loss": 2.7775, "mean_token_accuracy": 0.417241370677948, "step": 123755 }, { "epoch": 0.12465238643448448, "grad_norm": 8.83455450371274, "learning_rate": 4.931785698192487e-05, "loss": 2.4564, "mean_token_accuracy": 0.44827585816383364, "step": 123760 }, { "epoch": 0.12465742248758865, "grad_norm": 12.050637375279532, "learning_rate": 4.9317765410820746e-05, "loss": 2.2942, "mean_token_accuracy": 0.4344827651977539, "step": 123765 }, { "epoch": 0.12466245854069281, "grad_norm": 11.707590265022704, "learning_rate": 4.931767383366539e-05, "loss": 2.2418, "mean_token_accuracy": 0.44694494605064394, "step": 123770 }, { "epoch": 0.12466749459379699, "grad_norm": 9.189279777389158, "learning_rate": 4.9317582250458814e-05, "loss": 2.6886, "mean_token_accuracy": 0.43998789191246035, "step": 123775 }, { "epoch": 0.12467253064690116, "grad_norm": 15.773591276214393, "learning_rate": 4.931749066120106e-05, "loss": 2.4981, "mean_token_accuracy": 0.4, "step": 123780 }, { "epoch": 0.12467756670000534, "grad_norm": 8.863097339242998, "learning_rate": 4.931739906589214e-05, "loss": 2.3278, "mean_token_accuracy": 0.47114336490631104, "step": 123785 }, { "epoch": 0.12468260275310951, "grad_norm": 12.017006178921458, "learning_rate": 4.931730746453208e-05, "loss": 2.3017, "mean_token_accuracy": 0.4586206912994385, "step": 123790 }, { "epoch": 0.12468763880621368, "grad_norm": 10.598740583238024, "learning_rate": 4.9317215857120903e-05, "loss": 2.1881, "mean_token_accuracy": 0.4551724076271057, "step": 123795 }, { "epoch": 0.12469267485931786, "grad_norm": 11.231135437901454, "learning_rate": 4.9317124243658644e-05, "loss": 2.8643, "mean_token_accuracy": 0.4034482717514038, "step": 123800 }, { "epoch": 0.12469771091242203, "grad_norm": 11.894096075202533, "learning_rate": 4.931703262414533e-05, "loss": 2.7349, "mean_token_accuracy": 0.3517241358757019, "step": 123805 }, { "epoch": 0.1247027469655262, "grad_norm": 10.344794595110173, "learning_rate": 4.931694099858098e-05, "loss": 2.2276, "mean_token_accuracy": 0.42758620381355283, "step": 123810 }, { "epoch": 0.12470778301863038, "grad_norm": 11.748590401556678, "learning_rate": 4.931684936696561e-05, "loss": 2.0791, "mean_token_accuracy": 0.4620689630508423, "step": 123815 }, { "epoch": 0.12471281907173455, "grad_norm": 9.635866922146365, "learning_rate": 4.931675772929926e-05, "loss": 1.8313, "mean_token_accuracy": 0.5344827592372894, "step": 123820 }, { "epoch": 0.12471785512483873, "grad_norm": 10.695390381346565, "learning_rate": 4.931666608558196e-05, "loss": 2.4097, "mean_token_accuracy": 0.44827585816383364, "step": 123825 }, { "epoch": 0.1247228911779429, "grad_norm": 12.305061301430387, "learning_rate": 4.931657443581371e-05, "loss": 2.7296, "mean_token_accuracy": 0.3965517282485962, "step": 123830 }, { "epoch": 0.12472792723104707, "grad_norm": 11.57831007506338, "learning_rate": 4.931648277999457e-05, "loss": 2.5923, "mean_token_accuracy": 0.4137930989265442, "step": 123835 }, { "epoch": 0.12473296328415123, "grad_norm": 11.896689409926472, "learning_rate": 4.9316391118124535e-05, "loss": 2.1688, "mean_token_accuracy": 0.43793103098869324, "step": 123840 }, { "epoch": 0.1247379993372554, "grad_norm": 10.781291657583932, "learning_rate": 4.931629945020364e-05, "loss": 2.1756, "mean_token_accuracy": 0.44827587008476255, "step": 123845 }, { "epoch": 0.12474303539035958, "grad_norm": 6.429427710512313, "learning_rate": 4.931620777623192e-05, "loss": 1.7935, "mean_token_accuracy": 0.500738924741745, "step": 123850 }, { "epoch": 0.12474807144346375, "grad_norm": 10.912106315867291, "learning_rate": 4.9316116096209394e-05, "loss": 2.6199, "mean_token_accuracy": 0.38620689511299133, "step": 123855 }, { "epoch": 0.12475310749656793, "grad_norm": 11.403112544925696, "learning_rate": 4.931602441013608e-05, "loss": 2.5271, "mean_token_accuracy": 0.4413793087005615, "step": 123860 }, { "epoch": 0.1247581435496721, "grad_norm": 10.326074684639051, "learning_rate": 4.9315932718012015e-05, "loss": 1.9635, "mean_token_accuracy": 0.4586206912994385, "step": 123865 }, { "epoch": 0.12476317960277628, "grad_norm": 11.579898695866243, "learning_rate": 4.9315841019837226e-05, "loss": 2.3312, "mean_token_accuracy": 0.4172413766384125, "step": 123870 }, { "epoch": 0.12476821565588045, "grad_norm": 12.279130019615577, "learning_rate": 4.931574931561172e-05, "loss": 2.3592, "mean_token_accuracy": 0.44482758045196535, "step": 123875 }, { "epoch": 0.12477325170898462, "grad_norm": 11.004844148836165, "learning_rate": 4.931565760533555e-05, "loss": 2.486, "mean_token_accuracy": 0.36896551251411436, "step": 123880 }, { "epoch": 0.1247782877620888, "grad_norm": 10.991898578040617, "learning_rate": 4.931556588900871e-05, "loss": 2.5265, "mean_token_accuracy": 0.4344827592372894, "step": 123885 }, { "epoch": 0.12478332381519297, "grad_norm": 10.865961846740692, "learning_rate": 4.931547416663125e-05, "loss": 2.2762, "mean_token_accuracy": 0.43248639106750486, "step": 123890 }, { "epoch": 0.12478835986829714, "grad_norm": 11.425831672949833, "learning_rate": 4.931538243820318e-05, "loss": 2.376, "mean_token_accuracy": 0.4620689630508423, "step": 123895 }, { "epoch": 0.12479339592140132, "grad_norm": 11.182051213736225, "learning_rate": 4.931529070372454e-05, "loss": 2.5771, "mean_token_accuracy": 0.4103448212146759, "step": 123900 }, { "epoch": 0.12479843197450549, "grad_norm": 10.699551092116875, "learning_rate": 4.9315198963195343e-05, "loss": 2.304, "mean_token_accuracy": 0.42758620977401735, "step": 123905 }, { "epoch": 0.12480346802760965, "grad_norm": 9.59465081003794, "learning_rate": 4.931510721661562e-05, "loss": 2.2041, "mean_token_accuracy": 0.4551724255084991, "step": 123910 }, { "epoch": 0.12480850408071383, "grad_norm": 11.962476885429458, "learning_rate": 4.931501546398539e-05, "loss": 2.441, "mean_token_accuracy": 0.3896551728248596, "step": 123915 }, { "epoch": 0.124813540133818, "grad_norm": 10.79373016696175, "learning_rate": 4.93149237053047e-05, "loss": 1.9983, "mean_token_accuracy": 0.482758617401123, "step": 123920 }, { "epoch": 0.12481857618692217, "grad_norm": 12.24720353234997, "learning_rate": 4.931483194057355e-05, "loss": 2.6165, "mean_token_accuracy": 0.40508167147636415, "step": 123925 }, { "epoch": 0.12482361224002635, "grad_norm": 10.906314179567266, "learning_rate": 4.9314740169791976e-05, "loss": 2.8854, "mean_token_accuracy": 0.39310344457626345, "step": 123930 }, { "epoch": 0.12482864829313052, "grad_norm": 8.57757051376491, "learning_rate": 4.931464839296e-05, "loss": 2.1679, "mean_token_accuracy": 0.4551724076271057, "step": 123935 }, { "epoch": 0.1248336843462347, "grad_norm": 9.317363075229544, "learning_rate": 4.931455661007765e-05, "loss": 2.1262, "mean_token_accuracy": 0.48965516686439514, "step": 123940 }, { "epoch": 0.12483872039933887, "grad_norm": 9.020899315593375, "learning_rate": 4.931446482114495e-05, "loss": 2.1771, "mean_token_accuracy": 0.43103447556495667, "step": 123945 }, { "epoch": 0.12484375645244304, "grad_norm": 11.216994947024483, "learning_rate": 4.931437302616193e-05, "loss": 2.6884, "mean_token_accuracy": 0.42068966031074523, "step": 123950 }, { "epoch": 0.12484879250554722, "grad_norm": 13.577766699237666, "learning_rate": 4.931428122512861e-05, "loss": 2.2794, "mean_token_accuracy": 0.4607380509376526, "step": 123955 }, { "epoch": 0.12485382855865139, "grad_norm": 8.915431276976955, "learning_rate": 4.9314189418045027e-05, "loss": 1.9651, "mean_token_accuracy": 0.5034482836723327, "step": 123960 }, { "epoch": 0.12485886461175556, "grad_norm": 10.207325715047853, "learning_rate": 4.931409760491118e-05, "loss": 2.4525, "mean_token_accuracy": 0.4517241358757019, "step": 123965 }, { "epoch": 0.12486390066485974, "grad_norm": 10.3904599476029, "learning_rate": 4.9314005785727125e-05, "loss": 2.1991, "mean_token_accuracy": 0.4551724076271057, "step": 123970 }, { "epoch": 0.12486893671796391, "grad_norm": 12.275869292310661, "learning_rate": 4.9313913960492866e-05, "loss": 2.4438, "mean_token_accuracy": 0.41560798287391665, "step": 123975 }, { "epoch": 0.12487397277106807, "grad_norm": 11.654252935057961, "learning_rate": 4.931382212920844e-05, "loss": 2.6634, "mean_token_accuracy": 0.41724138259887694, "step": 123980 }, { "epoch": 0.12487900882417224, "grad_norm": 10.64454127005554, "learning_rate": 4.931373029187387e-05, "loss": 1.859, "mean_token_accuracy": 0.44482757449150084, "step": 123985 }, { "epoch": 0.12488404487727642, "grad_norm": 9.17738753882329, "learning_rate": 4.9313638448489186e-05, "loss": 2.1325, "mean_token_accuracy": 0.4225045442581177, "step": 123990 }, { "epoch": 0.12488908093038059, "grad_norm": 8.155930147871105, "learning_rate": 4.93135465990544e-05, "loss": 2.1224, "mean_token_accuracy": 0.4379310369491577, "step": 123995 }, { "epoch": 0.12489411698348477, "grad_norm": 13.913259919205482, "learning_rate": 4.931345474356955e-05, "loss": 2.3732, "mean_token_accuracy": 0.42758620977401735, "step": 124000 }, { "epoch": 0.12489915303658894, "grad_norm": 9.275250141998018, "learning_rate": 4.931336288203465e-05, "loss": 2.2718, "mean_token_accuracy": 0.4586206912994385, "step": 124005 }, { "epoch": 0.12490418908969311, "grad_norm": 11.45198936517712, "learning_rate": 4.931327101444974e-05, "loss": 2.5811, "mean_token_accuracy": 0.4103448212146759, "step": 124010 }, { "epoch": 0.12490922514279729, "grad_norm": 9.764758178106982, "learning_rate": 4.931317914081483e-05, "loss": 2.7661, "mean_token_accuracy": 0.4551724135875702, "step": 124015 }, { "epoch": 0.12491426119590146, "grad_norm": 11.54259689616539, "learning_rate": 4.931308726112996e-05, "loss": 2.6193, "mean_token_accuracy": 0.437931028008461, "step": 124020 }, { "epoch": 0.12491929724900563, "grad_norm": 10.856792115832969, "learning_rate": 4.931299537539515e-05, "loss": 2.476, "mean_token_accuracy": 0.43103448748588563, "step": 124025 }, { "epoch": 0.12492433330210981, "grad_norm": 10.82719994541945, "learning_rate": 4.931290348361043e-05, "loss": 2.2318, "mean_token_accuracy": 0.43103448748588563, "step": 124030 }, { "epoch": 0.12492936935521398, "grad_norm": 11.730860017126803, "learning_rate": 4.93128115857758e-05, "loss": 2.4042, "mean_token_accuracy": 0.4034482717514038, "step": 124035 }, { "epoch": 0.12493440540831816, "grad_norm": 12.511839415280349, "learning_rate": 4.931271968189132e-05, "loss": 2.5389, "mean_token_accuracy": 0.41724138259887694, "step": 124040 }, { "epoch": 0.12493944146142233, "grad_norm": 11.472001760787498, "learning_rate": 4.9312627771956995e-05, "loss": 2.4237, "mean_token_accuracy": 0.40344828367233276, "step": 124045 }, { "epoch": 0.12494447751452649, "grad_norm": 9.800059066481898, "learning_rate": 4.931253585597287e-05, "loss": 2.2778, "mean_token_accuracy": 0.4310344815254211, "step": 124050 }, { "epoch": 0.12494951356763066, "grad_norm": 12.022123126888289, "learning_rate": 4.931244393393894e-05, "loss": 2.9999, "mean_token_accuracy": 0.37931033968925476, "step": 124055 }, { "epoch": 0.12495454962073484, "grad_norm": 9.869224805735533, "learning_rate": 4.9312352005855255e-05, "loss": 1.8331, "mean_token_accuracy": 0.5228675186634064, "step": 124060 }, { "epoch": 0.12495958567383901, "grad_norm": 8.535249595321716, "learning_rate": 4.9312260071721826e-05, "loss": 1.9784, "mean_token_accuracy": 0.4862068951129913, "step": 124065 }, { "epoch": 0.12496462172694318, "grad_norm": 13.554037841516994, "learning_rate": 4.931216813153869e-05, "loss": 2.4976, "mean_token_accuracy": 0.39655172228813174, "step": 124070 }, { "epoch": 0.12496965778004736, "grad_norm": 8.215018749067184, "learning_rate": 4.931207618530588e-05, "loss": 2.3306, "mean_token_accuracy": 0.42413793206214906, "step": 124075 }, { "epoch": 0.12497469383315153, "grad_norm": 10.58593383937158, "learning_rate": 4.9311984233023386e-05, "loss": 2.1368, "mean_token_accuracy": 0.4551724076271057, "step": 124080 }, { "epoch": 0.1249797298862557, "grad_norm": 15.386159952830395, "learning_rate": 4.9311892274691276e-05, "loss": 2.7046, "mean_token_accuracy": 0.3827586233615875, "step": 124085 }, { "epoch": 0.12498476593935988, "grad_norm": 8.918774474895576, "learning_rate": 4.931180031030955e-05, "loss": 2.0983, "mean_token_accuracy": 0.4172413766384125, "step": 124090 }, { "epoch": 0.12498980199246405, "grad_norm": 11.663443381614654, "learning_rate": 4.931170833987823e-05, "loss": 2.3922, "mean_token_accuracy": 0.4358374387025833, "step": 124095 }, { "epoch": 0.12499483804556823, "grad_norm": 10.212163572717571, "learning_rate": 4.931161636339737e-05, "loss": 2.4685, "mean_token_accuracy": 0.4206896543502808, "step": 124100 }, { "epoch": 0.1249998740986724, "grad_norm": 10.752643245639343, "learning_rate": 4.931152438086696e-05, "loss": 2.26, "mean_token_accuracy": 0.4361766457557678, "step": 124105 }, { "epoch": 0.12500491015177656, "grad_norm": 9.792429972102655, "learning_rate": 4.931143239228705e-05, "loss": 2.4898, "mean_token_accuracy": 0.4186932861804962, "step": 124110 }, { "epoch": 0.12500994620488073, "grad_norm": 12.945059019387644, "learning_rate": 4.9311340397657655e-05, "loss": 2.3639, "mean_token_accuracy": 0.4103448331356049, "step": 124115 }, { "epoch": 0.1250149822579849, "grad_norm": 10.903339494566122, "learning_rate": 4.9311248396978804e-05, "loss": 2.4642, "mean_token_accuracy": 0.417241370677948, "step": 124120 }, { "epoch": 0.12502001831108908, "grad_norm": 9.756111801907657, "learning_rate": 4.9311156390250525e-05, "loss": 2.3362, "mean_token_accuracy": 0.44827585816383364, "step": 124125 }, { "epoch": 0.12502505436419326, "grad_norm": 21.707479036469287, "learning_rate": 4.931106437747284e-05, "loss": 2.4704, "mean_token_accuracy": 0.43793103098869324, "step": 124130 }, { "epoch": 0.12503009041729743, "grad_norm": 9.421060213031076, "learning_rate": 4.931097235864577e-05, "loss": 2.4808, "mean_token_accuracy": 0.44137930274009707, "step": 124135 }, { "epoch": 0.1250351264704016, "grad_norm": 11.522459449156207, "learning_rate": 4.931088033376935e-05, "loss": 2.7515, "mean_token_accuracy": 0.3793103486299515, "step": 124140 }, { "epoch": 0.12504016252350578, "grad_norm": 10.196326021331092, "learning_rate": 4.93107883028436e-05, "loss": 2.306, "mean_token_accuracy": 0.4689655125141144, "step": 124145 }, { "epoch": 0.12504519857660995, "grad_norm": 11.545179991732585, "learning_rate": 4.931069626586855e-05, "loss": 1.9724, "mean_token_accuracy": 0.5295297801494598, "step": 124150 }, { "epoch": 0.12505023462971412, "grad_norm": 14.09801574966443, "learning_rate": 4.931060422284422e-05, "loss": 2.4379, "mean_token_accuracy": 0.3896551728248596, "step": 124155 }, { "epoch": 0.1250552706828183, "grad_norm": 10.258868445131919, "learning_rate": 4.9310512173770626e-05, "loss": 2.3348, "mean_token_accuracy": 0.41379311084747317, "step": 124160 }, { "epoch": 0.12506030673592247, "grad_norm": 9.476993458056684, "learning_rate": 4.931042011864782e-05, "loss": 2.4549, "mean_token_accuracy": 0.41724138259887694, "step": 124165 }, { "epoch": 0.12506534278902665, "grad_norm": 11.347618134338601, "learning_rate": 4.9310328057475805e-05, "loss": 2.6114, "mean_token_accuracy": 0.42413793206214906, "step": 124170 }, { "epoch": 0.12507037884213082, "grad_norm": 10.38833527401101, "learning_rate": 4.9310235990254614e-05, "loss": 2.1821, "mean_token_accuracy": 0.4655172348022461, "step": 124175 }, { "epoch": 0.125075414895235, "grad_norm": 9.497734017409657, "learning_rate": 4.931014391698428e-05, "loss": 2.242, "mean_token_accuracy": 0.4448275864124298, "step": 124180 }, { "epoch": 0.12508045094833917, "grad_norm": 14.710155365013705, "learning_rate": 4.931005183766482e-05, "loss": 2.6883, "mean_token_accuracy": 0.4344827592372894, "step": 124185 }, { "epoch": 0.12508548700144334, "grad_norm": 11.80418681181357, "learning_rate": 4.930995975229625e-05, "loss": 3.2515, "mean_token_accuracy": 0.34137930870056155, "step": 124190 }, { "epoch": 0.12509052305454751, "grad_norm": 14.032108909157538, "learning_rate": 4.9309867660878616e-05, "loss": 2.4535, "mean_token_accuracy": 0.42068966031074523, "step": 124195 }, { "epoch": 0.1250955591076517, "grad_norm": 13.390933802810478, "learning_rate": 4.930977556341194e-05, "loss": 2.4829, "mean_token_accuracy": 0.42413792610168455, "step": 124200 }, { "epoch": 0.12510059516075586, "grad_norm": 12.24163069549035, "learning_rate": 4.930968345989622e-05, "loss": 2.4697, "mean_token_accuracy": 0.4154264986515045, "step": 124205 }, { "epoch": 0.12510563121386004, "grad_norm": 10.36573334788406, "learning_rate": 4.930959135033153e-05, "loss": 2.7618, "mean_token_accuracy": 0.39310344457626345, "step": 124210 }, { "epoch": 0.1251106672669642, "grad_norm": 10.286740525438994, "learning_rate": 4.930949923471785e-05, "loss": 2.3208, "mean_token_accuracy": 0.4586206912994385, "step": 124215 }, { "epoch": 0.12511570332006838, "grad_norm": 10.726593501320407, "learning_rate": 4.930940711305522e-05, "loss": 2.2779, "mean_token_accuracy": 0.46551724672317507, "step": 124220 }, { "epoch": 0.12512073937317253, "grad_norm": 10.501920826463534, "learning_rate": 4.930931498534368e-05, "loss": 2.1967, "mean_token_accuracy": 0.4396854221820831, "step": 124225 }, { "epoch": 0.1251257754262767, "grad_norm": 9.10472974801024, "learning_rate": 4.930922285158324e-05, "loss": 1.8772, "mean_token_accuracy": 0.5137930989265442, "step": 124230 }, { "epoch": 0.12513081147938088, "grad_norm": 11.832084788221232, "learning_rate": 4.9309130711773936e-05, "loss": 2.7032, "mean_token_accuracy": 0.43103448748588563, "step": 124235 }, { "epoch": 0.12513584753248505, "grad_norm": 10.79371384366645, "learning_rate": 4.9309038565915786e-05, "loss": 2.7187, "mean_token_accuracy": 0.41724138259887694, "step": 124240 }, { "epoch": 0.12514088358558922, "grad_norm": 15.330319638646554, "learning_rate": 4.930894641400882e-05, "loss": 2.374, "mean_token_accuracy": 0.4000000059604645, "step": 124245 }, { "epoch": 0.1251459196386934, "grad_norm": 9.256353586588736, "learning_rate": 4.930885425605306e-05, "loss": 2.1746, "mean_token_accuracy": 0.4931034445762634, "step": 124250 }, { "epoch": 0.12515095569179757, "grad_norm": 9.520127108943822, "learning_rate": 4.9308762092048535e-05, "loss": 2.1476, "mean_token_accuracy": 0.4811857223510742, "step": 124255 }, { "epoch": 0.12515599174490175, "grad_norm": 12.72107528571997, "learning_rate": 4.930866992199527e-05, "loss": 2.418, "mean_token_accuracy": 0.4517241358757019, "step": 124260 }, { "epoch": 0.12516102779800592, "grad_norm": 11.166375304005644, "learning_rate": 4.930857774589327e-05, "loss": 2.5324, "mean_token_accuracy": 0.3999999940395355, "step": 124265 }, { "epoch": 0.1251660638511101, "grad_norm": 10.106412227290502, "learning_rate": 4.93084855637426e-05, "loss": 2.4309, "mean_token_accuracy": 0.4034482777118683, "step": 124270 }, { "epoch": 0.12517109990421427, "grad_norm": 9.806830703261674, "learning_rate": 4.930839337554326e-05, "loss": 2.3157, "mean_token_accuracy": 0.3999999940395355, "step": 124275 }, { "epoch": 0.12517613595731844, "grad_norm": 12.062694964583635, "learning_rate": 4.930830118129528e-05, "loss": 2.1704, "mean_token_accuracy": 0.5103448271751404, "step": 124280 }, { "epoch": 0.12518117201042261, "grad_norm": 11.332874025154965, "learning_rate": 4.930820898099869e-05, "loss": 2.2498, "mean_token_accuracy": 0.45517241954803467, "step": 124285 }, { "epoch": 0.1251862080635268, "grad_norm": 10.850540724342629, "learning_rate": 4.93081167746535e-05, "loss": 2.2788, "mean_token_accuracy": 0.4413793087005615, "step": 124290 }, { "epoch": 0.12519124411663096, "grad_norm": 10.745253987207922, "learning_rate": 4.930802456225976e-05, "loss": 1.9893, "mean_token_accuracy": 0.47126436829566953, "step": 124295 }, { "epoch": 0.12519628016973514, "grad_norm": 12.566878173001326, "learning_rate": 4.930793234381748e-05, "loss": 2.5113, "mean_token_accuracy": 0.3793103456497192, "step": 124300 }, { "epoch": 0.1252013162228393, "grad_norm": 9.90817146343094, "learning_rate": 4.9307840119326687e-05, "loss": 2.2328, "mean_token_accuracy": 0.47241378426551817, "step": 124305 }, { "epoch": 0.12520635227594348, "grad_norm": 8.875456236458364, "learning_rate": 4.930774788878741e-05, "loss": 2.4195, "mean_token_accuracy": 0.39310344457626345, "step": 124310 }, { "epoch": 0.12521138832904766, "grad_norm": 8.679191996897456, "learning_rate": 4.930765565219967e-05, "loss": 2.0784, "mean_token_accuracy": 0.4551724076271057, "step": 124315 }, { "epoch": 0.12521642438215183, "grad_norm": 12.42731798861218, "learning_rate": 4.930756340956349e-05, "loss": 2.538, "mean_token_accuracy": 0.4435571640729904, "step": 124320 }, { "epoch": 0.125221460435256, "grad_norm": 11.811407222849237, "learning_rate": 4.930747116087891e-05, "loss": 2.7656, "mean_token_accuracy": 0.37241379022598264, "step": 124325 }, { "epoch": 0.12522649648836018, "grad_norm": 8.669276835630525, "learning_rate": 4.930737890614595e-05, "loss": 2.0491, "mean_token_accuracy": 0.45862067937850953, "step": 124330 }, { "epoch": 0.12523153254146435, "grad_norm": 11.280309414632294, "learning_rate": 4.9307286645364624e-05, "loss": 2.314, "mean_token_accuracy": 0.4655172348022461, "step": 124335 }, { "epoch": 0.12523656859456853, "grad_norm": 9.16049354730678, "learning_rate": 4.930719437853496e-05, "loss": 2.4316, "mean_token_accuracy": 0.4068965494632721, "step": 124340 }, { "epoch": 0.1252416046476727, "grad_norm": 10.812821904576646, "learning_rate": 4.9307102105657006e-05, "loss": 2.2273, "mean_token_accuracy": 0.4137930989265442, "step": 124345 }, { "epoch": 0.12524664070077687, "grad_norm": 10.701569128505094, "learning_rate": 4.930700982673076e-05, "loss": 2.2037, "mean_token_accuracy": 0.4620689630508423, "step": 124350 }, { "epoch": 0.12525167675388105, "grad_norm": 12.091054923666615, "learning_rate": 4.9306917541756256e-05, "loss": 2.5064, "mean_token_accuracy": 0.43103447556495667, "step": 124355 }, { "epoch": 0.12525671280698522, "grad_norm": 10.86908834460403, "learning_rate": 4.930682525073353e-05, "loss": 2.3314, "mean_token_accuracy": 0.42068966031074523, "step": 124360 }, { "epoch": 0.12526174886008937, "grad_norm": 11.236026503204053, "learning_rate": 4.930673295366259e-05, "loss": 2.2179, "mean_token_accuracy": 0.4137930989265442, "step": 124365 }, { "epoch": 0.12526678491319354, "grad_norm": 8.872651251415567, "learning_rate": 4.930664065054349e-05, "loss": 2.1659, "mean_token_accuracy": 0.4379310369491577, "step": 124370 }, { "epoch": 0.12527182096629771, "grad_norm": 11.003687844283842, "learning_rate": 4.930654834137622e-05, "loss": 2.7956, "mean_token_accuracy": 0.4344827592372894, "step": 124375 }, { "epoch": 0.1252768570194019, "grad_norm": 11.387695870284572, "learning_rate": 4.930645602616082e-05, "loss": 2.3843, "mean_token_accuracy": 0.4482758641242981, "step": 124380 }, { "epoch": 0.12528189307250606, "grad_norm": 9.0184280692249, "learning_rate": 4.930636370489733e-05, "loss": 2.3452, "mean_token_accuracy": 0.44827585220336913, "step": 124385 }, { "epoch": 0.12528692912561024, "grad_norm": 10.555524392227948, "learning_rate": 4.930627137758576e-05, "loss": 2.416, "mean_token_accuracy": 0.3793103456497192, "step": 124390 }, { "epoch": 0.1252919651787144, "grad_norm": 12.984583476416342, "learning_rate": 4.930617904422613e-05, "loss": 2.2601, "mean_token_accuracy": 0.4448275864124298, "step": 124395 }, { "epoch": 0.12529700123181858, "grad_norm": 9.799760564074004, "learning_rate": 4.930608670481849e-05, "loss": 2.4443, "mean_token_accuracy": 0.4482758641242981, "step": 124400 }, { "epoch": 0.12530203728492276, "grad_norm": 9.708679615962772, "learning_rate": 4.9305994359362844e-05, "loss": 2.1101, "mean_token_accuracy": 0.4848154902458191, "step": 124405 }, { "epoch": 0.12530707333802693, "grad_norm": 13.767479336738107, "learning_rate": 4.930590200785922e-05, "loss": 2.4934, "mean_token_accuracy": 0.43103448748588563, "step": 124410 }, { "epoch": 0.1253121093911311, "grad_norm": 10.372723597984143, "learning_rate": 4.930580965030766e-05, "loss": 2.624, "mean_token_accuracy": 0.42068964838981626, "step": 124415 }, { "epoch": 0.12531714544423528, "grad_norm": 9.657501886421322, "learning_rate": 4.930571728670816e-05, "loss": 1.95, "mean_token_accuracy": 0.5103448152542114, "step": 124420 }, { "epoch": 0.12532218149733945, "grad_norm": 22.344905047164442, "learning_rate": 4.930562491706077e-05, "loss": 2.1606, "mean_token_accuracy": 0.4896551609039307, "step": 124425 }, { "epoch": 0.12532721755044363, "grad_norm": 9.981920369758555, "learning_rate": 4.9305532541365514e-05, "loss": 2.3096, "mean_token_accuracy": 0.43793103098869324, "step": 124430 }, { "epoch": 0.1253322536035478, "grad_norm": 11.536663812063374, "learning_rate": 4.930544015962241e-05, "loss": 2.64, "mean_token_accuracy": 0.3620689570903778, "step": 124435 }, { "epoch": 0.12533728965665197, "grad_norm": 13.416887935152237, "learning_rate": 4.9305347771831484e-05, "loss": 2.6735, "mean_token_accuracy": 0.39310344457626345, "step": 124440 }, { "epoch": 0.12534232570975615, "grad_norm": 11.819190433372986, "learning_rate": 4.9305255377992756e-05, "loss": 2.2816, "mean_token_accuracy": 0.4551724135875702, "step": 124445 }, { "epoch": 0.12534736176286032, "grad_norm": 10.157211201410727, "learning_rate": 4.9305162978106274e-05, "loss": 2.2673, "mean_token_accuracy": 0.4640048503875732, "step": 124450 }, { "epoch": 0.1253523978159645, "grad_norm": 10.378394586946198, "learning_rate": 4.930507057217204e-05, "loss": 2.298, "mean_token_accuracy": 0.42758620977401735, "step": 124455 }, { "epoch": 0.12535743386906867, "grad_norm": 12.713838081300732, "learning_rate": 4.930497816019009e-05, "loss": 2.7325, "mean_token_accuracy": 0.36551724672317504, "step": 124460 }, { "epoch": 0.12536246992217284, "grad_norm": 9.907453395459001, "learning_rate": 4.9304885742160445e-05, "loss": 2.0588, "mean_token_accuracy": 0.45517241954803467, "step": 124465 }, { "epoch": 0.12536750597527702, "grad_norm": 11.534492418789997, "learning_rate": 4.9304793318083135e-05, "loss": 2.1591, "mean_token_accuracy": 0.46009852886199953, "step": 124470 }, { "epoch": 0.1253725420283812, "grad_norm": 10.900154180569425, "learning_rate": 4.930470088795819e-05, "loss": 2.1837, "mean_token_accuracy": 0.42068964838981626, "step": 124475 }, { "epoch": 0.12537757808148536, "grad_norm": 10.898282998411263, "learning_rate": 4.930460845178562e-05, "loss": 2.7508, "mean_token_accuracy": 0.3827586233615875, "step": 124480 }, { "epoch": 0.12538261413458954, "grad_norm": 12.61184354425924, "learning_rate": 4.9304516009565474e-05, "loss": 2.4133, "mean_token_accuracy": 0.4034482777118683, "step": 124485 }, { "epoch": 0.1253876501876937, "grad_norm": 9.62049562423281, "learning_rate": 4.930442356129775e-05, "loss": 2.7313, "mean_token_accuracy": 0.3862069010734558, "step": 124490 }, { "epoch": 0.12539268624079788, "grad_norm": 9.27471690784353, "learning_rate": 4.930433110698249e-05, "loss": 2.2691, "mean_token_accuracy": 0.42934058904647826, "step": 124495 }, { "epoch": 0.12539772229390206, "grad_norm": 12.108371796427354, "learning_rate": 4.9304238646619726e-05, "loss": 2.2935, "mean_token_accuracy": 0.4551724076271057, "step": 124500 }, { "epoch": 0.1254027583470062, "grad_norm": 13.159421005456878, "learning_rate": 4.930414618020947e-05, "loss": 2.5133, "mean_token_accuracy": 0.4758620738983154, "step": 124505 }, { "epoch": 0.12540779440011038, "grad_norm": 10.628612301447237, "learning_rate": 4.9304053707751755e-05, "loss": 2.4069, "mean_token_accuracy": 0.44827587008476255, "step": 124510 }, { "epoch": 0.12541283045321455, "grad_norm": 11.661209873602834, "learning_rate": 4.93039612292466e-05, "loss": 2.2758, "mean_token_accuracy": 0.45704780220985414, "step": 124515 }, { "epoch": 0.12541786650631873, "grad_norm": 11.928862655309173, "learning_rate": 4.930386874469404e-05, "loss": 2.3238, "mean_token_accuracy": 0.43641863465309144, "step": 124520 }, { "epoch": 0.1254229025594229, "grad_norm": 11.161303972046165, "learning_rate": 4.930377625409409e-05, "loss": 2.8901, "mean_token_accuracy": 0.43448275327682495, "step": 124525 }, { "epoch": 0.12542793861252707, "grad_norm": 11.51545051422143, "learning_rate": 4.9303683757446786e-05, "loss": 2.3218, "mean_token_accuracy": 0.42758620381355283, "step": 124530 }, { "epoch": 0.12543297466563125, "grad_norm": 11.795054491219824, "learning_rate": 4.930359125475215e-05, "loss": 2.4242, "mean_token_accuracy": 0.41034482717514037, "step": 124535 }, { "epoch": 0.12543801071873542, "grad_norm": 11.97099122066195, "learning_rate": 4.93034987460102e-05, "loss": 2.6628, "mean_token_accuracy": 0.39999999701976774, "step": 124540 }, { "epoch": 0.1254430467718396, "grad_norm": 9.952526010113953, "learning_rate": 4.930340623122097e-05, "loss": 2.6825, "mean_token_accuracy": 0.3551724076271057, "step": 124545 }, { "epoch": 0.12544808282494377, "grad_norm": 9.958362861784732, "learning_rate": 4.930331371038449e-05, "loss": 2.396, "mean_token_accuracy": 0.4620689690113068, "step": 124550 }, { "epoch": 0.12545311887804794, "grad_norm": 9.934306306390218, "learning_rate": 4.9303221183500774e-05, "loss": 2.7908, "mean_token_accuracy": 0.3793103456497192, "step": 124555 }, { "epoch": 0.12545815493115212, "grad_norm": 14.938526734115726, "learning_rate": 4.930312865056985e-05, "loss": 2.8463, "mean_token_accuracy": 0.42758620977401735, "step": 124560 }, { "epoch": 0.1254631909842563, "grad_norm": 9.940380520462925, "learning_rate": 4.930303611159176e-05, "loss": 2.1645, "mean_token_accuracy": 0.44827585816383364, "step": 124565 }, { "epoch": 0.12546822703736046, "grad_norm": 11.730876966483073, "learning_rate": 4.9302943566566504e-05, "loss": 2.6275, "mean_token_accuracy": 0.3827586203813553, "step": 124570 }, { "epoch": 0.12547326309046464, "grad_norm": 10.734082551112081, "learning_rate": 4.930285101549413e-05, "loss": 2.6104, "mean_token_accuracy": 0.4, "step": 124575 }, { "epoch": 0.1254782991435688, "grad_norm": 11.250192412186836, "learning_rate": 4.930275845837465e-05, "loss": 2.4856, "mean_token_accuracy": 0.4724137783050537, "step": 124580 }, { "epoch": 0.12548333519667298, "grad_norm": 12.33069578576654, "learning_rate": 4.930266589520809e-05, "loss": 2.8329, "mean_token_accuracy": 0.4103448212146759, "step": 124585 }, { "epoch": 0.12548837124977716, "grad_norm": 11.16268165731229, "learning_rate": 4.930257332599448e-05, "loss": 2.4668, "mean_token_accuracy": 0.39999999701976774, "step": 124590 }, { "epoch": 0.12549340730288133, "grad_norm": 16.9342047945548, "learning_rate": 4.930248075073385e-05, "loss": 2.2973, "mean_token_accuracy": 0.43793103098869324, "step": 124595 }, { "epoch": 0.1254984433559855, "grad_norm": 8.674582059178801, "learning_rate": 4.930238816942621e-05, "loss": 2.1273, "mean_token_accuracy": 0.45862067937850953, "step": 124600 }, { "epoch": 0.12550347940908968, "grad_norm": 9.246244899952336, "learning_rate": 4.9302295582071606e-05, "loss": 2.3476, "mean_token_accuracy": 0.39310344457626345, "step": 124605 }, { "epoch": 0.12550851546219385, "grad_norm": 13.81039599773027, "learning_rate": 4.930220298867005e-05, "loss": 2.2275, "mean_token_accuracy": 0.4448275864124298, "step": 124610 }, { "epoch": 0.12551355151529803, "grad_norm": 10.68915342013985, "learning_rate": 4.930211038922157e-05, "loss": 2.5478, "mean_token_accuracy": 0.4156684875488281, "step": 124615 }, { "epoch": 0.1255185875684022, "grad_norm": 12.119654851182881, "learning_rate": 4.93020177837262e-05, "loss": 2.5325, "mean_token_accuracy": 0.43448275327682495, "step": 124620 }, { "epoch": 0.12552362362150637, "grad_norm": 10.375049626621571, "learning_rate": 4.9301925172183954e-05, "loss": 2.1615, "mean_token_accuracy": 0.46551724076271056, "step": 124625 }, { "epoch": 0.12552865967461055, "grad_norm": 7.42633046238009, "learning_rate": 4.930183255459486e-05, "loss": 2.3654, "mean_token_accuracy": 0.4551724076271057, "step": 124630 }, { "epoch": 0.12553369572771472, "grad_norm": 10.70101445203032, "learning_rate": 4.930173993095896e-05, "loss": 2.2136, "mean_token_accuracy": 0.42758620977401735, "step": 124635 }, { "epoch": 0.12553873178081887, "grad_norm": 11.388928004284875, "learning_rate": 4.9301647301276254e-05, "loss": 2.4813, "mean_token_accuracy": 0.4241379201412201, "step": 124640 }, { "epoch": 0.12554376783392304, "grad_norm": 14.501089363676828, "learning_rate": 4.9301554665546784e-05, "loss": 2.8524, "mean_token_accuracy": 0.4000000059604645, "step": 124645 }, { "epoch": 0.12554880388702722, "grad_norm": 12.210052158703052, "learning_rate": 4.9301462023770575e-05, "loss": 2.691, "mean_token_accuracy": 0.44482759237289426, "step": 124650 }, { "epoch": 0.1255538399401314, "grad_norm": 12.062600488128512, "learning_rate": 4.9301369375947636e-05, "loss": 2.8426, "mean_token_accuracy": 0.37241379618644715, "step": 124655 }, { "epoch": 0.12555887599323556, "grad_norm": 9.623788881445236, "learning_rate": 4.930127672207801e-05, "loss": 2.6817, "mean_token_accuracy": 0.41034482717514037, "step": 124660 }, { "epoch": 0.12556391204633974, "grad_norm": 10.865980085018004, "learning_rate": 4.930118406216173e-05, "loss": 2.1651, "mean_token_accuracy": 0.47931033968925474, "step": 124665 }, { "epoch": 0.1255689480994439, "grad_norm": 10.824042084364352, "learning_rate": 4.9301091396198804e-05, "loss": 2.1779, "mean_token_accuracy": 0.44827585816383364, "step": 124670 }, { "epoch": 0.12557398415254808, "grad_norm": 15.09230145508599, "learning_rate": 4.9300998724189266e-05, "loss": 2.3435, "mean_token_accuracy": 0.38620689511299133, "step": 124675 }, { "epoch": 0.12557902020565226, "grad_norm": 13.185264547451006, "learning_rate": 4.930090604613313e-05, "loss": 1.9523, "mean_token_accuracy": 0.48275861144065857, "step": 124680 }, { "epoch": 0.12558405625875643, "grad_norm": 14.11166633829688, "learning_rate": 4.930081336203044e-05, "loss": 2.3738, "mean_token_accuracy": 0.42413793206214906, "step": 124685 }, { "epoch": 0.1255890923118606, "grad_norm": 9.379666024803575, "learning_rate": 4.930072067188121e-05, "loss": 2.1793, "mean_token_accuracy": 0.48275862336158754, "step": 124690 }, { "epoch": 0.12559412836496478, "grad_norm": 11.276653192053859, "learning_rate": 4.930062797568547e-05, "loss": 2.5849, "mean_token_accuracy": 0.41724138259887694, "step": 124695 }, { "epoch": 0.12559916441806895, "grad_norm": 10.60472390318333, "learning_rate": 4.9300535273443244e-05, "loss": 2.2812, "mean_token_accuracy": 0.43793103098869324, "step": 124700 }, { "epoch": 0.12560420047117313, "grad_norm": 10.09012991853654, "learning_rate": 4.930044256515456e-05, "loss": 2.267, "mean_token_accuracy": 0.39310343861579894, "step": 124705 }, { "epoch": 0.1256092365242773, "grad_norm": 15.513915624820415, "learning_rate": 4.9300349850819436e-05, "loss": 2.5037, "mean_token_accuracy": 0.4517241418361664, "step": 124710 }, { "epoch": 0.12561427257738148, "grad_norm": 13.02613968089426, "learning_rate": 4.930025713043791e-05, "loss": 2.6672, "mean_token_accuracy": 0.3999999940395355, "step": 124715 }, { "epoch": 0.12561930863048565, "grad_norm": 9.649172974368424, "learning_rate": 4.930016440401e-05, "loss": 2.2826, "mean_token_accuracy": 0.482758629322052, "step": 124720 }, { "epoch": 0.12562434468358982, "grad_norm": 9.343040850393614, "learning_rate": 4.930007167153574e-05, "loss": 2.2532, "mean_token_accuracy": 0.44482758045196535, "step": 124725 }, { "epoch": 0.125629380736694, "grad_norm": 11.10436546194367, "learning_rate": 4.929997893301513e-05, "loss": 2.4143, "mean_token_accuracy": 0.4498487651348114, "step": 124730 }, { "epoch": 0.12563441678979817, "grad_norm": 10.163303957350657, "learning_rate": 4.929988618844823e-05, "loss": 2.4618, "mean_token_accuracy": 0.4206896543502808, "step": 124735 }, { "epoch": 0.12563945284290234, "grad_norm": 9.585957170919789, "learning_rate": 4.929979343783505e-05, "loss": 2.4401, "mean_token_accuracy": 0.42758620381355283, "step": 124740 }, { "epoch": 0.12564448889600652, "grad_norm": 14.754339119992299, "learning_rate": 4.929970068117561e-05, "loss": 2.5607, "mean_token_accuracy": 0.4275862157344818, "step": 124745 }, { "epoch": 0.1256495249491107, "grad_norm": 11.08677609217498, "learning_rate": 4.929960791846994e-05, "loss": 2.4467, "mean_token_accuracy": 0.43448275327682495, "step": 124750 }, { "epoch": 0.12565456100221487, "grad_norm": 10.362906917031145, "learning_rate": 4.929951514971807e-05, "loss": 2.3062, "mean_token_accuracy": 0.4172413766384125, "step": 124755 }, { "epoch": 0.12565959705531904, "grad_norm": 13.472080325072039, "learning_rate": 4.929942237492002e-05, "loss": 2.1709, "mean_token_accuracy": 0.4862068951129913, "step": 124760 }, { "epoch": 0.1256646331084232, "grad_norm": 17.291678219400506, "learning_rate": 4.929932959407583e-05, "loss": 2.9935, "mean_token_accuracy": 0.3482758641242981, "step": 124765 }, { "epoch": 0.1256696691615274, "grad_norm": 9.183559872479305, "learning_rate": 4.9299236807185504e-05, "loss": 2.1417, "mean_token_accuracy": 0.4620689690113068, "step": 124770 }, { "epoch": 0.12567470521463156, "grad_norm": 11.532906173862996, "learning_rate": 4.929914401424909e-05, "loss": 2.276, "mean_token_accuracy": 0.4724137902259827, "step": 124775 }, { "epoch": 0.1256797412677357, "grad_norm": 11.780900114022156, "learning_rate": 4.9299051215266596e-05, "loss": 2.226, "mean_token_accuracy": 0.41034482717514037, "step": 124780 }, { "epoch": 0.12568477732083988, "grad_norm": 9.762984920155311, "learning_rate": 4.929895841023805e-05, "loss": 2.2261, "mean_token_accuracy": 0.42758620381355283, "step": 124785 }, { "epoch": 0.12568981337394405, "grad_norm": 11.369020675362362, "learning_rate": 4.929886559916349e-05, "loss": 2.2596, "mean_token_accuracy": 0.42758620381355283, "step": 124790 }, { "epoch": 0.12569484942704823, "grad_norm": 10.077714583183026, "learning_rate": 4.929877278204292e-05, "loss": 2.2478, "mean_token_accuracy": 0.4458128035068512, "step": 124795 }, { "epoch": 0.1256998854801524, "grad_norm": 10.542518272892835, "learning_rate": 4.929867995887638e-05, "loss": 2.1669, "mean_token_accuracy": 0.4637023627758026, "step": 124800 }, { "epoch": 0.12570492153325658, "grad_norm": 11.245086223219726, "learning_rate": 4.929858712966391e-05, "loss": 2.3923, "mean_token_accuracy": 0.4793103337287903, "step": 124805 }, { "epoch": 0.12570995758636075, "grad_norm": 11.32158876893631, "learning_rate": 4.9298494294405504e-05, "loss": 2.251, "mean_token_accuracy": 0.48275862336158754, "step": 124810 }, { "epoch": 0.12571499363946492, "grad_norm": 9.263053814466303, "learning_rate": 4.929840145310121e-05, "loss": 3.0557, "mean_token_accuracy": 0.3551724076271057, "step": 124815 }, { "epoch": 0.1257200296925691, "grad_norm": 11.393053418786424, "learning_rate": 4.9298308605751056e-05, "loss": 2.505, "mean_token_accuracy": 0.4379310369491577, "step": 124820 }, { "epoch": 0.12572506574567327, "grad_norm": 10.776018376078913, "learning_rate": 4.9298215752355056e-05, "loss": 2.3024, "mean_token_accuracy": 0.42758620977401735, "step": 124825 }, { "epoch": 0.12573010179877744, "grad_norm": 10.63415756330727, "learning_rate": 4.9298122892913234e-05, "loss": 2.3193, "mean_token_accuracy": 0.47767695784568787, "step": 124830 }, { "epoch": 0.12573513785188162, "grad_norm": 10.115554482214815, "learning_rate": 4.929803002742562e-05, "loss": 2.3358, "mean_token_accuracy": 0.4275862157344818, "step": 124835 }, { "epoch": 0.1257401739049858, "grad_norm": 9.617740132383002, "learning_rate": 4.9297937155892256e-05, "loss": 2.2648, "mean_token_accuracy": 0.41530550122261045, "step": 124840 }, { "epoch": 0.12574520995808997, "grad_norm": 11.469696881723728, "learning_rate": 4.9297844278313134e-05, "loss": 2.6002, "mean_token_accuracy": 0.3931034505367279, "step": 124845 }, { "epoch": 0.12575024601119414, "grad_norm": 12.607824152460605, "learning_rate": 4.929775139468831e-05, "loss": 2.2668, "mean_token_accuracy": 0.4517241418361664, "step": 124850 }, { "epoch": 0.1257552820642983, "grad_norm": 10.04156455023881, "learning_rate": 4.929765850501779e-05, "loss": 2.303, "mean_token_accuracy": 0.47931033968925474, "step": 124855 }, { "epoch": 0.1257603181174025, "grad_norm": 13.204351552175593, "learning_rate": 4.9297565609301615e-05, "loss": 2.3704, "mean_token_accuracy": 0.3793103456497192, "step": 124860 }, { "epoch": 0.12576535417050666, "grad_norm": 14.151111459345502, "learning_rate": 4.9297472707539804e-05, "loss": 2.5039, "mean_token_accuracy": 0.43103447258472444, "step": 124865 }, { "epoch": 0.12577039022361083, "grad_norm": 10.287347106515607, "learning_rate": 4.929737979973238e-05, "loss": 2.6358, "mean_token_accuracy": 0.4401088833808899, "step": 124870 }, { "epoch": 0.125775426276715, "grad_norm": 9.898973397022498, "learning_rate": 4.929728688587937e-05, "loss": 2.1612, "mean_token_accuracy": 0.47241380214691164, "step": 124875 }, { "epoch": 0.12578046232981918, "grad_norm": 12.015950662774873, "learning_rate": 4.92971939659808e-05, "loss": 2.102, "mean_token_accuracy": 0.4655172348022461, "step": 124880 }, { "epoch": 0.12578549838292336, "grad_norm": 9.199675212517754, "learning_rate": 4.92971010400367e-05, "loss": 2.1101, "mean_token_accuracy": 0.45862067937850953, "step": 124885 }, { "epoch": 0.12579053443602753, "grad_norm": 10.71574399802233, "learning_rate": 4.92970081080471e-05, "loss": 2.2249, "mean_token_accuracy": 0.44482759237289426, "step": 124890 }, { "epoch": 0.1257955704891317, "grad_norm": 11.570246221300328, "learning_rate": 4.929691517001201e-05, "loss": 2.5286, "mean_token_accuracy": 0.3896551728248596, "step": 124895 }, { "epoch": 0.12580060654223588, "grad_norm": 12.693753011582569, "learning_rate": 4.929682222593147e-05, "loss": 2.3181, "mean_token_accuracy": 0.4448275864124298, "step": 124900 }, { "epoch": 0.12580564259534005, "grad_norm": 10.029822738281057, "learning_rate": 4.929672927580549e-05, "loss": 2.2635, "mean_token_accuracy": 0.42758620977401735, "step": 124905 }, { "epoch": 0.12581067864844422, "grad_norm": 11.466763209368024, "learning_rate": 4.929663631963411e-05, "loss": 2.0675, "mean_token_accuracy": 0.4551724135875702, "step": 124910 }, { "epoch": 0.1258157147015484, "grad_norm": 10.077935727122965, "learning_rate": 4.929654335741736e-05, "loss": 2.1194, "mean_token_accuracy": 0.48965516686439514, "step": 124915 }, { "epoch": 0.12582075075465254, "grad_norm": 12.88302234832316, "learning_rate": 4.929645038915525e-05, "loss": 2.7777, "mean_token_accuracy": 0.38620689511299133, "step": 124920 }, { "epoch": 0.12582578680775672, "grad_norm": 10.336974789921443, "learning_rate": 4.929635741484781e-05, "loss": 2.2853, "mean_token_accuracy": 0.46551724076271056, "step": 124925 }, { "epoch": 0.1258308228608609, "grad_norm": 10.71216876597711, "learning_rate": 4.929626443449507e-05, "loss": 2.6768, "mean_token_accuracy": 0.4413793087005615, "step": 124930 }, { "epoch": 0.12583585891396507, "grad_norm": 10.596842791236762, "learning_rate": 4.9296171448097056e-05, "loss": 2.4019, "mean_token_accuracy": 0.4758620738983154, "step": 124935 }, { "epoch": 0.12584089496706924, "grad_norm": 12.31550445280161, "learning_rate": 4.92960784556538e-05, "loss": 2.3664, "mean_token_accuracy": 0.4103448212146759, "step": 124940 }, { "epoch": 0.1258459310201734, "grad_norm": 9.933005697531689, "learning_rate": 4.9295985457165314e-05, "loss": 2.7421, "mean_token_accuracy": 0.4068965554237366, "step": 124945 }, { "epoch": 0.1258509670732776, "grad_norm": 10.145829888384782, "learning_rate": 4.929589245263163e-05, "loss": 2.7587, "mean_token_accuracy": 0.34137930870056155, "step": 124950 }, { "epoch": 0.12585600312638176, "grad_norm": 12.329540906777055, "learning_rate": 4.929579944205277e-05, "loss": 2.4852, "mean_token_accuracy": 0.420689657330513, "step": 124955 }, { "epoch": 0.12586103917948593, "grad_norm": 8.66417948719421, "learning_rate": 4.9295706425428775e-05, "loss": 2.1594, "mean_token_accuracy": 0.4448275864124298, "step": 124960 }, { "epoch": 0.1258660752325901, "grad_norm": 8.781055132942296, "learning_rate": 4.9295613402759645e-05, "loss": 2.0965, "mean_token_accuracy": 0.4689655065536499, "step": 124965 }, { "epoch": 0.12587111128569428, "grad_norm": 10.36706656518254, "learning_rate": 4.929552037404544e-05, "loss": 2.1067, "mean_token_accuracy": 0.4848154842853546, "step": 124970 }, { "epoch": 0.12587614733879846, "grad_norm": 9.672273847009492, "learning_rate": 4.929542733928615e-05, "loss": 2.553, "mean_token_accuracy": 0.41034482717514037, "step": 124975 }, { "epoch": 0.12588118339190263, "grad_norm": 10.864068710218195, "learning_rate": 4.9295334298481815e-05, "loss": 2.2643, "mean_token_accuracy": 0.4655172348022461, "step": 124980 }, { "epoch": 0.1258862194450068, "grad_norm": 10.403609574552823, "learning_rate": 4.9295241251632476e-05, "loss": 2.2046, "mean_token_accuracy": 0.441379314661026, "step": 124985 }, { "epoch": 0.12589125549811098, "grad_norm": 10.583532540482432, "learning_rate": 4.9295148198738134e-05, "loss": 2.5579, "mean_token_accuracy": 0.4137930989265442, "step": 124990 }, { "epoch": 0.12589629155121515, "grad_norm": 10.344399787512279, "learning_rate": 4.929505513979883e-05, "loss": 2.2059, "mean_token_accuracy": 0.4724137902259827, "step": 124995 }, { "epoch": 0.12590132760431932, "grad_norm": 8.62941402833875, "learning_rate": 4.929496207481459e-05, "loss": 2.9367, "mean_token_accuracy": 0.41034482717514037, "step": 125000 }, { "epoch": 0.1259063636574235, "grad_norm": 11.420862615541132, "learning_rate": 4.929486900378543e-05, "loss": 2.5274, "mean_token_accuracy": 0.4310344815254211, "step": 125005 }, { "epoch": 0.12591139971052767, "grad_norm": 9.742107056323428, "learning_rate": 4.929477592671138e-05, "loss": 1.8625, "mean_token_accuracy": 0.4845130205154419, "step": 125010 }, { "epoch": 0.12591643576363185, "grad_norm": 9.898257793177857, "learning_rate": 4.9294682843592474e-05, "loss": 2.3977, "mean_token_accuracy": 0.41034482717514037, "step": 125015 }, { "epoch": 0.12592147181673602, "grad_norm": 10.394117451096738, "learning_rate": 4.929458975442873e-05, "loss": 2.4513, "mean_token_accuracy": 0.4517241299152374, "step": 125020 }, { "epoch": 0.1259265078698402, "grad_norm": 14.143012275106507, "learning_rate": 4.929449665922017e-05, "loss": 2.4695, "mean_token_accuracy": 0.4620689630508423, "step": 125025 }, { "epoch": 0.12593154392294437, "grad_norm": 13.89995037920458, "learning_rate": 4.9294403557966835e-05, "loss": 2.5268, "mean_token_accuracy": 0.3999999940395355, "step": 125030 }, { "epoch": 0.12593657997604854, "grad_norm": 11.73473090263776, "learning_rate": 4.929431045066873e-05, "loss": 2.5304, "mean_token_accuracy": 0.3793103456497192, "step": 125035 }, { "epoch": 0.12594161602915271, "grad_norm": 14.821295776122207, "learning_rate": 4.92942173373259e-05, "loss": 2.8426, "mean_token_accuracy": 0.35172413289546967, "step": 125040 }, { "epoch": 0.1259466520822569, "grad_norm": 10.112050581830612, "learning_rate": 4.929412421793836e-05, "loss": 2.3848, "mean_token_accuracy": 0.3793103516101837, "step": 125045 }, { "epoch": 0.12595168813536106, "grad_norm": 10.959863838491733, "learning_rate": 4.929403109250614e-05, "loss": 2.4967, "mean_token_accuracy": 0.46551724076271056, "step": 125050 }, { "epoch": 0.12595672418846524, "grad_norm": 10.8585325509035, "learning_rate": 4.9293937961029264e-05, "loss": 2.3296, "mean_token_accuracy": 0.4034482777118683, "step": 125055 }, { "epoch": 0.12596176024156938, "grad_norm": 10.824946314020456, "learning_rate": 4.929384482350775e-05, "loss": 2.491, "mean_token_accuracy": 0.44482758045196535, "step": 125060 }, { "epoch": 0.12596679629467356, "grad_norm": 11.690886979181048, "learning_rate": 4.9293751679941636e-05, "loss": 2.3739, "mean_token_accuracy": 0.46382335424423216, "step": 125065 }, { "epoch": 0.12597183234777773, "grad_norm": 11.006663606558181, "learning_rate": 4.929365853033096e-05, "loss": 2.187, "mean_token_accuracy": 0.42413792610168455, "step": 125070 }, { "epoch": 0.1259768684008819, "grad_norm": 9.46632307608489, "learning_rate": 4.929356537467571e-05, "loss": 2.5671, "mean_token_accuracy": 0.42068966031074523, "step": 125075 }, { "epoch": 0.12598190445398608, "grad_norm": 12.498023763067543, "learning_rate": 4.929347221297594e-05, "loss": 2.6123, "mean_token_accuracy": 0.4034482777118683, "step": 125080 }, { "epoch": 0.12598694050709025, "grad_norm": 13.837864983827453, "learning_rate": 4.929337904523167e-05, "loss": 2.345, "mean_token_accuracy": 0.4379310429096222, "step": 125085 }, { "epoch": 0.12599197656019442, "grad_norm": 11.005831427290232, "learning_rate": 4.929328587144292e-05, "loss": 2.44, "mean_token_accuracy": 0.4068965554237366, "step": 125090 }, { "epoch": 0.1259970126132986, "grad_norm": 10.100789607065176, "learning_rate": 4.929319269160972e-05, "loss": 2.4461, "mean_token_accuracy": 0.41724138259887694, "step": 125095 }, { "epoch": 0.12600204866640277, "grad_norm": 12.183378943133926, "learning_rate": 4.929309950573211e-05, "loss": 2.639, "mean_token_accuracy": 0.44827585816383364, "step": 125100 }, { "epoch": 0.12600708471950695, "grad_norm": 10.780438410034664, "learning_rate": 4.9293006313810095e-05, "loss": 2.1722, "mean_token_accuracy": 0.4448275864124298, "step": 125105 }, { "epoch": 0.12601212077261112, "grad_norm": 10.16260196750849, "learning_rate": 4.929291311584371e-05, "loss": 2.1611, "mean_token_accuracy": 0.47586206793785096, "step": 125110 }, { "epoch": 0.1260171568257153, "grad_norm": 10.302701842205792, "learning_rate": 4.9292819911832976e-05, "loss": 2.5502, "mean_token_accuracy": 0.4172413766384125, "step": 125115 }, { "epoch": 0.12602219287881947, "grad_norm": 9.628048275734054, "learning_rate": 4.9292726701777924e-05, "loss": 2.144, "mean_token_accuracy": 0.4482758641242981, "step": 125120 }, { "epoch": 0.12602722893192364, "grad_norm": 8.94019705587569, "learning_rate": 4.9292633485678575e-05, "loss": 1.9248, "mean_token_accuracy": 0.5401088833808899, "step": 125125 }, { "epoch": 0.12603226498502781, "grad_norm": 10.65335797697744, "learning_rate": 4.929254026353496e-05, "loss": 2.6787, "mean_token_accuracy": 0.37931033968925476, "step": 125130 }, { "epoch": 0.126037301038132, "grad_norm": 10.931902697564423, "learning_rate": 4.929244703534711e-05, "loss": 2.6358, "mean_token_accuracy": 0.3965517282485962, "step": 125135 }, { "epoch": 0.12604233709123616, "grad_norm": 10.615660359355696, "learning_rate": 4.9292353801115034e-05, "loss": 2.5811, "mean_token_accuracy": 0.39310344457626345, "step": 125140 }, { "epoch": 0.12604737314434034, "grad_norm": 11.283569643290294, "learning_rate": 4.929226056083876e-05, "loss": 2.4803, "mean_token_accuracy": 0.42758620381355283, "step": 125145 }, { "epoch": 0.1260524091974445, "grad_norm": 21.173688337041085, "learning_rate": 4.929216731451834e-05, "loss": 2.5651, "mean_token_accuracy": 0.40344828069210054, "step": 125150 }, { "epoch": 0.12605744525054868, "grad_norm": 10.645529630885825, "learning_rate": 4.929207406215377e-05, "loss": 2.5582, "mean_token_accuracy": 0.4034482777118683, "step": 125155 }, { "epoch": 0.12606248130365286, "grad_norm": 10.613206518269015, "learning_rate": 4.92919808037451e-05, "loss": 2.6919, "mean_token_accuracy": 0.38275861740112305, "step": 125160 }, { "epoch": 0.12606751735675703, "grad_norm": 9.177729620095056, "learning_rate": 4.9291887539292324e-05, "loss": 2.2705, "mean_token_accuracy": 0.4586206912994385, "step": 125165 }, { "epoch": 0.1260725534098612, "grad_norm": 10.18131320489993, "learning_rate": 4.92917942687955e-05, "loss": 2.1232, "mean_token_accuracy": 0.4206896543502808, "step": 125170 }, { "epoch": 0.12607758946296538, "grad_norm": 9.84210706704598, "learning_rate": 4.9291700992254626e-05, "loss": 2.2631, "mean_token_accuracy": 0.43968542814254763, "step": 125175 }, { "epoch": 0.12608262551606955, "grad_norm": 12.288679026138501, "learning_rate": 4.9291607709669756e-05, "loss": 2.4131, "mean_token_accuracy": 0.38620689511299133, "step": 125180 }, { "epoch": 0.12608766156917373, "grad_norm": 9.6532293476981, "learning_rate": 4.929151442104089e-05, "loss": 2.3215, "mean_token_accuracy": 0.41379311084747317, "step": 125185 }, { "epoch": 0.1260926976222779, "grad_norm": 10.20791223464611, "learning_rate": 4.9291421126368084e-05, "loss": 2.0508, "mean_token_accuracy": 0.47586206197738645, "step": 125190 }, { "epoch": 0.12609773367538207, "grad_norm": 10.628761295181024, "learning_rate": 4.9291327825651336e-05, "loss": 2.5726, "mean_token_accuracy": 0.4310344815254211, "step": 125195 }, { "epoch": 0.12610276972848622, "grad_norm": 9.70843707323825, "learning_rate": 4.929123451889068e-05, "loss": 2.2057, "mean_token_accuracy": 0.44827585816383364, "step": 125200 }, { "epoch": 0.1261078057815904, "grad_norm": 8.989997491979045, "learning_rate": 4.929114120608615e-05, "loss": 2.0401, "mean_token_accuracy": 0.47241379618644713, "step": 125205 }, { "epoch": 0.12611284183469457, "grad_norm": 10.665395672430744, "learning_rate": 4.9291047887237754e-05, "loss": 2.4025, "mean_token_accuracy": 0.4310344815254211, "step": 125210 }, { "epoch": 0.12611787788779874, "grad_norm": 10.825138193270469, "learning_rate": 4.929095456234554e-05, "loss": 2.367, "mean_token_accuracy": 0.42758620977401735, "step": 125215 }, { "epoch": 0.12612291394090291, "grad_norm": 11.750569379245496, "learning_rate": 4.929086123140952e-05, "loss": 2.2446, "mean_token_accuracy": 0.4517241358757019, "step": 125220 }, { "epoch": 0.1261279499940071, "grad_norm": 10.331386419265293, "learning_rate": 4.929076789442973e-05, "loss": 2.1454, "mean_token_accuracy": 0.4551724076271057, "step": 125225 }, { "epoch": 0.12613298604711126, "grad_norm": 9.907021810398078, "learning_rate": 4.929067455140618e-05, "loss": 2.6215, "mean_token_accuracy": 0.4068965554237366, "step": 125230 }, { "epoch": 0.12613802210021544, "grad_norm": 9.801439438960116, "learning_rate": 4.929058120233891e-05, "loss": 2.8114, "mean_token_accuracy": 0.4034482777118683, "step": 125235 }, { "epoch": 0.1261430581533196, "grad_norm": 10.784841116248522, "learning_rate": 4.929048784722794e-05, "loss": 2.1726, "mean_token_accuracy": 0.47931033968925474, "step": 125240 }, { "epoch": 0.12614809420642378, "grad_norm": 10.47706261029374, "learning_rate": 4.9290394486073294e-05, "loss": 2.4178, "mean_token_accuracy": 0.44137930274009707, "step": 125245 }, { "epoch": 0.12615313025952796, "grad_norm": 9.606461496733194, "learning_rate": 4.9290301118875003e-05, "loss": 2.3324, "mean_token_accuracy": 0.4724137902259827, "step": 125250 }, { "epoch": 0.12615816631263213, "grad_norm": 11.004724308533243, "learning_rate": 4.929020774563309e-05, "loss": 2.591, "mean_token_accuracy": 0.358620697259903, "step": 125255 }, { "epoch": 0.1261632023657363, "grad_norm": 9.406589197659763, "learning_rate": 4.929011436634759e-05, "loss": 2.3462, "mean_token_accuracy": 0.4551724076271057, "step": 125260 }, { "epoch": 0.12616823841884048, "grad_norm": 16.60781137389467, "learning_rate": 4.929002098101851e-05, "loss": 2.735, "mean_token_accuracy": 0.3896551638841629, "step": 125265 }, { "epoch": 0.12617327447194465, "grad_norm": 11.919541753701893, "learning_rate": 4.9289927589645883e-05, "loss": 2.6369, "mean_token_accuracy": 0.4068965554237366, "step": 125270 }, { "epoch": 0.12617831052504883, "grad_norm": 11.030222231153909, "learning_rate": 4.928983419222975e-05, "loss": 2.2932, "mean_token_accuracy": 0.4551724076271057, "step": 125275 }, { "epoch": 0.126183346578153, "grad_norm": 9.509624470021091, "learning_rate": 4.928974078877012e-05, "loss": 2.5445, "mean_token_accuracy": 0.382758629322052, "step": 125280 }, { "epoch": 0.12618838263125717, "grad_norm": 9.999143345040915, "learning_rate": 4.9289647379267024e-05, "loss": 2.7804, "mean_token_accuracy": 0.4, "step": 125285 }, { "epoch": 0.12619341868436135, "grad_norm": 10.630657938917718, "learning_rate": 4.9289553963720485e-05, "loss": 2.4329, "mean_token_accuracy": 0.4206896543502808, "step": 125290 }, { "epoch": 0.12619845473746552, "grad_norm": 10.706565739502814, "learning_rate": 4.9289460542130536e-05, "loss": 2.504, "mean_token_accuracy": 0.3827586114406586, "step": 125295 }, { "epoch": 0.1262034907905697, "grad_norm": 11.87256073857122, "learning_rate": 4.92893671144972e-05, "loss": 2.3704, "mean_token_accuracy": 0.4000000059604645, "step": 125300 }, { "epoch": 0.12620852684367387, "grad_norm": 10.824801603564477, "learning_rate": 4.9289273680820494e-05, "loss": 2.3819, "mean_token_accuracy": 0.4068965494632721, "step": 125305 }, { "epoch": 0.12621356289677804, "grad_norm": 15.291171512413726, "learning_rate": 4.9289180241100455e-05, "loss": 2.7953, "mean_token_accuracy": 0.41379310488700866, "step": 125310 }, { "epoch": 0.12621859894988222, "grad_norm": 8.183961806346213, "learning_rate": 4.9289086795337114e-05, "loss": 2.1634, "mean_token_accuracy": 0.43793103098869324, "step": 125315 }, { "epoch": 0.1262236350029864, "grad_norm": 9.322994671475339, "learning_rate": 4.9288993343530484e-05, "loss": 2.2329, "mean_token_accuracy": 0.43793103098869324, "step": 125320 }, { "epoch": 0.12622867105609056, "grad_norm": 12.536703364082399, "learning_rate": 4.9288899885680586e-05, "loss": 2.631, "mean_token_accuracy": 0.42758620381355283, "step": 125325 }, { "epoch": 0.12623370710919474, "grad_norm": 13.493922756640167, "learning_rate": 4.928880642178747e-05, "loss": 2.5095, "mean_token_accuracy": 0.4344827592372894, "step": 125330 }, { "epoch": 0.1262387431622989, "grad_norm": 11.094542147196364, "learning_rate": 4.928871295185114e-05, "loss": 2.5548, "mean_token_accuracy": 0.39655172228813174, "step": 125335 }, { "epoch": 0.12624377921540306, "grad_norm": 8.895421205279188, "learning_rate": 4.928861947587163e-05, "loss": 2.5806, "mean_token_accuracy": 0.42068964838981626, "step": 125340 }, { "epoch": 0.12624881526850723, "grad_norm": 9.610954266568175, "learning_rate": 4.928852599384896e-05, "loss": 2.5595, "mean_token_accuracy": 0.4344827592372894, "step": 125345 }, { "epoch": 0.1262538513216114, "grad_norm": 10.962175878236517, "learning_rate": 4.9288432505783175e-05, "loss": 2.0971, "mean_token_accuracy": 0.47586206197738645, "step": 125350 }, { "epoch": 0.12625888737471558, "grad_norm": 14.625286193677272, "learning_rate": 4.9288339011674276e-05, "loss": 2.6845, "mean_token_accuracy": 0.37241379022598264, "step": 125355 }, { "epoch": 0.12626392342781975, "grad_norm": 12.327264770028144, "learning_rate": 4.92882455115223e-05, "loss": 2.7291, "mean_token_accuracy": 0.38620689511299133, "step": 125360 }, { "epoch": 0.12626895948092393, "grad_norm": 11.019882851579643, "learning_rate": 4.928815200532727e-05, "loss": 2.4473, "mean_token_accuracy": 0.4379310369491577, "step": 125365 }, { "epoch": 0.1262739955340281, "grad_norm": 10.90551572072697, "learning_rate": 4.928805849308922e-05, "loss": 2.2083, "mean_token_accuracy": 0.4724137902259827, "step": 125370 }, { "epoch": 0.12627903158713227, "grad_norm": 8.654918445428313, "learning_rate": 4.928796497480817e-05, "loss": 2.2839, "mean_token_accuracy": 0.4551724135875702, "step": 125375 }, { "epoch": 0.12628406764023645, "grad_norm": 14.226194401493888, "learning_rate": 4.9287871450484146e-05, "loss": 2.368, "mean_token_accuracy": 0.4172413766384125, "step": 125380 }, { "epoch": 0.12628910369334062, "grad_norm": 9.501190271952192, "learning_rate": 4.9287777920117176e-05, "loss": 2.5419, "mean_token_accuracy": 0.42068964838981626, "step": 125385 }, { "epoch": 0.1262941397464448, "grad_norm": 14.654793063213871, "learning_rate": 4.928768438370728e-05, "loss": 2.5638, "mean_token_accuracy": 0.441379314661026, "step": 125390 }, { "epoch": 0.12629917579954897, "grad_norm": 7.9908159369372385, "learning_rate": 4.9287590841254493e-05, "loss": 2.2217, "mean_token_accuracy": 0.4517241358757019, "step": 125395 }, { "epoch": 0.12630421185265314, "grad_norm": 11.148710354915991, "learning_rate": 4.9287497292758836e-05, "loss": 2.4963, "mean_token_accuracy": 0.4186932921409607, "step": 125400 }, { "epoch": 0.12630924790575732, "grad_norm": 10.61996935053498, "learning_rate": 4.928740373822034e-05, "loss": 2.538, "mean_token_accuracy": 0.4468844473361969, "step": 125405 }, { "epoch": 0.1263142839588615, "grad_norm": 11.302727946872293, "learning_rate": 4.928731017763902e-05, "loss": 1.9627, "mean_token_accuracy": 0.44646098613739016, "step": 125410 }, { "epoch": 0.12631932001196566, "grad_norm": 12.82476906672126, "learning_rate": 4.92872166110149e-05, "loss": 2.3222, "mean_token_accuracy": 0.4413793087005615, "step": 125415 }, { "epoch": 0.12632435606506984, "grad_norm": 10.035710143545849, "learning_rate": 4.928712303834803e-05, "loss": 2.4011, "mean_token_accuracy": 0.43103447556495667, "step": 125420 }, { "epoch": 0.126329392118174, "grad_norm": 10.420871992617379, "learning_rate": 4.9287029459638414e-05, "loss": 2.2528, "mean_token_accuracy": 0.4288566291332245, "step": 125425 }, { "epoch": 0.12633442817127818, "grad_norm": 11.096351566011899, "learning_rate": 4.928693587488608e-05, "loss": 2.3015, "mean_token_accuracy": 0.46551724672317507, "step": 125430 }, { "epoch": 0.12633946422438236, "grad_norm": 14.962162675219886, "learning_rate": 4.928684228409107e-05, "loss": 2.6576, "mean_token_accuracy": 0.37241379022598264, "step": 125435 }, { "epoch": 0.12634450027748653, "grad_norm": 9.744584065333749, "learning_rate": 4.9286748687253384e-05, "loss": 2.0041, "mean_token_accuracy": 0.4778325021266937, "step": 125440 }, { "epoch": 0.1263495363305907, "grad_norm": 9.188978448825495, "learning_rate": 4.928665508437307e-05, "loss": 2.1169, "mean_token_accuracy": 0.4896551728248596, "step": 125445 }, { "epoch": 0.12635457238369488, "grad_norm": 9.679030618989241, "learning_rate": 4.928656147545014e-05, "loss": 2.2549, "mean_token_accuracy": 0.4367211163043976, "step": 125450 }, { "epoch": 0.12635960843679905, "grad_norm": 9.980158435699172, "learning_rate": 4.928646786048463e-05, "loss": 2.3197, "mean_token_accuracy": 0.46358137130737304, "step": 125455 }, { "epoch": 0.12636464448990323, "grad_norm": 10.314676378958742, "learning_rate": 4.9286374239476555e-05, "loss": 2.7281, "mean_token_accuracy": 0.4034482777118683, "step": 125460 }, { "epoch": 0.1263696805430074, "grad_norm": 11.940209851895457, "learning_rate": 4.9286280612425964e-05, "loss": 2.5419, "mean_token_accuracy": 0.39655172228813174, "step": 125465 }, { "epoch": 0.12637471659611157, "grad_norm": 11.328375333734817, "learning_rate": 4.928618697933286e-05, "loss": 2.5356, "mean_token_accuracy": 0.36896551549434664, "step": 125470 }, { "epoch": 0.12637975264921575, "grad_norm": 9.765939095629442, "learning_rate": 4.928609334019727e-05, "loss": 2.39, "mean_token_accuracy": 0.482758617401123, "step": 125475 }, { "epoch": 0.1263847887023199, "grad_norm": 11.57498601297852, "learning_rate": 4.928599969501922e-05, "loss": 2.0353, "mean_token_accuracy": 0.4931034445762634, "step": 125480 }, { "epoch": 0.12638982475542407, "grad_norm": 12.448634704061009, "learning_rate": 4.928590604379875e-05, "loss": 2.6564, "mean_token_accuracy": 0.4275862157344818, "step": 125485 }, { "epoch": 0.12639486080852824, "grad_norm": 14.055194407238002, "learning_rate": 4.9285812386535884e-05, "loss": 2.3279, "mean_token_accuracy": 0.4448275864124298, "step": 125490 }, { "epoch": 0.12639989686163242, "grad_norm": 11.505836065007093, "learning_rate": 4.9285718723230636e-05, "loss": 2.6071, "mean_token_accuracy": 0.4247537016868591, "step": 125495 }, { "epoch": 0.1264049329147366, "grad_norm": 11.697771985600228, "learning_rate": 4.928562505388304e-05, "loss": 2.153, "mean_token_accuracy": 0.43103448748588563, "step": 125500 }, { "epoch": 0.12640996896784076, "grad_norm": 9.868091003841489, "learning_rate": 4.928553137849311e-05, "loss": 2.6435, "mean_token_accuracy": 0.3689655244350433, "step": 125505 }, { "epoch": 0.12641500502094494, "grad_norm": 11.493527042101217, "learning_rate": 4.928543769706089e-05, "loss": 2.6694, "mean_token_accuracy": 0.4103448152542114, "step": 125510 }, { "epoch": 0.1264200410740491, "grad_norm": 9.167007892768117, "learning_rate": 4.92853440095864e-05, "loss": 2.2183, "mean_token_accuracy": 0.46551724076271056, "step": 125515 }, { "epoch": 0.12642507712715328, "grad_norm": 21.044905122756113, "learning_rate": 4.928525031606966e-05, "loss": 2.4057, "mean_token_accuracy": 0.42758620977401735, "step": 125520 }, { "epoch": 0.12643011318025746, "grad_norm": 9.62231005158025, "learning_rate": 4.9285156616510694e-05, "loss": 2.578, "mean_token_accuracy": 0.4413793087005615, "step": 125525 }, { "epoch": 0.12643514923336163, "grad_norm": 9.952977222937193, "learning_rate": 4.928506291090954e-05, "loss": 2.5167, "mean_token_accuracy": 0.4379310369491577, "step": 125530 }, { "epoch": 0.1264401852864658, "grad_norm": 11.778850765784465, "learning_rate": 4.928496919926622e-05, "loss": 2.8928, "mean_token_accuracy": 0.37241379022598264, "step": 125535 }, { "epoch": 0.12644522133956998, "grad_norm": 11.751936236388296, "learning_rate": 4.928487548158075e-05, "loss": 2.6896, "mean_token_accuracy": 0.3999999940395355, "step": 125540 }, { "epoch": 0.12645025739267415, "grad_norm": 11.833655397496972, "learning_rate": 4.928478175785317e-05, "loss": 2.681, "mean_token_accuracy": 0.39310344457626345, "step": 125545 }, { "epoch": 0.12645529344577833, "grad_norm": 8.958547432045819, "learning_rate": 4.928468802808349e-05, "loss": 1.9739, "mean_token_accuracy": 0.5034482836723327, "step": 125550 }, { "epoch": 0.1264603294988825, "grad_norm": 11.790643284422298, "learning_rate": 4.928459429227176e-05, "loss": 2.6511, "mean_token_accuracy": 0.334482753276825, "step": 125555 }, { "epoch": 0.12646536555198667, "grad_norm": 12.113374873844297, "learning_rate": 4.9284500550417986e-05, "loss": 2.6445, "mean_token_accuracy": 0.4, "step": 125560 }, { "epoch": 0.12647040160509085, "grad_norm": 9.572315590352302, "learning_rate": 4.9284406802522194e-05, "loss": 2.5127, "mean_token_accuracy": 0.4379310369491577, "step": 125565 }, { "epoch": 0.12647543765819502, "grad_norm": 17.7582823476348, "learning_rate": 4.928431304858442e-05, "loss": 3.2016, "mean_token_accuracy": 0.3999999940395355, "step": 125570 }, { "epoch": 0.1264804737112992, "grad_norm": 9.815793387767407, "learning_rate": 4.928421928860468e-05, "loss": 2.3778, "mean_token_accuracy": 0.4551724135875702, "step": 125575 }, { "epoch": 0.12648550976440337, "grad_norm": 11.28466485145402, "learning_rate": 4.928412552258301e-05, "loss": 2.3067, "mean_token_accuracy": 0.48275861144065857, "step": 125580 }, { "epoch": 0.12649054581750754, "grad_norm": 12.078761570883694, "learning_rate": 4.9284031750519434e-05, "loss": 2.2491, "mean_token_accuracy": 0.44827585816383364, "step": 125585 }, { "epoch": 0.12649558187061172, "grad_norm": 10.881890212445043, "learning_rate": 4.928393797241397e-05, "loss": 2.3827, "mean_token_accuracy": 0.4344827592372894, "step": 125590 }, { "epoch": 0.1265006179237159, "grad_norm": 10.144265724688271, "learning_rate": 4.9283844188266657e-05, "loss": 2.2691, "mean_token_accuracy": 0.47586206793785096, "step": 125595 }, { "epoch": 0.12650565397682006, "grad_norm": 9.992885385037306, "learning_rate": 4.9283750398077507e-05, "loss": 2.138, "mean_token_accuracy": 0.4517241418361664, "step": 125600 }, { "epoch": 0.12651069002992424, "grad_norm": 11.102061507671314, "learning_rate": 4.928365660184656e-05, "loss": 2.3777, "mean_token_accuracy": 0.3896551728248596, "step": 125605 }, { "epoch": 0.1265157260830284, "grad_norm": 10.846652540957106, "learning_rate": 4.9283562799573826e-05, "loss": 2.3766, "mean_token_accuracy": 0.46551724076271056, "step": 125610 }, { "epoch": 0.1265207621361326, "grad_norm": 10.072721302397126, "learning_rate": 4.9283468991259344e-05, "loss": 2.5078, "mean_token_accuracy": 0.42413792610168455, "step": 125615 }, { "epoch": 0.12652579818923673, "grad_norm": 11.758240459043593, "learning_rate": 4.9283375176903124e-05, "loss": 2.3474, "mean_token_accuracy": 0.4, "step": 125620 }, { "epoch": 0.1265308342423409, "grad_norm": 10.5553537837584, "learning_rate": 4.9283281356505215e-05, "loss": 2.0944, "mean_token_accuracy": 0.44482758045196535, "step": 125625 }, { "epoch": 0.12653587029544508, "grad_norm": 20.73416552728234, "learning_rate": 4.9283187530065636e-05, "loss": 2.3979, "mean_token_accuracy": 0.3896551728248596, "step": 125630 }, { "epoch": 0.12654090634854925, "grad_norm": 12.130696116556242, "learning_rate": 4.92830936975844e-05, "loss": 2.6901, "mean_token_accuracy": 0.42758620977401735, "step": 125635 }, { "epoch": 0.12654594240165343, "grad_norm": 11.034498458822736, "learning_rate": 4.928299985906155e-05, "loss": 2.3368, "mean_token_accuracy": 0.4310344934463501, "step": 125640 }, { "epoch": 0.1265509784547576, "grad_norm": 10.19478699878888, "learning_rate": 4.9282906014497096e-05, "loss": 2.0181, "mean_token_accuracy": 0.4862068951129913, "step": 125645 }, { "epoch": 0.12655601450786177, "grad_norm": 10.351644092984687, "learning_rate": 4.928281216389107e-05, "loss": 2.1446, "mean_token_accuracy": 0.5034482836723327, "step": 125650 }, { "epoch": 0.12656105056096595, "grad_norm": 9.809617944629604, "learning_rate": 4.9282718307243506e-05, "loss": 2.2823, "mean_token_accuracy": 0.46560150384902954, "step": 125655 }, { "epoch": 0.12656608661407012, "grad_norm": 10.746901483546806, "learning_rate": 4.928262444455443e-05, "loss": 2.2662, "mean_token_accuracy": 0.4310344815254211, "step": 125660 }, { "epoch": 0.1265711226671743, "grad_norm": 10.798428574052325, "learning_rate": 4.928253057582385e-05, "loss": 2.5561, "mean_token_accuracy": 0.43793103098869324, "step": 125665 }, { "epoch": 0.12657615872027847, "grad_norm": 10.845569390647395, "learning_rate": 4.9282436701051806e-05, "loss": 2.5372, "mean_token_accuracy": 0.42068966031074523, "step": 125670 }, { "epoch": 0.12658119477338264, "grad_norm": 10.442403129634714, "learning_rate": 4.928234282023832e-05, "loss": 2.5139, "mean_token_accuracy": 0.41379310488700866, "step": 125675 }, { "epoch": 0.12658623082648682, "grad_norm": 9.695632224657807, "learning_rate": 4.928224893338342e-05, "loss": 1.9068, "mean_token_accuracy": 0.5310344696044922, "step": 125680 }, { "epoch": 0.126591266879591, "grad_norm": 11.866102981580292, "learning_rate": 4.928215504048714e-05, "loss": 2.6371, "mean_token_accuracy": 0.42413793206214906, "step": 125685 }, { "epoch": 0.12659630293269516, "grad_norm": 14.304503777762402, "learning_rate": 4.92820611415495e-05, "loss": 2.6484, "mean_token_accuracy": 0.43309134244918823, "step": 125690 }, { "epoch": 0.12660133898579934, "grad_norm": 10.683574373781973, "learning_rate": 4.928196723657051e-05, "loss": 2.1932, "mean_token_accuracy": 0.4344827592372894, "step": 125695 }, { "epoch": 0.1266063750389035, "grad_norm": 9.647674244008142, "learning_rate": 4.928187332555023e-05, "loss": 2.3392, "mean_token_accuracy": 0.45517241954803467, "step": 125700 }, { "epoch": 0.1266114110920077, "grad_norm": 11.951268303104852, "learning_rate": 4.928177940848865e-05, "loss": 2.3583, "mean_token_accuracy": 0.41034482419490814, "step": 125705 }, { "epoch": 0.12661644714511186, "grad_norm": 11.867609282460064, "learning_rate": 4.928168548538581e-05, "loss": 2.466, "mean_token_accuracy": 0.36551723480224607, "step": 125710 }, { "epoch": 0.12662148319821603, "grad_norm": 10.59241334516758, "learning_rate": 4.9281591556241754e-05, "loss": 2.4093, "mean_token_accuracy": 0.4034482777118683, "step": 125715 }, { "epoch": 0.1266265192513202, "grad_norm": 9.858919147885919, "learning_rate": 4.9281497621056474e-05, "loss": 2.2759, "mean_token_accuracy": 0.42413792610168455, "step": 125720 }, { "epoch": 0.12663155530442438, "grad_norm": 12.625832124887527, "learning_rate": 4.9281403679830026e-05, "loss": 2.8458, "mean_token_accuracy": 0.41034482717514037, "step": 125725 }, { "epoch": 0.12663659135752856, "grad_norm": 10.127678075054163, "learning_rate": 4.928130973256242e-05, "loss": 2.0577, "mean_token_accuracy": 0.46551724076271056, "step": 125730 }, { "epoch": 0.12664162741063273, "grad_norm": 13.32731949070172, "learning_rate": 4.9281215779253695e-05, "loss": 3.0282, "mean_token_accuracy": 0.40000000298023225, "step": 125735 }, { "epoch": 0.1266466634637369, "grad_norm": 11.92925612436529, "learning_rate": 4.928112181990386e-05, "loss": 2.4612, "mean_token_accuracy": 0.41034482717514037, "step": 125740 }, { "epoch": 0.12665169951684108, "grad_norm": 15.30487666368016, "learning_rate": 4.9281027854512945e-05, "loss": 2.6929, "mean_token_accuracy": 0.43448275327682495, "step": 125745 }, { "epoch": 0.12665673556994525, "grad_norm": 8.888813138442623, "learning_rate": 4.928093388308099e-05, "loss": 2.357, "mean_token_accuracy": 0.42758620977401735, "step": 125750 }, { "epoch": 0.12666177162304942, "grad_norm": 11.469065745541219, "learning_rate": 4.928083990560801e-05, "loss": 2.2921, "mean_token_accuracy": 0.44137930274009707, "step": 125755 }, { "epoch": 0.12666680767615357, "grad_norm": 9.732309707326875, "learning_rate": 4.928074592209403e-05, "loss": 2.5668, "mean_token_accuracy": 0.3965517282485962, "step": 125760 }, { "epoch": 0.12667184372925774, "grad_norm": 12.14781775355426, "learning_rate": 4.928065193253908e-05, "loss": 2.7583, "mean_token_accuracy": 0.41034482717514037, "step": 125765 }, { "epoch": 0.12667687978236192, "grad_norm": 9.858763586687843, "learning_rate": 4.928055793694319e-05, "loss": 2.2642, "mean_token_accuracy": 0.4813672125339508, "step": 125770 }, { "epoch": 0.1266819158354661, "grad_norm": 10.476294704595, "learning_rate": 4.928046393530637e-05, "loss": 2.4076, "mean_token_accuracy": 0.44482757449150084, "step": 125775 }, { "epoch": 0.12668695188857027, "grad_norm": 9.384607881273848, "learning_rate": 4.9280369927628666e-05, "loss": 2.2854, "mean_token_accuracy": 0.47241379618644713, "step": 125780 }, { "epoch": 0.12669198794167444, "grad_norm": 10.989054703167026, "learning_rate": 4.928027591391009e-05, "loss": 2.4529, "mean_token_accuracy": 0.43103447556495667, "step": 125785 }, { "epoch": 0.1266970239947786, "grad_norm": 11.787620489354616, "learning_rate": 4.928018189415068e-05, "loss": 2.2382, "mean_token_accuracy": 0.4517241418361664, "step": 125790 }, { "epoch": 0.1267020600478828, "grad_norm": 13.057123464911285, "learning_rate": 4.928008786835045e-05, "loss": 2.4118, "mean_token_accuracy": 0.4483968555927277, "step": 125795 }, { "epoch": 0.12670709610098696, "grad_norm": 11.90658399725102, "learning_rate": 4.9279993836509434e-05, "loss": 2.5213, "mean_token_accuracy": 0.46751360297203065, "step": 125800 }, { "epoch": 0.12671213215409113, "grad_norm": 11.202323943168969, "learning_rate": 4.9279899798627657e-05, "loss": 3.0484, "mean_token_accuracy": 0.42413793206214906, "step": 125805 }, { "epoch": 0.1267171682071953, "grad_norm": 11.892098534929213, "learning_rate": 4.927980575470513e-05, "loss": 2.4668, "mean_token_accuracy": 0.3896551728248596, "step": 125810 }, { "epoch": 0.12672220426029948, "grad_norm": 10.765266469873094, "learning_rate": 4.927971170474191e-05, "loss": 2.5185, "mean_token_accuracy": 0.4241379380226135, "step": 125815 }, { "epoch": 0.12672724031340366, "grad_norm": 11.553713110870778, "learning_rate": 4.9279617648738e-05, "loss": 2.1007, "mean_token_accuracy": 0.482758617401123, "step": 125820 }, { "epoch": 0.12673227636650783, "grad_norm": 9.135975429448365, "learning_rate": 4.9279523586693425e-05, "loss": 1.9383, "mean_token_accuracy": 0.4965517222881317, "step": 125825 }, { "epoch": 0.126737312419612, "grad_norm": 11.53238773394369, "learning_rate": 4.927942951860823e-05, "loss": 2.6553, "mean_token_accuracy": 0.4172413766384125, "step": 125830 }, { "epoch": 0.12674234847271618, "grad_norm": 11.28761532049412, "learning_rate": 4.927933544448242e-05, "loss": 2.4189, "mean_token_accuracy": 0.47586206793785096, "step": 125835 }, { "epoch": 0.12674738452582035, "grad_norm": 9.666054691713788, "learning_rate": 4.927924136431603e-05, "loss": 2.0141, "mean_token_accuracy": 0.517241382598877, "step": 125840 }, { "epoch": 0.12675242057892452, "grad_norm": 10.877253332283415, "learning_rate": 4.9279147278109095e-05, "loss": 2.3106, "mean_token_accuracy": 0.4103448212146759, "step": 125845 }, { "epoch": 0.1267574566320287, "grad_norm": 10.64344925255087, "learning_rate": 4.927905318586162e-05, "loss": 2.3265, "mean_token_accuracy": 0.4206896543502808, "step": 125850 }, { "epoch": 0.12676249268513287, "grad_norm": 12.002022163480463, "learning_rate": 4.9278959087573654e-05, "loss": 2.4491, "mean_token_accuracy": 0.3896551728248596, "step": 125855 }, { "epoch": 0.12676752873823705, "grad_norm": 10.491001765712781, "learning_rate": 4.9278864983245205e-05, "loss": 2.3777, "mean_token_accuracy": 0.4172413766384125, "step": 125860 }, { "epoch": 0.12677256479134122, "grad_norm": 8.231952688544519, "learning_rate": 4.927877087287631e-05, "loss": 2.2266, "mean_token_accuracy": 0.4620689570903778, "step": 125865 }, { "epoch": 0.1267776008444454, "grad_norm": 12.772871663660506, "learning_rate": 4.927867675646699e-05, "loss": 2.5991, "mean_token_accuracy": 0.3689655214548111, "step": 125870 }, { "epoch": 0.12678263689754957, "grad_norm": 10.57896307688069, "learning_rate": 4.927858263401727e-05, "loss": 2.2857, "mean_token_accuracy": 0.42758620977401735, "step": 125875 }, { "epoch": 0.12678767295065374, "grad_norm": 9.537771282310928, "learning_rate": 4.9278488505527186e-05, "loss": 2.2944, "mean_token_accuracy": 0.47241379618644713, "step": 125880 }, { "epoch": 0.12679270900375791, "grad_norm": 12.42032507431532, "learning_rate": 4.927839437099675e-05, "loss": 2.7396, "mean_token_accuracy": 0.4068965494632721, "step": 125885 }, { "epoch": 0.1267977450568621, "grad_norm": 11.367172733771271, "learning_rate": 4.927830023042601e-05, "loss": 2.7015, "mean_token_accuracy": 0.38620689511299133, "step": 125890 }, { "epoch": 0.12680278110996626, "grad_norm": 8.617554411637004, "learning_rate": 4.927820608381496e-05, "loss": 2.1608, "mean_token_accuracy": 0.4586206912994385, "step": 125895 }, { "epoch": 0.1268078171630704, "grad_norm": 12.765817488545972, "learning_rate": 4.927811193116364e-05, "loss": 2.8886, "mean_token_accuracy": 0.3758620709180832, "step": 125900 }, { "epoch": 0.12681285321617458, "grad_norm": 12.779562712700741, "learning_rate": 4.927801777247209e-05, "loss": 1.9414, "mean_token_accuracy": 0.5331518352031708, "step": 125905 }, { "epoch": 0.12681788926927876, "grad_norm": 10.848155771989825, "learning_rate": 4.9277923607740326e-05, "loss": 2.7266, "mean_token_accuracy": 0.3931034505367279, "step": 125910 }, { "epoch": 0.12682292532238293, "grad_norm": 15.294449732672954, "learning_rate": 4.927782943696838e-05, "loss": 2.2415, "mean_token_accuracy": 0.46551724076271056, "step": 125915 }, { "epoch": 0.1268279613754871, "grad_norm": 10.61165736207434, "learning_rate": 4.927773526015626e-05, "loss": 2.274, "mean_token_accuracy": 0.4780399203300476, "step": 125920 }, { "epoch": 0.12683299742859128, "grad_norm": 10.924617714631845, "learning_rate": 4.9277641077304004e-05, "loss": 2.7693, "mean_token_accuracy": 0.44295220375061034, "step": 125925 }, { "epoch": 0.12683803348169545, "grad_norm": 8.537161209579299, "learning_rate": 4.927754688841164e-05, "loss": 1.8594, "mean_token_accuracy": 0.5310344874858857, "step": 125930 }, { "epoch": 0.12684306953479962, "grad_norm": 10.52313436943759, "learning_rate": 4.9277452693479195e-05, "loss": 2.4275, "mean_token_accuracy": 0.441379314661026, "step": 125935 }, { "epoch": 0.1268481055879038, "grad_norm": 11.018736252574792, "learning_rate": 4.927735849250669e-05, "loss": 2.6516, "mean_token_accuracy": 0.4137930989265442, "step": 125940 }, { "epoch": 0.12685314164100797, "grad_norm": 11.05401076865327, "learning_rate": 4.927726428549416e-05, "loss": 2.5095, "mean_token_accuracy": 0.43448275327682495, "step": 125945 }, { "epoch": 0.12685817769411215, "grad_norm": 10.578709637466218, "learning_rate": 4.927717007244161e-05, "loss": 2.379, "mean_token_accuracy": 0.4862069010734558, "step": 125950 }, { "epoch": 0.12686321374721632, "grad_norm": 9.160598917720971, "learning_rate": 4.927707585334909e-05, "loss": 2.3359, "mean_token_accuracy": 0.4275862008333206, "step": 125955 }, { "epoch": 0.1268682498003205, "grad_norm": 12.49224326408125, "learning_rate": 4.9276981628216626e-05, "loss": 2.8845, "mean_token_accuracy": 0.3655172407627106, "step": 125960 }, { "epoch": 0.12687328585342467, "grad_norm": 9.7104464524469, "learning_rate": 4.927688739704423e-05, "loss": 2.0831, "mean_token_accuracy": 0.47931034564971925, "step": 125965 }, { "epoch": 0.12687832190652884, "grad_norm": 11.040785223097275, "learning_rate": 4.927679315983192e-05, "loss": 2.6082, "mean_token_accuracy": 0.43103448748588563, "step": 125970 }, { "epoch": 0.12688335795963301, "grad_norm": 12.631956779654836, "learning_rate": 4.9276698916579736e-05, "loss": 2.0813, "mean_token_accuracy": 0.5, "step": 125975 }, { "epoch": 0.1268883940127372, "grad_norm": 10.699209321960875, "learning_rate": 4.927660466728772e-05, "loss": 2.5479, "mean_token_accuracy": 0.4413793087005615, "step": 125980 }, { "epoch": 0.12689343006584136, "grad_norm": 11.853747468741728, "learning_rate": 4.9276510411955865e-05, "loss": 2.1796, "mean_token_accuracy": 0.4551724135875702, "step": 125985 }, { "epoch": 0.12689846611894554, "grad_norm": 11.635591701468837, "learning_rate": 4.927641615058422e-05, "loss": 2.0917, "mean_token_accuracy": 0.48965516686439514, "step": 125990 }, { "epoch": 0.1269035021720497, "grad_norm": 12.007231326699971, "learning_rate": 4.927632188317281e-05, "loss": 2.4904, "mean_token_accuracy": 0.43448275327682495, "step": 125995 }, { "epoch": 0.12690853822515388, "grad_norm": 10.520497902007655, "learning_rate": 4.9276227609721646e-05, "loss": 2.2964, "mean_token_accuracy": 0.4689655125141144, "step": 126000 }, { "epoch": 0.12691357427825806, "grad_norm": 15.20270641096983, "learning_rate": 4.9276133330230774e-05, "loss": 2.5535, "mean_token_accuracy": 0.42758620977401735, "step": 126005 }, { "epoch": 0.12691861033136223, "grad_norm": 9.32013585696496, "learning_rate": 4.9276039044700206e-05, "loss": 2.5769, "mean_token_accuracy": 0.4310344815254211, "step": 126010 }, { "epoch": 0.1269236463844664, "grad_norm": 11.859055222617217, "learning_rate": 4.9275944753129974e-05, "loss": 2.4756, "mean_token_accuracy": 0.4172413766384125, "step": 126015 }, { "epoch": 0.12692868243757058, "grad_norm": 8.500413637966444, "learning_rate": 4.927585045552009e-05, "loss": 2.1482, "mean_token_accuracy": 0.46551724076271056, "step": 126020 }, { "epoch": 0.12693371849067475, "grad_norm": 10.795854196155297, "learning_rate": 4.927575615187061e-05, "loss": 2.2955, "mean_token_accuracy": 0.4310344815254211, "step": 126025 }, { "epoch": 0.12693875454377893, "grad_norm": 11.676153816267155, "learning_rate": 4.9275661842181535e-05, "loss": 2.6862, "mean_token_accuracy": 0.41034482717514037, "step": 126030 }, { "epoch": 0.1269437905968831, "grad_norm": 10.209753627457854, "learning_rate": 4.92755675264529e-05, "loss": 2.5684, "mean_token_accuracy": 0.4103448212146759, "step": 126035 }, { "epoch": 0.12694882664998725, "grad_norm": 9.752392978485547, "learning_rate": 4.927547320468473e-05, "loss": 2.8925, "mean_token_accuracy": 0.3379310339689255, "step": 126040 }, { "epoch": 0.12695386270309142, "grad_norm": 11.263508757835163, "learning_rate": 4.927537887687705e-05, "loss": 2.2759, "mean_token_accuracy": 0.4482758641242981, "step": 126045 }, { "epoch": 0.1269588987561956, "grad_norm": 11.100882632609176, "learning_rate": 4.927528454302989e-05, "loss": 2.34, "mean_token_accuracy": 0.4034482717514038, "step": 126050 }, { "epoch": 0.12696393480929977, "grad_norm": 10.708809656733335, "learning_rate": 4.927519020314327e-05, "loss": 1.9983, "mean_token_accuracy": 0.4862069070339203, "step": 126055 }, { "epoch": 0.12696897086240394, "grad_norm": 9.836202796063645, "learning_rate": 4.927509585721722e-05, "loss": 2.183, "mean_token_accuracy": 0.46382336020469667, "step": 126060 }, { "epoch": 0.12697400691550811, "grad_norm": 13.299203499541346, "learning_rate": 4.927500150525177e-05, "loss": 3.0111, "mean_token_accuracy": 0.3931034505367279, "step": 126065 }, { "epoch": 0.1269790429686123, "grad_norm": 11.356182211108925, "learning_rate": 4.927490714724694e-05, "loss": 2.5998, "mean_token_accuracy": 0.36896551251411436, "step": 126070 }, { "epoch": 0.12698407902171646, "grad_norm": 10.975556575741773, "learning_rate": 4.927481278320276e-05, "loss": 2.3455, "mean_token_accuracy": 0.4379310429096222, "step": 126075 }, { "epoch": 0.12698911507482064, "grad_norm": 13.378187000547074, "learning_rate": 4.927471841311925e-05, "loss": 2.3857, "mean_token_accuracy": 0.4517241299152374, "step": 126080 }, { "epoch": 0.1269941511279248, "grad_norm": 12.900145296320545, "learning_rate": 4.9274624036996444e-05, "loss": 2.591, "mean_token_accuracy": 0.39655172228813174, "step": 126085 }, { "epoch": 0.12699918718102898, "grad_norm": 10.271377444815963, "learning_rate": 4.9274529654834365e-05, "loss": 2.9004, "mean_token_accuracy": 0.3655172437429428, "step": 126090 }, { "epoch": 0.12700422323413316, "grad_norm": 9.310323407261366, "learning_rate": 4.927443526663304e-05, "loss": 2.3656, "mean_token_accuracy": 0.4551724135875702, "step": 126095 }, { "epoch": 0.12700925928723733, "grad_norm": 11.23852980567283, "learning_rate": 4.927434087239249e-05, "loss": 2.2402, "mean_token_accuracy": 0.4329098641872406, "step": 126100 }, { "epoch": 0.1270142953403415, "grad_norm": 11.554499275540394, "learning_rate": 4.927424647211274e-05, "loss": 3.3686, "mean_token_accuracy": 0.3482758581638336, "step": 126105 }, { "epoch": 0.12701933139344568, "grad_norm": 12.759030625814802, "learning_rate": 4.927415206579384e-05, "loss": 2.4493, "mean_token_accuracy": 0.3896551728248596, "step": 126110 }, { "epoch": 0.12702436744654985, "grad_norm": 8.250148875832812, "learning_rate": 4.927405765343579e-05, "loss": 2.4197, "mean_token_accuracy": 0.4310344815254211, "step": 126115 }, { "epoch": 0.12702940349965403, "grad_norm": 9.495345883736034, "learning_rate": 4.9273963235038614e-05, "loss": 2.4691, "mean_token_accuracy": 0.4310344815254211, "step": 126120 }, { "epoch": 0.1270344395527582, "grad_norm": 10.927679756824796, "learning_rate": 4.927386881060236e-05, "loss": 2.5678, "mean_token_accuracy": 0.39655172228813174, "step": 126125 }, { "epoch": 0.12703947560586237, "grad_norm": 11.57393533609862, "learning_rate": 4.927377438012703e-05, "loss": 2.4389, "mean_token_accuracy": 0.4413793087005615, "step": 126130 }, { "epoch": 0.12704451165896655, "grad_norm": 11.680836305810228, "learning_rate": 4.927367994361267e-05, "loss": 2.9498, "mean_token_accuracy": 0.4172413766384125, "step": 126135 }, { "epoch": 0.12704954771207072, "grad_norm": 8.774422747857063, "learning_rate": 4.927358550105929e-05, "loss": 2.2867, "mean_token_accuracy": 0.4586206912994385, "step": 126140 }, { "epoch": 0.1270545837651749, "grad_norm": 9.592288354276853, "learning_rate": 4.9273491052466946e-05, "loss": 2.5243, "mean_token_accuracy": 0.4034482717514038, "step": 126145 }, { "epoch": 0.12705961981827907, "grad_norm": 10.222573659945471, "learning_rate": 4.927339659783563e-05, "loss": 2.3097, "mean_token_accuracy": 0.4620689690113068, "step": 126150 }, { "epoch": 0.12706465587138324, "grad_norm": 7.7184142181124855, "learning_rate": 4.927330213716537e-05, "loss": 2.2712, "mean_token_accuracy": 0.441379314661026, "step": 126155 }, { "epoch": 0.12706969192448742, "grad_norm": 8.746859491145443, "learning_rate": 4.927320767045622e-05, "loss": 2.3161, "mean_token_accuracy": 0.4551724076271057, "step": 126160 }, { "epoch": 0.1270747279775916, "grad_norm": 10.099152115326639, "learning_rate": 4.927311319770818e-05, "loss": 1.9294, "mean_token_accuracy": 0.5115763545036316, "step": 126165 }, { "epoch": 0.12707976403069576, "grad_norm": 12.579689390553792, "learning_rate": 4.927301871892129e-05, "loss": 2.7328, "mean_token_accuracy": 0.4, "step": 126170 }, { "epoch": 0.12708480008379994, "grad_norm": 10.084073752844548, "learning_rate": 4.927292423409557e-05, "loss": 2.2837, "mean_token_accuracy": 0.4172413766384125, "step": 126175 }, { "epoch": 0.12708983613690408, "grad_norm": 12.628416672666575, "learning_rate": 4.927282974323105e-05, "loss": 2.4772, "mean_token_accuracy": 0.42068966627120974, "step": 126180 }, { "epoch": 0.12709487219000826, "grad_norm": 11.732637109279727, "learning_rate": 4.927273524632776e-05, "loss": 2.5584, "mean_token_accuracy": 0.37241379022598264, "step": 126185 }, { "epoch": 0.12709990824311243, "grad_norm": 11.225813804845815, "learning_rate": 4.9272640743385716e-05, "loss": 2.4805, "mean_token_accuracy": 0.39655172526836396, "step": 126190 }, { "epoch": 0.1271049442962166, "grad_norm": 7.941056891899211, "learning_rate": 4.927254623440495e-05, "loss": 2.2199, "mean_token_accuracy": 0.47096188068389894, "step": 126195 }, { "epoch": 0.12710998034932078, "grad_norm": 8.890275672568116, "learning_rate": 4.927245171938549e-05, "loss": 2.1447, "mean_token_accuracy": 0.46412582993507384, "step": 126200 }, { "epoch": 0.12711501640242495, "grad_norm": 9.62779900760923, "learning_rate": 4.9272357198327345e-05, "loss": 2.0407, "mean_token_accuracy": 0.47803992629051206, "step": 126205 }, { "epoch": 0.12712005245552913, "grad_norm": 12.390036725574676, "learning_rate": 4.927226267123057e-05, "loss": 2.6304, "mean_token_accuracy": 0.41379310488700866, "step": 126210 }, { "epoch": 0.1271250885086333, "grad_norm": 11.363172950723811, "learning_rate": 4.927216813809517e-05, "loss": 2.5063, "mean_token_accuracy": 0.44827585220336913, "step": 126215 }, { "epoch": 0.12713012456173747, "grad_norm": 10.612965705980761, "learning_rate": 4.927207359892117e-05, "loss": 2.4224, "mean_token_accuracy": 0.37241379022598264, "step": 126220 }, { "epoch": 0.12713516061484165, "grad_norm": 10.421657051158945, "learning_rate": 4.9271979053708626e-05, "loss": 2.668, "mean_token_accuracy": 0.4034482717514038, "step": 126225 }, { "epoch": 0.12714019666794582, "grad_norm": 13.430643635519768, "learning_rate": 4.9271884502457526e-05, "loss": 2.3038, "mean_token_accuracy": 0.4137930989265442, "step": 126230 }, { "epoch": 0.12714523272105, "grad_norm": 11.092229422373455, "learning_rate": 4.927178994516792e-05, "loss": 2.4333, "mean_token_accuracy": 0.3965517163276672, "step": 126235 }, { "epoch": 0.12715026877415417, "grad_norm": 12.403814249020611, "learning_rate": 4.927169538183982e-05, "loss": 2.1577, "mean_token_accuracy": 0.4551724076271057, "step": 126240 }, { "epoch": 0.12715530482725834, "grad_norm": 9.75285635893045, "learning_rate": 4.927160081247327e-05, "loss": 2.3119, "mean_token_accuracy": 0.4551724135875702, "step": 126245 }, { "epoch": 0.12716034088036252, "grad_norm": 13.412757811935899, "learning_rate": 4.927150623706827e-05, "loss": 2.4692, "mean_token_accuracy": 0.458620685338974, "step": 126250 }, { "epoch": 0.1271653769334667, "grad_norm": 8.598081282392796, "learning_rate": 4.9271411655624874e-05, "loss": 2.879, "mean_token_accuracy": 0.3793103516101837, "step": 126255 }, { "epoch": 0.12717041298657086, "grad_norm": 9.855782686600524, "learning_rate": 4.927131706814309e-05, "loss": 2.8979, "mean_token_accuracy": 0.39310345351696013, "step": 126260 }, { "epoch": 0.12717544903967504, "grad_norm": 9.936308898051733, "learning_rate": 4.9271222474622956e-05, "loss": 2.0479, "mean_token_accuracy": 0.47931034564971925, "step": 126265 }, { "epoch": 0.1271804850927792, "grad_norm": 10.129650455500053, "learning_rate": 4.927112787506448e-05, "loss": 2.0105, "mean_token_accuracy": 0.4882637679576874, "step": 126270 }, { "epoch": 0.12718552114588338, "grad_norm": 11.483971175291135, "learning_rate": 4.9271033269467716e-05, "loss": 2.5026, "mean_token_accuracy": 0.42068966031074523, "step": 126275 }, { "epoch": 0.12719055719898756, "grad_norm": 10.138688224848085, "learning_rate": 4.927093865783266e-05, "loss": 2.5921, "mean_token_accuracy": 0.41379310488700866, "step": 126280 }, { "epoch": 0.12719559325209173, "grad_norm": 9.909470183148706, "learning_rate": 4.927084404015936e-05, "loss": 2.4916, "mean_token_accuracy": 0.41379310488700866, "step": 126285 }, { "epoch": 0.1272006293051959, "grad_norm": 10.510144087976744, "learning_rate": 4.927074941644784e-05, "loss": 2.2887, "mean_token_accuracy": 0.48275862336158754, "step": 126290 }, { "epoch": 0.12720566535830008, "grad_norm": 10.479268481324404, "learning_rate": 4.927065478669811e-05, "loss": 2.4535, "mean_token_accuracy": 0.41379310488700866, "step": 126295 }, { "epoch": 0.12721070141140425, "grad_norm": 8.061558553294134, "learning_rate": 4.9270560150910215e-05, "loss": 2.1166, "mean_token_accuracy": 0.5051421761512757, "step": 126300 }, { "epoch": 0.12721573746450843, "grad_norm": 9.899583273039967, "learning_rate": 4.9270465509084175e-05, "loss": 1.7906, "mean_token_accuracy": 0.5068965494632721, "step": 126305 }, { "epoch": 0.1272207735176126, "grad_norm": 10.573630394345253, "learning_rate": 4.927037086122001e-05, "loss": 2.4518, "mean_token_accuracy": 0.41034482717514037, "step": 126310 }, { "epoch": 0.12722580957071677, "grad_norm": 10.924930949289928, "learning_rate": 4.9270276207317754e-05, "loss": 2.7127, "mean_token_accuracy": 0.37241379022598264, "step": 126315 }, { "epoch": 0.12723084562382092, "grad_norm": 9.901593932786334, "learning_rate": 4.9270181547377433e-05, "loss": 2.664, "mean_token_accuracy": 0.4206896543502808, "step": 126320 }, { "epoch": 0.1272358816769251, "grad_norm": 9.58202496628284, "learning_rate": 4.927008688139907e-05, "loss": 2.4454, "mean_token_accuracy": 0.43448275327682495, "step": 126325 }, { "epoch": 0.12724091773002927, "grad_norm": 11.105729245350968, "learning_rate": 4.9269992209382695e-05, "loss": 2.5745, "mean_token_accuracy": 0.38112522661685944, "step": 126330 }, { "epoch": 0.12724595378313344, "grad_norm": 10.55666765294462, "learning_rate": 4.926989753132833e-05, "loss": 2.2971, "mean_token_accuracy": 0.42758620977401735, "step": 126335 }, { "epoch": 0.12725098983623762, "grad_norm": 11.385150936882395, "learning_rate": 4.926980284723599e-05, "loss": 2.1933, "mean_token_accuracy": 0.4482758641242981, "step": 126340 }, { "epoch": 0.1272560258893418, "grad_norm": 8.739827855266293, "learning_rate": 4.926970815710573e-05, "loss": 2.2255, "mean_token_accuracy": 0.44827585816383364, "step": 126345 }, { "epoch": 0.12726106194244596, "grad_norm": 9.726817323603358, "learning_rate": 4.926961346093756e-05, "loss": 2.2247, "mean_token_accuracy": 0.4517241358757019, "step": 126350 }, { "epoch": 0.12726609799555014, "grad_norm": 12.34747528232104, "learning_rate": 4.92695187587315e-05, "loss": 2.5598, "mean_token_accuracy": 0.41034482717514037, "step": 126355 }, { "epoch": 0.1272711340486543, "grad_norm": 8.80877887276747, "learning_rate": 4.926942405048758e-05, "loss": 2.334, "mean_token_accuracy": 0.4344827592372894, "step": 126360 }, { "epoch": 0.12727617010175848, "grad_norm": 11.165485289462055, "learning_rate": 4.926932933620584e-05, "loss": 2.2251, "mean_token_accuracy": 0.43103448748588563, "step": 126365 }, { "epoch": 0.12728120615486266, "grad_norm": 10.46804713058949, "learning_rate": 4.926923461588628e-05, "loss": 2.6067, "mean_token_accuracy": 0.4294615924358368, "step": 126370 }, { "epoch": 0.12728624220796683, "grad_norm": 10.98617950668715, "learning_rate": 4.926913988952895e-05, "loss": 2.3449, "mean_token_accuracy": 0.4517241358757019, "step": 126375 }, { "epoch": 0.127291278261071, "grad_norm": 10.645373637133252, "learning_rate": 4.926904515713387e-05, "loss": 2.4209, "mean_token_accuracy": 0.48814277052879335, "step": 126380 }, { "epoch": 0.12729631431417518, "grad_norm": 12.310859565002135, "learning_rate": 4.9268950418701066e-05, "loss": 2.2172, "mean_token_accuracy": 0.4793103337287903, "step": 126385 }, { "epoch": 0.12730135036727935, "grad_norm": 10.674351654876757, "learning_rate": 4.926885567423055e-05, "loss": 2.4508, "mean_token_accuracy": 0.4275861978530884, "step": 126390 }, { "epoch": 0.12730638642038353, "grad_norm": 15.355195517702839, "learning_rate": 4.926876092372237e-05, "loss": 2.5824, "mean_token_accuracy": 0.42068964838981626, "step": 126395 }, { "epoch": 0.1273114224734877, "grad_norm": 9.354645539103366, "learning_rate": 4.9268666167176547e-05, "loss": 2.2881, "mean_token_accuracy": 0.4724137902259827, "step": 126400 }, { "epoch": 0.12731645852659187, "grad_norm": 10.575320321669027, "learning_rate": 4.926857140459309e-05, "loss": 1.8653, "mean_token_accuracy": 0.5206896603107453, "step": 126405 }, { "epoch": 0.12732149457969605, "grad_norm": 10.266682275879177, "learning_rate": 4.9268476635972054e-05, "loss": 2.7302, "mean_token_accuracy": 0.3931034505367279, "step": 126410 }, { "epoch": 0.12732653063280022, "grad_norm": 10.495592614842995, "learning_rate": 4.926838186131344e-05, "loss": 2.1113, "mean_token_accuracy": 0.4881427764892578, "step": 126415 }, { "epoch": 0.1273315666859044, "grad_norm": 10.83245621046197, "learning_rate": 4.9268287080617284e-05, "loss": 2.3692, "mean_token_accuracy": 0.4206896543502808, "step": 126420 }, { "epoch": 0.12733660273900857, "grad_norm": 10.577581259228142, "learning_rate": 4.926819229388362e-05, "loss": 2.2124, "mean_token_accuracy": 0.4379310369491577, "step": 126425 }, { "epoch": 0.12734163879211274, "grad_norm": 11.524831788194373, "learning_rate": 4.926809750111246e-05, "loss": 2.6795, "mean_token_accuracy": 0.4241379380226135, "step": 126430 }, { "epoch": 0.12734667484521692, "grad_norm": 9.555904086448859, "learning_rate": 4.926800270230383e-05, "loss": 2.4417, "mean_token_accuracy": 0.44827585220336913, "step": 126435 }, { "epoch": 0.1273517108983211, "grad_norm": 10.767659136283019, "learning_rate": 4.926790789745778e-05, "loss": 2.5106, "mean_token_accuracy": 0.42068964838981626, "step": 126440 }, { "epoch": 0.12735674695142526, "grad_norm": 10.637516448783286, "learning_rate": 4.926781308657431e-05, "loss": 2.7729, "mean_token_accuracy": 0.3827586233615875, "step": 126445 }, { "epoch": 0.12736178300452944, "grad_norm": 13.003034501984684, "learning_rate": 4.926771826965346e-05, "loss": 2.4798, "mean_token_accuracy": 0.4103448212146759, "step": 126450 }, { "epoch": 0.1273668190576336, "grad_norm": 10.043682963341322, "learning_rate": 4.9267623446695244e-05, "loss": 2.5028, "mean_token_accuracy": 0.44137930274009707, "step": 126455 }, { "epoch": 0.12737185511073776, "grad_norm": 9.288098370214811, "learning_rate": 4.92675286176997e-05, "loss": 2.4132, "mean_token_accuracy": 0.48275862336158754, "step": 126460 }, { "epoch": 0.12737689116384193, "grad_norm": 15.561254059001973, "learning_rate": 4.926743378266686e-05, "loss": 2.3064, "mean_token_accuracy": 0.4620689630508423, "step": 126465 }, { "epoch": 0.1273819272169461, "grad_norm": 10.657211722902776, "learning_rate": 4.926733894159673e-05, "loss": 2.5582, "mean_token_accuracy": 0.3915305495262146, "step": 126470 }, { "epoch": 0.12738696327005028, "grad_norm": 10.108846251623126, "learning_rate": 4.9267244094489346e-05, "loss": 2.3001, "mean_token_accuracy": 0.4482758641242981, "step": 126475 }, { "epoch": 0.12739199932315445, "grad_norm": 9.68266235903544, "learning_rate": 4.926714924134475e-05, "loss": 2.7632, "mean_token_accuracy": 0.40344826579093934, "step": 126480 }, { "epoch": 0.12739703537625863, "grad_norm": 8.813745223752033, "learning_rate": 4.926705438216294e-05, "loss": 2.3415, "mean_token_accuracy": 0.4448275864124298, "step": 126485 }, { "epoch": 0.1274020714293628, "grad_norm": 11.138610458056178, "learning_rate": 4.926695951694396e-05, "loss": 2.4439, "mean_token_accuracy": 0.4206896543502808, "step": 126490 }, { "epoch": 0.12740710748246697, "grad_norm": 12.766977686041347, "learning_rate": 4.926686464568783e-05, "loss": 2.697, "mean_token_accuracy": 0.4379310250282288, "step": 126495 }, { "epoch": 0.12741214353557115, "grad_norm": 9.7252725679758, "learning_rate": 4.926676976839459e-05, "loss": 2.0266, "mean_token_accuracy": 0.4793103516101837, "step": 126500 }, { "epoch": 0.12741717958867532, "grad_norm": 13.430226307482679, "learning_rate": 4.926667488506425e-05, "loss": 2.2601, "mean_token_accuracy": 0.43448275327682495, "step": 126505 }, { "epoch": 0.1274222156417795, "grad_norm": 9.432532705921734, "learning_rate": 4.9266579995696834e-05, "loss": 2.1758, "mean_token_accuracy": 0.49655171632766726, "step": 126510 }, { "epoch": 0.12742725169488367, "grad_norm": 17.202132188727933, "learning_rate": 4.926648510029237e-05, "loss": 2.3422, "mean_token_accuracy": 0.4379310369491577, "step": 126515 }, { "epoch": 0.12743228774798784, "grad_norm": 10.708867121674936, "learning_rate": 4.926639019885091e-05, "loss": 2.379, "mean_token_accuracy": 0.42068966031074523, "step": 126520 }, { "epoch": 0.12743732380109202, "grad_norm": 8.986516714934993, "learning_rate": 4.926629529137245e-05, "loss": 2.1604, "mean_token_accuracy": 0.4586206912994385, "step": 126525 }, { "epoch": 0.1274423598541962, "grad_norm": 8.583356398084181, "learning_rate": 4.9266200377857026e-05, "loss": 2.1423, "mean_token_accuracy": 0.5160919666290283, "step": 126530 }, { "epoch": 0.12744739590730036, "grad_norm": 11.232109148260191, "learning_rate": 4.9266105458304665e-05, "loss": 2.1981, "mean_token_accuracy": 0.5034482717514038, "step": 126535 }, { "epoch": 0.12745243196040454, "grad_norm": 11.077759855187457, "learning_rate": 4.92660105327154e-05, "loss": 2.2887, "mean_token_accuracy": 0.41034482717514037, "step": 126540 }, { "epoch": 0.1274574680135087, "grad_norm": 11.363657292745028, "learning_rate": 4.9265915601089246e-05, "loss": 2.6995, "mean_token_accuracy": 0.4034482777118683, "step": 126545 }, { "epoch": 0.12746250406661289, "grad_norm": 10.641174029367203, "learning_rate": 4.926582066342623e-05, "loss": 2.5183, "mean_token_accuracy": 0.41034482717514037, "step": 126550 }, { "epoch": 0.12746754011971706, "grad_norm": 10.112679768361234, "learning_rate": 4.926572571972638e-05, "loss": 2.3635, "mean_token_accuracy": 0.41034482717514037, "step": 126555 }, { "epoch": 0.12747257617282123, "grad_norm": 8.072071754023415, "learning_rate": 4.9265630769989744e-05, "loss": 1.9905, "mean_token_accuracy": 0.46551724076271056, "step": 126560 }, { "epoch": 0.1274776122259254, "grad_norm": 8.702500347073224, "learning_rate": 4.926553581421631e-05, "loss": 2.2123, "mean_token_accuracy": 0.4793103337287903, "step": 126565 }, { "epoch": 0.12748264827902958, "grad_norm": 13.993208313287704, "learning_rate": 4.926544085240614e-05, "loss": 2.4542, "mean_token_accuracy": 0.4482758641242981, "step": 126570 }, { "epoch": 0.12748768433213375, "grad_norm": 10.31942398466338, "learning_rate": 4.9265345884559225e-05, "loss": 2.2075, "mean_token_accuracy": 0.4620689690113068, "step": 126575 }, { "epoch": 0.12749272038523793, "grad_norm": 13.51433793724202, "learning_rate": 4.926525091067562e-05, "loss": 2.8164, "mean_token_accuracy": 0.42758620381355283, "step": 126580 }, { "epoch": 0.1274977564383421, "grad_norm": 10.586864729474502, "learning_rate": 4.926515593075534e-05, "loss": 2.2226, "mean_token_accuracy": 0.4896551787853241, "step": 126585 }, { "epoch": 0.12750279249144628, "grad_norm": 10.929018940348854, "learning_rate": 4.926506094479842e-05, "loss": 2.3437, "mean_token_accuracy": 0.4310344815254211, "step": 126590 }, { "epoch": 0.12750782854455045, "grad_norm": 9.495089533920298, "learning_rate": 4.926496595280487e-05, "loss": 1.7149, "mean_token_accuracy": 0.4948578417301178, "step": 126595 }, { "epoch": 0.1275128645976546, "grad_norm": 9.64151516723201, "learning_rate": 4.9264870954774726e-05, "loss": 1.9267, "mean_token_accuracy": 0.5310344874858857, "step": 126600 }, { "epoch": 0.12751790065075877, "grad_norm": 8.772590219093928, "learning_rate": 4.926477595070802e-05, "loss": 2.3717, "mean_token_accuracy": 0.4119782209396362, "step": 126605 }, { "epoch": 0.12752293670386294, "grad_norm": 12.856256732661985, "learning_rate": 4.9264680940604766e-05, "loss": 2.4908, "mean_token_accuracy": 0.4172413766384125, "step": 126610 }, { "epoch": 0.12752797275696712, "grad_norm": 12.366155219429201, "learning_rate": 4.9264585924464994e-05, "loss": 2.1575, "mean_token_accuracy": 0.4620689630508423, "step": 126615 }, { "epoch": 0.1275330088100713, "grad_norm": 8.931818705571665, "learning_rate": 4.9264490902288743e-05, "loss": 2.5921, "mean_token_accuracy": 0.3531155467033386, "step": 126620 }, { "epoch": 0.12753804486317546, "grad_norm": 10.560233131796643, "learning_rate": 4.926439587407603e-05, "loss": 2.3733, "mean_token_accuracy": 0.441379314661026, "step": 126625 }, { "epoch": 0.12754308091627964, "grad_norm": 11.466093898814218, "learning_rate": 4.926430083982687e-05, "loss": 2.2441, "mean_token_accuracy": 0.42280701398849485, "step": 126630 }, { "epoch": 0.1275481169693838, "grad_norm": 11.523090129326764, "learning_rate": 4.926420579954131e-05, "loss": 2.2938, "mean_token_accuracy": 0.43793103098869324, "step": 126635 }, { "epoch": 0.127553153022488, "grad_norm": 8.508769059063956, "learning_rate": 4.926411075321935e-05, "loss": 2.1088, "mean_token_accuracy": 0.4793103516101837, "step": 126640 }, { "epoch": 0.12755818907559216, "grad_norm": 10.607255769094087, "learning_rate": 4.926401570086105e-05, "loss": 2.3691, "mean_token_accuracy": 0.44137930274009707, "step": 126645 }, { "epoch": 0.12756322512869633, "grad_norm": 10.100091149557151, "learning_rate": 4.926392064246641e-05, "loss": 2.369, "mean_token_accuracy": 0.4344827592372894, "step": 126650 }, { "epoch": 0.1275682611818005, "grad_norm": 13.376922680986132, "learning_rate": 4.9263825578035466e-05, "loss": 2.8752, "mean_token_accuracy": 0.4, "step": 126655 }, { "epoch": 0.12757329723490468, "grad_norm": 11.641374353681552, "learning_rate": 4.926373050756825e-05, "loss": 2.5629, "mean_token_accuracy": 0.4379310250282288, "step": 126660 }, { "epoch": 0.12757833328800885, "grad_norm": 8.761074886129105, "learning_rate": 4.926363543106477e-05, "loss": 1.8578, "mean_token_accuracy": 0.5101266205310822, "step": 126665 }, { "epoch": 0.12758336934111303, "grad_norm": 9.25209989931402, "learning_rate": 4.926354034852507e-05, "loss": 2.2834, "mean_token_accuracy": 0.44670296609401705, "step": 126670 }, { "epoch": 0.1275884053942172, "grad_norm": 12.2968119486088, "learning_rate": 4.926344525994918e-05, "loss": 2.52, "mean_token_accuracy": 0.42758620381355283, "step": 126675 }, { "epoch": 0.12759344144732138, "grad_norm": 15.028851253985428, "learning_rate": 4.9263350165337114e-05, "loss": 2.8971, "mean_token_accuracy": 0.3482758581638336, "step": 126680 }, { "epoch": 0.12759847750042555, "grad_norm": 11.65526991793121, "learning_rate": 4.92632550646889e-05, "loss": 2.4411, "mean_token_accuracy": 0.4068965554237366, "step": 126685 }, { "epoch": 0.12760351355352972, "grad_norm": 13.435179148738603, "learning_rate": 4.9263159958004564e-05, "loss": 2.4512, "mean_token_accuracy": 0.43448275327682495, "step": 126690 }, { "epoch": 0.1276085496066339, "grad_norm": 11.142575610417355, "learning_rate": 4.926306484528414e-05, "loss": 2.2504, "mean_token_accuracy": 0.4537810027599335, "step": 126695 }, { "epoch": 0.12761358565973807, "grad_norm": 11.366193941869742, "learning_rate": 4.926296972652763e-05, "loss": 2.2988, "mean_token_accuracy": 0.42413792610168455, "step": 126700 }, { "epoch": 0.12761862171284225, "grad_norm": 9.492350808900062, "learning_rate": 4.92628746017351e-05, "loss": 2.2662, "mean_token_accuracy": 0.4172413766384125, "step": 126705 }, { "epoch": 0.12762365776594642, "grad_norm": 9.410108035231657, "learning_rate": 4.9262779470906545e-05, "loss": 2.5687, "mean_token_accuracy": 0.41379310488700866, "step": 126710 }, { "epoch": 0.1276286938190506, "grad_norm": 10.048063691775335, "learning_rate": 4.9262684334042006e-05, "loss": 2.3475, "mean_token_accuracy": 0.44827585816383364, "step": 126715 }, { "epoch": 0.12763372987215477, "grad_norm": 9.91279451156627, "learning_rate": 4.9262589191141504e-05, "loss": 2.2665, "mean_token_accuracy": 0.45517241954803467, "step": 126720 }, { "epoch": 0.12763876592525894, "grad_norm": 11.505168197035678, "learning_rate": 4.926249404220507e-05, "loss": 2.7829, "mean_token_accuracy": 0.3379310369491577, "step": 126725 }, { "epoch": 0.12764380197836311, "grad_norm": 10.275022579429226, "learning_rate": 4.9262398887232717e-05, "loss": 2.796, "mean_token_accuracy": 0.3827586233615875, "step": 126730 }, { "epoch": 0.1276488380314673, "grad_norm": 8.43177887188339, "learning_rate": 4.926230372622449e-05, "loss": 2.0841, "mean_token_accuracy": 0.4586206912994385, "step": 126735 }, { "epoch": 0.12765387408457143, "grad_norm": 9.833103960504232, "learning_rate": 4.9262208559180414e-05, "loss": 2.3915, "mean_token_accuracy": 0.42758620977401735, "step": 126740 }, { "epoch": 0.1276589101376756, "grad_norm": 8.668301392831404, "learning_rate": 4.9262113386100493e-05, "loss": 2.3824, "mean_token_accuracy": 0.4, "step": 126745 }, { "epoch": 0.12766394619077978, "grad_norm": 14.967785717041187, "learning_rate": 4.926201820698477e-05, "loss": 2.627, "mean_token_accuracy": 0.3931034505367279, "step": 126750 }, { "epoch": 0.12766898224388395, "grad_norm": 8.005346880875743, "learning_rate": 4.9261923021833284e-05, "loss": 2.2352, "mean_token_accuracy": 0.4689655125141144, "step": 126755 }, { "epoch": 0.12767401829698813, "grad_norm": 13.11089051448881, "learning_rate": 4.9261827830646035e-05, "loss": 3.1929, "mean_token_accuracy": 0.3758620619773865, "step": 126760 }, { "epoch": 0.1276790543500923, "grad_norm": 11.198795776855427, "learning_rate": 4.9261732633423066e-05, "loss": 2.725, "mean_token_accuracy": 0.3896551728248596, "step": 126765 }, { "epoch": 0.12768409040319648, "grad_norm": 11.893137380648179, "learning_rate": 4.92616374301644e-05, "loss": 2.1802, "mean_token_accuracy": 0.4620689630508423, "step": 126770 }, { "epoch": 0.12768912645630065, "grad_norm": 9.040045397192271, "learning_rate": 4.926154222087006e-05, "loss": 2.2651, "mean_token_accuracy": 0.46896551847457885, "step": 126775 }, { "epoch": 0.12769416250940482, "grad_norm": 12.05815741875097, "learning_rate": 4.926144700554008e-05, "loss": 2.3568, "mean_token_accuracy": 0.4241379380226135, "step": 126780 }, { "epoch": 0.127699198562509, "grad_norm": 10.663360429660948, "learning_rate": 4.926135178417448e-05, "loss": 2.8555, "mean_token_accuracy": 0.37586207389831544, "step": 126785 }, { "epoch": 0.12770423461561317, "grad_norm": 10.7243196619777, "learning_rate": 4.9261256556773284e-05, "loss": 2.3048, "mean_token_accuracy": 0.458620685338974, "step": 126790 }, { "epoch": 0.12770927066871735, "grad_norm": 12.378873060511623, "learning_rate": 4.926116132333653e-05, "loss": 2.2625, "mean_token_accuracy": 0.4275861978530884, "step": 126795 }, { "epoch": 0.12771430672182152, "grad_norm": 11.267963760878676, "learning_rate": 4.9261066083864226e-05, "loss": 2.4628, "mean_token_accuracy": 0.4206896543502808, "step": 126800 }, { "epoch": 0.1277193427749257, "grad_norm": 10.039140044980758, "learning_rate": 4.9260970838356415e-05, "loss": 2.2729, "mean_token_accuracy": 0.4551724135875702, "step": 126805 }, { "epoch": 0.12772437882802987, "grad_norm": 11.310061161728843, "learning_rate": 4.9260875586813116e-05, "loss": 2.2114, "mean_token_accuracy": 0.4517241418361664, "step": 126810 }, { "epoch": 0.12772941488113404, "grad_norm": 9.95515123383782, "learning_rate": 4.9260780329234354e-05, "loss": 2.3266, "mean_token_accuracy": 0.4379310369491577, "step": 126815 }, { "epoch": 0.12773445093423821, "grad_norm": 8.628315622092826, "learning_rate": 4.926068506562016e-05, "loss": 2.1622, "mean_token_accuracy": 0.4517241418361664, "step": 126820 }, { "epoch": 0.1277394869873424, "grad_norm": 11.195149438029354, "learning_rate": 4.9260589795970566e-05, "loss": 2.3587, "mean_token_accuracy": 0.42413793206214906, "step": 126825 }, { "epoch": 0.12774452304044656, "grad_norm": 8.914357616401052, "learning_rate": 4.926049452028559e-05, "loss": 2.3972, "mean_token_accuracy": 0.4137930929660797, "step": 126830 }, { "epoch": 0.12774955909355074, "grad_norm": 11.50655367157371, "learning_rate": 4.9260399238565255e-05, "loss": 2.2437, "mean_token_accuracy": 0.4310344815254211, "step": 126835 }, { "epoch": 0.1277545951466549, "grad_norm": 9.787988754799539, "learning_rate": 4.9260303950809596e-05, "loss": 2.1003, "mean_token_accuracy": 0.4586206912994385, "step": 126840 }, { "epoch": 0.12775963119975908, "grad_norm": 9.306597994197702, "learning_rate": 4.926020865701863e-05, "loss": 2.1581, "mean_token_accuracy": 0.44482759237289426, "step": 126845 }, { "epoch": 0.12776466725286326, "grad_norm": 12.729434732711708, "learning_rate": 4.926011335719239e-05, "loss": 2.4881, "mean_token_accuracy": 0.4498487591743469, "step": 126850 }, { "epoch": 0.12776970330596743, "grad_norm": 11.276623286836196, "learning_rate": 4.92600180513309e-05, "loss": 2.6583, "mean_token_accuracy": 0.37241379618644715, "step": 126855 }, { "epoch": 0.1277747393590716, "grad_norm": 10.831769414058755, "learning_rate": 4.9259922739434195e-05, "loss": 2.4984, "mean_token_accuracy": 0.42758620977401735, "step": 126860 }, { "epoch": 0.12777977541217578, "grad_norm": 10.698054881529341, "learning_rate": 4.925982742150228e-05, "loss": 2.7509, "mean_token_accuracy": 0.39310344457626345, "step": 126865 }, { "epoch": 0.12778481146527995, "grad_norm": 9.19272532564477, "learning_rate": 4.925973209753521e-05, "loss": 2.5509, "mean_token_accuracy": 0.4206896543502808, "step": 126870 }, { "epoch": 0.12778984751838413, "grad_norm": 12.02608796593253, "learning_rate": 4.925963676753299e-05, "loss": 2.7366, "mean_token_accuracy": 0.4586206912994385, "step": 126875 }, { "epoch": 0.12779488357148827, "grad_norm": 10.731068961003015, "learning_rate": 4.9259541431495657e-05, "loss": 2.6728, "mean_token_accuracy": 0.3620689660310745, "step": 126880 }, { "epoch": 0.12779991962459245, "grad_norm": 10.631990481014354, "learning_rate": 4.9259446089423244e-05, "loss": 2.375, "mean_token_accuracy": 0.4310344815254211, "step": 126885 }, { "epoch": 0.12780495567769662, "grad_norm": 11.33762696506454, "learning_rate": 4.9259350741315756e-05, "loss": 2.4843, "mean_token_accuracy": 0.4172413766384125, "step": 126890 }, { "epoch": 0.1278099917308008, "grad_norm": 11.22839614285331, "learning_rate": 4.925925538717323e-05, "loss": 2.2964, "mean_token_accuracy": 0.39310344457626345, "step": 126895 }, { "epoch": 0.12781502778390497, "grad_norm": 12.456939660159644, "learning_rate": 4.92591600269957e-05, "loss": 2.3633, "mean_token_accuracy": 0.458620685338974, "step": 126900 }, { "epoch": 0.12782006383700914, "grad_norm": 13.257324270212553, "learning_rate": 4.9259064660783174e-05, "loss": 2.7723, "mean_token_accuracy": 0.3999999940395355, "step": 126905 }, { "epoch": 0.12782509989011331, "grad_norm": 10.561204754156838, "learning_rate": 4.92589692885357e-05, "loss": 2.3811, "mean_token_accuracy": 0.3965517282485962, "step": 126910 }, { "epoch": 0.1278301359432175, "grad_norm": 11.138798056264237, "learning_rate": 4.925887391025329e-05, "loss": 2.4485, "mean_token_accuracy": 0.4310344696044922, "step": 126915 }, { "epoch": 0.12783517199632166, "grad_norm": 12.146522636450808, "learning_rate": 4.925877852593598e-05, "loss": 2.4301, "mean_token_accuracy": 0.4, "step": 126920 }, { "epoch": 0.12784020804942584, "grad_norm": 11.319515796797806, "learning_rate": 4.925868313558379e-05, "loss": 2.533, "mean_token_accuracy": 0.4206896543502808, "step": 126925 }, { "epoch": 0.12784524410253, "grad_norm": 11.103742563527515, "learning_rate": 4.925858773919674e-05, "loss": 2.5852, "mean_token_accuracy": 0.42068966031074523, "step": 126930 }, { "epoch": 0.12785028015563418, "grad_norm": 9.783547542750082, "learning_rate": 4.925849233677487e-05, "loss": 2.1932, "mean_token_accuracy": 0.4965517342090607, "step": 126935 }, { "epoch": 0.12785531620873836, "grad_norm": 12.345273388940988, "learning_rate": 4.925839692831821e-05, "loss": 2.3496, "mean_token_accuracy": 0.43653962314128875, "step": 126940 }, { "epoch": 0.12786035226184253, "grad_norm": 10.654256450051767, "learning_rate": 4.9258301513826766e-05, "loss": 2.2947, "mean_token_accuracy": 0.4, "step": 126945 }, { "epoch": 0.1278653883149467, "grad_norm": 13.083685128526781, "learning_rate": 4.925820609330058e-05, "loss": 2.3788, "mean_token_accuracy": 0.4, "step": 126950 }, { "epoch": 0.12787042436805088, "grad_norm": 11.51521000744791, "learning_rate": 4.9258110666739676e-05, "loss": 2.4277, "mean_token_accuracy": 0.45517241954803467, "step": 126955 }, { "epoch": 0.12787546042115505, "grad_norm": 12.263774892039638, "learning_rate": 4.9258015234144074e-05, "loss": 2.3009, "mean_token_accuracy": 0.4620689630508423, "step": 126960 }, { "epoch": 0.12788049647425923, "grad_norm": 10.367202593000918, "learning_rate": 4.925791979551381e-05, "loss": 2.2849, "mean_token_accuracy": 0.441379314661026, "step": 126965 }, { "epoch": 0.1278855325273634, "grad_norm": 11.010811145056024, "learning_rate": 4.92578243508489e-05, "loss": 2.527, "mean_token_accuracy": 0.3517241418361664, "step": 126970 }, { "epoch": 0.12789056858046757, "grad_norm": 11.879036947544455, "learning_rate": 4.9257728900149385e-05, "loss": 2.2518, "mean_token_accuracy": 0.41724138259887694, "step": 126975 }, { "epoch": 0.12789560463357175, "grad_norm": 11.817416531026135, "learning_rate": 4.9257633443415274e-05, "loss": 2.7877, "mean_token_accuracy": 0.3738657057285309, "step": 126980 }, { "epoch": 0.12790064068667592, "grad_norm": 8.819305555394017, "learning_rate": 4.925753798064661e-05, "loss": 2.4334, "mean_token_accuracy": 0.45698729157447815, "step": 126985 }, { "epoch": 0.1279056767397801, "grad_norm": 10.167271536189954, "learning_rate": 4.925744251184341e-05, "loss": 2.6804, "mean_token_accuracy": 0.3827586114406586, "step": 126990 }, { "epoch": 0.12791071279288427, "grad_norm": 11.95473369184682, "learning_rate": 4.925734703700569e-05, "loss": 2.1363, "mean_token_accuracy": 0.42758620381355283, "step": 126995 }, { "epoch": 0.12791574884598844, "grad_norm": 10.837953761188341, "learning_rate": 4.925725155613351e-05, "loss": 2.418, "mean_token_accuracy": 0.3793103516101837, "step": 127000 }, { "epoch": 0.12792078489909262, "grad_norm": 8.522922760905878, "learning_rate": 4.9257156069226864e-05, "loss": 2.5517, "mean_token_accuracy": 0.4310344815254211, "step": 127005 }, { "epoch": 0.1279258209521968, "grad_norm": 10.661776153818293, "learning_rate": 4.925706057628579e-05, "loss": 2.7257, "mean_token_accuracy": 0.3793103516101837, "step": 127010 }, { "epoch": 0.12793085700530096, "grad_norm": 10.759453543875233, "learning_rate": 4.9256965077310314e-05, "loss": 2.0764, "mean_token_accuracy": 0.42758620977401735, "step": 127015 }, { "epoch": 0.1279358930584051, "grad_norm": 10.132624634933931, "learning_rate": 4.9256869572300455e-05, "loss": 2.4544, "mean_token_accuracy": 0.42413793206214906, "step": 127020 }, { "epoch": 0.12794092911150928, "grad_norm": 9.65406052034216, "learning_rate": 4.925677406125626e-05, "loss": 2.2545, "mean_token_accuracy": 0.44482759237289426, "step": 127025 }, { "epoch": 0.12794596516461346, "grad_norm": 12.313267183044637, "learning_rate": 4.925667854417773e-05, "loss": 3.0173, "mean_token_accuracy": 0.4000000059604645, "step": 127030 }, { "epoch": 0.12795100121771763, "grad_norm": 9.452212010042398, "learning_rate": 4.925658302106491e-05, "loss": 2.5741, "mean_token_accuracy": 0.4344827592372894, "step": 127035 }, { "epoch": 0.1279560372708218, "grad_norm": 9.19184639950459, "learning_rate": 4.925648749191782e-05, "loss": 2.122, "mean_token_accuracy": 0.441379314661026, "step": 127040 }, { "epoch": 0.12796107332392598, "grad_norm": 11.684138239985195, "learning_rate": 4.9256391956736494e-05, "loss": 2.3861, "mean_token_accuracy": 0.46551724076271056, "step": 127045 }, { "epoch": 0.12796610937703015, "grad_norm": 11.67704504528358, "learning_rate": 4.9256296415520944e-05, "loss": 2.3009, "mean_token_accuracy": 0.4379310369491577, "step": 127050 }, { "epoch": 0.12797114543013433, "grad_norm": 11.540006463307481, "learning_rate": 4.9256200868271205e-05, "loss": 2.2267, "mean_token_accuracy": 0.4172413766384125, "step": 127055 }, { "epoch": 0.1279761814832385, "grad_norm": 9.412826689091045, "learning_rate": 4.9256105314987306e-05, "loss": 2.7954, "mean_token_accuracy": 0.41379310488700866, "step": 127060 }, { "epoch": 0.12798121753634267, "grad_norm": 11.417912369468148, "learning_rate": 4.925600975566927e-05, "loss": 2.4841, "mean_token_accuracy": 0.36896551549434664, "step": 127065 }, { "epoch": 0.12798625358944685, "grad_norm": 12.139551665347785, "learning_rate": 4.925591419031712e-05, "loss": 2.2353, "mean_token_accuracy": 0.4413793087005615, "step": 127070 }, { "epoch": 0.12799128964255102, "grad_norm": 10.12006375810203, "learning_rate": 4.9255818618930884e-05, "loss": 2.2912, "mean_token_accuracy": 0.46436781287193296, "step": 127075 }, { "epoch": 0.1279963256956552, "grad_norm": 12.493811221751345, "learning_rate": 4.925572304151059e-05, "loss": 2.4715, "mean_token_accuracy": 0.4344827592372894, "step": 127080 }, { "epoch": 0.12800136174875937, "grad_norm": 12.31332505515964, "learning_rate": 4.925562745805627e-05, "loss": 2.853, "mean_token_accuracy": 0.341379314661026, "step": 127085 }, { "epoch": 0.12800639780186354, "grad_norm": 9.675071818223804, "learning_rate": 4.925553186856795e-05, "loss": 2.3168, "mean_token_accuracy": 0.41724138259887694, "step": 127090 }, { "epoch": 0.12801143385496772, "grad_norm": 14.779468129722208, "learning_rate": 4.925543627304563e-05, "loss": 2.4308, "mean_token_accuracy": 0.4034482777118683, "step": 127095 }, { "epoch": 0.1280164699080719, "grad_norm": 9.866213196686568, "learning_rate": 4.9255340671489386e-05, "loss": 2.2729, "mean_token_accuracy": 0.39655172228813174, "step": 127100 }, { "epoch": 0.12802150596117606, "grad_norm": 13.929116392266403, "learning_rate": 4.92552450638992e-05, "loss": 2.5871, "mean_token_accuracy": 0.47586206197738645, "step": 127105 }, { "epoch": 0.12802654201428024, "grad_norm": 12.175500117205303, "learning_rate": 4.925514945027512e-05, "loss": 2.3948, "mean_token_accuracy": 0.43103448748588563, "step": 127110 }, { "epoch": 0.1280315780673844, "grad_norm": 12.50221174596488, "learning_rate": 4.925505383061717e-05, "loss": 2.3325, "mean_token_accuracy": 0.47791893482208253, "step": 127115 }, { "epoch": 0.12803661412048858, "grad_norm": 9.112202940582087, "learning_rate": 4.925495820492538e-05, "loss": 2.4781, "mean_token_accuracy": 0.3655172407627106, "step": 127120 }, { "epoch": 0.12804165017359276, "grad_norm": 10.764015915914927, "learning_rate": 4.925486257319977e-05, "loss": 2.567, "mean_token_accuracy": 0.45517241954803467, "step": 127125 }, { "epoch": 0.12804668622669693, "grad_norm": 13.662902081936021, "learning_rate": 4.925476693544035e-05, "loss": 2.5529, "mean_token_accuracy": 0.3931034505367279, "step": 127130 }, { "epoch": 0.1280517222798011, "grad_norm": 12.774570797529282, "learning_rate": 4.925467129164719e-05, "loss": 2.3579, "mean_token_accuracy": 0.42758620381355283, "step": 127135 }, { "epoch": 0.12805675833290528, "grad_norm": 10.175613935564426, "learning_rate": 4.925457564182027e-05, "loss": 2.2847, "mean_token_accuracy": 0.42413793206214906, "step": 127140 }, { "epoch": 0.12806179438600945, "grad_norm": 11.607022208415822, "learning_rate": 4.925447998595965e-05, "loss": 2.8177, "mean_token_accuracy": 0.40205685794353485, "step": 127145 }, { "epoch": 0.12806683043911363, "grad_norm": 7.464077015132231, "learning_rate": 4.9254384324065336e-05, "loss": 1.9185, "mean_token_accuracy": 0.5225369393825531, "step": 127150 }, { "epoch": 0.1280718664922178, "grad_norm": 15.118912837057882, "learning_rate": 4.925428865613736e-05, "loss": 2.3115, "mean_token_accuracy": 0.4689655125141144, "step": 127155 }, { "epoch": 0.12807690254532195, "grad_norm": 10.826731697966773, "learning_rate": 4.9254192982175765e-05, "loss": 2.5683, "mean_token_accuracy": 0.4344827592372894, "step": 127160 }, { "epoch": 0.12808193859842612, "grad_norm": 12.394609339012014, "learning_rate": 4.9254097302180554e-05, "loss": 2.8275, "mean_token_accuracy": 0.3999999940395355, "step": 127165 }, { "epoch": 0.1280869746515303, "grad_norm": 9.514137556998087, "learning_rate": 4.9254001616151765e-05, "loss": 2.2196, "mean_token_accuracy": 0.42758620977401735, "step": 127170 }, { "epoch": 0.12809201070463447, "grad_norm": 9.329028999210804, "learning_rate": 4.9253905924089425e-05, "loss": 1.9966, "mean_token_accuracy": 0.45517240166664125, "step": 127175 }, { "epoch": 0.12809704675773864, "grad_norm": 9.595595575056784, "learning_rate": 4.9253810225993554e-05, "loss": 2.225, "mean_token_accuracy": 0.4448275864124298, "step": 127180 }, { "epoch": 0.12810208281084282, "grad_norm": 11.336584626123788, "learning_rate": 4.925371452186418e-05, "loss": 2.2194, "mean_token_accuracy": 0.4931034445762634, "step": 127185 }, { "epoch": 0.128107118863947, "grad_norm": 11.50290157788579, "learning_rate": 4.9253618811701334e-05, "loss": 2.5734, "mean_token_accuracy": 0.39999999701976774, "step": 127190 }, { "epoch": 0.12811215491705116, "grad_norm": 10.304893162485786, "learning_rate": 4.925352309550504e-05, "loss": 2.431, "mean_token_accuracy": 0.44827587008476255, "step": 127195 }, { "epoch": 0.12811719097015534, "grad_norm": 10.3719513078457, "learning_rate": 4.925342737327533e-05, "loss": 2.5209, "mean_token_accuracy": 0.42413792610168455, "step": 127200 }, { "epoch": 0.1281222270232595, "grad_norm": 10.42465515173153, "learning_rate": 4.925333164501222e-05, "loss": 2.6396, "mean_token_accuracy": 0.36896551251411436, "step": 127205 }, { "epoch": 0.12812726307636368, "grad_norm": 9.702067531280344, "learning_rate": 4.9253235910715744e-05, "loss": 2.4835, "mean_token_accuracy": 0.36896551251411436, "step": 127210 }, { "epoch": 0.12813229912946786, "grad_norm": 10.16428887813001, "learning_rate": 4.925314017038593e-05, "loss": 2.5293, "mean_token_accuracy": 0.4295825779438019, "step": 127215 }, { "epoch": 0.12813733518257203, "grad_norm": 7.453574434938719, "learning_rate": 4.92530444240228e-05, "loss": 2.1615, "mean_token_accuracy": 0.48965516686439514, "step": 127220 }, { "epoch": 0.1281423712356762, "grad_norm": 11.525993832832416, "learning_rate": 4.925294867162638e-05, "loss": 2.4023, "mean_token_accuracy": 0.3965517163276672, "step": 127225 }, { "epoch": 0.12814740728878038, "grad_norm": 8.571556145362914, "learning_rate": 4.9252852913196696e-05, "loss": 1.9015, "mean_token_accuracy": 0.5310344755649566, "step": 127230 }, { "epoch": 0.12815244334188455, "grad_norm": 10.611404609252613, "learning_rate": 4.925275714873379e-05, "loss": 2.3414, "mean_token_accuracy": 0.4310344815254211, "step": 127235 }, { "epoch": 0.12815747939498873, "grad_norm": 21.901660959596388, "learning_rate": 4.925266137823766e-05, "loss": 2.6081, "mean_token_accuracy": 0.4644088625907898, "step": 127240 }, { "epoch": 0.1281625154480929, "grad_norm": 11.028341137836424, "learning_rate": 4.9252565601708354e-05, "loss": 2.3647, "mean_token_accuracy": 0.41379310488700866, "step": 127245 }, { "epoch": 0.12816755150119707, "grad_norm": 11.373256720423647, "learning_rate": 4.92524698191459e-05, "loss": 2.3173, "mean_token_accuracy": 0.4344827592372894, "step": 127250 }, { "epoch": 0.12817258755430125, "grad_norm": 10.668431034298811, "learning_rate": 4.925237403055031e-05, "loss": 2.6515, "mean_token_accuracy": 0.3827586233615875, "step": 127255 }, { "epoch": 0.12817762360740542, "grad_norm": 10.611157214423176, "learning_rate": 4.9252278235921616e-05, "loss": 2.3834, "mean_token_accuracy": 0.4517241418361664, "step": 127260 }, { "epoch": 0.1281826596605096, "grad_norm": 11.746070389120103, "learning_rate": 4.925218243525985e-05, "loss": 2.4977, "mean_token_accuracy": 0.44137930274009707, "step": 127265 }, { "epoch": 0.12818769571361377, "grad_norm": 11.253197243240193, "learning_rate": 4.9252086628565034e-05, "loss": 2.3282, "mean_token_accuracy": 0.4034482717514038, "step": 127270 }, { "epoch": 0.12819273176671794, "grad_norm": 10.671099711682324, "learning_rate": 4.925199081583719e-05, "loss": 2.0988, "mean_token_accuracy": 0.46896552443504336, "step": 127275 }, { "epoch": 0.12819776781982212, "grad_norm": 9.405020583357254, "learning_rate": 4.925189499707636e-05, "loss": 2.2536, "mean_token_accuracy": 0.4620689690113068, "step": 127280 }, { "epoch": 0.1282028038729263, "grad_norm": 9.644305615183036, "learning_rate": 4.925179917228256e-05, "loss": 2.1338, "mean_token_accuracy": 0.47931034564971925, "step": 127285 }, { "epoch": 0.12820783992603046, "grad_norm": 12.086707061678714, "learning_rate": 4.925170334145581e-05, "loss": 2.1114, "mean_token_accuracy": 0.46067755818367007, "step": 127290 }, { "epoch": 0.12821287597913464, "grad_norm": 9.009217015545866, "learning_rate": 4.925160750459615e-05, "loss": 2.1778, "mean_token_accuracy": 0.4517241418361664, "step": 127295 }, { "epoch": 0.12821791203223878, "grad_norm": 12.85407971067005, "learning_rate": 4.92515116617036e-05, "loss": 2.1135, "mean_token_accuracy": 0.4724137902259827, "step": 127300 }, { "epoch": 0.12822294808534296, "grad_norm": 10.309255889606387, "learning_rate": 4.9251415812778185e-05, "loss": 2.3609, "mean_token_accuracy": 0.42758620381355283, "step": 127305 }, { "epoch": 0.12822798413844713, "grad_norm": 20.933573288781254, "learning_rate": 4.9251319957819936e-05, "loss": 2.3028, "mean_token_accuracy": 0.4482758641242981, "step": 127310 }, { "epoch": 0.1282330201915513, "grad_norm": 13.329691281463843, "learning_rate": 4.9251224096828876e-05, "loss": 2.413, "mean_token_accuracy": 0.47586206793785096, "step": 127315 }, { "epoch": 0.12823805624465548, "grad_norm": 15.943843930149265, "learning_rate": 4.925112822980503e-05, "loss": 2.5111, "mean_token_accuracy": 0.39068360924720763, "step": 127320 }, { "epoch": 0.12824309229775965, "grad_norm": 10.436887443574433, "learning_rate": 4.925103235674843e-05, "loss": 2.5934, "mean_token_accuracy": 0.4034482777118683, "step": 127325 }, { "epoch": 0.12824812835086383, "grad_norm": 9.44659820454716, "learning_rate": 4.92509364776591e-05, "loss": 2.4543, "mean_token_accuracy": 0.44482759237289426, "step": 127330 }, { "epoch": 0.128253164403968, "grad_norm": 10.18558018446586, "learning_rate": 4.9250840592537076e-05, "loss": 2.0392, "mean_token_accuracy": 0.4758620738983154, "step": 127335 }, { "epoch": 0.12825820045707217, "grad_norm": 10.845008242974112, "learning_rate": 4.925074470138236e-05, "loss": 2.8803, "mean_token_accuracy": 0.341379314661026, "step": 127340 }, { "epoch": 0.12826323651017635, "grad_norm": 9.280594036668052, "learning_rate": 4.9250648804195e-05, "loss": 2.171, "mean_token_accuracy": 0.482758617401123, "step": 127345 }, { "epoch": 0.12826827256328052, "grad_norm": 8.348892104554656, "learning_rate": 4.9250552900975015e-05, "loss": 2.3708, "mean_token_accuracy": 0.4620689630508423, "step": 127350 }, { "epoch": 0.1282733086163847, "grad_norm": 12.328074801651551, "learning_rate": 4.925045699172243e-05, "loss": 2.4891, "mean_token_accuracy": 0.43793103098869324, "step": 127355 }, { "epoch": 0.12827834466948887, "grad_norm": 11.849255652131053, "learning_rate": 4.925036107643728e-05, "loss": 2.5749, "mean_token_accuracy": 0.3793103456497192, "step": 127360 }, { "epoch": 0.12828338072259304, "grad_norm": 10.870795805309061, "learning_rate": 4.9250265155119586e-05, "loss": 2.3152, "mean_token_accuracy": 0.45341803431510924, "step": 127365 }, { "epoch": 0.12828841677569722, "grad_norm": 13.493678797309856, "learning_rate": 4.925016922776937e-05, "loss": 2.7042, "mean_token_accuracy": 0.43793101906776427, "step": 127370 }, { "epoch": 0.1282934528288014, "grad_norm": 9.2207436766657, "learning_rate": 4.925007329438667e-05, "loss": 1.938, "mean_token_accuracy": 0.49655172824859617, "step": 127375 }, { "epoch": 0.12829848888190556, "grad_norm": 10.288247749415964, "learning_rate": 4.9249977354971506e-05, "loss": 2.051, "mean_token_accuracy": 0.4778584361076355, "step": 127380 }, { "epoch": 0.12830352493500974, "grad_norm": 10.173543011809986, "learning_rate": 4.924988140952391e-05, "loss": 2.2012, "mean_token_accuracy": 0.43793103098869324, "step": 127385 }, { "epoch": 0.1283085609881139, "grad_norm": 10.800836878000327, "learning_rate": 4.924978545804389e-05, "loss": 2.0077, "mean_token_accuracy": 0.4931034564971924, "step": 127390 }, { "epoch": 0.12831359704121809, "grad_norm": 9.555954246154421, "learning_rate": 4.924968950053149e-05, "loss": 2.191, "mean_token_accuracy": 0.4931034445762634, "step": 127395 }, { "epoch": 0.12831863309432226, "grad_norm": 9.3459721359432, "learning_rate": 4.924959353698673e-05, "loss": 2.2685, "mean_token_accuracy": 0.4586206912994385, "step": 127400 }, { "epoch": 0.12832366914742643, "grad_norm": 16.841221492904037, "learning_rate": 4.924949756740964e-05, "loss": 2.3733, "mean_token_accuracy": 0.47241379618644713, "step": 127405 }, { "epoch": 0.1283287052005306, "grad_norm": 10.528813251349805, "learning_rate": 4.9249401591800256e-05, "loss": 2.4338, "mean_token_accuracy": 0.42758620381355283, "step": 127410 }, { "epoch": 0.12833374125363478, "grad_norm": 12.323400203574582, "learning_rate": 4.924930561015859e-05, "loss": 2.7101, "mean_token_accuracy": 0.3793103456497192, "step": 127415 }, { "epoch": 0.12833877730673895, "grad_norm": 10.683197484944763, "learning_rate": 4.924920962248466e-05, "loss": 2.5163, "mean_token_accuracy": 0.36896551251411436, "step": 127420 }, { "epoch": 0.12834381335984313, "grad_norm": 11.927005848611316, "learning_rate": 4.924911362877852e-05, "loss": 2.3026, "mean_token_accuracy": 0.43103448748588563, "step": 127425 }, { "epoch": 0.1283488494129473, "grad_norm": 10.382088964590611, "learning_rate": 4.924901762904018e-05, "loss": 2.1833, "mean_token_accuracy": 0.45517241954803467, "step": 127430 }, { "epoch": 0.12835388546605148, "grad_norm": 10.912613604648959, "learning_rate": 4.9248921623269665e-05, "loss": 2.7045, "mean_token_accuracy": 0.43103448748588563, "step": 127435 }, { "epoch": 0.12835892151915562, "grad_norm": 8.791477757425053, "learning_rate": 4.9248825611467e-05, "loss": 2.3226, "mean_token_accuracy": 0.41724138259887694, "step": 127440 }, { "epoch": 0.1283639575722598, "grad_norm": 9.801801918771966, "learning_rate": 4.9248729593632234e-05, "loss": 2.3741, "mean_token_accuracy": 0.4965517222881317, "step": 127445 }, { "epoch": 0.12836899362536397, "grad_norm": 10.980425861458698, "learning_rate": 4.9248633569765364e-05, "loss": 2.0667, "mean_token_accuracy": 0.47241380214691164, "step": 127450 }, { "epoch": 0.12837402967846814, "grad_norm": 10.18514598274303, "learning_rate": 4.924853753986643e-05, "loss": 2.3063, "mean_token_accuracy": 0.458620685338974, "step": 127455 }, { "epoch": 0.12837906573157232, "grad_norm": 15.872264392054353, "learning_rate": 4.924844150393546e-05, "loss": 2.6809, "mean_token_accuracy": 0.3965517282485962, "step": 127460 }, { "epoch": 0.1283841017846765, "grad_norm": 9.711729365108138, "learning_rate": 4.924834546197248e-05, "loss": 2.2877, "mean_token_accuracy": 0.42068964838981626, "step": 127465 }, { "epoch": 0.12838913783778066, "grad_norm": 11.688967012013288, "learning_rate": 4.9248249413977515e-05, "loss": 2.504, "mean_token_accuracy": 0.4081669747829437, "step": 127470 }, { "epoch": 0.12839417389088484, "grad_norm": 12.05582258052738, "learning_rate": 4.9248153359950587e-05, "loss": 2.6158, "mean_token_accuracy": 0.4068965524435043, "step": 127475 }, { "epoch": 0.128399209943989, "grad_norm": 11.699874378755295, "learning_rate": 4.924805729989173e-05, "loss": 2.5124, "mean_token_accuracy": 0.43793103098869324, "step": 127480 }, { "epoch": 0.12840424599709319, "grad_norm": 11.294930048293406, "learning_rate": 4.924796123380097e-05, "loss": 2.5431, "mean_token_accuracy": 0.40344828367233276, "step": 127485 }, { "epoch": 0.12840928205019736, "grad_norm": 12.327184659029557, "learning_rate": 4.924786516167833e-05, "loss": 2.5591, "mean_token_accuracy": 0.4009679317474365, "step": 127490 }, { "epoch": 0.12841431810330153, "grad_norm": 8.143868523905455, "learning_rate": 4.924776908352384e-05, "loss": 2.4144, "mean_token_accuracy": 0.4137930989265442, "step": 127495 }, { "epoch": 0.1284193541564057, "grad_norm": 10.218091600680188, "learning_rate": 4.9247672999337524e-05, "loss": 2.0457, "mean_token_accuracy": 0.4746521532535553, "step": 127500 }, { "epoch": 0.12842439020950988, "grad_norm": 9.665013529497648, "learning_rate": 4.924757690911941e-05, "loss": 2.5364, "mean_token_accuracy": 0.3896551728248596, "step": 127505 }, { "epoch": 0.12842942626261405, "grad_norm": 9.824078209991308, "learning_rate": 4.924748081286953e-05, "loss": 2.293, "mean_token_accuracy": 0.48620688915252686, "step": 127510 }, { "epoch": 0.12843446231571823, "grad_norm": 9.967185374003213, "learning_rate": 4.92473847105879e-05, "loss": 2.3901, "mean_token_accuracy": 0.4551724076271057, "step": 127515 }, { "epoch": 0.1284394983688224, "grad_norm": 9.646116556289856, "learning_rate": 4.924728860227455e-05, "loss": 2.31, "mean_token_accuracy": 0.4172413766384125, "step": 127520 }, { "epoch": 0.12844453442192658, "grad_norm": 10.727863685857805, "learning_rate": 4.924719248792951e-05, "loss": 2.473, "mean_token_accuracy": 0.4000000059604645, "step": 127525 }, { "epoch": 0.12844957047503075, "grad_norm": 10.4072670042468, "learning_rate": 4.9247096367552804e-05, "loss": 2.3919, "mean_token_accuracy": 0.43448275327682495, "step": 127530 }, { "epoch": 0.12845460652813492, "grad_norm": 13.119713129747444, "learning_rate": 4.924700024114446e-05, "loss": 2.5632, "mean_token_accuracy": 0.3551724135875702, "step": 127535 }, { "epoch": 0.1284596425812391, "grad_norm": 11.06028328417857, "learning_rate": 4.924690410870451e-05, "loss": 2.6298, "mean_token_accuracy": 0.4206896543502808, "step": 127540 }, { "epoch": 0.12846467863434327, "grad_norm": 11.149996919638442, "learning_rate": 4.9246807970232964e-05, "loss": 2.3162, "mean_token_accuracy": 0.45680580735206605, "step": 127545 }, { "epoch": 0.12846971468744744, "grad_norm": 12.446012580055408, "learning_rate": 4.9246711825729866e-05, "loss": 2.2609, "mean_token_accuracy": 0.46896551847457885, "step": 127550 }, { "epoch": 0.12847475074055162, "grad_norm": 8.857469839756094, "learning_rate": 4.924661567519524e-05, "loss": 2.3958, "mean_token_accuracy": 0.4206896543502808, "step": 127555 }, { "epoch": 0.1284797867936558, "grad_norm": 17.366067394988328, "learning_rate": 4.92465195186291e-05, "loss": 2.8502, "mean_token_accuracy": 0.4068965494632721, "step": 127560 }, { "epoch": 0.12848482284675997, "grad_norm": 9.524688744223372, "learning_rate": 4.924642335603149e-05, "loss": 2.2596, "mean_token_accuracy": 0.4379310369491577, "step": 127565 }, { "epoch": 0.12848985889986414, "grad_norm": 8.954131992880237, "learning_rate": 4.924632718740243e-05, "loss": 2.0301, "mean_token_accuracy": 0.47931034564971925, "step": 127570 }, { "epoch": 0.1284948949529683, "grad_norm": 10.572252217444298, "learning_rate": 4.924623101274193e-05, "loss": 2.051, "mean_token_accuracy": 0.47586206197738645, "step": 127575 }, { "epoch": 0.12849993100607246, "grad_norm": 9.972141032575564, "learning_rate": 4.924613483205005e-05, "loss": 2.5891, "mean_token_accuracy": 0.38620689511299133, "step": 127580 }, { "epoch": 0.12850496705917663, "grad_norm": 23.634134233052833, "learning_rate": 4.924603864532679e-05, "loss": 2.5493, "mean_token_accuracy": 0.4620689690113068, "step": 127585 }, { "epoch": 0.1285100031122808, "grad_norm": 9.920526659620677, "learning_rate": 4.924594245257219e-05, "loss": 2.1956, "mean_token_accuracy": 0.4862068951129913, "step": 127590 }, { "epoch": 0.12851503916538498, "grad_norm": 9.685879530127387, "learning_rate": 4.9245846253786265e-05, "loss": 2.0457, "mean_token_accuracy": 0.5054446458816528, "step": 127595 }, { "epoch": 0.12852007521848915, "grad_norm": 9.778513673376692, "learning_rate": 4.924575004896905e-05, "loss": 2.2707, "mean_token_accuracy": 0.4551724076271057, "step": 127600 }, { "epoch": 0.12852511127159333, "grad_norm": 11.891404312162072, "learning_rate": 4.924565383812057e-05, "loss": 2.4989, "mean_token_accuracy": 0.37586206793785093, "step": 127605 }, { "epoch": 0.1285301473246975, "grad_norm": 10.75039502544218, "learning_rate": 4.9245557621240856e-05, "loss": 2.5416, "mean_token_accuracy": 0.3862068891525269, "step": 127610 }, { "epoch": 0.12853518337780168, "grad_norm": 9.612919729099593, "learning_rate": 4.924546139832992e-05, "loss": 2.7252, "mean_token_accuracy": 0.40514217019081117, "step": 127615 }, { "epoch": 0.12854021943090585, "grad_norm": 10.037783219321234, "learning_rate": 4.924536516938781e-05, "loss": 2.2765, "mean_token_accuracy": 0.4344827651977539, "step": 127620 }, { "epoch": 0.12854525548401002, "grad_norm": 10.355716639114506, "learning_rate": 4.924526893441454e-05, "loss": 2.578, "mean_token_accuracy": 0.37931033968925476, "step": 127625 }, { "epoch": 0.1285502915371142, "grad_norm": 11.434283329777282, "learning_rate": 4.9245172693410137e-05, "loss": 2.3593, "mean_token_accuracy": 0.42413792610168455, "step": 127630 }, { "epoch": 0.12855532759021837, "grad_norm": 9.883753883517219, "learning_rate": 4.924507644637462e-05, "loss": 2.4999, "mean_token_accuracy": 0.37586207389831544, "step": 127635 }, { "epoch": 0.12856036364332254, "grad_norm": 9.85825155963918, "learning_rate": 4.9244980193308035e-05, "loss": 2.1288, "mean_token_accuracy": 0.41034482717514037, "step": 127640 }, { "epoch": 0.12856539969642672, "grad_norm": 9.490900552391723, "learning_rate": 4.92448839342104e-05, "loss": 2.2676, "mean_token_accuracy": 0.40689654350280763, "step": 127645 }, { "epoch": 0.1285704357495309, "grad_norm": 10.392967148786031, "learning_rate": 4.924478766908174e-05, "loss": 2.3756, "mean_token_accuracy": 0.42758620381355283, "step": 127650 }, { "epoch": 0.12857547180263507, "grad_norm": 10.196652029877601, "learning_rate": 4.924469139792208e-05, "loss": 2.4218, "mean_token_accuracy": 0.4151845157146454, "step": 127655 }, { "epoch": 0.12858050785573924, "grad_norm": 10.106740909292347, "learning_rate": 4.9244595120731454e-05, "loss": 2.5054, "mean_token_accuracy": 0.40889291763305663, "step": 127660 }, { "epoch": 0.1285855439088434, "grad_norm": 8.746469829851762, "learning_rate": 4.9244498837509865e-05, "loss": 2.1263, "mean_token_accuracy": 0.4848759710788727, "step": 127665 }, { "epoch": 0.1285905799619476, "grad_norm": 11.96363592722122, "learning_rate": 4.9244402548257374e-05, "loss": 2.3255, "mean_token_accuracy": 0.4517241299152374, "step": 127670 }, { "epoch": 0.12859561601505176, "grad_norm": 12.873178156372564, "learning_rate": 4.9244306252973995e-05, "loss": 2.1929, "mean_token_accuracy": 0.4620689690113068, "step": 127675 }, { "epoch": 0.12860065206815594, "grad_norm": 12.555515286965361, "learning_rate": 4.924420995165974e-05, "loss": 2.6105, "mean_token_accuracy": 0.42068966031074523, "step": 127680 }, { "epoch": 0.1286056881212601, "grad_norm": 11.257758453877543, "learning_rate": 4.924411364431465e-05, "loss": 2.2747, "mean_token_accuracy": 0.46551724076271056, "step": 127685 }, { "epoch": 0.12861072417436428, "grad_norm": 9.524178075990802, "learning_rate": 4.924401733093875e-05, "loss": 2.2319, "mean_token_accuracy": 0.43448275327682495, "step": 127690 }, { "epoch": 0.12861576022746846, "grad_norm": 10.73306407693994, "learning_rate": 4.924392101153207e-05, "loss": 2.8543, "mean_token_accuracy": 0.4, "step": 127695 }, { "epoch": 0.12862079628057263, "grad_norm": 12.358542560353868, "learning_rate": 4.924382468609463e-05, "loss": 2.5391, "mean_token_accuracy": 0.41034482717514037, "step": 127700 }, { "epoch": 0.1286258323336768, "grad_norm": 12.803602730367725, "learning_rate": 4.9243728354626465e-05, "loss": 2.2057, "mean_token_accuracy": 0.4665024638175964, "step": 127705 }, { "epoch": 0.12863086838678098, "grad_norm": 11.912318131331174, "learning_rate": 4.924363201712758e-05, "loss": 2.0926, "mean_token_accuracy": 0.482758617401123, "step": 127710 }, { "epoch": 0.12863590443988515, "grad_norm": 8.778746103534983, "learning_rate": 4.924353567359804e-05, "loss": 2.2093, "mean_token_accuracy": 0.4517241358757019, "step": 127715 }, { "epoch": 0.1286409404929893, "grad_norm": 11.294805026590723, "learning_rate": 4.9243439324037824e-05, "loss": 2.9956, "mean_token_accuracy": 0.3517241358757019, "step": 127720 }, { "epoch": 0.12864597654609347, "grad_norm": 12.377306215613103, "learning_rate": 4.9243342968447e-05, "loss": 2.149, "mean_token_accuracy": 0.4551724076271057, "step": 127725 }, { "epoch": 0.12865101259919764, "grad_norm": 12.352788698625478, "learning_rate": 4.924324660682558e-05, "loss": 2.0403, "mean_token_accuracy": 0.4689655125141144, "step": 127730 }, { "epoch": 0.12865604865230182, "grad_norm": 12.446138862570537, "learning_rate": 4.924315023917358e-05, "loss": 2.3995, "mean_token_accuracy": 0.4482758641242981, "step": 127735 }, { "epoch": 0.128661084705406, "grad_norm": 9.314516141434398, "learning_rate": 4.924305386549104e-05, "loss": 2.4982, "mean_token_accuracy": 0.44827585816383364, "step": 127740 }, { "epoch": 0.12866612075851017, "grad_norm": 13.349780516521868, "learning_rate": 4.924295748577799e-05, "loss": 2.5256, "mean_token_accuracy": 0.39848759174346926, "step": 127745 }, { "epoch": 0.12867115681161434, "grad_norm": 9.868787205010163, "learning_rate": 4.924286110003444e-05, "loss": 2.539, "mean_token_accuracy": 0.43103448748588563, "step": 127750 }, { "epoch": 0.1286761928647185, "grad_norm": 8.303075561032642, "learning_rate": 4.9242764708260426e-05, "loss": 2.2235, "mean_token_accuracy": 0.46049606800079346, "step": 127755 }, { "epoch": 0.1286812289178227, "grad_norm": 8.700129701718794, "learning_rate": 4.924266831045598e-05, "loss": 2.3138, "mean_token_accuracy": 0.43448275327682495, "step": 127760 }, { "epoch": 0.12868626497092686, "grad_norm": 10.867260357626947, "learning_rate": 4.924257190662112e-05, "loss": 2.4015, "mean_token_accuracy": 0.41863279342651366, "step": 127765 }, { "epoch": 0.12869130102403104, "grad_norm": 9.941276875340995, "learning_rate": 4.924247549675588e-05, "loss": 2.3781, "mean_token_accuracy": 0.4241379201412201, "step": 127770 }, { "epoch": 0.1286963370771352, "grad_norm": 11.761954858608897, "learning_rate": 4.924237908086028e-05, "loss": 2.4607, "mean_token_accuracy": 0.42413792610168455, "step": 127775 }, { "epoch": 0.12870137313023938, "grad_norm": 11.724174152208045, "learning_rate": 4.924228265893436e-05, "loss": 2.4791, "mean_token_accuracy": 0.4379310369491577, "step": 127780 }, { "epoch": 0.12870640918334356, "grad_norm": 12.11535240758572, "learning_rate": 4.924218623097813e-05, "loss": 2.705, "mean_token_accuracy": 0.42413793206214906, "step": 127785 }, { "epoch": 0.12871144523644773, "grad_norm": 11.900259854091827, "learning_rate": 4.924208979699162e-05, "loss": 2.699, "mean_token_accuracy": 0.37241379022598264, "step": 127790 }, { "epoch": 0.1287164812895519, "grad_norm": 9.320682189089904, "learning_rate": 4.924199335697486e-05, "loss": 2.2684, "mean_token_accuracy": 0.4482758641242981, "step": 127795 }, { "epoch": 0.12872151734265608, "grad_norm": 10.173876403857657, "learning_rate": 4.9241896910927884e-05, "loss": 2.077, "mean_token_accuracy": 0.4620689570903778, "step": 127800 }, { "epoch": 0.12872655339576025, "grad_norm": 12.429764339461054, "learning_rate": 4.9241800458850704e-05, "loss": 2.3958, "mean_token_accuracy": 0.4206896543502808, "step": 127805 }, { "epoch": 0.12873158944886443, "grad_norm": 9.717841497686875, "learning_rate": 4.9241704000743354e-05, "loss": 2.219, "mean_token_accuracy": 0.4206896543502808, "step": 127810 }, { "epoch": 0.1287366255019686, "grad_norm": 8.179142170925058, "learning_rate": 4.9241607536605874e-05, "loss": 2.0514, "mean_token_accuracy": 0.47931034564971925, "step": 127815 }, { "epoch": 0.12874166155507277, "grad_norm": 9.36774110027047, "learning_rate": 4.9241511066438265e-05, "loss": 2.1227, "mean_token_accuracy": 0.4517241418361664, "step": 127820 }, { "epoch": 0.12874669760817695, "grad_norm": 16.963848948166888, "learning_rate": 4.9241414590240575e-05, "loss": 2.8065, "mean_token_accuracy": 0.3793103456497192, "step": 127825 }, { "epoch": 0.12875173366128112, "grad_norm": 9.067236700654885, "learning_rate": 4.9241318108012816e-05, "loss": 2.5271, "mean_token_accuracy": 0.42758620381355283, "step": 127830 }, { "epoch": 0.1287567697143853, "grad_norm": 10.033844406102103, "learning_rate": 4.924122161975502e-05, "loss": 2.0759, "mean_token_accuracy": 0.4620689690113068, "step": 127835 }, { "epoch": 0.12876180576748947, "grad_norm": 11.480512654263155, "learning_rate": 4.924112512546722e-05, "loss": 2.5, "mean_token_accuracy": 0.44827587008476255, "step": 127840 }, { "epoch": 0.12876684182059364, "grad_norm": 10.490263782530084, "learning_rate": 4.924102862514945e-05, "loss": 2.4743, "mean_token_accuracy": 0.3551724076271057, "step": 127845 }, { "epoch": 0.12877187787369782, "grad_norm": 11.952263836096163, "learning_rate": 4.92409321188017e-05, "loss": 2.5803, "mean_token_accuracy": 0.4137930989265442, "step": 127850 }, { "epoch": 0.128776913926802, "grad_norm": 12.367147219303193, "learning_rate": 4.9240835606424044e-05, "loss": 2.6585, "mean_token_accuracy": 0.3896551787853241, "step": 127855 }, { "epoch": 0.12878194997990614, "grad_norm": 8.913225504028858, "learning_rate": 4.924073908801647e-05, "loss": 2.8537, "mean_token_accuracy": 0.42413792610168455, "step": 127860 }, { "epoch": 0.1287869860330103, "grad_norm": 13.54135211834433, "learning_rate": 4.924064256357903e-05, "loss": 2.4155, "mean_token_accuracy": 0.38620689511299133, "step": 127865 }, { "epoch": 0.12879202208611448, "grad_norm": 11.067944021256205, "learning_rate": 4.9240546033111734e-05, "loss": 2.4104, "mean_token_accuracy": 0.4137930989265442, "step": 127870 }, { "epoch": 0.12879705813921866, "grad_norm": 8.913873877160082, "learning_rate": 4.924044949661462e-05, "loss": 2.3584, "mean_token_accuracy": 0.4482758641242981, "step": 127875 }, { "epoch": 0.12880209419232283, "grad_norm": 10.619948518487105, "learning_rate": 4.924035295408771e-05, "loss": 2.016, "mean_token_accuracy": 0.4620689690113068, "step": 127880 }, { "epoch": 0.128807130245427, "grad_norm": 8.548220761346169, "learning_rate": 4.924025640553104e-05, "loss": 2.2873, "mean_token_accuracy": 0.41724138855934145, "step": 127885 }, { "epoch": 0.12881216629853118, "grad_norm": 10.81221961159857, "learning_rate": 4.9240159850944624e-05, "loss": 2.9849, "mean_token_accuracy": 0.3758620619773865, "step": 127890 }, { "epoch": 0.12881720235163535, "grad_norm": 10.749276371479498, "learning_rate": 4.9240063290328486e-05, "loss": 2.7106, "mean_token_accuracy": 0.3896551698446274, "step": 127895 }, { "epoch": 0.12882223840473953, "grad_norm": 9.089119627198698, "learning_rate": 4.9239966723682666e-05, "loss": 2.377, "mean_token_accuracy": 0.4379310369491577, "step": 127900 }, { "epoch": 0.1288272744578437, "grad_norm": 7.1446871415420885, "learning_rate": 4.9239870151007184e-05, "loss": 2.2417, "mean_token_accuracy": 0.4551724135875702, "step": 127905 }, { "epoch": 0.12883231051094787, "grad_norm": 10.187841017023604, "learning_rate": 4.9239773572302075e-05, "loss": 2.2897, "mean_token_accuracy": 0.4517241418361664, "step": 127910 }, { "epoch": 0.12883734656405205, "grad_norm": 10.403386119245795, "learning_rate": 4.9239676987567345e-05, "loss": 2.6126, "mean_token_accuracy": 0.4241379350423813, "step": 127915 }, { "epoch": 0.12884238261715622, "grad_norm": 12.326672261273545, "learning_rate": 4.9239580396803054e-05, "loss": 2.734, "mean_token_accuracy": 0.3551724135875702, "step": 127920 }, { "epoch": 0.1288474186702604, "grad_norm": 9.206492008968038, "learning_rate": 4.923948380000919e-05, "loss": 2.032, "mean_token_accuracy": 0.458620685338974, "step": 127925 }, { "epoch": 0.12885245472336457, "grad_norm": 10.773759247858782, "learning_rate": 4.923938719718581e-05, "loss": 2.1891, "mean_token_accuracy": 0.4517241418361664, "step": 127930 }, { "epoch": 0.12885749077646874, "grad_norm": 9.995627006310333, "learning_rate": 4.923929058833293e-05, "loss": 1.9157, "mean_token_accuracy": 0.46551724672317507, "step": 127935 }, { "epoch": 0.12886252682957292, "grad_norm": 9.945657078055119, "learning_rate": 4.923919397345057e-05, "loss": 2.2962, "mean_token_accuracy": 0.47586206793785096, "step": 127940 }, { "epoch": 0.1288675628826771, "grad_norm": 9.875888206298198, "learning_rate": 4.9239097352538764e-05, "loss": 2.3374, "mean_token_accuracy": 0.42413793206214906, "step": 127945 }, { "epoch": 0.12887259893578126, "grad_norm": 12.603182960831314, "learning_rate": 4.923900072559755e-05, "loss": 2.4499, "mean_token_accuracy": 0.4034482717514038, "step": 127950 }, { "epoch": 0.12887763498888544, "grad_norm": 6.387231426374068, "learning_rate": 4.9238904092626934e-05, "loss": 2.1974, "mean_token_accuracy": 0.5034482717514038, "step": 127955 }, { "epoch": 0.1288826710419896, "grad_norm": 12.448383390707571, "learning_rate": 4.923880745362694e-05, "loss": 2.4921, "mean_token_accuracy": 0.41379311084747317, "step": 127960 }, { "epoch": 0.12888770709509378, "grad_norm": 10.895570328460016, "learning_rate": 4.923871080859763e-05, "loss": 2.5131, "mean_token_accuracy": 0.3896551728248596, "step": 127965 }, { "epoch": 0.12889274314819796, "grad_norm": 9.671907138873527, "learning_rate": 4.923861415753899e-05, "loss": 2.376, "mean_token_accuracy": 0.4344827592372894, "step": 127970 }, { "epoch": 0.12889777920130213, "grad_norm": 10.7373867442119, "learning_rate": 4.923851750045107e-05, "loss": 2.0071, "mean_token_accuracy": 0.5151477873325347, "step": 127975 }, { "epoch": 0.1289028152544063, "grad_norm": 10.500383645785941, "learning_rate": 4.923842083733389e-05, "loss": 2.3578, "mean_token_accuracy": 0.43974592089653014, "step": 127980 }, { "epoch": 0.12890785130751048, "grad_norm": 15.368089437734245, "learning_rate": 4.923832416818748e-05, "loss": 2.7182, "mean_token_accuracy": 0.4137930929660797, "step": 127985 }, { "epoch": 0.12891288736061465, "grad_norm": 9.960802926068677, "learning_rate": 4.923822749301186e-05, "loss": 2.6472, "mean_token_accuracy": 0.38965516686439516, "step": 127990 }, { "epoch": 0.12891792341371883, "grad_norm": 10.09790831119215, "learning_rate": 4.923813081180707e-05, "loss": 2.5491, "mean_token_accuracy": 0.42413792610168455, "step": 127995 }, { "epoch": 0.12892295946682297, "grad_norm": 9.149688846576023, "learning_rate": 4.923803412457313e-05, "loss": 2.0772, "mean_token_accuracy": 0.45716878175735476, "step": 128000 }, { "epoch": 0.12892799551992715, "grad_norm": 9.896589071965902, "learning_rate": 4.923793743131006e-05, "loss": 2.3349, "mean_token_accuracy": 0.41379311084747317, "step": 128005 }, { "epoch": 0.12893303157303132, "grad_norm": 9.850544263452704, "learning_rate": 4.923784073201788e-05, "loss": 2.4156, "mean_token_accuracy": 0.41379310488700866, "step": 128010 }, { "epoch": 0.1289380676261355, "grad_norm": 11.280189944010093, "learning_rate": 4.923774402669664e-05, "loss": 2.6709, "mean_token_accuracy": 0.4428917050361633, "step": 128015 }, { "epoch": 0.12894310367923967, "grad_norm": 10.265939459215152, "learning_rate": 4.923764731534636e-05, "loss": 2.373, "mean_token_accuracy": 0.4448275864124298, "step": 128020 }, { "epoch": 0.12894813973234384, "grad_norm": 10.446819118202718, "learning_rate": 4.923755059796706e-05, "loss": 2.4208, "mean_token_accuracy": 0.44827585220336913, "step": 128025 }, { "epoch": 0.12895317578544802, "grad_norm": 8.906470300846742, "learning_rate": 4.923745387455876e-05, "loss": 2.3462, "mean_token_accuracy": 0.4068965554237366, "step": 128030 }, { "epoch": 0.1289582118385522, "grad_norm": 8.416743895264542, "learning_rate": 4.9237357145121504e-05, "loss": 2.3328, "mean_token_accuracy": 0.4103448212146759, "step": 128035 }, { "epoch": 0.12896324789165636, "grad_norm": 12.411916065591448, "learning_rate": 4.9237260409655316e-05, "loss": 2.4392, "mean_token_accuracy": 0.44373866319656374, "step": 128040 }, { "epoch": 0.12896828394476054, "grad_norm": 11.547835284403805, "learning_rate": 4.923716366816021e-05, "loss": 2.5093, "mean_token_accuracy": 0.39310344457626345, "step": 128045 }, { "epoch": 0.1289733199978647, "grad_norm": 13.68837922765251, "learning_rate": 4.9237066920636224e-05, "loss": 2.8612, "mean_token_accuracy": 0.4118572294712067, "step": 128050 }, { "epoch": 0.12897835605096888, "grad_norm": 9.466380823989482, "learning_rate": 4.923697016708338e-05, "loss": 2.134, "mean_token_accuracy": 0.5103448331356049, "step": 128055 }, { "epoch": 0.12898339210407306, "grad_norm": 10.352341153421657, "learning_rate": 4.9236873407501705e-05, "loss": 2.5447, "mean_token_accuracy": 0.3896551728248596, "step": 128060 }, { "epoch": 0.12898842815717723, "grad_norm": 14.339941788551553, "learning_rate": 4.923677664189123e-05, "loss": 2.0564, "mean_token_accuracy": 0.5217980325222016, "step": 128065 }, { "epoch": 0.1289934642102814, "grad_norm": 10.45788698802218, "learning_rate": 4.9236679870251975e-05, "loss": 2.4836, "mean_token_accuracy": 0.42758620381355283, "step": 128070 }, { "epoch": 0.12899850026338558, "grad_norm": 11.187185664102419, "learning_rate": 4.923658309258397e-05, "loss": 2.5896, "mean_token_accuracy": 0.43793103098869324, "step": 128075 }, { "epoch": 0.12900353631648975, "grad_norm": 10.299184261023209, "learning_rate": 4.923648630888725e-05, "loss": 2.341, "mean_token_accuracy": 0.45517241954803467, "step": 128080 }, { "epoch": 0.12900857236959393, "grad_norm": 9.825751820106026, "learning_rate": 4.9236389519161826e-05, "loss": 2.3219, "mean_token_accuracy": 0.4413793087005615, "step": 128085 }, { "epoch": 0.1290136084226981, "grad_norm": 11.382378327600449, "learning_rate": 4.9236292723407736e-05, "loss": 2.4512, "mean_token_accuracy": 0.44482758045196535, "step": 128090 }, { "epoch": 0.12901864447580227, "grad_norm": 12.597371176736944, "learning_rate": 4.923619592162501e-05, "loss": 2.1234, "mean_token_accuracy": 0.4620689630508423, "step": 128095 }, { "epoch": 0.12902368052890645, "grad_norm": 11.599556806224502, "learning_rate": 4.923609911381366e-05, "loss": 2.3214, "mean_token_accuracy": 0.43103447556495667, "step": 128100 }, { "epoch": 0.12902871658201062, "grad_norm": 10.661955965787193, "learning_rate": 4.923600229997373e-05, "loss": 1.9721, "mean_token_accuracy": 0.5241379320621491, "step": 128105 }, { "epoch": 0.1290337526351148, "grad_norm": 9.6460281938407, "learning_rate": 4.9235905480105235e-05, "loss": 2.4721, "mean_token_accuracy": 0.43647912740707395, "step": 128110 }, { "epoch": 0.12903878868821897, "grad_norm": 13.656770520630891, "learning_rate": 4.92358086542082e-05, "loss": 2.7494, "mean_token_accuracy": 0.3655172407627106, "step": 128115 }, { "epoch": 0.12904382474132314, "grad_norm": 10.38199837686001, "learning_rate": 4.923571182228267e-05, "loss": 2.6349, "mean_token_accuracy": 0.3931034505367279, "step": 128120 }, { "epoch": 0.12904886079442732, "grad_norm": 12.340078572625343, "learning_rate": 4.923561498432865e-05, "loss": 2.1448, "mean_token_accuracy": 0.4551724076271057, "step": 128125 }, { "epoch": 0.1290538968475315, "grad_norm": 9.270954449059742, "learning_rate": 4.923551814034618e-05, "loss": 2.5246, "mean_token_accuracy": 0.44137929677963256, "step": 128130 }, { "epoch": 0.12905893290063566, "grad_norm": 9.518150641211344, "learning_rate": 4.923542129033528e-05, "loss": 2.1757, "mean_token_accuracy": 0.4620689570903778, "step": 128135 }, { "epoch": 0.1290639689537398, "grad_norm": 13.75313540086422, "learning_rate": 4.9235324434295976e-05, "loss": 2.2751, "mean_token_accuracy": 0.4534785389900208, "step": 128140 }, { "epoch": 0.12906900500684398, "grad_norm": 10.193098893792, "learning_rate": 4.9235227572228306e-05, "loss": 2.0801, "mean_token_accuracy": 0.4551724076271057, "step": 128145 }, { "epoch": 0.12907404105994816, "grad_norm": 13.76885370745627, "learning_rate": 4.923513070413228e-05, "loss": 2.7368, "mean_token_accuracy": 0.3896551728248596, "step": 128150 }, { "epoch": 0.12907907711305233, "grad_norm": 10.137959076144442, "learning_rate": 4.9235033830007947e-05, "loss": 2.3066, "mean_token_accuracy": 0.4620689630508423, "step": 128155 }, { "epoch": 0.1290841131661565, "grad_norm": 9.497434500079118, "learning_rate": 4.923493694985532e-05, "loss": 2.2012, "mean_token_accuracy": 0.493103438615799, "step": 128160 }, { "epoch": 0.12908914921926068, "grad_norm": 10.618999431179382, "learning_rate": 4.923484006367442e-05, "loss": 2.2725, "mean_token_accuracy": 0.36551723480224607, "step": 128165 }, { "epoch": 0.12909418527236485, "grad_norm": 23.14589823289881, "learning_rate": 4.923474317146528e-05, "loss": 2.4004, "mean_token_accuracy": 0.4310344815254211, "step": 128170 }, { "epoch": 0.12909922132546903, "grad_norm": 11.869449548702043, "learning_rate": 4.9234646273227937e-05, "loss": 2.2808, "mean_token_accuracy": 0.4551724076271057, "step": 128175 }, { "epoch": 0.1291042573785732, "grad_norm": 17.95330533625835, "learning_rate": 4.92345493689624e-05, "loss": 3.1248, "mean_token_accuracy": 0.3793103456497192, "step": 128180 }, { "epoch": 0.12910929343167737, "grad_norm": 11.767762272627898, "learning_rate": 4.923445245866871e-05, "loss": 2.5562, "mean_token_accuracy": 0.42413793206214906, "step": 128185 }, { "epoch": 0.12911432948478155, "grad_norm": 11.455829641652464, "learning_rate": 4.923435554234688e-05, "loss": 2.5565, "mean_token_accuracy": 0.3931034505367279, "step": 128190 }, { "epoch": 0.12911936553788572, "grad_norm": 18.73248184651721, "learning_rate": 4.9234258619996956e-05, "loss": 2.3877, "mean_token_accuracy": 0.47931034564971925, "step": 128195 }, { "epoch": 0.1291244015909899, "grad_norm": 12.882458843004732, "learning_rate": 4.923416169161895e-05, "loss": 2.194, "mean_token_accuracy": 0.44827585220336913, "step": 128200 }, { "epoch": 0.12912943764409407, "grad_norm": 11.47918219496123, "learning_rate": 4.9234064757212894e-05, "loss": 2.3348, "mean_token_accuracy": 0.42413793206214906, "step": 128205 }, { "epoch": 0.12913447369719824, "grad_norm": 10.204767922131863, "learning_rate": 4.9233967816778814e-05, "loss": 2.3474, "mean_token_accuracy": 0.4344827651977539, "step": 128210 }, { "epoch": 0.12913950975030242, "grad_norm": 11.021061055877462, "learning_rate": 4.923387087031674e-05, "loss": 2.3792, "mean_token_accuracy": 0.4, "step": 128215 }, { "epoch": 0.1291445458034066, "grad_norm": 11.659137608060712, "learning_rate": 4.923377391782669e-05, "loss": 2.4856, "mean_token_accuracy": 0.4586206912994385, "step": 128220 }, { "epoch": 0.12914958185651076, "grad_norm": 12.511110755537983, "learning_rate": 4.9233676959308706e-05, "loss": 2.6203, "mean_token_accuracy": 0.4172413766384125, "step": 128225 }, { "epoch": 0.12915461790961494, "grad_norm": 10.143849413269466, "learning_rate": 4.923357999476279e-05, "loss": 2.1678, "mean_token_accuracy": 0.4689655065536499, "step": 128230 }, { "epoch": 0.1291596539627191, "grad_norm": 12.378513577454244, "learning_rate": 4.923348302418899e-05, "loss": 2.2234, "mean_token_accuracy": 0.4275861978530884, "step": 128235 }, { "epoch": 0.12916469001582329, "grad_norm": 9.238350809807386, "learning_rate": 4.923338604758734e-05, "loss": 2.3772, "mean_token_accuracy": 0.38965516686439516, "step": 128240 }, { "epoch": 0.12916972606892746, "grad_norm": 10.017038917496683, "learning_rate": 4.9233289064957844e-05, "loss": 2.6635, "mean_token_accuracy": 0.4068965405225754, "step": 128245 }, { "epoch": 0.12917476212203163, "grad_norm": 13.79515467311658, "learning_rate": 4.923319207630054e-05, "loss": 2.2718, "mean_token_accuracy": 0.4689655125141144, "step": 128250 }, { "epoch": 0.1291797981751358, "grad_norm": 10.141504044822554, "learning_rate": 4.9233095081615454e-05, "loss": 2.4224, "mean_token_accuracy": 0.46551724672317507, "step": 128255 }, { "epoch": 0.12918483422823998, "grad_norm": 9.756983224290494, "learning_rate": 4.923299808090261e-05, "loss": 2.4208, "mean_token_accuracy": 0.4103448212146759, "step": 128260 }, { "epoch": 0.12918987028134415, "grad_norm": 10.809049172746894, "learning_rate": 4.923290107416204e-05, "loss": 2.396, "mean_token_accuracy": 0.4068965494632721, "step": 128265 }, { "epoch": 0.12919490633444833, "grad_norm": 12.873062680950348, "learning_rate": 4.923280406139377e-05, "loss": 2.4719, "mean_token_accuracy": 0.4172413796186447, "step": 128270 }, { "epoch": 0.12919994238755247, "grad_norm": 12.02306164832703, "learning_rate": 4.923270704259783e-05, "loss": 2.3277, "mean_token_accuracy": 0.4517241358757019, "step": 128275 }, { "epoch": 0.12920497844065665, "grad_norm": 10.543216943325918, "learning_rate": 4.9232610017774235e-05, "loss": 2.429, "mean_token_accuracy": 0.4359346628189087, "step": 128280 }, { "epoch": 0.12921001449376082, "grad_norm": 9.285387787629254, "learning_rate": 4.9232512986923027e-05, "loss": 2.3374, "mean_token_accuracy": 0.3965517163276672, "step": 128285 }, { "epoch": 0.129215050546865, "grad_norm": 8.288132963426827, "learning_rate": 4.9232415950044216e-05, "loss": 2.1738, "mean_token_accuracy": 0.4344827592372894, "step": 128290 }, { "epoch": 0.12922008659996917, "grad_norm": 9.100106675981571, "learning_rate": 4.9232318907137845e-05, "loss": 2.3022, "mean_token_accuracy": 0.4310344815254211, "step": 128295 }, { "epoch": 0.12922512265307334, "grad_norm": 8.591850562778362, "learning_rate": 4.923222185820393e-05, "loss": 2.2747, "mean_token_accuracy": 0.4620689630508423, "step": 128300 }, { "epoch": 0.12923015870617752, "grad_norm": 11.711904980572218, "learning_rate": 4.923212480324251e-05, "loss": 2.5069, "mean_token_accuracy": 0.417241370677948, "step": 128305 }, { "epoch": 0.1292351947592817, "grad_norm": 10.441602356776599, "learning_rate": 4.92320277422536e-05, "loss": 2.4624, "mean_token_accuracy": 0.44827585220336913, "step": 128310 }, { "epoch": 0.12924023081238586, "grad_norm": 8.936264956571058, "learning_rate": 4.9231930675237224e-05, "loss": 2.0904, "mean_token_accuracy": 0.4620689511299133, "step": 128315 }, { "epoch": 0.12924526686549004, "grad_norm": 11.696122010133827, "learning_rate": 4.923183360219343e-05, "loss": 2.5087, "mean_token_accuracy": 0.46268472671508787, "step": 128320 }, { "epoch": 0.1292503029185942, "grad_norm": 9.49282278617059, "learning_rate": 4.9231736523122226e-05, "loss": 2.339, "mean_token_accuracy": 0.4781004250049591, "step": 128325 }, { "epoch": 0.12925533897169839, "grad_norm": 10.560907837869582, "learning_rate": 4.923163943802364e-05, "loss": 2.1361, "mean_token_accuracy": 0.4931034445762634, "step": 128330 }, { "epoch": 0.12926037502480256, "grad_norm": 7.868158711451674, "learning_rate": 4.92315423468977e-05, "loss": 1.82, "mean_token_accuracy": 0.5434361755847931, "step": 128335 }, { "epoch": 0.12926541107790673, "grad_norm": 11.559445731840162, "learning_rate": 4.923144524974444e-05, "loss": 2.3789, "mean_token_accuracy": 0.4448275864124298, "step": 128340 }, { "epoch": 0.1292704471310109, "grad_norm": 8.62185157778288, "learning_rate": 4.9231348146563875e-05, "loss": 2.5222, "mean_token_accuracy": 0.4517241418361664, "step": 128345 }, { "epoch": 0.12927548318411508, "grad_norm": 10.499650567420986, "learning_rate": 4.9231251037356054e-05, "loss": 2.6852, "mean_token_accuracy": 0.4, "step": 128350 }, { "epoch": 0.12928051923721925, "grad_norm": 9.987244560985339, "learning_rate": 4.9231153922120983e-05, "loss": 2.2718, "mean_token_accuracy": 0.42758620381355283, "step": 128355 }, { "epoch": 0.12928555529032343, "grad_norm": 11.609162890519203, "learning_rate": 4.923105680085869e-05, "loss": 2.3983, "mean_token_accuracy": 0.44827587008476255, "step": 128360 }, { "epoch": 0.1292905913434276, "grad_norm": 10.785544627357782, "learning_rate": 4.923095967356922e-05, "loss": 2.6034, "mean_token_accuracy": 0.3793103456497192, "step": 128365 }, { "epoch": 0.12929562739653178, "grad_norm": 10.05384063989405, "learning_rate": 4.9230862540252575e-05, "loss": 2.3357, "mean_token_accuracy": 0.4482758462429047, "step": 128370 }, { "epoch": 0.12930066344963595, "grad_norm": 10.077603193180943, "learning_rate": 4.92307654009088e-05, "loss": 2.6173, "mean_token_accuracy": 0.42413793206214906, "step": 128375 }, { "epoch": 0.12930569950274012, "grad_norm": 12.94194483584215, "learning_rate": 4.923066825553791e-05, "loss": 2.4918, "mean_token_accuracy": 0.40689654648303986, "step": 128380 }, { "epoch": 0.1293107355558443, "grad_norm": 8.653185343151973, "learning_rate": 4.9230571104139946e-05, "loss": 1.9475, "mean_token_accuracy": 0.5137931048870087, "step": 128385 }, { "epoch": 0.12931577160894847, "grad_norm": 10.093317038606145, "learning_rate": 4.923047394671492e-05, "loss": 2.7248, "mean_token_accuracy": 0.35172413289546967, "step": 128390 }, { "epoch": 0.12932080766205264, "grad_norm": 9.015370340150174, "learning_rate": 4.923037678326287e-05, "loss": 2.3865, "mean_token_accuracy": 0.43448275327682495, "step": 128395 }, { "epoch": 0.12932584371515682, "grad_norm": 12.727613415682727, "learning_rate": 4.9230279613783815e-05, "loss": 2.718, "mean_token_accuracy": 0.37241379022598264, "step": 128400 }, { "epoch": 0.129330879768261, "grad_norm": 12.893233089636619, "learning_rate": 4.923018243827779e-05, "loss": 2.6024, "mean_token_accuracy": 0.38620689511299133, "step": 128405 }, { "epoch": 0.12933591582136517, "grad_norm": 9.769822701325634, "learning_rate": 4.923008525674482e-05, "loss": 2.3737, "mean_token_accuracy": 0.42758620977401735, "step": 128410 }, { "epoch": 0.1293409518744693, "grad_norm": 11.093732285389486, "learning_rate": 4.922998806918493e-05, "loss": 2.2606, "mean_token_accuracy": 0.44827587008476255, "step": 128415 }, { "epoch": 0.12934598792757349, "grad_norm": 11.831503662923112, "learning_rate": 4.922989087559814e-05, "loss": 2.3646, "mean_token_accuracy": 0.44482759237289426, "step": 128420 }, { "epoch": 0.12935102398067766, "grad_norm": 10.932611173680359, "learning_rate": 4.9229793675984484e-05, "loss": 2.516, "mean_token_accuracy": 0.4, "step": 128425 }, { "epoch": 0.12935606003378183, "grad_norm": 10.720429181341594, "learning_rate": 4.9229696470343995e-05, "loss": 2.5235, "mean_token_accuracy": 0.4103448212146759, "step": 128430 }, { "epoch": 0.129361096086886, "grad_norm": 9.057200822809468, "learning_rate": 4.922959925867669e-05, "loss": 2.3166, "mean_token_accuracy": 0.3999999940395355, "step": 128435 }, { "epoch": 0.12936613213999018, "grad_norm": 11.393283278613834, "learning_rate": 4.92295020409826e-05, "loss": 2.3571, "mean_token_accuracy": 0.42758620977401735, "step": 128440 }, { "epoch": 0.12937116819309435, "grad_norm": 9.417508024936371, "learning_rate": 4.922940481726175e-05, "loss": 2.2085, "mean_token_accuracy": 0.4172413766384125, "step": 128445 }, { "epoch": 0.12937620424619853, "grad_norm": 10.641598745260088, "learning_rate": 4.9229307587514174e-05, "loss": 2.605, "mean_token_accuracy": 0.4000000059604645, "step": 128450 }, { "epoch": 0.1293812402993027, "grad_norm": 12.391252616413993, "learning_rate": 4.9229210351739884e-05, "loss": 2.7849, "mean_token_accuracy": 0.3724137991666794, "step": 128455 }, { "epoch": 0.12938627635240688, "grad_norm": 10.609599197597293, "learning_rate": 4.922911310993892e-05, "loss": 2.4388, "mean_token_accuracy": 0.4137930989265442, "step": 128460 }, { "epoch": 0.12939131240551105, "grad_norm": 11.084332974920564, "learning_rate": 4.922901586211131e-05, "loss": 2.2999, "mean_token_accuracy": 0.4241379380226135, "step": 128465 }, { "epoch": 0.12939634845861522, "grad_norm": 9.929504039439461, "learning_rate": 4.922891860825707e-05, "loss": 2.4784, "mean_token_accuracy": 0.4586206912994385, "step": 128470 }, { "epoch": 0.1294013845117194, "grad_norm": 15.988994757873366, "learning_rate": 4.9228821348376245e-05, "loss": 2.6338, "mean_token_accuracy": 0.4379310250282288, "step": 128475 }, { "epoch": 0.12940642056482357, "grad_norm": 10.086236902092688, "learning_rate": 4.9228724082468846e-05, "loss": 2.2012, "mean_token_accuracy": 0.4448275864124298, "step": 128480 }, { "epoch": 0.12941145661792774, "grad_norm": 9.919850608237327, "learning_rate": 4.92286268105349e-05, "loss": 2.4092, "mean_token_accuracy": 0.42068966031074523, "step": 128485 }, { "epoch": 0.12941649267103192, "grad_norm": 10.36151074613392, "learning_rate": 4.9228529532574436e-05, "loss": 2.6474, "mean_token_accuracy": 0.41034482717514037, "step": 128490 }, { "epoch": 0.1294215287241361, "grad_norm": 10.814078549595507, "learning_rate": 4.9228432248587486e-05, "loss": 1.9901, "mean_token_accuracy": 0.46388384103775027, "step": 128495 }, { "epoch": 0.12942656477724027, "grad_norm": 9.628282644193899, "learning_rate": 4.922833495857408e-05, "loss": 2.3803, "mean_token_accuracy": 0.42649728059768677, "step": 128500 }, { "epoch": 0.12943160083034444, "grad_norm": 10.190468686052826, "learning_rate": 4.922823766253423e-05, "loss": 2.1543, "mean_token_accuracy": 0.4085904359817505, "step": 128505 }, { "epoch": 0.1294366368834486, "grad_norm": 12.736714565369475, "learning_rate": 4.922814036046797e-05, "loss": 2.2055, "mean_token_accuracy": 0.47241379618644713, "step": 128510 }, { "epoch": 0.1294416729365528, "grad_norm": 13.81866931657363, "learning_rate": 4.922804305237534e-05, "loss": 2.1993, "mean_token_accuracy": 0.4551724135875702, "step": 128515 }, { "epoch": 0.12944670898965696, "grad_norm": 11.332984657417704, "learning_rate": 4.922794573825635e-05, "loss": 2.291, "mean_token_accuracy": 0.43448275327682495, "step": 128520 }, { "epoch": 0.12945174504276113, "grad_norm": 10.612774116665511, "learning_rate": 4.9227848418111036e-05, "loss": 2.2284, "mean_token_accuracy": 0.41034482717514037, "step": 128525 }, { "epoch": 0.1294567810958653, "grad_norm": 10.647990408439938, "learning_rate": 4.9227751091939425e-05, "loss": 2.6217, "mean_token_accuracy": 0.3793103456497192, "step": 128530 }, { "epoch": 0.12946181714896948, "grad_norm": 10.215387089311125, "learning_rate": 4.922765375974154e-05, "loss": 2.0988, "mean_token_accuracy": 0.4517241358757019, "step": 128535 }, { "epoch": 0.12946685320207366, "grad_norm": 9.50565017360137, "learning_rate": 4.9227556421517404e-05, "loss": 2.5273, "mean_token_accuracy": 0.48275862336158754, "step": 128540 }, { "epoch": 0.12947188925517783, "grad_norm": 12.570121742660366, "learning_rate": 4.922745907726705e-05, "loss": 2.3682, "mean_token_accuracy": 0.4206896543502808, "step": 128545 }, { "epoch": 0.129476925308282, "grad_norm": 10.587107295593276, "learning_rate": 4.9227361726990505e-05, "loss": 2.6492, "mean_token_accuracy": 0.3999999940395355, "step": 128550 }, { "epoch": 0.12948196136138615, "grad_norm": 10.889903996097548, "learning_rate": 4.92272643706878e-05, "loss": 2.325, "mean_token_accuracy": 0.4551724135875702, "step": 128555 }, { "epoch": 0.12948699741449032, "grad_norm": 11.250836260352825, "learning_rate": 4.9227167008358946e-05, "loss": 2.2902, "mean_token_accuracy": 0.3931034505367279, "step": 128560 }, { "epoch": 0.1294920334675945, "grad_norm": 9.692144480822643, "learning_rate": 4.9227069640003984e-05, "loss": 2.6993, "mean_token_accuracy": 0.46182698011398315, "step": 128565 }, { "epoch": 0.12949706952069867, "grad_norm": 10.364059605856148, "learning_rate": 4.922697226562295e-05, "loss": 2.3901, "mean_token_accuracy": 0.40689654350280763, "step": 128570 }, { "epoch": 0.12950210557380284, "grad_norm": 10.945650258632602, "learning_rate": 4.9226874885215854e-05, "loss": 2.3936, "mean_token_accuracy": 0.42068964838981626, "step": 128575 }, { "epoch": 0.12950714162690702, "grad_norm": 13.475264468376844, "learning_rate": 4.922677749878272e-05, "loss": 2.6797, "mean_token_accuracy": 0.34137930572032926, "step": 128580 }, { "epoch": 0.1295121776800112, "grad_norm": 9.29288252708925, "learning_rate": 4.922668010632359e-05, "loss": 2.2817, "mean_token_accuracy": 0.4068965524435043, "step": 128585 }, { "epoch": 0.12951721373311537, "grad_norm": 15.144942365167076, "learning_rate": 4.922658270783848e-05, "loss": 2.2094, "mean_token_accuracy": 0.4517241358757019, "step": 128590 }, { "epoch": 0.12952224978621954, "grad_norm": 14.56149968575802, "learning_rate": 4.922648530332743e-05, "loss": 2.5125, "mean_token_accuracy": 0.43103448748588563, "step": 128595 }, { "epoch": 0.1295272858393237, "grad_norm": 9.788288204012805, "learning_rate": 4.922638789279045e-05, "loss": 2.5252, "mean_token_accuracy": 0.4310344815254211, "step": 128600 }, { "epoch": 0.1295323218924279, "grad_norm": 11.313123335277009, "learning_rate": 4.922629047622758e-05, "loss": 2.375, "mean_token_accuracy": 0.41379310488700866, "step": 128605 }, { "epoch": 0.12953735794553206, "grad_norm": 11.983241871420145, "learning_rate": 4.922619305363884e-05, "loss": 2.4844, "mean_token_accuracy": 0.4, "step": 128610 }, { "epoch": 0.12954239399863623, "grad_norm": 9.775363386181654, "learning_rate": 4.9226095625024256e-05, "loss": 2.6076, "mean_token_accuracy": 0.3655172407627106, "step": 128615 }, { "epoch": 0.1295474300517404, "grad_norm": 10.321132760324106, "learning_rate": 4.922599819038386e-05, "loss": 2.2395, "mean_token_accuracy": 0.42068966031074523, "step": 128620 }, { "epoch": 0.12955246610484458, "grad_norm": 13.021030834541213, "learning_rate": 4.9225900749717683e-05, "loss": 2.515, "mean_token_accuracy": 0.36896551251411436, "step": 128625 }, { "epoch": 0.12955750215794876, "grad_norm": 10.368196936865282, "learning_rate": 4.922580330302574e-05, "loss": 2.3935, "mean_token_accuracy": 0.4206896543502808, "step": 128630 }, { "epoch": 0.12956253821105293, "grad_norm": 31.510578625425886, "learning_rate": 4.922570585030808e-05, "loss": 3.061, "mean_token_accuracy": 0.3379310369491577, "step": 128635 }, { "epoch": 0.1295675742641571, "grad_norm": 14.779858693807187, "learning_rate": 4.922560839156469e-05, "loss": 2.7028, "mean_token_accuracy": 0.37241379618644715, "step": 128640 }, { "epoch": 0.12957261031726128, "grad_norm": 11.210140774520562, "learning_rate": 4.922551092679564e-05, "loss": 2.3217, "mean_token_accuracy": 0.4413793087005615, "step": 128645 }, { "epoch": 0.12957764637036545, "grad_norm": 10.49835099290548, "learning_rate": 4.922541345600093e-05, "loss": 2.3338, "mean_token_accuracy": 0.4502722263336182, "step": 128650 }, { "epoch": 0.12958268242346963, "grad_norm": 9.933881739230081, "learning_rate": 4.922531597918059e-05, "loss": 2.4273, "mean_token_accuracy": 0.43448275327682495, "step": 128655 }, { "epoch": 0.1295877184765738, "grad_norm": 9.02386435036657, "learning_rate": 4.922521849633467e-05, "loss": 2.3226, "mean_token_accuracy": 0.42758620381355283, "step": 128660 }, { "epoch": 0.12959275452967797, "grad_norm": 9.422806619540497, "learning_rate": 4.922512100746316e-05, "loss": 2.2151, "mean_token_accuracy": 0.45172412395477296, "step": 128665 }, { "epoch": 0.12959779058278215, "grad_norm": 10.513136882419335, "learning_rate": 4.922502351256612e-05, "loss": 2.7507, "mean_token_accuracy": 0.358620685338974, "step": 128670 }, { "epoch": 0.12960282663588632, "grad_norm": 10.102674252622704, "learning_rate": 4.922492601164356e-05, "loss": 2.3054, "mean_token_accuracy": 0.4551724135875702, "step": 128675 }, { "epoch": 0.1296078626889905, "grad_norm": 10.341627093599968, "learning_rate": 4.922482850469551e-05, "loss": 2.5214, "mean_token_accuracy": 0.42068966031074523, "step": 128680 }, { "epoch": 0.12961289874209467, "grad_norm": 9.825369302983255, "learning_rate": 4.9224730991721996e-05, "loss": 2.2416, "mean_token_accuracy": 0.4448275864124298, "step": 128685 }, { "epoch": 0.12961793479519884, "grad_norm": 8.92783688411926, "learning_rate": 4.922463347272305e-05, "loss": 2.0701, "mean_token_accuracy": 0.47241379618644713, "step": 128690 }, { "epoch": 0.129622970848303, "grad_norm": 13.58763556782782, "learning_rate": 4.92245359476987e-05, "loss": 2.4561, "mean_token_accuracy": 0.4103448212146759, "step": 128695 }, { "epoch": 0.12962800690140716, "grad_norm": 10.210042775066235, "learning_rate": 4.922443841664896e-05, "loss": 2.3501, "mean_token_accuracy": 0.39310344457626345, "step": 128700 }, { "epoch": 0.12963304295451133, "grad_norm": 14.199492185502088, "learning_rate": 4.9224340879573864e-05, "loss": 2.4118, "mean_token_accuracy": 0.45985221266746523, "step": 128705 }, { "epoch": 0.1296380790076155, "grad_norm": 9.968943710280945, "learning_rate": 4.922424333647345e-05, "loss": 2.1674, "mean_token_accuracy": 0.46551724672317507, "step": 128710 }, { "epoch": 0.12964311506071968, "grad_norm": 11.371724280845331, "learning_rate": 4.922414578734773e-05, "loss": 2.8526, "mean_token_accuracy": 0.3896551728248596, "step": 128715 }, { "epoch": 0.12964815111382386, "grad_norm": 9.973817709465926, "learning_rate": 4.922404823219674e-05, "loss": 2.3886, "mean_token_accuracy": 0.4365396320819855, "step": 128720 }, { "epoch": 0.12965318716692803, "grad_norm": 10.75277288340718, "learning_rate": 4.92239506710205e-05, "loss": 2.295, "mean_token_accuracy": 0.4068965554237366, "step": 128725 }, { "epoch": 0.1296582232200322, "grad_norm": 11.649506136148458, "learning_rate": 4.922385310381906e-05, "loss": 3.0828, "mean_token_accuracy": 0.3689655065536499, "step": 128730 }, { "epoch": 0.12966325927313638, "grad_norm": 10.35002744805028, "learning_rate": 4.922375553059241e-05, "loss": 2.6444, "mean_token_accuracy": 0.41034482717514037, "step": 128735 }, { "epoch": 0.12966829532624055, "grad_norm": 14.850963433565857, "learning_rate": 4.922365795134059e-05, "loss": 2.3013, "mean_token_accuracy": 0.43793103098869324, "step": 128740 }, { "epoch": 0.12967333137934473, "grad_norm": 10.138083259821439, "learning_rate": 4.922356036606365e-05, "loss": 2.6505, "mean_token_accuracy": 0.39310344457626345, "step": 128745 }, { "epoch": 0.1296783674324489, "grad_norm": 13.169020782694234, "learning_rate": 4.922346277476159e-05, "loss": 2.683, "mean_token_accuracy": 0.4015124022960663, "step": 128750 }, { "epoch": 0.12968340348555307, "grad_norm": 14.682999815421061, "learning_rate": 4.922336517743445e-05, "loss": 2.7534, "mean_token_accuracy": 0.3862069010734558, "step": 128755 }, { "epoch": 0.12968843953865725, "grad_norm": 12.406982092579819, "learning_rate": 4.922326757408224e-05, "loss": 2.5751, "mean_token_accuracy": 0.4034482717514038, "step": 128760 }, { "epoch": 0.12969347559176142, "grad_norm": 9.289541207141683, "learning_rate": 4.9223169964705016e-05, "loss": 2.1228, "mean_token_accuracy": 0.532758629322052, "step": 128765 }, { "epoch": 0.1296985116448656, "grad_norm": 14.905165619608505, "learning_rate": 4.9223072349302784e-05, "loss": 2.479, "mean_token_accuracy": 0.41034482717514037, "step": 128770 }, { "epoch": 0.12970354769796977, "grad_norm": 10.54707094637327, "learning_rate": 4.922297472787558e-05, "loss": 2.369, "mean_token_accuracy": 0.4068965554237366, "step": 128775 }, { "epoch": 0.12970858375107394, "grad_norm": 9.267326814110223, "learning_rate": 4.922287710042342e-05, "loss": 2.0204, "mean_token_accuracy": 0.46551724076271056, "step": 128780 }, { "epoch": 0.12971361980417812, "grad_norm": 11.6991069401333, "learning_rate": 4.922277946694635e-05, "loss": 2.5079, "mean_token_accuracy": 0.45039322376251223, "step": 128785 }, { "epoch": 0.1297186558572823, "grad_norm": 11.040960812283831, "learning_rate": 4.922268182744438e-05, "loss": 2.5548, "mean_token_accuracy": 0.4103448331356049, "step": 128790 }, { "epoch": 0.12972369191038646, "grad_norm": 11.821578228411786, "learning_rate": 4.9222584181917545e-05, "loss": 2.429, "mean_token_accuracy": 0.3724137842655182, "step": 128795 }, { "epoch": 0.12972872796349064, "grad_norm": 14.745171348941362, "learning_rate": 4.922248653036587e-05, "loss": 2.1999, "mean_token_accuracy": 0.482758629322052, "step": 128800 }, { "epoch": 0.1297337640165948, "grad_norm": 9.803592162819138, "learning_rate": 4.922238887278938e-05, "loss": 2.2611, "mean_token_accuracy": 0.4344827592372894, "step": 128805 }, { "epoch": 0.12973880006969898, "grad_norm": 14.078533489124007, "learning_rate": 4.922229120918811e-05, "loss": 2.5571, "mean_token_accuracy": 0.41379310488700866, "step": 128810 }, { "epoch": 0.12974383612280316, "grad_norm": 9.427708135493718, "learning_rate": 4.922219353956208e-05, "loss": 2.7344, "mean_token_accuracy": 0.3620689630508423, "step": 128815 }, { "epoch": 0.12974887217590733, "grad_norm": 10.202429176037935, "learning_rate": 4.922209586391131e-05, "loss": 2.244, "mean_token_accuracy": 0.4586206912994385, "step": 128820 }, { "epoch": 0.1297539082290115, "grad_norm": 7.656981448783427, "learning_rate": 4.9221998182235844e-05, "loss": 1.965, "mean_token_accuracy": 0.4517241358757019, "step": 128825 }, { "epoch": 0.12975894428211568, "grad_norm": 14.059614221938608, "learning_rate": 4.9221900494535704e-05, "loss": 2.5231, "mean_token_accuracy": 0.4068965494632721, "step": 128830 }, { "epoch": 0.12976398033521983, "grad_norm": 9.42369608083215, "learning_rate": 4.9221802800810904e-05, "loss": 2.6977, "mean_token_accuracy": 0.43793103098869324, "step": 128835 }, { "epoch": 0.129769016388324, "grad_norm": 10.432590511032222, "learning_rate": 4.922170510106149e-05, "loss": 2.3689, "mean_token_accuracy": 0.44482759237289426, "step": 128840 }, { "epoch": 0.12977405244142817, "grad_norm": 10.572904177778087, "learning_rate": 4.922160739528747e-05, "loss": 2.4216, "mean_token_accuracy": 0.4068965554237366, "step": 128845 }, { "epoch": 0.12977908849453235, "grad_norm": 9.81605639781412, "learning_rate": 4.9221509683488884e-05, "loss": 1.6755, "mean_token_accuracy": 0.545976996421814, "step": 128850 }, { "epoch": 0.12978412454763652, "grad_norm": 13.612671088942882, "learning_rate": 4.9221411965665766e-05, "loss": 2.2121, "mean_token_accuracy": 0.43448275327682495, "step": 128855 }, { "epoch": 0.1297891606007407, "grad_norm": 13.363442660999057, "learning_rate": 4.922131424181813e-05, "loss": 2.74, "mean_token_accuracy": 0.3793103456497192, "step": 128860 }, { "epoch": 0.12979419665384487, "grad_norm": 12.847790319177335, "learning_rate": 4.9221216511945994e-05, "loss": 2.8904, "mean_token_accuracy": 0.39782213866710664, "step": 128865 }, { "epoch": 0.12979923270694904, "grad_norm": 11.075422676408175, "learning_rate": 4.9221118776049416e-05, "loss": 2.5751, "mean_token_accuracy": 0.4034482777118683, "step": 128870 }, { "epoch": 0.12980426876005322, "grad_norm": 9.254314635761203, "learning_rate": 4.922102103412839e-05, "loss": 2.442, "mean_token_accuracy": 0.44700543880462645, "step": 128875 }, { "epoch": 0.1298093048131574, "grad_norm": 9.202192513016714, "learning_rate": 4.922092328618297e-05, "loss": 2.4319, "mean_token_accuracy": 0.4482758641242981, "step": 128880 }, { "epoch": 0.12981434086626156, "grad_norm": 9.269608318216155, "learning_rate": 4.922082553221315e-05, "loss": 2.1075, "mean_token_accuracy": 0.47931034564971925, "step": 128885 }, { "epoch": 0.12981937691936574, "grad_norm": 9.99024636474826, "learning_rate": 4.9220727772219e-05, "loss": 2.3831, "mean_token_accuracy": 0.4275861978530884, "step": 128890 }, { "epoch": 0.1298244129724699, "grad_norm": 10.679935001270755, "learning_rate": 4.922063000620052e-05, "loss": 2.3392, "mean_token_accuracy": 0.4448275864124298, "step": 128895 }, { "epoch": 0.12982944902557408, "grad_norm": 15.057453091172635, "learning_rate": 4.9220532234157733e-05, "loss": 2.2004, "mean_token_accuracy": 0.43793103098869324, "step": 128900 }, { "epoch": 0.12983448507867826, "grad_norm": 8.823157014977081, "learning_rate": 4.9220434456090684e-05, "loss": 2.7488, "mean_token_accuracy": 0.4, "step": 128905 }, { "epoch": 0.12983952113178243, "grad_norm": 9.661226208373158, "learning_rate": 4.922033667199939e-05, "loss": 2.1303, "mean_token_accuracy": 0.4931034445762634, "step": 128910 }, { "epoch": 0.1298445571848866, "grad_norm": 11.7028942971968, "learning_rate": 4.9220238881883885e-05, "loss": 2.2658, "mean_token_accuracy": 0.44482758045196535, "step": 128915 }, { "epoch": 0.12984959323799078, "grad_norm": 15.302319056815664, "learning_rate": 4.9220141085744175e-05, "loss": 2.4273, "mean_token_accuracy": 0.36551724672317504, "step": 128920 }, { "epoch": 0.12985462929109495, "grad_norm": 9.824674646383706, "learning_rate": 4.922004328358032e-05, "loss": 2.2326, "mean_token_accuracy": 0.4620689630508423, "step": 128925 }, { "epoch": 0.12985966534419913, "grad_norm": 9.23863390844651, "learning_rate": 4.921994547539232e-05, "loss": 2.0856, "mean_token_accuracy": 0.47586206197738645, "step": 128930 }, { "epoch": 0.1298647013973033, "grad_norm": 11.617189621281785, "learning_rate": 4.9219847661180205e-05, "loss": 2.5977, "mean_token_accuracy": 0.3896551787853241, "step": 128935 }, { "epoch": 0.12986973745040747, "grad_norm": 7.843181029924144, "learning_rate": 4.921974984094403e-05, "loss": 2.0132, "mean_token_accuracy": 0.46896551847457885, "step": 128940 }, { "epoch": 0.12987477350351165, "grad_norm": 9.467589054048226, "learning_rate": 4.9219652014683785e-05, "loss": 2.3464, "mean_token_accuracy": 0.4103448331356049, "step": 128945 }, { "epoch": 0.12987980955661582, "grad_norm": 9.934286509569239, "learning_rate": 4.9219554182399526e-05, "loss": 2.7643, "mean_token_accuracy": 0.4044162094593048, "step": 128950 }, { "epoch": 0.12988484560972, "grad_norm": 12.713532099531973, "learning_rate": 4.9219456344091256e-05, "loss": 2.6595, "mean_token_accuracy": 0.35172413289546967, "step": 128955 }, { "epoch": 0.12988988166282417, "grad_norm": 9.237348765735163, "learning_rate": 4.921935849975902e-05, "loss": 2.5821, "mean_token_accuracy": 0.39310345649719236, "step": 128960 }, { "epoch": 0.12989491771592834, "grad_norm": 10.1279886098419, "learning_rate": 4.921926064940283e-05, "loss": 2.0792, "mean_token_accuracy": 0.47931033968925474, "step": 128965 }, { "epoch": 0.12989995376903252, "grad_norm": 10.802393616726349, "learning_rate": 4.921916279302273e-05, "loss": 2.7915, "mean_token_accuracy": 0.3862069010734558, "step": 128970 }, { "epoch": 0.12990498982213666, "grad_norm": 11.344827411072202, "learning_rate": 4.9219064930618734e-05, "loss": 1.9055, "mean_token_accuracy": 0.4804187178611755, "step": 128975 }, { "epoch": 0.12991002587524084, "grad_norm": 10.613188935967989, "learning_rate": 4.921896706219088e-05, "loss": 2.4469, "mean_token_accuracy": 0.4034482777118683, "step": 128980 }, { "epoch": 0.129915061928345, "grad_norm": 15.929356254626578, "learning_rate": 4.921886918773919e-05, "loss": 2.7692, "mean_token_accuracy": 0.3931034505367279, "step": 128985 }, { "epoch": 0.12992009798144918, "grad_norm": 11.052685889955178, "learning_rate": 4.921877130726369e-05, "loss": 2.3434, "mean_token_accuracy": 0.4448275864124298, "step": 128990 }, { "epoch": 0.12992513403455336, "grad_norm": 9.754025641991657, "learning_rate": 4.9218673420764406e-05, "loss": 2.495, "mean_token_accuracy": 0.4103448212146759, "step": 128995 }, { "epoch": 0.12993017008765753, "grad_norm": 9.099272999375952, "learning_rate": 4.9218575528241366e-05, "loss": 2.4259, "mean_token_accuracy": 0.3913490653038025, "step": 129000 }, { "epoch": 0.1299352061407617, "grad_norm": 11.197552428919481, "learning_rate": 4.92184776296946e-05, "loss": 2.3226, "mean_token_accuracy": 0.4551724076271057, "step": 129005 }, { "epoch": 0.12994024219386588, "grad_norm": 10.507743180889685, "learning_rate": 4.921837972512413e-05, "loss": 2.2834, "mean_token_accuracy": 0.41724138259887694, "step": 129010 }, { "epoch": 0.12994527824697005, "grad_norm": 9.535505812762104, "learning_rate": 4.921828181452999e-05, "loss": 2.5572, "mean_token_accuracy": 0.4620689630508423, "step": 129015 }, { "epoch": 0.12995031430007423, "grad_norm": 12.289841951176568, "learning_rate": 4.92181838979122e-05, "loss": 2.629, "mean_token_accuracy": 0.39310344457626345, "step": 129020 }, { "epoch": 0.1299553503531784, "grad_norm": 9.17240638367847, "learning_rate": 4.92180859752708e-05, "loss": 2.196, "mean_token_accuracy": 0.42068966031074523, "step": 129025 }, { "epoch": 0.12996038640628257, "grad_norm": 8.65741313496406, "learning_rate": 4.9217988046605795e-05, "loss": 2.2303, "mean_token_accuracy": 0.4551724135875702, "step": 129030 }, { "epoch": 0.12996542245938675, "grad_norm": 8.179782684164124, "learning_rate": 4.921789011191723e-05, "loss": 2.0353, "mean_token_accuracy": 0.5034482717514038, "step": 129035 }, { "epoch": 0.12997045851249092, "grad_norm": 9.164897471708702, "learning_rate": 4.921779217120513e-05, "loss": 2.8426, "mean_token_accuracy": 0.36896551847457887, "step": 129040 }, { "epoch": 0.1299754945655951, "grad_norm": 10.418627358706344, "learning_rate": 4.9217694224469516e-05, "loss": 2.2527, "mean_token_accuracy": 0.43103447556495667, "step": 129045 }, { "epoch": 0.12998053061869927, "grad_norm": 11.44265481778297, "learning_rate": 4.921759627171042e-05, "loss": 2.1793, "mean_token_accuracy": 0.49999999403953554, "step": 129050 }, { "epoch": 0.12998556667180344, "grad_norm": 9.587542874032676, "learning_rate": 4.921749831292787e-05, "loss": 2.2143, "mean_token_accuracy": 0.493103438615799, "step": 129055 }, { "epoch": 0.12999060272490762, "grad_norm": 10.181623158865825, "learning_rate": 4.921740034812189e-05, "loss": 2.4807, "mean_token_accuracy": 0.36896551847457887, "step": 129060 }, { "epoch": 0.1299956387780118, "grad_norm": 14.319432190892446, "learning_rate": 4.92173023772925e-05, "loss": 2.6834, "mean_token_accuracy": 0.39310343861579894, "step": 129065 }, { "epoch": 0.13000067483111596, "grad_norm": 9.642575956353483, "learning_rate": 4.921720440043974e-05, "loss": 2.2141, "mean_token_accuracy": 0.47931034564971925, "step": 129070 }, { "epoch": 0.13000571088422014, "grad_norm": 11.05650215824258, "learning_rate": 4.921710641756364e-05, "loss": 2.0048, "mean_token_accuracy": 0.4931034564971924, "step": 129075 }, { "epoch": 0.1300107469373243, "grad_norm": 11.343850295285575, "learning_rate": 4.9217008428664206e-05, "loss": 2.2962, "mean_token_accuracy": 0.4344827592372894, "step": 129080 }, { "epoch": 0.13001578299042849, "grad_norm": 13.143462895810625, "learning_rate": 4.921691043374149e-05, "loss": 2.685, "mean_token_accuracy": 0.4034482777118683, "step": 129085 }, { "epoch": 0.13002081904353266, "grad_norm": 9.681457221954199, "learning_rate": 4.92168124327955e-05, "loss": 2.0833, "mean_token_accuracy": 0.42413792610168455, "step": 129090 }, { "epoch": 0.13002585509663683, "grad_norm": 12.861691871039671, "learning_rate": 4.921671442582628e-05, "loss": 2.6104, "mean_token_accuracy": 0.3896551728248596, "step": 129095 }, { "epoch": 0.130030891149741, "grad_norm": 9.534868523340778, "learning_rate": 4.921661641283384e-05, "loss": 2.6672, "mean_token_accuracy": 0.41034483909606934, "step": 129100 }, { "epoch": 0.13003592720284518, "grad_norm": 13.538707796803724, "learning_rate": 4.9216518393818226e-05, "loss": 2.5987, "mean_token_accuracy": 0.43103447556495667, "step": 129105 }, { "epoch": 0.13004096325594935, "grad_norm": 8.648761549755044, "learning_rate": 4.9216420368779436e-05, "loss": 2.171, "mean_token_accuracy": 0.48275862336158754, "step": 129110 }, { "epoch": 0.1300459993090535, "grad_norm": 11.340521407737254, "learning_rate": 4.921632233771753e-05, "loss": 2.1855, "mean_token_accuracy": 0.47241379618644713, "step": 129115 }, { "epoch": 0.13005103536215767, "grad_norm": 8.28950862814899, "learning_rate": 4.9216224300632515e-05, "loss": 2.0552, "mean_token_accuracy": 0.4655172526836395, "step": 129120 }, { "epoch": 0.13005607141526185, "grad_norm": 10.91781274078189, "learning_rate": 4.921612625752443e-05, "loss": 2.4565, "mean_token_accuracy": 0.4034482717514038, "step": 129125 }, { "epoch": 0.13006110746836602, "grad_norm": 8.519807181961898, "learning_rate": 4.9216028208393294e-05, "loss": 2.5394, "mean_token_accuracy": 0.4068965554237366, "step": 129130 }, { "epoch": 0.1300661435214702, "grad_norm": 10.888882690432926, "learning_rate": 4.9215930153239136e-05, "loss": 2.4095, "mean_token_accuracy": 0.4344827592372894, "step": 129135 }, { "epoch": 0.13007117957457437, "grad_norm": 10.000760322978257, "learning_rate": 4.921583209206198e-05, "loss": 2.3519, "mean_token_accuracy": 0.4620689570903778, "step": 129140 }, { "epoch": 0.13007621562767854, "grad_norm": 11.844827946635355, "learning_rate": 4.921573402486187e-05, "loss": 2.3979, "mean_token_accuracy": 0.4103448331356049, "step": 129145 }, { "epoch": 0.13008125168078272, "grad_norm": 10.55951584015606, "learning_rate": 4.9215635951638805e-05, "loss": 2.6713, "mean_token_accuracy": 0.4000000059604645, "step": 129150 }, { "epoch": 0.1300862877338869, "grad_norm": 10.040452606788156, "learning_rate": 4.9215537872392834e-05, "loss": 2.3228, "mean_token_accuracy": 0.4517241358757019, "step": 129155 }, { "epoch": 0.13009132378699106, "grad_norm": 9.823654898282637, "learning_rate": 4.921543978712398e-05, "loss": 2.213, "mean_token_accuracy": 0.4931034505367279, "step": 129160 }, { "epoch": 0.13009635984009524, "grad_norm": 11.925671672230084, "learning_rate": 4.921534169583226e-05, "loss": 2.3796, "mean_token_accuracy": 0.44827585816383364, "step": 129165 }, { "epoch": 0.1301013958931994, "grad_norm": 10.751464219240567, "learning_rate": 4.9215243598517716e-05, "loss": 2.1919, "mean_token_accuracy": 0.42068964838981626, "step": 129170 }, { "epoch": 0.13010643194630359, "grad_norm": 10.406064063478205, "learning_rate": 4.9215145495180366e-05, "loss": 2.2861, "mean_token_accuracy": 0.4310344815254211, "step": 129175 }, { "epoch": 0.13011146799940776, "grad_norm": 9.81013586051914, "learning_rate": 4.921504738582024e-05, "loss": 2.6879, "mean_token_accuracy": 0.39655172228813174, "step": 129180 }, { "epoch": 0.13011650405251193, "grad_norm": 10.003018902046872, "learning_rate": 4.921494927043737e-05, "loss": 2.2703, "mean_token_accuracy": 0.482758617401123, "step": 129185 }, { "epoch": 0.1301215401056161, "grad_norm": 10.69242808223807, "learning_rate": 4.921485114903176e-05, "loss": 2.7214, "mean_token_accuracy": 0.3655172437429428, "step": 129190 }, { "epoch": 0.13012657615872028, "grad_norm": 9.312606287242122, "learning_rate": 4.9214753021603474e-05, "loss": 2.3152, "mean_token_accuracy": 0.3911675691604614, "step": 129195 }, { "epoch": 0.13013161221182445, "grad_norm": 9.997138202707541, "learning_rate": 4.92146548881525e-05, "loss": 2.1418, "mean_token_accuracy": 0.47586206793785096, "step": 129200 }, { "epoch": 0.13013664826492863, "grad_norm": 8.934825362531758, "learning_rate": 4.921455674867891e-05, "loss": 2.7794, "mean_token_accuracy": 0.3965517163276672, "step": 129205 }, { "epoch": 0.1301416843180328, "grad_norm": 12.049984574977294, "learning_rate": 4.921445860318269e-05, "loss": 2.7606, "mean_token_accuracy": 0.42758620977401735, "step": 129210 }, { "epoch": 0.13014672037113698, "grad_norm": 6.646942027711644, "learning_rate": 4.9214360451663885e-05, "loss": 2.0259, "mean_token_accuracy": 0.4862069010734558, "step": 129215 }, { "epoch": 0.13015175642424115, "grad_norm": 13.214860662286243, "learning_rate": 4.921426229412253e-05, "loss": 2.6543, "mean_token_accuracy": 0.358620685338974, "step": 129220 }, { "epoch": 0.13015679247734532, "grad_norm": 10.64313424656918, "learning_rate": 4.921416413055864e-05, "loss": 2.6381, "mean_token_accuracy": 0.4068965554237366, "step": 129225 }, { "epoch": 0.1301618285304495, "grad_norm": 10.069920428691292, "learning_rate": 4.9214065960972246e-05, "loss": 2.5657, "mean_token_accuracy": 0.42413793206214906, "step": 129230 }, { "epoch": 0.13016686458355367, "grad_norm": 11.161183774699586, "learning_rate": 4.921396778536337e-05, "loss": 2.2985, "mean_token_accuracy": 0.4413793087005615, "step": 129235 }, { "epoch": 0.13017190063665784, "grad_norm": 11.080384450565381, "learning_rate": 4.9213869603732046e-05, "loss": 2.5605, "mean_token_accuracy": 0.4034482717514038, "step": 129240 }, { "epoch": 0.13017693668976202, "grad_norm": 11.70509229179508, "learning_rate": 4.92137714160783e-05, "loss": 2.2962, "mean_token_accuracy": 0.4586206912994385, "step": 129245 }, { "epoch": 0.1301819727428662, "grad_norm": 12.776876263891452, "learning_rate": 4.921367322240216e-05, "loss": 2.758, "mean_token_accuracy": 0.37241379022598264, "step": 129250 }, { "epoch": 0.13018700879597034, "grad_norm": 9.298334534560004, "learning_rate": 4.9213575022703645e-05, "loss": 2.4465, "mean_token_accuracy": 0.45862069725990295, "step": 129255 }, { "epoch": 0.1301920448490745, "grad_norm": 11.343326549216577, "learning_rate": 4.92134768169828e-05, "loss": 2.3055, "mean_token_accuracy": 0.4551724135875702, "step": 129260 }, { "epoch": 0.13019708090217869, "grad_norm": 10.341818039911217, "learning_rate": 4.921337860523963e-05, "loss": 2.2726, "mean_token_accuracy": 0.3896551787853241, "step": 129265 }, { "epoch": 0.13020211695528286, "grad_norm": 9.315649677002215, "learning_rate": 4.921328038747418e-05, "loss": 2.5262, "mean_token_accuracy": 0.4482758641242981, "step": 129270 }, { "epoch": 0.13020715300838703, "grad_norm": 10.38477189795735, "learning_rate": 4.921318216368646e-05, "loss": 2.6443, "mean_token_accuracy": 0.37586206793785093, "step": 129275 }, { "epoch": 0.1302121890614912, "grad_norm": 10.051838724946624, "learning_rate": 4.921308393387653e-05, "loss": 2.4293, "mean_token_accuracy": 0.4241379201412201, "step": 129280 }, { "epoch": 0.13021722511459538, "grad_norm": 9.464562921840644, "learning_rate": 4.921298569804438e-05, "loss": 2.2438, "mean_token_accuracy": 0.42413792610168455, "step": 129285 }, { "epoch": 0.13022226116769955, "grad_norm": 12.47707481414817, "learning_rate": 4.9212887456190053e-05, "loss": 2.3116, "mean_token_accuracy": 0.41379310488700866, "step": 129290 }, { "epoch": 0.13022729722080373, "grad_norm": 8.405989613412654, "learning_rate": 4.921278920831358e-05, "loss": 2.2264, "mean_token_accuracy": 0.41379310488700866, "step": 129295 }, { "epoch": 0.1302323332739079, "grad_norm": 9.65346990567041, "learning_rate": 4.921269095441498e-05, "loss": 2.4937, "mean_token_accuracy": 0.4365396171808243, "step": 129300 }, { "epoch": 0.13023736932701208, "grad_norm": 9.624865426897893, "learning_rate": 4.921259269449429e-05, "loss": 2.4789, "mean_token_accuracy": 0.41857229471206664, "step": 129305 }, { "epoch": 0.13024240538011625, "grad_norm": 8.909263640046639, "learning_rate": 4.921249442855152e-05, "loss": 2.6475, "mean_token_accuracy": 0.42758620977401735, "step": 129310 }, { "epoch": 0.13024744143322042, "grad_norm": 18.5774035363204, "learning_rate": 4.921239615658672e-05, "loss": 2.7813, "mean_token_accuracy": 0.4551724076271057, "step": 129315 }, { "epoch": 0.1302524774863246, "grad_norm": 11.055270515815687, "learning_rate": 4.92122978785999e-05, "loss": 2.5187, "mean_token_accuracy": 0.47931033968925474, "step": 129320 }, { "epoch": 0.13025751353942877, "grad_norm": 9.245289731657552, "learning_rate": 4.9212199594591095e-05, "loss": 2.2389, "mean_token_accuracy": 0.43103447556495667, "step": 129325 }, { "epoch": 0.13026254959253294, "grad_norm": 12.670729568976498, "learning_rate": 4.921210130456033e-05, "loss": 2.9651, "mean_token_accuracy": 0.34827586114406583, "step": 129330 }, { "epoch": 0.13026758564563712, "grad_norm": 9.6831610186234, "learning_rate": 4.921200300850764e-05, "loss": 2.4141, "mean_token_accuracy": 0.42413793206214906, "step": 129335 }, { "epoch": 0.1302726216987413, "grad_norm": 11.13431813272363, "learning_rate": 4.921190470643303e-05, "loss": 2.2295, "mean_token_accuracy": 0.4482758641242981, "step": 129340 }, { "epoch": 0.13027765775184547, "grad_norm": 9.657109050767806, "learning_rate": 4.9211806398336554e-05, "loss": 2.0867, "mean_token_accuracy": 0.482758629322052, "step": 129345 }, { "epoch": 0.13028269380494964, "grad_norm": 9.460780956134226, "learning_rate": 4.921170808421822e-05, "loss": 2.4273, "mean_token_accuracy": 0.4257713258266449, "step": 129350 }, { "epoch": 0.1302877298580538, "grad_norm": 10.926793093908415, "learning_rate": 4.921160976407807e-05, "loss": 2.5411, "mean_token_accuracy": 0.41034483909606934, "step": 129355 }, { "epoch": 0.130292765911158, "grad_norm": 10.43138692876934, "learning_rate": 4.921151143791612e-05, "loss": 2.1316, "mean_token_accuracy": 0.4379310369491577, "step": 129360 }, { "epoch": 0.13029780196426216, "grad_norm": 10.042449375688033, "learning_rate": 4.92114131057324e-05, "loss": 2.1914, "mean_token_accuracy": 0.4689655125141144, "step": 129365 }, { "epoch": 0.13030283801736633, "grad_norm": 10.90227374136877, "learning_rate": 4.9211314767526944e-05, "loss": 2.0421, "mean_token_accuracy": 0.5054446518421173, "step": 129370 }, { "epoch": 0.1303078740704705, "grad_norm": 10.4911757872581, "learning_rate": 4.921121642329977e-05, "loss": 2.4023, "mean_token_accuracy": 0.4068965494632721, "step": 129375 }, { "epoch": 0.13031291012357468, "grad_norm": 10.41582666642685, "learning_rate": 4.92111180730509e-05, "loss": 2.075, "mean_token_accuracy": 0.510344821214676, "step": 129380 }, { "epoch": 0.13031794617667886, "grad_norm": 16.958841846099634, "learning_rate": 4.921101971678039e-05, "loss": 2.8366, "mean_token_accuracy": 0.4068965494632721, "step": 129385 }, { "epoch": 0.13032298222978303, "grad_norm": 10.44170471139477, "learning_rate": 4.921092135448824e-05, "loss": 2.4146, "mean_token_accuracy": 0.4034482777118683, "step": 129390 }, { "epoch": 0.13032801828288718, "grad_norm": 9.98467170556745, "learning_rate": 4.9210822986174474e-05, "loss": 2.4195, "mean_token_accuracy": 0.43793103098869324, "step": 129395 }, { "epoch": 0.13033305433599135, "grad_norm": 8.795114003306088, "learning_rate": 4.9210724611839144e-05, "loss": 2.2201, "mean_token_accuracy": 0.46206897497177124, "step": 129400 }, { "epoch": 0.13033809038909552, "grad_norm": 12.788349839012314, "learning_rate": 4.921062623148226e-05, "loss": 2.145, "mean_token_accuracy": 0.49655172824859617, "step": 129405 }, { "epoch": 0.1303431264421997, "grad_norm": 11.071766114295322, "learning_rate": 4.9210527845103846e-05, "loss": 2.484, "mean_token_accuracy": 0.4034482717514038, "step": 129410 }, { "epoch": 0.13034816249530387, "grad_norm": 9.672154944824332, "learning_rate": 4.9210429452703947e-05, "loss": 2.6607, "mean_token_accuracy": 0.40689654350280763, "step": 129415 }, { "epoch": 0.13035319854840804, "grad_norm": 10.651940626290392, "learning_rate": 4.921033105428257e-05, "loss": 2.5017, "mean_token_accuracy": 0.44137930274009707, "step": 129420 }, { "epoch": 0.13035823460151222, "grad_norm": 9.630757351472075, "learning_rate": 4.921023264983975e-05, "loss": 2.4826, "mean_token_accuracy": 0.41379311084747317, "step": 129425 }, { "epoch": 0.1303632706546164, "grad_norm": 14.629111292803264, "learning_rate": 4.921013423937553e-05, "loss": 2.8851, "mean_token_accuracy": 0.3862068891525269, "step": 129430 }, { "epoch": 0.13036830670772057, "grad_norm": 11.588360137664067, "learning_rate": 4.921003582288991e-05, "loss": 2.6196, "mean_token_accuracy": 0.3827586233615875, "step": 129435 }, { "epoch": 0.13037334276082474, "grad_norm": 7.840971141067827, "learning_rate": 4.920993740038293e-05, "loss": 2.1574, "mean_token_accuracy": 0.5344827592372894, "step": 129440 }, { "epoch": 0.1303783788139289, "grad_norm": 11.674542533545212, "learning_rate": 4.9209838971854624e-05, "loss": 2.1892, "mean_token_accuracy": 0.47586206793785096, "step": 129445 }, { "epoch": 0.1303834148670331, "grad_norm": 9.30807409605611, "learning_rate": 4.9209740537305017e-05, "loss": 2.1998, "mean_token_accuracy": 0.46551724672317507, "step": 129450 }, { "epoch": 0.13038845092013726, "grad_norm": 12.180083801125372, "learning_rate": 4.920964209673413e-05, "loss": 2.4786, "mean_token_accuracy": 0.4, "step": 129455 }, { "epoch": 0.13039348697324143, "grad_norm": 12.088921490412227, "learning_rate": 4.920954365014199e-05, "loss": 2.5126, "mean_token_accuracy": 0.4344827592372894, "step": 129460 }, { "epoch": 0.1303985230263456, "grad_norm": 9.06604965626044, "learning_rate": 4.920944519752863e-05, "loss": 2.6583, "mean_token_accuracy": 0.3896551728248596, "step": 129465 }, { "epoch": 0.13040355907944978, "grad_norm": 10.64590763756691, "learning_rate": 4.920934673889406e-05, "loss": 2.6717, "mean_token_accuracy": 0.4034482777118683, "step": 129470 }, { "epoch": 0.13040859513255396, "grad_norm": 10.184118682889142, "learning_rate": 4.920924827423834e-05, "loss": 2.3753, "mean_token_accuracy": 0.44482759237289426, "step": 129475 }, { "epoch": 0.13041363118565813, "grad_norm": 10.17231426013366, "learning_rate": 4.920914980356147e-05, "loss": 2.3596, "mean_token_accuracy": 0.4103448212146759, "step": 129480 }, { "epoch": 0.1304186672387623, "grad_norm": 12.910015412580528, "learning_rate": 4.92090513268635e-05, "loss": 2.4315, "mean_token_accuracy": 0.4068965554237366, "step": 129485 }, { "epoch": 0.13042370329186648, "grad_norm": 12.850317633386826, "learning_rate": 4.920895284414442e-05, "loss": 2.3701, "mean_token_accuracy": 0.4034482777118683, "step": 129490 }, { "epoch": 0.13042873934497065, "grad_norm": 8.44974869565826, "learning_rate": 4.92088543554043e-05, "loss": 2.1333, "mean_token_accuracy": 0.47586206197738645, "step": 129495 }, { "epoch": 0.13043377539807482, "grad_norm": 9.698224933233899, "learning_rate": 4.920875586064314e-05, "loss": 2.5498, "mean_token_accuracy": 0.42413793206214906, "step": 129500 }, { "epoch": 0.130438811451179, "grad_norm": 11.745868023296493, "learning_rate": 4.9208657359860976e-05, "loss": 2.1325, "mean_token_accuracy": 0.4862069010734558, "step": 129505 }, { "epoch": 0.13044384750428317, "grad_norm": 18.42893924893373, "learning_rate": 4.920855885305784e-05, "loss": 2.6479, "mean_token_accuracy": 0.42068964838981626, "step": 129510 }, { "epoch": 0.13044888355738735, "grad_norm": 8.951450479716442, "learning_rate": 4.920846034023375e-05, "loss": 2.6923, "mean_token_accuracy": 0.4172413766384125, "step": 129515 }, { "epoch": 0.13045391961049152, "grad_norm": 10.470889349126548, "learning_rate": 4.9208361821388735e-05, "loss": 2.088, "mean_token_accuracy": 0.5125831842422486, "step": 129520 }, { "epoch": 0.1304589556635957, "grad_norm": 11.878940122235473, "learning_rate": 4.920826329652283e-05, "loss": 2.5428, "mean_token_accuracy": 0.4454930365085602, "step": 129525 }, { "epoch": 0.13046399171669987, "grad_norm": 12.421954377669323, "learning_rate": 4.920816476563606e-05, "loss": 2.9375, "mean_token_accuracy": 0.37586206793785093, "step": 129530 }, { "epoch": 0.130469027769804, "grad_norm": 17.224319106689666, "learning_rate": 4.920806622872844e-05, "loss": 2.3543, "mean_token_accuracy": 0.47382464408874514, "step": 129535 }, { "epoch": 0.1304740638229082, "grad_norm": 10.859901545378836, "learning_rate": 4.920796768580001e-05, "loss": 2.5708, "mean_token_accuracy": 0.4103448212146759, "step": 129540 }, { "epoch": 0.13047909987601236, "grad_norm": 11.065226287882751, "learning_rate": 4.920786913685079e-05, "loss": 2.5147, "mean_token_accuracy": 0.37241379022598264, "step": 129545 }, { "epoch": 0.13048413592911653, "grad_norm": 15.267922582902608, "learning_rate": 4.920777058188082e-05, "loss": 2.4677, "mean_token_accuracy": 0.4137930989265442, "step": 129550 }, { "epoch": 0.1304891719822207, "grad_norm": 9.361456865913672, "learning_rate": 4.920767202089012e-05, "loss": 2.44, "mean_token_accuracy": 0.4310344815254211, "step": 129555 }, { "epoch": 0.13049420803532488, "grad_norm": 11.411162104666378, "learning_rate": 4.920757345387871e-05, "loss": 2.3057, "mean_token_accuracy": 0.4517241418361664, "step": 129560 }, { "epoch": 0.13049924408842906, "grad_norm": 13.428229385735968, "learning_rate": 4.9207474880846626e-05, "loss": 2.5223, "mean_token_accuracy": 0.41724138259887694, "step": 129565 }, { "epoch": 0.13050428014153323, "grad_norm": 11.694987075875211, "learning_rate": 4.9207376301793896e-05, "loss": 1.9535, "mean_token_accuracy": 0.5190886616706848, "step": 129570 }, { "epoch": 0.1305093161946374, "grad_norm": 10.633994817693539, "learning_rate": 4.920727771672054e-05, "loss": 2.1742, "mean_token_accuracy": 0.43448275327682495, "step": 129575 }, { "epoch": 0.13051435224774158, "grad_norm": 19.100883530220017, "learning_rate": 4.920717912562658e-05, "loss": 2.3931, "mean_token_accuracy": 0.4950393199920654, "step": 129580 }, { "epoch": 0.13051938830084575, "grad_norm": 9.3374759327297, "learning_rate": 4.920708052851207e-05, "loss": 2.0964, "mean_token_accuracy": 0.4689655065536499, "step": 129585 }, { "epoch": 0.13052442435394992, "grad_norm": 10.15109235166133, "learning_rate": 4.9206981925377016e-05, "loss": 2.1965, "mean_token_accuracy": 0.4482758641242981, "step": 129590 }, { "epoch": 0.1305294604070541, "grad_norm": 16.952555359938135, "learning_rate": 4.920688331622144e-05, "loss": 2.6903, "mean_token_accuracy": 0.4172413766384125, "step": 129595 }, { "epoch": 0.13053449646015827, "grad_norm": 11.76023223390273, "learning_rate": 4.920678470104539e-05, "loss": 2.3956, "mean_token_accuracy": 0.4517241358757019, "step": 129600 }, { "epoch": 0.13053953251326245, "grad_norm": 9.660428406496449, "learning_rate": 4.920668607984888e-05, "loss": 2.3671, "mean_token_accuracy": 0.43944343328475954, "step": 129605 }, { "epoch": 0.13054456856636662, "grad_norm": 12.941687181262658, "learning_rate": 4.920658745263194e-05, "loss": 2.436, "mean_token_accuracy": 0.39812461733818055, "step": 129610 }, { "epoch": 0.1305496046194708, "grad_norm": 12.624723468066707, "learning_rate": 4.9206488819394585e-05, "loss": 2.516, "mean_token_accuracy": 0.4034482717514038, "step": 129615 }, { "epoch": 0.13055464067257497, "grad_norm": 10.474768786916417, "learning_rate": 4.920639018013687e-05, "loss": 2.1695, "mean_token_accuracy": 0.4551724076271057, "step": 129620 }, { "epoch": 0.13055967672567914, "grad_norm": 14.267385471219535, "learning_rate": 4.92062915348588e-05, "loss": 2.2539, "mean_token_accuracy": 0.48965516686439514, "step": 129625 }, { "epoch": 0.13056471277878332, "grad_norm": 10.69116743341844, "learning_rate": 4.920619288356041e-05, "loss": 2.4402, "mean_token_accuracy": 0.4344827592372894, "step": 129630 }, { "epoch": 0.1305697488318875, "grad_norm": 11.170199085196069, "learning_rate": 4.920609422624173e-05, "loss": 2.321, "mean_token_accuracy": 0.46551724076271056, "step": 129635 }, { "epoch": 0.13057478488499166, "grad_norm": 9.977782510464031, "learning_rate": 4.920599556290278e-05, "loss": 2.0961, "mean_token_accuracy": 0.49854809045791626, "step": 129640 }, { "epoch": 0.13057982093809584, "grad_norm": 12.156757647579397, "learning_rate": 4.920589689354359e-05, "loss": 2.2209, "mean_token_accuracy": 0.4724137902259827, "step": 129645 }, { "epoch": 0.1305848569912, "grad_norm": 8.98511433843367, "learning_rate": 4.920579821816418e-05, "loss": 2.0213, "mean_token_accuracy": 0.4931034445762634, "step": 129650 }, { "epoch": 0.13058989304430418, "grad_norm": 14.459676711731003, "learning_rate": 4.92056995367646e-05, "loss": 2.6627, "mean_token_accuracy": 0.3965517282485962, "step": 129655 }, { "epoch": 0.13059492909740836, "grad_norm": 9.037490198170193, "learning_rate": 4.920560084934486e-05, "loss": 2.7547, "mean_token_accuracy": 0.3793103516101837, "step": 129660 }, { "epoch": 0.13059996515051253, "grad_norm": 7.480979262369426, "learning_rate": 4.920550215590499e-05, "loss": 2.4778, "mean_token_accuracy": 0.46031458377838136, "step": 129665 }, { "epoch": 0.1306050012036167, "grad_norm": 12.20311132509539, "learning_rate": 4.920540345644502e-05, "loss": 2.4602, "mean_token_accuracy": 0.4103448331356049, "step": 129670 }, { "epoch": 0.13061003725672085, "grad_norm": 9.041003906303553, "learning_rate": 4.9205304750964977e-05, "loss": 2.3402, "mean_token_accuracy": 0.3931034505367279, "step": 129675 }, { "epoch": 0.13061507330982502, "grad_norm": 13.422866558312972, "learning_rate": 4.920520603946488e-05, "loss": 2.3834, "mean_token_accuracy": 0.4053236603736877, "step": 129680 }, { "epoch": 0.1306201093629292, "grad_norm": 16.872525776779753, "learning_rate": 4.920510732194477e-05, "loss": 2.6042, "mean_token_accuracy": 0.4, "step": 129685 }, { "epoch": 0.13062514541603337, "grad_norm": 11.430652407173223, "learning_rate": 4.9205008598404656e-05, "loss": 2.5181, "mean_token_accuracy": 0.482758617401123, "step": 129690 }, { "epoch": 0.13063018146913755, "grad_norm": 10.1883215431587, "learning_rate": 4.920490986884459e-05, "loss": 1.9856, "mean_token_accuracy": 0.4973667740821838, "step": 129695 }, { "epoch": 0.13063521752224172, "grad_norm": 9.697056299902146, "learning_rate": 4.920481113326458e-05, "loss": 2.2099, "mean_token_accuracy": 0.4482758641242981, "step": 129700 }, { "epoch": 0.1306402535753459, "grad_norm": 12.189435485388746, "learning_rate": 4.920471239166466e-05, "loss": 2.4748, "mean_token_accuracy": 0.4620689630508423, "step": 129705 }, { "epoch": 0.13064528962845007, "grad_norm": 14.042526472262566, "learning_rate": 4.920461364404486e-05, "loss": 2.8522, "mean_token_accuracy": 0.38620689511299133, "step": 129710 }, { "epoch": 0.13065032568155424, "grad_norm": 9.390108999891527, "learning_rate": 4.920451489040519e-05, "loss": 2.4159, "mean_token_accuracy": 0.38965516686439516, "step": 129715 }, { "epoch": 0.13065536173465842, "grad_norm": 10.82204494570798, "learning_rate": 4.920441613074571e-05, "loss": 2.4565, "mean_token_accuracy": 0.4068965554237366, "step": 129720 }, { "epoch": 0.1306603977877626, "grad_norm": 10.495526618004677, "learning_rate": 4.920431736506643e-05, "loss": 2.5404, "mean_token_accuracy": 0.38275861740112305, "step": 129725 }, { "epoch": 0.13066543384086676, "grad_norm": 18.14833919057225, "learning_rate": 4.9204218593367364e-05, "loss": 3.1358, "mean_token_accuracy": 0.3827586233615875, "step": 129730 }, { "epoch": 0.13067046989397094, "grad_norm": 13.721473082093036, "learning_rate": 4.920411981564856e-05, "loss": 2.6488, "mean_token_accuracy": 0.3551724076271057, "step": 129735 }, { "epoch": 0.1306755059470751, "grad_norm": 8.712896202315477, "learning_rate": 4.920402103191003e-05, "loss": 2.2153, "mean_token_accuracy": 0.4620689570903778, "step": 129740 }, { "epoch": 0.13068054200017928, "grad_norm": 11.133789461992814, "learning_rate": 4.920392224215181e-05, "loss": 2.083, "mean_token_accuracy": 0.47931034564971925, "step": 129745 }, { "epoch": 0.13068557805328346, "grad_norm": 9.878926789994928, "learning_rate": 4.9203823446373937e-05, "loss": 2.0918, "mean_token_accuracy": 0.4931034505367279, "step": 129750 }, { "epoch": 0.13069061410638763, "grad_norm": 11.12451114204712, "learning_rate": 4.920372464457642e-05, "loss": 2.3229, "mean_token_accuracy": 0.4724137902259827, "step": 129755 }, { "epoch": 0.1306956501594918, "grad_norm": 9.630222230347455, "learning_rate": 4.920362583675929e-05, "loss": 2.7839, "mean_token_accuracy": 0.37586206793785093, "step": 129760 }, { "epoch": 0.13070068621259598, "grad_norm": 9.794447815635563, "learning_rate": 4.9203527022922587e-05, "loss": 2.1764, "mean_token_accuracy": 0.4517241299152374, "step": 129765 }, { "epoch": 0.13070572226570015, "grad_norm": 9.887450821671035, "learning_rate": 4.9203428203066324e-05, "loss": 2.2153, "mean_token_accuracy": 0.4586206912994385, "step": 129770 }, { "epoch": 0.13071075831880433, "grad_norm": 9.964614323649075, "learning_rate": 4.920332937719054e-05, "loss": 2.4447, "mean_token_accuracy": 0.44482758045196535, "step": 129775 }, { "epoch": 0.1307157943719085, "grad_norm": 16.12490333612492, "learning_rate": 4.920323054529525e-05, "loss": 2.0431, "mean_token_accuracy": 0.4954023063182831, "step": 129780 }, { "epoch": 0.13072083042501267, "grad_norm": 11.820050219581136, "learning_rate": 4.920313170738049e-05, "loss": 2.7879, "mean_token_accuracy": 0.4551724135875702, "step": 129785 }, { "epoch": 0.13072586647811685, "grad_norm": 10.997605829381802, "learning_rate": 4.920303286344628e-05, "loss": 2.2698, "mean_token_accuracy": 0.4034482777118683, "step": 129790 }, { "epoch": 0.13073090253122102, "grad_norm": 10.624101015425934, "learning_rate": 4.920293401349266e-05, "loss": 2.3461, "mean_token_accuracy": 0.4068965494632721, "step": 129795 }, { "epoch": 0.1307359385843252, "grad_norm": 11.01763601336403, "learning_rate": 4.920283515751965e-05, "loss": 2.3293, "mean_token_accuracy": 0.42758620977401735, "step": 129800 }, { "epoch": 0.13074097463742937, "grad_norm": 16.38627217214197, "learning_rate": 4.920273629552728e-05, "loss": 2.1937, "mean_token_accuracy": 0.46382335424423216, "step": 129805 }, { "epoch": 0.13074601069053354, "grad_norm": 9.930145538067137, "learning_rate": 4.920263742751557e-05, "loss": 2.1672, "mean_token_accuracy": 0.48275862336158754, "step": 129810 }, { "epoch": 0.1307510467436377, "grad_norm": 11.596086022056923, "learning_rate": 4.920253855348456e-05, "loss": 2.28, "mean_token_accuracy": 0.4482758641242981, "step": 129815 }, { "epoch": 0.13075608279674186, "grad_norm": 12.876303231028354, "learning_rate": 4.920243967343426e-05, "loss": 2.6879, "mean_token_accuracy": 0.43448275327682495, "step": 129820 }, { "epoch": 0.13076111884984604, "grad_norm": 9.588606886560406, "learning_rate": 4.920234078736471e-05, "loss": 2.44, "mean_token_accuracy": 0.4034482777118683, "step": 129825 }, { "epoch": 0.1307661549029502, "grad_norm": 12.695248496820787, "learning_rate": 4.920224189527594e-05, "loss": 2.6157, "mean_token_accuracy": 0.37241379618644715, "step": 129830 }, { "epoch": 0.13077119095605438, "grad_norm": 13.50772015646374, "learning_rate": 4.920214299716797e-05, "loss": 2.6908, "mean_token_accuracy": 0.35517241060733795, "step": 129835 }, { "epoch": 0.13077622700915856, "grad_norm": 12.512866530151031, "learning_rate": 4.920204409304083e-05, "loss": 2.396, "mean_token_accuracy": 0.43793103098869324, "step": 129840 }, { "epoch": 0.13078126306226273, "grad_norm": 9.551203519074571, "learning_rate": 4.920194518289454e-05, "loss": 2.4663, "mean_token_accuracy": 0.4310344815254211, "step": 129845 }, { "epoch": 0.1307862991153669, "grad_norm": 20.901340638137732, "learning_rate": 4.920184626672914e-05, "loss": 2.7596, "mean_token_accuracy": 0.41724138259887694, "step": 129850 }, { "epoch": 0.13079133516847108, "grad_norm": 15.215228871466978, "learning_rate": 4.920174734454465e-05, "loss": 2.6496, "mean_token_accuracy": 0.4068965554237366, "step": 129855 }, { "epoch": 0.13079637122157525, "grad_norm": 11.02966613076287, "learning_rate": 4.9201648416341106e-05, "loss": 2.4374, "mean_token_accuracy": 0.39310343861579894, "step": 129860 }, { "epoch": 0.13080140727467943, "grad_norm": 8.354521930561766, "learning_rate": 4.920154948211852e-05, "loss": 1.989, "mean_token_accuracy": 0.4931034445762634, "step": 129865 }, { "epoch": 0.1308064433277836, "grad_norm": 11.08835571405055, "learning_rate": 4.9201450541876944e-05, "loss": 2.5489, "mean_token_accuracy": 0.4103448331356049, "step": 129870 }, { "epoch": 0.13081147938088777, "grad_norm": 12.010531214067022, "learning_rate": 4.9201351595616376e-05, "loss": 2.269, "mean_token_accuracy": 0.44827587008476255, "step": 129875 }, { "epoch": 0.13081651543399195, "grad_norm": 15.212630984228232, "learning_rate": 4.920125264333685e-05, "loss": 2.5735, "mean_token_accuracy": 0.4137931078672409, "step": 129880 }, { "epoch": 0.13082155148709612, "grad_norm": 11.780594254963692, "learning_rate": 4.920115368503842e-05, "loss": 2.4857, "mean_token_accuracy": 0.42758620381355283, "step": 129885 }, { "epoch": 0.1308265875402003, "grad_norm": 12.696101235434234, "learning_rate": 4.9201054720721076e-05, "loss": 2.2825, "mean_token_accuracy": 0.48275862336158754, "step": 129890 }, { "epoch": 0.13083162359330447, "grad_norm": 13.458449655521985, "learning_rate": 4.920095575038488e-05, "loss": 2.2646, "mean_token_accuracy": 0.48275862336158754, "step": 129895 }, { "epoch": 0.13083665964640864, "grad_norm": 10.629783900166613, "learning_rate": 4.920085677402983e-05, "loss": 2.2409, "mean_token_accuracy": 0.47586206197738645, "step": 129900 }, { "epoch": 0.13084169569951282, "grad_norm": 7.100971966958274, "learning_rate": 4.9200757791655973e-05, "loss": 1.7576, "mean_token_accuracy": 0.5264367878437042, "step": 129905 }, { "epoch": 0.130846731752617, "grad_norm": 10.72771873207638, "learning_rate": 4.920065880326333e-05, "loss": 2.6802, "mean_token_accuracy": 0.42068964838981626, "step": 129910 }, { "epoch": 0.13085176780572116, "grad_norm": 14.13485711279724, "learning_rate": 4.920055980885192e-05, "loss": 2.2795, "mean_token_accuracy": 0.45517241954803467, "step": 129915 }, { "epoch": 0.13085680385882534, "grad_norm": 10.808975715900987, "learning_rate": 4.920046080842179e-05, "loss": 2.5262, "mean_token_accuracy": 0.4068965494632721, "step": 129920 }, { "epoch": 0.1308618399119295, "grad_norm": 9.810444991375585, "learning_rate": 4.9200361801972953e-05, "loss": 2.4169, "mean_token_accuracy": 0.43103447556495667, "step": 129925 }, { "epoch": 0.13086687596503369, "grad_norm": 10.527248837170186, "learning_rate": 4.920026278950544e-05, "loss": 2.2655, "mean_token_accuracy": 0.4620689690113068, "step": 129930 }, { "epoch": 0.13087191201813786, "grad_norm": 10.518438006204596, "learning_rate": 4.920016377101928e-05, "loss": 2.2862, "mean_token_accuracy": 0.4705989181995392, "step": 129935 }, { "epoch": 0.13087694807124203, "grad_norm": 9.189124120088655, "learning_rate": 4.920006474651448e-05, "loss": 2.549, "mean_token_accuracy": 0.38965516686439516, "step": 129940 }, { "epoch": 0.1308819841243462, "grad_norm": 11.018723028212438, "learning_rate": 4.919996571599111e-05, "loss": 2.0742, "mean_token_accuracy": 0.4413793087005615, "step": 129945 }, { "epoch": 0.13088702017745038, "grad_norm": 11.193656102346607, "learning_rate": 4.919986667944917e-05, "loss": 2.9236, "mean_token_accuracy": 0.3896551728248596, "step": 129950 }, { "epoch": 0.13089205623055453, "grad_norm": 9.719023521199578, "learning_rate": 4.919976763688868e-05, "loss": 2.1043, "mean_token_accuracy": 0.41379310488700866, "step": 129955 }, { "epoch": 0.1308970922836587, "grad_norm": 13.20713095654652, "learning_rate": 4.919966858830969e-05, "loss": 2.4281, "mean_token_accuracy": 0.458620685338974, "step": 129960 }, { "epoch": 0.13090212833676287, "grad_norm": 10.419874370579727, "learning_rate": 4.9199569533712205e-05, "loss": 2.7523, "mean_token_accuracy": 0.3827586144208908, "step": 129965 }, { "epoch": 0.13090716438986705, "grad_norm": 12.196057513270569, "learning_rate": 4.919947047309627e-05, "loss": 2.8233, "mean_token_accuracy": 0.3655172407627106, "step": 129970 }, { "epoch": 0.13091220044297122, "grad_norm": 9.877991174780144, "learning_rate": 4.919937140646191e-05, "loss": 2.1484, "mean_token_accuracy": 0.42758620381355283, "step": 129975 }, { "epoch": 0.1309172364960754, "grad_norm": 8.838912440528503, "learning_rate": 4.919927233380913e-05, "loss": 2.3966, "mean_token_accuracy": 0.4344827592372894, "step": 129980 }, { "epoch": 0.13092227254917957, "grad_norm": 9.436291529241307, "learning_rate": 4.9199173255138e-05, "loss": 2.1374, "mean_token_accuracy": 0.49546279907226565, "step": 129985 }, { "epoch": 0.13092730860228374, "grad_norm": 11.91555615954252, "learning_rate": 4.9199074170448505e-05, "loss": 2.4139, "mean_token_accuracy": 0.4018753796815872, "step": 129990 }, { "epoch": 0.13093234465538792, "grad_norm": 9.73781943289101, "learning_rate": 4.91989750797407e-05, "loss": 2.8005, "mean_token_accuracy": 0.417241370677948, "step": 129995 }, { "epoch": 0.1309373807084921, "grad_norm": 10.608270433801268, "learning_rate": 4.91988759830146e-05, "loss": 1.9521, "mean_token_accuracy": 0.5195402264595032, "step": 130000 }, { "epoch": 0.13094241676159626, "grad_norm": 10.102706112430658, "learning_rate": 4.919877688027024e-05, "loss": 2.2082, "mean_token_accuracy": 0.4379310369491577, "step": 130005 }, { "epoch": 0.13094745281470044, "grad_norm": 10.551981193913814, "learning_rate": 4.919867777150764e-05, "loss": 2.3321, "mean_token_accuracy": 0.4413793087005615, "step": 130010 }, { "epoch": 0.1309524888678046, "grad_norm": 11.997734580941797, "learning_rate": 4.919857865672683e-05, "loss": 2.4767, "mean_token_accuracy": 0.3931034505367279, "step": 130015 }, { "epoch": 0.13095752492090879, "grad_norm": 10.408937049720626, "learning_rate": 4.919847953592784e-05, "loss": 2.5107, "mean_token_accuracy": 0.4172413766384125, "step": 130020 }, { "epoch": 0.13096256097401296, "grad_norm": 14.602571703561534, "learning_rate": 4.91983804091107e-05, "loss": 2.2086, "mean_token_accuracy": 0.43793103098869324, "step": 130025 }, { "epoch": 0.13096759702711713, "grad_norm": 15.360068161103989, "learning_rate": 4.919828127627543e-05, "loss": 2.2283, "mean_token_accuracy": 0.47931034564971925, "step": 130030 }, { "epoch": 0.1309726330802213, "grad_norm": 16.08048085303682, "learning_rate": 4.919818213742205e-05, "loss": 2.4826, "mean_token_accuracy": 0.42758620977401735, "step": 130035 }, { "epoch": 0.13097766913332548, "grad_norm": 9.753755326232202, "learning_rate": 4.91980829925506e-05, "loss": 2.3319, "mean_token_accuracy": 0.42413793206214906, "step": 130040 }, { "epoch": 0.13098270518642965, "grad_norm": 10.062689479623907, "learning_rate": 4.9197983841661124e-05, "loss": 2.5425, "mean_token_accuracy": 0.4, "step": 130045 }, { "epoch": 0.13098774123953383, "grad_norm": 9.066139539157533, "learning_rate": 4.919788468475362e-05, "loss": 2.39, "mean_token_accuracy": 0.46206897497177124, "step": 130050 }, { "epoch": 0.130992777292638, "grad_norm": 8.655193501532251, "learning_rate": 4.919778552182813e-05, "loss": 2.606, "mean_token_accuracy": 0.45172414779663084, "step": 130055 }, { "epoch": 0.13099781334574218, "grad_norm": 9.63673942337131, "learning_rate": 4.919768635288467e-05, "loss": 2.4175, "mean_token_accuracy": 0.39147005677223207, "step": 130060 }, { "epoch": 0.13100284939884635, "grad_norm": 11.109108598482244, "learning_rate": 4.9197587177923285e-05, "loss": 3.0937, "mean_token_accuracy": 0.35172413289546967, "step": 130065 }, { "epoch": 0.13100788545195052, "grad_norm": 14.704660697402826, "learning_rate": 4.9197487996943985e-05, "loss": 2.161, "mean_token_accuracy": 0.49999999403953554, "step": 130070 }, { "epoch": 0.1310129215050547, "grad_norm": 7.641346943298125, "learning_rate": 4.919738880994681e-05, "loss": 2.129, "mean_token_accuracy": 0.44827585816383364, "step": 130075 }, { "epoch": 0.13101795755815887, "grad_norm": 9.729897206417885, "learning_rate": 4.9197289616931786e-05, "loss": 2.3801, "mean_token_accuracy": 0.4620689630508423, "step": 130080 }, { "epoch": 0.13102299361126304, "grad_norm": 11.985453882281405, "learning_rate": 4.9197190417898934e-05, "loss": 2.8645, "mean_token_accuracy": 0.38620689511299133, "step": 130085 }, { "epoch": 0.13102802966436722, "grad_norm": 10.392842210404202, "learning_rate": 4.9197091212848286e-05, "loss": 2.0525, "mean_token_accuracy": 0.45517241954803467, "step": 130090 }, { "epoch": 0.13103306571747136, "grad_norm": 11.203783979656587, "learning_rate": 4.9196992001779875e-05, "loss": 2.5856, "mean_token_accuracy": 0.4068965554237366, "step": 130095 }, { "epoch": 0.13103810177057554, "grad_norm": 9.506109214061613, "learning_rate": 4.9196892784693716e-05, "loss": 2.6634, "mean_token_accuracy": 0.3965517282485962, "step": 130100 }, { "epoch": 0.1310431378236797, "grad_norm": 9.016042048450963, "learning_rate": 4.919679356158984e-05, "loss": 2.1301, "mean_token_accuracy": 0.4310344815254211, "step": 130105 }, { "epoch": 0.13104817387678389, "grad_norm": 12.493404828163342, "learning_rate": 4.919669433246828e-05, "loss": 2.7075, "mean_token_accuracy": 0.4103448331356049, "step": 130110 }, { "epoch": 0.13105320992988806, "grad_norm": 11.324293391799031, "learning_rate": 4.919659509732906e-05, "loss": 2.3695, "mean_token_accuracy": 0.4448275864124298, "step": 130115 }, { "epoch": 0.13105824598299223, "grad_norm": 12.893193860867884, "learning_rate": 4.919649585617221e-05, "loss": 2.4074, "mean_token_accuracy": 0.39655172228813174, "step": 130120 }, { "epoch": 0.1310632820360964, "grad_norm": 11.579970701681532, "learning_rate": 4.9196396608997754e-05, "loss": 2.5701, "mean_token_accuracy": 0.4395644307136536, "step": 130125 }, { "epoch": 0.13106831808920058, "grad_norm": 9.39088508223922, "learning_rate": 4.919629735580573e-05, "loss": 2.2407, "mean_token_accuracy": 0.42758620381355283, "step": 130130 }, { "epoch": 0.13107335414230475, "grad_norm": 10.649754037464291, "learning_rate": 4.919619809659615e-05, "loss": 2.5375, "mean_token_accuracy": 0.4034482717514038, "step": 130135 }, { "epoch": 0.13107839019540893, "grad_norm": 11.553762091100667, "learning_rate": 4.919609883136905e-05, "loss": 2.4565, "mean_token_accuracy": 0.4034482717514038, "step": 130140 }, { "epoch": 0.1310834262485131, "grad_norm": 13.648557332774484, "learning_rate": 4.9195999560124455e-05, "loss": 2.5649, "mean_token_accuracy": 0.458620685338974, "step": 130145 }, { "epoch": 0.13108846230161728, "grad_norm": 11.224593578664763, "learning_rate": 4.91959002828624e-05, "loss": 2.3424, "mean_token_accuracy": 0.44827585816383364, "step": 130150 }, { "epoch": 0.13109349835472145, "grad_norm": 9.896415497485727, "learning_rate": 4.919580099958289e-05, "loss": 2.4924, "mean_token_accuracy": 0.38965516686439516, "step": 130155 }, { "epoch": 0.13109853440782562, "grad_norm": 9.984960096864834, "learning_rate": 4.919570171028598e-05, "loss": 2.4295, "mean_token_accuracy": 0.46551724672317507, "step": 130160 }, { "epoch": 0.1311035704609298, "grad_norm": 9.852320387332838, "learning_rate": 4.919560241497168e-05, "loss": 2.6723, "mean_token_accuracy": 0.41209921836853025, "step": 130165 }, { "epoch": 0.13110860651403397, "grad_norm": 10.516934470168565, "learning_rate": 4.9195503113640036e-05, "loss": 2.3592, "mean_token_accuracy": 0.4551724076271057, "step": 130170 }, { "epoch": 0.13111364256713814, "grad_norm": 10.079109944616036, "learning_rate": 4.919540380629105e-05, "loss": 2.3045, "mean_token_accuracy": 0.4482758641242981, "step": 130175 }, { "epoch": 0.13111867862024232, "grad_norm": 11.082892332421725, "learning_rate": 4.919530449292477e-05, "loss": 2.4649, "mean_token_accuracy": 0.3793103456497192, "step": 130180 }, { "epoch": 0.1311237146733465, "grad_norm": 11.958995615805087, "learning_rate": 4.919520517354121e-05, "loss": 2.6175, "mean_token_accuracy": 0.4, "step": 130185 }, { "epoch": 0.13112875072645067, "grad_norm": 9.698069856827663, "learning_rate": 4.919510584814042e-05, "loss": 2.5469, "mean_token_accuracy": 0.42413793206214906, "step": 130190 }, { "epoch": 0.13113378677955484, "grad_norm": 11.555025109812906, "learning_rate": 4.9195006516722396e-05, "loss": 2.5379, "mean_token_accuracy": 0.42413793206214906, "step": 130195 }, { "epoch": 0.131138822832659, "grad_norm": 10.856936120673042, "learning_rate": 4.919490717928718e-05, "loss": 2.7043, "mean_token_accuracy": 0.4344827473163605, "step": 130200 }, { "epoch": 0.1311438588857632, "grad_norm": 9.153556648880983, "learning_rate": 4.919480783583481e-05, "loss": 2.9457, "mean_token_accuracy": 0.44343616962432864, "step": 130205 }, { "epoch": 0.13114889493886736, "grad_norm": 10.072519646850195, "learning_rate": 4.9194708486365295e-05, "loss": 2.2067, "mean_token_accuracy": 0.458620685338974, "step": 130210 }, { "epoch": 0.13115393099197153, "grad_norm": 13.775635477351994, "learning_rate": 4.9194609130878675e-05, "loss": 3.1039, "mean_token_accuracy": 0.3379310369491577, "step": 130215 }, { "epoch": 0.1311589670450757, "grad_norm": 10.124892147619125, "learning_rate": 4.919450976937498e-05, "loss": 2.3458, "mean_token_accuracy": 0.3862069010734558, "step": 130220 }, { "epoch": 0.13116400309817988, "grad_norm": 8.653004527765933, "learning_rate": 4.919441040185423e-05, "loss": 2.0558, "mean_token_accuracy": 0.5, "step": 130225 }, { "epoch": 0.13116903915128406, "grad_norm": 10.051151348138543, "learning_rate": 4.919431102831645e-05, "loss": 2.3128, "mean_token_accuracy": 0.42583181858062746, "step": 130230 }, { "epoch": 0.1311740752043882, "grad_norm": 9.678135859494779, "learning_rate": 4.919421164876167e-05, "loss": 2.0366, "mean_token_accuracy": 0.5034482717514038, "step": 130235 }, { "epoch": 0.13117911125749238, "grad_norm": 13.76658112564712, "learning_rate": 4.919411226318993e-05, "loss": 2.9472, "mean_token_accuracy": 0.3620689630508423, "step": 130240 }, { "epoch": 0.13118414731059655, "grad_norm": 9.82820005691906, "learning_rate": 4.9194012871601236e-05, "loss": 2.4012, "mean_token_accuracy": 0.42413793206214906, "step": 130245 }, { "epoch": 0.13118918336370072, "grad_norm": 10.867473851137854, "learning_rate": 4.919391347399563e-05, "loss": 2.4646, "mean_token_accuracy": 0.4310344815254211, "step": 130250 }, { "epoch": 0.1311942194168049, "grad_norm": 8.560602894269996, "learning_rate": 4.9193814070373145e-05, "loss": 2.3708, "mean_token_accuracy": 0.4034482777118683, "step": 130255 }, { "epoch": 0.13119925546990907, "grad_norm": 13.70526708950858, "learning_rate": 4.919371466073378e-05, "loss": 2.6405, "mean_token_accuracy": 0.4, "step": 130260 }, { "epoch": 0.13120429152301324, "grad_norm": 11.987382850126089, "learning_rate": 4.91936152450776e-05, "loss": 2.6374, "mean_token_accuracy": 0.37586206793785093, "step": 130265 }, { "epoch": 0.13120932757611742, "grad_norm": 12.218585208855114, "learning_rate": 4.9193515823404614e-05, "loss": 2.7804, "mean_token_accuracy": 0.42068964838981626, "step": 130270 }, { "epoch": 0.1312143636292216, "grad_norm": 10.83111027273831, "learning_rate": 4.919341639571484e-05, "loss": 2.3029, "mean_token_accuracy": 0.4206896543502808, "step": 130275 }, { "epoch": 0.13121939968232577, "grad_norm": 10.80114190478455, "learning_rate": 4.9193316962008325e-05, "loss": 2.7782, "mean_token_accuracy": 0.3965517282485962, "step": 130280 }, { "epoch": 0.13122443573542994, "grad_norm": 11.532907977641091, "learning_rate": 4.919321752228508e-05, "loss": 2.3239, "mean_token_accuracy": 0.43103448748588563, "step": 130285 }, { "epoch": 0.1312294717885341, "grad_norm": 11.346798519587363, "learning_rate": 4.919311807654514e-05, "loss": 2.461, "mean_token_accuracy": 0.41379311084747317, "step": 130290 }, { "epoch": 0.1312345078416383, "grad_norm": 14.882312863508448, "learning_rate": 4.919301862478855e-05, "loss": 2.7366, "mean_token_accuracy": 0.4122202038764954, "step": 130295 }, { "epoch": 0.13123954389474246, "grad_norm": 11.56274694643577, "learning_rate": 4.91929191670153e-05, "loss": 2.457, "mean_token_accuracy": 0.4000000059604645, "step": 130300 }, { "epoch": 0.13124457994784663, "grad_norm": 9.154972583480959, "learning_rate": 4.919281970322545e-05, "loss": 2.1285, "mean_token_accuracy": 0.4448275864124298, "step": 130305 }, { "epoch": 0.1312496160009508, "grad_norm": 12.123011151756938, "learning_rate": 4.919272023341901e-05, "loss": 2.6897, "mean_token_accuracy": 0.4551724135875702, "step": 130310 }, { "epoch": 0.13125465205405498, "grad_norm": 14.504716702752743, "learning_rate": 4.919262075759601e-05, "loss": 2.4632, "mean_token_accuracy": 0.4172413766384125, "step": 130315 }, { "epoch": 0.13125968810715916, "grad_norm": 11.232959252033986, "learning_rate": 4.9192521275756487e-05, "loss": 2.4201, "mean_token_accuracy": 0.3931034505367279, "step": 130320 }, { "epoch": 0.13126472416026333, "grad_norm": 14.172836088682345, "learning_rate": 4.9192421787900463e-05, "loss": 2.932, "mean_token_accuracy": 0.3655172407627106, "step": 130325 }, { "epoch": 0.1312697602133675, "grad_norm": 11.092964364126177, "learning_rate": 4.919232229402796e-05, "loss": 2.3312, "mean_token_accuracy": 0.4517241358757019, "step": 130330 }, { "epoch": 0.13127479626647168, "grad_norm": 9.144583078035716, "learning_rate": 4.919222279413901e-05, "loss": 2.3342, "mean_token_accuracy": 0.4413793087005615, "step": 130335 }, { "epoch": 0.13127983231957585, "grad_norm": 11.247149482203813, "learning_rate": 4.919212328823365e-05, "loss": 2.5313, "mean_token_accuracy": 0.4206896543502808, "step": 130340 }, { "epoch": 0.13128486837268002, "grad_norm": 22.56215493686229, "learning_rate": 4.9192023776311885e-05, "loss": 2.6521, "mean_token_accuracy": 0.44827585816383364, "step": 130345 }, { "epoch": 0.1312899044257842, "grad_norm": 11.4417838074006, "learning_rate": 4.9191924258373765e-05, "loss": 2.2865, "mean_token_accuracy": 0.4586206912994385, "step": 130350 }, { "epoch": 0.13129494047888837, "grad_norm": 10.750042411820706, "learning_rate": 4.9191824734419305e-05, "loss": 2.936, "mean_token_accuracy": 0.38977832198143003, "step": 130355 }, { "epoch": 0.13129997653199255, "grad_norm": 8.960459162968913, "learning_rate": 4.919172520444854e-05, "loss": 2.3058, "mean_token_accuracy": 0.4568663060665131, "step": 130360 }, { "epoch": 0.13130501258509672, "grad_norm": 10.906366735402356, "learning_rate": 4.9191625668461495e-05, "loss": 2.4839, "mean_token_accuracy": 0.42577131986618044, "step": 130365 }, { "epoch": 0.1313100486382009, "grad_norm": 10.94864501521933, "learning_rate": 4.919152612645819e-05, "loss": 2.2908, "mean_token_accuracy": 0.4310344815254211, "step": 130370 }, { "epoch": 0.13131508469130504, "grad_norm": 13.608887585731715, "learning_rate": 4.919142657843867e-05, "loss": 2.7443, "mean_token_accuracy": 0.35862069129943847, "step": 130375 }, { "epoch": 0.1313201207444092, "grad_norm": 13.200972101879383, "learning_rate": 4.919132702440294e-05, "loss": 2.3799, "mean_token_accuracy": 0.43448275327682495, "step": 130380 }, { "epoch": 0.1313251567975134, "grad_norm": 10.049640929350957, "learning_rate": 4.919122746435105e-05, "loss": 2.1232, "mean_token_accuracy": 0.4344827592372894, "step": 130385 }, { "epoch": 0.13133019285061756, "grad_norm": 11.199253034482947, "learning_rate": 4.919112789828301e-05, "loss": 2.418, "mean_token_accuracy": 0.38620689511299133, "step": 130390 }, { "epoch": 0.13133522890372173, "grad_norm": 10.338739335677511, "learning_rate": 4.919102832619886e-05, "loss": 2.2746, "mean_token_accuracy": 0.5088929176330567, "step": 130395 }, { "epoch": 0.1313402649568259, "grad_norm": 10.99225191177493, "learning_rate": 4.919092874809862e-05, "loss": 2.355, "mean_token_accuracy": 0.46358135938644407, "step": 130400 }, { "epoch": 0.13134530100993008, "grad_norm": 8.993524183841702, "learning_rate": 4.9190829163982315e-05, "loss": 2.3243, "mean_token_accuracy": 0.42232305407524107, "step": 130405 }, { "epoch": 0.13135033706303426, "grad_norm": 16.099817940591038, "learning_rate": 4.9190729573849984e-05, "loss": 2.7001, "mean_token_accuracy": 0.3931034505367279, "step": 130410 }, { "epoch": 0.13135537311613843, "grad_norm": 12.36970136626534, "learning_rate": 4.9190629977701646e-05, "loss": 2.4489, "mean_token_accuracy": 0.40895341634750365, "step": 130415 }, { "epoch": 0.1313604091692426, "grad_norm": 12.385607328677422, "learning_rate": 4.919053037553733e-05, "loss": 2.2225, "mean_token_accuracy": 0.47447065711021424, "step": 130420 }, { "epoch": 0.13136544522234678, "grad_norm": 12.861950985129017, "learning_rate": 4.919043076735706e-05, "loss": 2.3408, "mean_token_accuracy": 0.4379310369491577, "step": 130425 }, { "epoch": 0.13137048127545095, "grad_norm": 11.269090545533686, "learning_rate": 4.9190331153160884e-05, "loss": 2.5385, "mean_token_accuracy": 0.4107142806053162, "step": 130430 }, { "epoch": 0.13137551732855512, "grad_norm": 11.35043243277724, "learning_rate": 4.919023153294881e-05, "loss": 2.3926, "mean_token_accuracy": 0.4586206912994385, "step": 130435 }, { "epoch": 0.1313805533816593, "grad_norm": 10.005446994077566, "learning_rate": 4.919013190672086e-05, "loss": 2.3473, "mean_token_accuracy": 0.4551724135875702, "step": 130440 }, { "epoch": 0.13138558943476347, "grad_norm": 11.845036976035948, "learning_rate": 4.919003227447707e-05, "loss": 2.247, "mean_token_accuracy": 0.4551724135875702, "step": 130445 }, { "epoch": 0.13139062548786765, "grad_norm": 9.652152332640988, "learning_rate": 4.918993263621747e-05, "loss": 2.4258, "mean_token_accuracy": 0.420689657330513, "step": 130450 }, { "epoch": 0.13139566154097182, "grad_norm": 17.78756715625396, "learning_rate": 4.9189832991942086e-05, "loss": 2.4968, "mean_token_accuracy": 0.42068966031074523, "step": 130455 }, { "epoch": 0.131400697594076, "grad_norm": 11.840060615482374, "learning_rate": 4.9189733341650954e-05, "loss": 2.2451, "mean_token_accuracy": 0.4689655125141144, "step": 130460 }, { "epoch": 0.13140573364718017, "grad_norm": 11.038203201361574, "learning_rate": 4.9189633685344086e-05, "loss": 2.3946, "mean_token_accuracy": 0.41379310488700866, "step": 130465 }, { "epoch": 0.13141076970028434, "grad_norm": 11.539909051254734, "learning_rate": 4.9189534023021516e-05, "loss": 2.507, "mean_token_accuracy": 0.4034482777118683, "step": 130470 }, { "epoch": 0.13141580575338851, "grad_norm": 10.396043621501276, "learning_rate": 4.918943435468328e-05, "loss": 2.5884, "mean_token_accuracy": 0.3896551728248596, "step": 130475 }, { "epoch": 0.1314208418064927, "grad_norm": 8.713508039669044, "learning_rate": 4.9189334680329396e-05, "loss": 2.0885, "mean_token_accuracy": 0.5068965435028077, "step": 130480 }, { "epoch": 0.13142587785959686, "grad_norm": 10.718424930014715, "learning_rate": 4.91892349999599e-05, "loss": 2.3036, "mean_token_accuracy": 0.43793103098869324, "step": 130485 }, { "epoch": 0.13143091391270104, "grad_norm": 10.499580053717974, "learning_rate": 4.9189135313574804e-05, "loss": 2.4824, "mean_token_accuracy": 0.3931034505367279, "step": 130490 }, { "epoch": 0.1314359499658052, "grad_norm": 11.643215205789456, "learning_rate": 4.918903562117415e-05, "loss": 2.4437, "mean_token_accuracy": 0.4241379380226135, "step": 130495 }, { "epoch": 0.13144098601890938, "grad_norm": 11.751402074726798, "learning_rate": 4.918893592275795e-05, "loss": 2.4656, "mean_token_accuracy": 0.4068965494632721, "step": 130500 }, { "epoch": 0.13144602207201356, "grad_norm": 10.315020227886011, "learning_rate": 4.9188836218326257e-05, "loss": 2.5402, "mean_token_accuracy": 0.4000000059604645, "step": 130505 }, { "epoch": 0.13145105812511773, "grad_norm": 8.686696753774982, "learning_rate": 4.918873650787908e-05, "loss": 1.9837, "mean_token_accuracy": 0.46073804795742035, "step": 130510 }, { "epoch": 0.13145609417822188, "grad_norm": 10.651359576102672, "learning_rate": 4.918863679141645e-05, "loss": 2.2277, "mean_token_accuracy": 0.47241379618644713, "step": 130515 }, { "epoch": 0.13146113023132605, "grad_norm": 9.191772021659512, "learning_rate": 4.91885370689384e-05, "loss": 2.1276, "mean_token_accuracy": 0.45662432312965395, "step": 130520 }, { "epoch": 0.13146616628443022, "grad_norm": 9.553214576997286, "learning_rate": 4.918843734044495e-05, "loss": 2.5742, "mean_token_accuracy": 0.417241370677948, "step": 130525 }, { "epoch": 0.1314712023375344, "grad_norm": 12.024252790485859, "learning_rate": 4.9188337605936124e-05, "loss": 2.6667, "mean_token_accuracy": 0.391349059343338, "step": 130530 }, { "epoch": 0.13147623839063857, "grad_norm": 9.581341588144074, "learning_rate": 4.918823786541197e-05, "loss": 2.6407, "mean_token_accuracy": 0.3965517282485962, "step": 130535 }, { "epoch": 0.13148127444374275, "grad_norm": 10.291594235321346, "learning_rate": 4.918813811887249e-05, "loss": 2.3599, "mean_token_accuracy": 0.4172413766384125, "step": 130540 }, { "epoch": 0.13148631049684692, "grad_norm": 8.77402733319208, "learning_rate": 4.918803836631773e-05, "loss": 2.2768, "mean_token_accuracy": 0.45735026597976686, "step": 130545 }, { "epoch": 0.1314913465499511, "grad_norm": 11.694059760426102, "learning_rate": 4.9187938607747706e-05, "loss": 2.471, "mean_token_accuracy": 0.4534180223941803, "step": 130550 }, { "epoch": 0.13149638260305527, "grad_norm": 9.272513535748267, "learning_rate": 4.918783884316246e-05, "loss": 2.3881, "mean_token_accuracy": 0.4620689690113068, "step": 130555 }, { "epoch": 0.13150141865615944, "grad_norm": 12.489392368658415, "learning_rate": 4.9187739072562e-05, "loss": 2.4079, "mean_token_accuracy": 0.43103448748588563, "step": 130560 }, { "epoch": 0.13150645470926361, "grad_norm": 10.428182844686333, "learning_rate": 4.9187639295946376e-05, "loss": 2.3662, "mean_token_accuracy": 0.4448275864124298, "step": 130565 }, { "epoch": 0.1315114907623678, "grad_norm": 14.591326633318399, "learning_rate": 4.91875395133156e-05, "loss": 2.4757, "mean_token_accuracy": 0.40550513863563536, "step": 130570 }, { "epoch": 0.13151652681547196, "grad_norm": 10.509598139622824, "learning_rate": 4.918743972466971e-05, "loss": 2.3434, "mean_token_accuracy": 0.44137930274009707, "step": 130575 }, { "epoch": 0.13152156286857614, "grad_norm": 10.638161909813379, "learning_rate": 4.918733993000872e-05, "loss": 2.4599, "mean_token_accuracy": 0.42413793206214906, "step": 130580 }, { "epoch": 0.1315265989216803, "grad_norm": 11.120027321832138, "learning_rate": 4.9187240129332666e-05, "loss": 2.1883, "mean_token_accuracy": 0.4626134276390076, "step": 130585 }, { "epoch": 0.13153163497478448, "grad_norm": 12.451789154309804, "learning_rate": 4.9187140322641577e-05, "loss": 2.568, "mean_token_accuracy": 0.43448275327682495, "step": 130590 }, { "epoch": 0.13153667102788866, "grad_norm": 10.354979661515424, "learning_rate": 4.9187040509935476e-05, "loss": 2.5616, "mean_token_accuracy": 0.42758620977401735, "step": 130595 }, { "epoch": 0.13154170708099283, "grad_norm": 9.640153981685936, "learning_rate": 4.91869406912144e-05, "loss": 2.6799, "mean_token_accuracy": 0.3999999940395355, "step": 130600 }, { "epoch": 0.131546743134097, "grad_norm": 8.968165629848603, "learning_rate": 4.918684086647836e-05, "loss": 2.4565, "mean_token_accuracy": 0.4137930989265442, "step": 130605 }, { "epoch": 0.13155177918720118, "grad_norm": 11.945135896989738, "learning_rate": 4.918674103572741e-05, "loss": 2.0696, "mean_token_accuracy": 0.4344827592372894, "step": 130610 }, { "epoch": 0.13155681524030535, "grad_norm": 10.251090533862238, "learning_rate": 4.918664119896154e-05, "loss": 2.384, "mean_token_accuracy": 0.42758620977401735, "step": 130615 }, { "epoch": 0.13156185129340953, "grad_norm": 10.679151359396819, "learning_rate": 4.9186541356180817e-05, "loss": 2.2952, "mean_token_accuracy": 0.4068965494632721, "step": 130620 }, { "epoch": 0.1315668873465137, "grad_norm": 10.267037817347953, "learning_rate": 4.918644150738524e-05, "loss": 2.3921, "mean_token_accuracy": 0.42068966031074523, "step": 130625 }, { "epoch": 0.13157192339961787, "grad_norm": 11.9522428317448, "learning_rate": 4.918634165257485e-05, "loss": 2.0382, "mean_token_accuracy": 0.46896551847457885, "step": 130630 }, { "epoch": 0.13157695945272205, "grad_norm": 10.786578199414956, "learning_rate": 4.9186241791749674e-05, "loss": 2.4337, "mean_token_accuracy": 0.4068965554237366, "step": 130635 }, { "epoch": 0.13158199550582622, "grad_norm": 9.12774338031861, "learning_rate": 4.918614192490974e-05, "loss": 2.2604, "mean_token_accuracy": 0.43103447556495667, "step": 130640 }, { "epoch": 0.1315870315589304, "grad_norm": 10.972530098819206, "learning_rate": 4.9186042052055075e-05, "loss": 2.3279, "mean_token_accuracy": 0.41724138259887694, "step": 130645 }, { "epoch": 0.13159206761203457, "grad_norm": 11.419850076009672, "learning_rate": 4.91859421731857e-05, "loss": 2.2267, "mean_token_accuracy": 0.4517241418361664, "step": 130650 }, { "epoch": 0.13159710366513871, "grad_norm": 9.51567772329566, "learning_rate": 4.9185842288301644e-05, "loss": 2.2613, "mean_token_accuracy": 0.46551724672317507, "step": 130655 }, { "epoch": 0.1316021397182429, "grad_norm": 9.419136121885083, "learning_rate": 4.918574239740294e-05, "loss": 2.2573, "mean_token_accuracy": 0.41034482717514037, "step": 130660 }, { "epoch": 0.13160717577134706, "grad_norm": 10.858073589711925, "learning_rate": 4.9185642500489624e-05, "loss": 2.4129, "mean_token_accuracy": 0.4, "step": 130665 }, { "epoch": 0.13161221182445124, "grad_norm": 10.099936893414954, "learning_rate": 4.9185542597561714e-05, "loss": 2.4664, "mean_token_accuracy": 0.38620689511299133, "step": 130670 }, { "epoch": 0.1316172478775554, "grad_norm": 8.753919941448983, "learning_rate": 4.918544268861923e-05, "loss": 2.4327, "mean_token_accuracy": 0.4586206912994385, "step": 130675 }, { "epoch": 0.13162228393065958, "grad_norm": 10.99113104929302, "learning_rate": 4.9185342773662214e-05, "loss": 2.262, "mean_token_accuracy": 0.4448275864124298, "step": 130680 }, { "epoch": 0.13162731998376376, "grad_norm": 9.908806389853094, "learning_rate": 4.918524285269068e-05, "loss": 2.5109, "mean_token_accuracy": 0.4689655125141144, "step": 130685 }, { "epoch": 0.13163235603686793, "grad_norm": 12.39326867247831, "learning_rate": 4.918514292570466e-05, "loss": 2.7155, "mean_token_accuracy": 0.38965516686439516, "step": 130690 }, { "epoch": 0.1316373920899721, "grad_norm": 10.27760616865546, "learning_rate": 4.9185042992704195e-05, "loss": 2.4212, "mean_token_accuracy": 0.40344828367233276, "step": 130695 }, { "epoch": 0.13164242814307628, "grad_norm": 9.075318294268499, "learning_rate": 4.91849430536893e-05, "loss": 2.0368, "mean_token_accuracy": 0.46896551847457885, "step": 130700 }, { "epoch": 0.13164746419618045, "grad_norm": 10.629530341777205, "learning_rate": 4.918484310866001e-05, "loss": 2.3232, "mean_token_accuracy": 0.4310344815254211, "step": 130705 }, { "epoch": 0.13165250024928463, "grad_norm": 12.129961913344362, "learning_rate": 4.918474315761634e-05, "loss": 2.5669, "mean_token_accuracy": 0.39655172526836396, "step": 130710 }, { "epoch": 0.1316575363023888, "grad_norm": 9.811054599151198, "learning_rate": 4.918464320055833e-05, "loss": 2.3931, "mean_token_accuracy": 0.48275862336158754, "step": 130715 }, { "epoch": 0.13166257235549297, "grad_norm": 10.855580974194785, "learning_rate": 4.9184543237485996e-05, "loss": 2.555, "mean_token_accuracy": 0.4034482717514038, "step": 130720 }, { "epoch": 0.13166760840859715, "grad_norm": 10.0511608861002, "learning_rate": 4.918444326839938e-05, "loss": 2.1263, "mean_token_accuracy": 0.43448275327682495, "step": 130725 }, { "epoch": 0.13167264446170132, "grad_norm": 11.743041098757228, "learning_rate": 4.91843432932985e-05, "loss": 2.9244, "mean_token_accuracy": 0.38620689511299133, "step": 130730 }, { "epoch": 0.1316776805148055, "grad_norm": 10.961361444324625, "learning_rate": 4.918424331218338e-05, "loss": 2.2351, "mean_token_accuracy": 0.4448275864124298, "step": 130735 }, { "epoch": 0.13168271656790967, "grad_norm": 11.201072956082047, "learning_rate": 4.9184143325054064e-05, "loss": 2.5289, "mean_token_accuracy": 0.3896551728248596, "step": 130740 }, { "epoch": 0.13168775262101384, "grad_norm": 11.143533927783068, "learning_rate": 4.918404333191057e-05, "loss": 2.4085, "mean_token_accuracy": 0.44482759237289426, "step": 130745 }, { "epoch": 0.13169278867411802, "grad_norm": 12.110253306948275, "learning_rate": 4.918394333275292e-05, "loss": 2.3231, "mean_token_accuracy": 0.4724137902259827, "step": 130750 }, { "epoch": 0.1316978247272222, "grad_norm": 31.580804448961075, "learning_rate": 4.9183843327581155e-05, "loss": 2.398, "mean_token_accuracy": 0.4482758641242981, "step": 130755 }, { "epoch": 0.13170286078032636, "grad_norm": 11.051335679790999, "learning_rate": 4.918374331639529e-05, "loss": 2.5759, "mean_token_accuracy": 0.4034482777118683, "step": 130760 }, { "epoch": 0.13170789683343054, "grad_norm": 11.582452527757654, "learning_rate": 4.918364329919536e-05, "loss": 2.3433, "mean_token_accuracy": 0.3999999940395355, "step": 130765 }, { "epoch": 0.1317129328865347, "grad_norm": 12.362602207543476, "learning_rate": 4.9183543275981385e-05, "loss": 2.4198, "mean_token_accuracy": 0.4172413766384125, "step": 130770 }, { "epoch": 0.13171796893963889, "grad_norm": 11.07896659049019, "learning_rate": 4.918344324675341e-05, "loss": 2.5332, "mean_token_accuracy": 0.4398669064044952, "step": 130775 }, { "epoch": 0.13172300499274306, "grad_norm": 9.869163444998218, "learning_rate": 4.918334321151144e-05, "loss": 2.0117, "mean_token_accuracy": 0.4344827592372894, "step": 130780 }, { "epoch": 0.13172804104584723, "grad_norm": 10.216398764595747, "learning_rate": 4.918324317025552e-05, "loss": 2.4519, "mean_token_accuracy": 0.4206896543502808, "step": 130785 }, { "epoch": 0.1317330770989514, "grad_norm": 10.647137697604052, "learning_rate": 4.9183143122985665e-05, "loss": 2.664, "mean_token_accuracy": 0.3689655065536499, "step": 130790 }, { "epoch": 0.13173811315205555, "grad_norm": 12.942946391372622, "learning_rate": 4.918304306970191e-05, "loss": 2.0517, "mean_token_accuracy": 0.48275861144065857, "step": 130795 }, { "epoch": 0.13174314920515973, "grad_norm": 12.145405589499571, "learning_rate": 4.9182943010404286e-05, "loss": 2.7871, "mean_token_accuracy": 0.37241379022598264, "step": 130800 }, { "epoch": 0.1317481852582639, "grad_norm": 10.703535198521816, "learning_rate": 4.9182842945092814e-05, "loss": 2.2691, "mean_token_accuracy": 0.43448275327682495, "step": 130805 }, { "epoch": 0.13175322131136807, "grad_norm": 9.716822014926146, "learning_rate": 4.918274287376753e-05, "loss": 2.5473, "mean_token_accuracy": 0.38965516686439516, "step": 130810 }, { "epoch": 0.13175825736447225, "grad_norm": 9.543300299718846, "learning_rate": 4.9182642796428455e-05, "loss": 2.2674, "mean_token_accuracy": 0.4294615864753723, "step": 130815 }, { "epoch": 0.13176329341757642, "grad_norm": 13.564384799143454, "learning_rate": 4.918254271307562e-05, "loss": 2.7759, "mean_token_accuracy": 0.4379310250282288, "step": 130820 }, { "epoch": 0.1317683294706806, "grad_norm": 10.11796654090193, "learning_rate": 4.9182442623709045e-05, "loss": 2.4708, "mean_token_accuracy": 0.493103438615799, "step": 130825 }, { "epoch": 0.13177336552378477, "grad_norm": 10.51085136920519, "learning_rate": 4.9182342528328764e-05, "loss": 2.3156, "mean_token_accuracy": 0.4068965494632721, "step": 130830 }, { "epoch": 0.13177840157688894, "grad_norm": 10.877044530946895, "learning_rate": 4.918224242693481e-05, "loss": 2.1231, "mean_token_accuracy": 0.4896551787853241, "step": 130835 }, { "epoch": 0.13178343762999312, "grad_norm": 9.954786739638214, "learning_rate": 4.9182142319527194e-05, "loss": 2.3727, "mean_token_accuracy": 0.43448275327682495, "step": 130840 }, { "epoch": 0.1317884736830973, "grad_norm": 10.956913633085835, "learning_rate": 4.918204220610597e-05, "loss": 2.2265, "mean_token_accuracy": 0.44482758045196535, "step": 130845 }, { "epoch": 0.13179350973620146, "grad_norm": 10.389404041662878, "learning_rate": 4.9181942086671134e-05, "loss": 2.1054, "mean_token_accuracy": 0.46896551847457885, "step": 130850 }, { "epoch": 0.13179854578930564, "grad_norm": 13.511752163760171, "learning_rate": 4.918184196122275e-05, "loss": 2.6279, "mean_token_accuracy": 0.38275861740112305, "step": 130855 }, { "epoch": 0.1318035818424098, "grad_norm": 10.996592771240188, "learning_rate": 4.9181741829760816e-05, "loss": 2.5114, "mean_token_accuracy": 0.36896551847457887, "step": 130860 }, { "epoch": 0.13180861789551399, "grad_norm": 12.723959178336107, "learning_rate": 4.918164169228536e-05, "loss": 2.2492, "mean_token_accuracy": 0.4241379201412201, "step": 130865 }, { "epoch": 0.13181365394861816, "grad_norm": 11.729989690647935, "learning_rate": 4.918154154879644e-05, "loss": 2.4429, "mean_token_accuracy": 0.4379310369491577, "step": 130870 }, { "epoch": 0.13181869000172233, "grad_norm": 10.219886363999361, "learning_rate": 4.918144139929405e-05, "loss": 2.6907, "mean_token_accuracy": 0.4396249294281006, "step": 130875 }, { "epoch": 0.1318237260548265, "grad_norm": 12.509321280105091, "learning_rate": 4.918134124377824e-05, "loss": 2.4371, "mean_token_accuracy": 0.4413793087005615, "step": 130880 }, { "epoch": 0.13182876210793068, "grad_norm": 10.991233959975574, "learning_rate": 4.918124108224902e-05, "loss": 2.8331, "mean_token_accuracy": 0.3896551728248596, "step": 130885 }, { "epoch": 0.13183379816103485, "grad_norm": 11.226964045819216, "learning_rate": 4.9181140914706426e-05, "loss": 2.6919, "mean_token_accuracy": 0.4361161530017853, "step": 130890 }, { "epoch": 0.13183883421413903, "grad_norm": 10.09364755610323, "learning_rate": 4.91810407411505e-05, "loss": 2.4786, "mean_token_accuracy": 0.38965516686439516, "step": 130895 }, { "epoch": 0.1318438702672432, "grad_norm": 18.228300204087997, "learning_rate": 4.918094056158124e-05, "loss": 3.0076, "mean_token_accuracy": 0.34482758641242983, "step": 130900 }, { "epoch": 0.13184890632034738, "grad_norm": 11.023487326657166, "learning_rate": 4.91808403759987e-05, "loss": 2.3864, "mean_token_accuracy": 0.3793103456497192, "step": 130905 }, { "epoch": 0.13185394237345155, "grad_norm": 10.180432916863133, "learning_rate": 4.91807401844029e-05, "loss": 2.3274, "mean_token_accuracy": 0.4379310369491577, "step": 130910 }, { "epoch": 0.13185897842655572, "grad_norm": 11.584152702809524, "learning_rate": 4.918063998679386e-05, "loss": 2.552, "mean_token_accuracy": 0.38965517580509185, "step": 130915 }, { "epoch": 0.1318640144796599, "grad_norm": 11.616318357888956, "learning_rate": 4.918053978317161e-05, "loss": 2.2036, "mean_token_accuracy": 0.4448275864124298, "step": 130920 }, { "epoch": 0.13186905053276407, "grad_norm": 10.43237190405584, "learning_rate": 4.9180439573536194e-05, "loss": 2.2806, "mean_token_accuracy": 0.4862068831920624, "step": 130925 }, { "epoch": 0.13187408658586824, "grad_norm": 11.56454662836667, "learning_rate": 4.9180339357887616e-05, "loss": 2.352, "mean_token_accuracy": 0.3862069010734558, "step": 130930 }, { "epoch": 0.1318791226389724, "grad_norm": 8.755932696871785, "learning_rate": 4.918023913622592e-05, "loss": 2.1958, "mean_token_accuracy": 0.4862068951129913, "step": 130935 }, { "epoch": 0.13188415869207656, "grad_norm": 11.669919603291996, "learning_rate": 4.9180138908551125e-05, "loss": 2.088, "mean_token_accuracy": 0.49999999403953554, "step": 130940 }, { "epoch": 0.13188919474518074, "grad_norm": 16.694921306766254, "learning_rate": 4.9180038674863266e-05, "loss": 3.0236, "mean_token_accuracy": 0.3827586114406586, "step": 130945 }, { "epoch": 0.1318942307982849, "grad_norm": 9.622264401455466, "learning_rate": 4.917993843516237e-05, "loss": 2.5885, "mean_token_accuracy": 0.3758620619773865, "step": 130950 }, { "epoch": 0.13189926685138909, "grad_norm": 10.367114962281956, "learning_rate": 4.9179838189448457e-05, "loss": 2.1693, "mean_token_accuracy": 0.48275862336158754, "step": 130955 }, { "epoch": 0.13190430290449326, "grad_norm": 8.965918179172675, "learning_rate": 4.917973793772157e-05, "loss": 1.8551, "mean_token_accuracy": 0.541379302740097, "step": 130960 }, { "epoch": 0.13190933895759743, "grad_norm": 9.200192226062848, "learning_rate": 4.917963767998171e-05, "loss": 2.4657, "mean_token_accuracy": 0.3827586114406586, "step": 130965 }, { "epoch": 0.1319143750107016, "grad_norm": 11.525048352643964, "learning_rate": 4.917953741622894e-05, "loss": 2.8565, "mean_token_accuracy": 0.39310344457626345, "step": 130970 }, { "epoch": 0.13191941106380578, "grad_norm": 11.234872839272798, "learning_rate": 4.917943714646326e-05, "loss": 2.5469, "mean_token_accuracy": 0.3275861978530884, "step": 130975 }, { "epoch": 0.13192444711690995, "grad_norm": 9.656233619424537, "learning_rate": 4.91793368706847e-05, "loss": 2.575, "mean_token_accuracy": 0.4206896543502808, "step": 130980 }, { "epoch": 0.13192948317001413, "grad_norm": 10.807322796840618, "learning_rate": 4.9179236588893304e-05, "loss": 2.1792, "mean_token_accuracy": 0.441379314661026, "step": 130985 }, { "epoch": 0.1319345192231183, "grad_norm": 10.264219868380904, "learning_rate": 4.917913630108909e-05, "loss": 2.1323, "mean_token_accuracy": 0.4586206912994385, "step": 130990 }, { "epoch": 0.13193955527622248, "grad_norm": 12.58639345930477, "learning_rate": 4.9179036007272086e-05, "loss": 2.6361, "mean_token_accuracy": 0.4, "step": 130995 }, { "epoch": 0.13194459132932665, "grad_norm": 13.160175907894914, "learning_rate": 4.917893570744232e-05, "loss": 2.5628, "mean_token_accuracy": 0.4103448212146759, "step": 131000 }, { "epoch": 0.13194962738243082, "grad_norm": 10.335783733791828, "learning_rate": 4.917883540159982e-05, "loss": 2.3135, "mean_token_accuracy": 0.45747126936912536, "step": 131005 }, { "epoch": 0.131954663435535, "grad_norm": 10.232142277331928, "learning_rate": 4.917873508974462e-05, "loss": 2.4727, "mean_token_accuracy": 0.44482758045196535, "step": 131010 }, { "epoch": 0.13195969948863917, "grad_norm": 13.111616940367243, "learning_rate": 4.9178634771876734e-05, "loss": 2.3704, "mean_token_accuracy": 0.34482757449150087, "step": 131015 }, { "epoch": 0.13196473554174334, "grad_norm": 10.46645009032725, "learning_rate": 4.9178534447996197e-05, "loss": 2.4401, "mean_token_accuracy": 0.4241379380226135, "step": 131020 }, { "epoch": 0.13196977159484752, "grad_norm": 22.755171977644668, "learning_rate": 4.917843411810305e-05, "loss": 2.8994, "mean_token_accuracy": 0.39310345649719236, "step": 131025 }, { "epoch": 0.1319748076479517, "grad_norm": 12.08302287552268, "learning_rate": 4.917833378219729e-05, "loss": 2.44, "mean_token_accuracy": 0.4034482777118683, "step": 131030 }, { "epoch": 0.13197984370105587, "grad_norm": 10.741653749099575, "learning_rate": 4.917823344027898e-05, "loss": 2.3395, "mean_token_accuracy": 0.44827585220336913, "step": 131035 }, { "epoch": 0.13198487975416004, "grad_norm": 12.285779944372209, "learning_rate": 4.917813309234812e-05, "loss": 2.4102, "mean_token_accuracy": 0.4206896543502808, "step": 131040 }, { "epoch": 0.1319899158072642, "grad_norm": 9.717539691535709, "learning_rate": 4.917803273840475e-05, "loss": 2.1824, "mean_token_accuracy": 0.47931033968925474, "step": 131045 }, { "epoch": 0.1319949518603684, "grad_norm": 10.720397785005064, "learning_rate": 4.91779323784489e-05, "loss": 2.0308, "mean_token_accuracy": 0.5103448271751404, "step": 131050 }, { "epoch": 0.13199998791347256, "grad_norm": 8.744973254593255, "learning_rate": 4.917783201248059e-05, "loss": 2.5705, "mean_token_accuracy": 0.41034482717514037, "step": 131055 }, { "epoch": 0.13200502396657673, "grad_norm": 11.02182176009723, "learning_rate": 4.917773164049986e-05, "loss": 2.4431, "mean_token_accuracy": 0.45287356376647947, "step": 131060 }, { "epoch": 0.1320100600196809, "grad_norm": 9.546201881254037, "learning_rate": 4.917763126250673e-05, "loss": 2.5843, "mean_token_accuracy": 0.4448275864124298, "step": 131065 }, { "epoch": 0.13201509607278508, "grad_norm": 11.119891023409128, "learning_rate": 4.917753087850123e-05, "loss": 2.7079, "mean_token_accuracy": 0.3482758581638336, "step": 131070 }, { "epoch": 0.13202013212588923, "grad_norm": 10.586922478488056, "learning_rate": 4.9177430488483375e-05, "loss": 2.2008, "mean_token_accuracy": 0.46896552443504336, "step": 131075 }, { "epoch": 0.1320251681789934, "grad_norm": 9.587490736970123, "learning_rate": 4.917733009245321e-05, "loss": 2.8213, "mean_token_accuracy": 0.3551724135875702, "step": 131080 }, { "epoch": 0.13203020423209758, "grad_norm": 11.598202276377595, "learning_rate": 4.917722969041076e-05, "loss": 2.8273, "mean_token_accuracy": 0.3793103456497192, "step": 131085 }, { "epoch": 0.13203524028520175, "grad_norm": 11.115493501480103, "learning_rate": 4.917712928235604e-05, "loss": 3.114, "mean_token_accuracy": 0.42413792610168455, "step": 131090 }, { "epoch": 0.13204027633830592, "grad_norm": 9.492574454602126, "learning_rate": 4.9177028868289094e-05, "loss": 2.3009, "mean_token_accuracy": 0.45517241954803467, "step": 131095 }, { "epoch": 0.1320453123914101, "grad_norm": 11.23168719622025, "learning_rate": 4.917692844820994e-05, "loss": 2.4937, "mean_token_accuracy": 0.417241370677948, "step": 131100 }, { "epoch": 0.13205034844451427, "grad_norm": 9.585405627488765, "learning_rate": 4.9176828022118615e-05, "loss": 2.2565, "mean_token_accuracy": 0.41034482717514037, "step": 131105 }, { "epoch": 0.13205538449761844, "grad_norm": 8.178961082310304, "learning_rate": 4.917672759001514e-05, "loss": 2.2924, "mean_token_accuracy": 0.4344827592372894, "step": 131110 }, { "epoch": 0.13206042055072262, "grad_norm": 9.714175817509103, "learning_rate": 4.917662715189954e-05, "loss": 2.2799, "mean_token_accuracy": 0.46551724672317507, "step": 131115 }, { "epoch": 0.1320654566038268, "grad_norm": 9.612685652369812, "learning_rate": 4.9176526707771844e-05, "loss": 1.8617, "mean_token_accuracy": 0.4724137902259827, "step": 131120 }, { "epoch": 0.13207049265693097, "grad_norm": 12.597026104451066, "learning_rate": 4.917642625763209e-05, "loss": 2.2525, "mean_token_accuracy": 0.39310344457626345, "step": 131125 }, { "epoch": 0.13207552871003514, "grad_norm": 8.632268085849248, "learning_rate": 4.9176325801480295e-05, "loss": 2.2023, "mean_token_accuracy": 0.46896551847457885, "step": 131130 }, { "epoch": 0.1320805647631393, "grad_norm": 10.383354543780289, "learning_rate": 4.9176225339316486e-05, "loss": 2.9107, "mean_token_accuracy": 0.3931034505367279, "step": 131135 }, { "epoch": 0.1320856008162435, "grad_norm": 11.693272698879378, "learning_rate": 4.91761248711407e-05, "loss": 2.6274, "mean_token_accuracy": 0.38965516686439516, "step": 131140 }, { "epoch": 0.13209063686934766, "grad_norm": 11.731994477902825, "learning_rate": 4.9176024396952955e-05, "loss": 2.4591, "mean_token_accuracy": 0.4018148839473724, "step": 131145 }, { "epoch": 0.13209567292245183, "grad_norm": 9.602340126816566, "learning_rate": 4.9175923916753294e-05, "loss": 2.2571, "mean_token_accuracy": 0.4517241299152374, "step": 131150 }, { "epoch": 0.132100708975556, "grad_norm": 11.349152587114629, "learning_rate": 4.9175823430541725e-05, "loss": 2.26, "mean_token_accuracy": 0.3931034505367279, "step": 131155 }, { "epoch": 0.13210574502866018, "grad_norm": 8.832727124403188, "learning_rate": 4.917572293831829e-05, "loss": 2.6766, "mean_token_accuracy": 0.42413793206214906, "step": 131160 }, { "epoch": 0.13211078108176436, "grad_norm": 11.520790035382664, "learning_rate": 4.9175622440083006e-05, "loss": 2.4084, "mean_token_accuracy": 0.42413793206214906, "step": 131165 }, { "epoch": 0.13211581713486853, "grad_norm": 11.453158788052189, "learning_rate": 4.9175521935835916e-05, "loss": 2.3295, "mean_token_accuracy": 0.4068965494632721, "step": 131170 }, { "epoch": 0.1321208531879727, "grad_norm": 11.026000468198363, "learning_rate": 4.9175421425577035e-05, "loss": 2.5167, "mean_token_accuracy": 0.3758620619773865, "step": 131175 }, { "epoch": 0.13212588924107688, "grad_norm": 9.229372105944885, "learning_rate": 4.9175320909306394e-05, "loss": 2.0608, "mean_token_accuracy": 0.4586206912994385, "step": 131180 }, { "epoch": 0.13213092529418105, "grad_norm": 15.977849725511692, "learning_rate": 4.917522038702402e-05, "loss": 3.0136, "mean_token_accuracy": 0.36206897497177126, "step": 131185 }, { "epoch": 0.13213596134728522, "grad_norm": 11.309395976283522, "learning_rate": 4.917511985872995e-05, "loss": 2.5063, "mean_token_accuracy": 0.3827586233615875, "step": 131190 }, { "epoch": 0.1321409974003894, "grad_norm": 10.662406744295948, "learning_rate": 4.91750193244242e-05, "loss": 2.7202, "mean_token_accuracy": 0.34827585220336915, "step": 131195 }, { "epoch": 0.13214603345349357, "grad_norm": 9.929448598498977, "learning_rate": 4.917491878410681e-05, "loss": 2.667, "mean_token_accuracy": 0.38275861740112305, "step": 131200 }, { "epoch": 0.13215106950659775, "grad_norm": 10.571578543823838, "learning_rate": 4.9174818237777786e-05, "loss": 2.584, "mean_token_accuracy": 0.3931034475564957, "step": 131205 }, { "epoch": 0.13215610555970192, "grad_norm": 14.077657892704604, "learning_rate": 4.9174717685437176e-05, "loss": 2.7506, "mean_token_accuracy": 0.37586206793785093, "step": 131210 }, { "epoch": 0.13216114161280607, "grad_norm": 9.96251433581295, "learning_rate": 4.9174617127085e-05, "loss": 2.1662, "mean_token_accuracy": 0.4517241418361664, "step": 131215 }, { "epoch": 0.13216617766591024, "grad_norm": 10.087131419183478, "learning_rate": 4.9174516562721296e-05, "loss": 2.2735, "mean_token_accuracy": 0.4294615864753723, "step": 131220 }, { "epoch": 0.1321712137190144, "grad_norm": 10.337666296090887, "learning_rate": 4.917441599234608e-05, "loss": 2.9218, "mean_token_accuracy": 0.4206896543502808, "step": 131225 }, { "epoch": 0.1321762497721186, "grad_norm": 11.760898393300893, "learning_rate": 4.917431541595939e-05, "loss": 2.3589, "mean_token_accuracy": 0.4569268047809601, "step": 131230 }, { "epoch": 0.13218128582522276, "grad_norm": 11.750066054010318, "learning_rate": 4.917421483356124e-05, "loss": 2.2726, "mean_token_accuracy": 0.4586206912994385, "step": 131235 }, { "epoch": 0.13218632187832693, "grad_norm": 11.684625591762964, "learning_rate": 4.917411424515166e-05, "loss": 2.6757, "mean_token_accuracy": 0.4000000059604645, "step": 131240 }, { "epoch": 0.1321913579314311, "grad_norm": 9.368007996693185, "learning_rate": 4.917401365073069e-05, "loss": 2.0041, "mean_token_accuracy": 0.44506957530975344, "step": 131245 }, { "epoch": 0.13219639398453528, "grad_norm": 12.579201731817781, "learning_rate": 4.917391305029836e-05, "loss": 2.1639, "mean_token_accuracy": 0.458620685338974, "step": 131250 }, { "epoch": 0.13220143003763946, "grad_norm": 10.211357839688405, "learning_rate": 4.917381244385468e-05, "loss": 2.6896, "mean_token_accuracy": 0.4, "step": 131255 }, { "epoch": 0.13220646609074363, "grad_norm": 13.016531313607894, "learning_rate": 4.9173711831399684e-05, "loss": 2.4494, "mean_token_accuracy": 0.4137930989265442, "step": 131260 }, { "epoch": 0.1322115021438478, "grad_norm": 11.52024239764897, "learning_rate": 4.9173611212933413e-05, "loss": 2.3147, "mean_token_accuracy": 0.38753780722618103, "step": 131265 }, { "epoch": 0.13221653819695198, "grad_norm": 10.485724964378825, "learning_rate": 4.917351058845587e-05, "loss": 2.6046, "mean_token_accuracy": 0.4379310429096222, "step": 131270 }, { "epoch": 0.13222157425005615, "grad_norm": 10.100919214957443, "learning_rate": 4.9173409957967115e-05, "loss": 2.3011, "mean_token_accuracy": 0.4344827592372894, "step": 131275 }, { "epoch": 0.13222661030316032, "grad_norm": 10.351577044309916, "learning_rate": 4.9173309321467154e-05, "loss": 2.3228, "mean_token_accuracy": 0.41034482717514037, "step": 131280 }, { "epoch": 0.1322316463562645, "grad_norm": 9.561083520264361, "learning_rate": 4.917320867895602e-05, "loss": 2.0107, "mean_token_accuracy": 0.4986085891723633, "step": 131285 }, { "epoch": 0.13223668240936867, "grad_norm": 13.970629077734946, "learning_rate": 4.917310803043374e-05, "loss": 2.14, "mean_token_accuracy": 0.49153055548667907, "step": 131290 }, { "epoch": 0.13224171846247285, "grad_norm": 9.138960642554114, "learning_rate": 4.9173007375900345e-05, "loss": 2.2237, "mean_token_accuracy": 0.42413793206214906, "step": 131295 }, { "epoch": 0.13224675451557702, "grad_norm": 10.315801795330843, "learning_rate": 4.917290671535585e-05, "loss": 2.5188, "mean_token_accuracy": 0.37586206793785093, "step": 131300 }, { "epoch": 0.1322517905686812, "grad_norm": 8.279586333882575, "learning_rate": 4.91728060488003e-05, "loss": 2.4168, "mean_token_accuracy": 0.42413793206214906, "step": 131305 }, { "epoch": 0.13225682662178537, "grad_norm": 10.277069416620028, "learning_rate": 4.917270537623372e-05, "loss": 1.9879, "mean_token_accuracy": 0.5172413647174835, "step": 131310 }, { "epoch": 0.13226186267488954, "grad_norm": 9.939770771114727, "learning_rate": 4.917260469765613e-05, "loss": 2.0263, "mean_token_accuracy": 0.4398064136505127, "step": 131315 }, { "epoch": 0.13226689872799371, "grad_norm": 9.271120978803843, "learning_rate": 4.917250401306756e-05, "loss": 2.2392, "mean_token_accuracy": 0.4620689570903778, "step": 131320 }, { "epoch": 0.1322719347810979, "grad_norm": 15.96267272538953, "learning_rate": 4.9172403322468044e-05, "loss": 2.3658, "mean_token_accuracy": 0.4379310369491577, "step": 131325 }, { "epoch": 0.13227697083420206, "grad_norm": 13.79683631607779, "learning_rate": 4.917230262585761e-05, "loss": 2.6164, "mean_token_accuracy": 0.4034482717514038, "step": 131330 }, { "epoch": 0.13228200688730624, "grad_norm": 10.343560728145343, "learning_rate": 4.917220192323628e-05, "loss": 2.3659, "mean_token_accuracy": 0.41724138259887694, "step": 131335 }, { "epoch": 0.1322870429404104, "grad_norm": 9.719834097177923, "learning_rate": 4.917210121460407e-05, "loss": 2.4235, "mean_token_accuracy": 0.4310344815254211, "step": 131340 }, { "epoch": 0.13229207899351458, "grad_norm": 21.46685489085881, "learning_rate": 4.917200049996104e-05, "loss": 2.8414, "mean_token_accuracy": 0.3931034505367279, "step": 131345 }, { "epoch": 0.13229711504661876, "grad_norm": 11.96503878559186, "learning_rate": 4.917189977930719e-05, "loss": 2.3628, "mean_token_accuracy": 0.4137930989265442, "step": 131350 }, { "epoch": 0.1323021510997229, "grad_norm": 12.497098217018177, "learning_rate": 4.917179905264256e-05, "loss": 2.6067, "mean_token_accuracy": 0.4206896543502808, "step": 131355 }, { "epoch": 0.13230718715282708, "grad_norm": 10.17702784969293, "learning_rate": 4.917169831996718e-05, "loss": 2.3237, "mean_token_accuracy": 0.4137930929660797, "step": 131360 }, { "epoch": 0.13231222320593125, "grad_norm": 12.070017585259182, "learning_rate": 4.917159758128106e-05, "loss": 2.3383, "mean_token_accuracy": 0.4655172348022461, "step": 131365 }, { "epoch": 0.13231725925903542, "grad_norm": 10.332457917331118, "learning_rate": 4.9171496836584265e-05, "loss": 2.0801, "mean_token_accuracy": 0.4724137902259827, "step": 131370 }, { "epoch": 0.1323222953121396, "grad_norm": 13.063408071261948, "learning_rate": 4.917139608587678e-05, "loss": 2.5882, "mean_token_accuracy": 0.41548699140548706, "step": 131375 }, { "epoch": 0.13232733136524377, "grad_norm": 10.314802861097107, "learning_rate": 4.917129532915866e-05, "loss": 2.3084, "mean_token_accuracy": 0.4, "step": 131380 }, { "epoch": 0.13233236741834795, "grad_norm": 10.439205836624764, "learning_rate": 4.9171194566429924e-05, "loss": 2.078, "mean_token_accuracy": 0.4551724076271057, "step": 131385 }, { "epoch": 0.13233740347145212, "grad_norm": 10.37218801985173, "learning_rate": 4.91710937976906e-05, "loss": 1.9862, "mean_token_accuracy": 0.42413792610168455, "step": 131390 }, { "epoch": 0.1323424395245563, "grad_norm": 36.31546879822128, "learning_rate": 4.917099302294072e-05, "loss": 2.5273, "mean_token_accuracy": 0.43980641961097716, "step": 131395 }, { "epoch": 0.13234747557766047, "grad_norm": 10.501689614303995, "learning_rate": 4.917089224218031e-05, "loss": 2.2748, "mean_token_accuracy": 0.42758620977401735, "step": 131400 }, { "epoch": 0.13235251163076464, "grad_norm": 8.661146613634248, "learning_rate": 4.9170791455409394e-05, "loss": 2.3483, "mean_token_accuracy": 0.42413792610168455, "step": 131405 }, { "epoch": 0.13235754768386881, "grad_norm": 11.843981967257625, "learning_rate": 4.9170690662628e-05, "loss": 2.0483, "mean_token_accuracy": 0.46551724076271056, "step": 131410 }, { "epoch": 0.132362583736973, "grad_norm": 12.1326545456674, "learning_rate": 4.9170589863836166e-05, "loss": 2.5534, "mean_token_accuracy": 0.42758620977401735, "step": 131415 }, { "epoch": 0.13236761979007716, "grad_norm": 8.759100106322363, "learning_rate": 4.917048905903391e-05, "loss": 2.3965, "mean_token_accuracy": 0.4137930989265442, "step": 131420 }, { "epoch": 0.13237265584318134, "grad_norm": 8.406203514615466, "learning_rate": 4.9170388248221264e-05, "loss": 1.8667, "mean_token_accuracy": 0.5459359467029572, "step": 131425 }, { "epoch": 0.1323776918962855, "grad_norm": 10.691088865635312, "learning_rate": 4.9170287431398254e-05, "loss": 2.3505, "mean_token_accuracy": 0.43103447556495667, "step": 131430 }, { "epoch": 0.13238272794938968, "grad_norm": 15.15634085761613, "learning_rate": 4.917018660856491e-05, "loss": 2.6865, "mean_token_accuracy": 0.3999999940395355, "step": 131435 }, { "epoch": 0.13238776400249386, "grad_norm": 10.948326879346327, "learning_rate": 4.9170085779721256e-05, "loss": 2.2749, "mean_token_accuracy": 0.43103448748588563, "step": 131440 }, { "epoch": 0.13239280005559803, "grad_norm": 11.414183825778988, "learning_rate": 4.9169984944867334e-05, "loss": 2.509, "mean_token_accuracy": 0.4206896543502808, "step": 131445 }, { "epoch": 0.1323978361087022, "grad_norm": 12.886774756888196, "learning_rate": 4.916988410400314e-05, "loss": 2.7913, "mean_token_accuracy": 0.4068965554237366, "step": 131450 }, { "epoch": 0.13240287216180638, "grad_norm": 10.089831696112617, "learning_rate": 4.9169783257128735e-05, "loss": 1.9596, "mean_token_accuracy": 0.5071428477764129, "step": 131455 }, { "epoch": 0.13240790821491055, "grad_norm": 10.369503562693835, "learning_rate": 4.916968240424414e-05, "loss": 2.4564, "mean_token_accuracy": 0.4034482777118683, "step": 131460 }, { "epoch": 0.13241294426801473, "grad_norm": 9.905948068893506, "learning_rate": 4.916958154534937e-05, "loss": 2.3357, "mean_token_accuracy": 0.4586207032203674, "step": 131465 }, { "epoch": 0.1324179803211189, "grad_norm": 10.701725071365345, "learning_rate": 4.9169480680444466e-05, "loss": 2.747, "mean_token_accuracy": 0.3862068891525269, "step": 131470 }, { "epoch": 0.13242301637422307, "grad_norm": 8.490209353266549, "learning_rate": 4.9169379809529445e-05, "loss": 2.2655, "mean_token_accuracy": 0.43793103098869324, "step": 131475 }, { "epoch": 0.13242805242732725, "grad_norm": 10.229179170868987, "learning_rate": 4.916927893260434e-05, "loss": 2.3276, "mean_token_accuracy": 0.4448275864124298, "step": 131480 }, { "epoch": 0.13243308848043142, "grad_norm": 10.496374428250446, "learning_rate": 4.916917804966918e-05, "loss": 2.4068, "mean_token_accuracy": 0.37241379022598264, "step": 131485 }, { "epoch": 0.1324381245335356, "grad_norm": 10.760596030963063, "learning_rate": 4.9169077160724e-05, "loss": 2.3649, "mean_token_accuracy": 0.46412583589553835, "step": 131490 }, { "epoch": 0.13244316058663974, "grad_norm": 9.846684539687933, "learning_rate": 4.9168976265768816e-05, "loss": 2.3506, "mean_token_accuracy": 0.4344827592372894, "step": 131495 }, { "epoch": 0.13244819663974391, "grad_norm": 9.88283171136494, "learning_rate": 4.916887536480366e-05, "loss": 2.6195, "mean_token_accuracy": 0.4068965554237366, "step": 131500 }, { "epoch": 0.1324532326928481, "grad_norm": 10.896891512713259, "learning_rate": 4.9168774457828563e-05, "loss": 2.5124, "mean_token_accuracy": 0.3999999940395355, "step": 131505 }, { "epoch": 0.13245826874595226, "grad_norm": 9.659845829241906, "learning_rate": 4.916867354484355e-05, "loss": 2.5078, "mean_token_accuracy": 0.43103447556495667, "step": 131510 }, { "epoch": 0.13246330479905644, "grad_norm": 11.485363420803353, "learning_rate": 4.9168572625848656e-05, "loss": 2.6061, "mean_token_accuracy": 0.3896551728248596, "step": 131515 }, { "epoch": 0.1324683408521606, "grad_norm": 11.92137294081313, "learning_rate": 4.916847170084389e-05, "loss": 2.6137, "mean_token_accuracy": 0.4068965554237366, "step": 131520 }, { "epoch": 0.13247337690526478, "grad_norm": 9.675054214550553, "learning_rate": 4.9168370769829295e-05, "loss": 2.5857, "mean_token_accuracy": 0.4034482717514038, "step": 131525 }, { "epoch": 0.13247841295836896, "grad_norm": 11.477417646564005, "learning_rate": 4.91682698328049e-05, "loss": 2.2241, "mean_token_accuracy": 0.47773743867874147, "step": 131530 }, { "epoch": 0.13248344901147313, "grad_norm": 9.78207054101505, "learning_rate": 4.916816888977073e-05, "loss": 2.2948, "mean_token_accuracy": 0.4811857283115387, "step": 131535 }, { "epoch": 0.1324884850645773, "grad_norm": 11.22467178490561, "learning_rate": 4.9168067940726817e-05, "loss": 2.4509, "mean_token_accuracy": 0.443254691362381, "step": 131540 }, { "epoch": 0.13249352111768148, "grad_norm": 13.435615470633008, "learning_rate": 4.9167966985673174e-05, "loss": 2.9686, "mean_token_accuracy": 0.34482758641242983, "step": 131545 }, { "epoch": 0.13249855717078565, "grad_norm": 8.849383088208135, "learning_rate": 4.9167866024609845e-05, "loss": 2.4739, "mean_token_accuracy": 0.40532365441322327, "step": 131550 }, { "epoch": 0.13250359322388983, "grad_norm": 11.395979086494675, "learning_rate": 4.916776505753685e-05, "loss": 2.2397, "mean_token_accuracy": 0.4620689630508423, "step": 131555 }, { "epoch": 0.132508629276994, "grad_norm": 9.820701868560276, "learning_rate": 4.916766408445422e-05, "loss": 2.1531, "mean_token_accuracy": 0.47586206197738645, "step": 131560 }, { "epoch": 0.13251366533009817, "grad_norm": 9.71007132106587, "learning_rate": 4.916756310536199e-05, "loss": 2.1673, "mean_token_accuracy": 0.4448275864124298, "step": 131565 }, { "epoch": 0.13251870138320235, "grad_norm": 11.118668292212806, "learning_rate": 4.916746212026017e-05, "loss": 2.0053, "mean_token_accuracy": 0.4896551728248596, "step": 131570 }, { "epoch": 0.13252373743630652, "grad_norm": 12.048046714378241, "learning_rate": 4.91673611291488e-05, "loss": 2.579, "mean_token_accuracy": 0.39310344457626345, "step": 131575 }, { "epoch": 0.1325287734894107, "grad_norm": 15.87882610048152, "learning_rate": 4.916726013202791e-05, "loss": 2.22, "mean_token_accuracy": 0.41379310488700866, "step": 131580 }, { "epoch": 0.13253380954251487, "grad_norm": 12.182246906187553, "learning_rate": 4.9167159128897524e-05, "loss": 2.6226, "mean_token_accuracy": 0.4034482777118683, "step": 131585 }, { "epoch": 0.13253884559561904, "grad_norm": 9.862150161993853, "learning_rate": 4.916705811975767e-05, "loss": 2.0312, "mean_token_accuracy": 0.4379310429096222, "step": 131590 }, { "epoch": 0.13254388164872322, "grad_norm": 19.43909647021108, "learning_rate": 4.916695710460838e-05, "loss": 2.6867, "mean_token_accuracy": 0.40689656138420105, "step": 131595 }, { "epoch": 0.1325489177018274, "grad_norm": 9.562658268984745, "learning_rate": 4.916685608344968e-05, "loss": 2.5226, "mean_token_accuracy": 0.39140955805778505, "step": 131600 }, { "epoch": 0.13255395375493156, "grad_norm": 10.78529389997785, "learning_rate": 4.916675505628159e-05, "loss": 2.0993, "mean_token_accuracy": 0.44827585220336913, "step": 131605 }, { "epoch": 0.13255898980803574, "grad_norm": 10.670135376599857, "learning_rate": 4.9166654023104144e-05, "loss": 2.2091, "mean_token_accuracy": 0.43793103098869324, "step": 131610 }, { "epoch": 0.1325640258611399, "grad_norm": 9.393311037102201, "learning_rate": 4.9166552983917375e-05, "loss": 2.5331, "mean_token_accuracy": 0.4068965494632721, "step": 131615 }, { "epoch": 0.13256906191424409, "grad_norm": 10.847580722863094, "learning_rate": 4.9166451938721306e-05, "loss": 2.3997, "mean_token_accuracy": 0.40689654350280763, "step": 131620 }, { "epoch": 0.13257409796734826, "grad_norm": 8.923688101012566, "learning_rate": 4.9166350887515966e-05, "loss": 1.9499, "mean_token_accuracy": 0.5206896483898162, "step": 131625 }, { "epoch": 0.13257913402045243, "grad_norm": 9.482095439516355, "learning_rate": 4.916624983030138e-05, "loss": 2.4504, "mean_token_accuracy": 0.4344827592372894, "step": 131630 }, { "epoch": 0.13258417007355658, "grad_norm": 8.73390809723528, "learning_rate": 4.916614876707758e-05, "loss": 2.4479, "mean_token_accuracy": 0.42413793206214906, "step": 131635 }, { "epoch": 0.13258920612666075, "grad_norm": 11.26250179756786, "learning_rate": 4.91660476978446e-05, "loss": 2.0022, "mean_token_accuracy": 0.5086509406566619, "step": 131640 }, { "epoch": 0.13259424217976493, "grad_norm": 11.449484641269287, "learning_rate": 4.9165946622602456e-05, "loss": 2.3139, "mean_token_accuracy": 0.44482758045196535, "step": 131645 }, { "epoch": 0.1325992782328691, "grad_norm": 8.657478646736775, "learning_rate": 4.916584554135118e-05, "loss": 2.367, "mean_token_accuracy": 0.44137930274009707, "step": 131650 }, { "epoch": 0.13260431428597327, "grad_norm": 11.247643761071732, "learning_rate": 4.916574445409079e-05, "loss": 2.2148, "mean_token_accuracy": 0.4172413766384125, "step": 131655 }, { "epoch": 0.13260935033907745, "grad_norm": 13.109814885357773, "learning_rate": 4.916564336082134e-05, "loss": 2.8525, "mean_token_accuracy": 0.3275861978530884, "step": 131660 }, { "epoch": 0.13261438639218162, "grad_norm": 11.0417158925555, "learning_rate": 4.916554226154283e-05, "loss": 2.4252, "mean_token_accuracy": 0.45862067937850953, "step": 131665 }, { "epoch": 0.1326194224452858, "grad_norm": 11.743411025304352, "learning_rate": 4.916544115625531e-05, "loss": 2.4164, "mean_token_accuracy": 0.4206896543502808, "step": 131670 }, { "epoch": 0.13262445849838997, "grad_norm": 15.62222504259958, "learning_rate": 4.9165340044958806e-05, "loss": 2.8422, "mean_token_accuracy": 0.3896551728248596, "step": 131675 }, { "epoch": 0.13262949455149414, "grad_norm": 9.31799944861975, "learning_rate": 4.916523892765333e-05, "loss": 2.4405, "mean_token_accuracy": 0.44827587008476255, "step": 131680 }, { "epoch": 0.13263453060459832, "grad_norm": 9.377359860755403, "learning_rate": 4.916513780433892e-05, "loss": 2.2141, "mean_token_accuracy": 0.49741379618644715, "step": 131685 }, { "epoch": 0.1326395666577025, "grad_norm": 10.512934130986027, "learning_rate": 4.9165036675015605e-05, "loss": 2.6758, "mean_token_accuracy": 0.3999999940395355, "step": 131690 }, { "epoch": 0.13264460271080666, "grad_norm": 10.28861878063264, "learning_rate": 4.9164935539683414e-05, "loss": 2.3657, "mean_token_accuracy": 0.43992740511894224, "step": 131695 }, { "epoch": 0.13264963876391084, "grad_norm": 11.21309032444019, "learning_rate": 4.916483439834236e-05, "loss": 2.6913, "mean_token_accuracy": 0.40000000298023225, "step": 131700 }, { "epoch": 0.132654674817015, "grad_norm": 10.510568610255131, "learning_rate": 4.9164733250992495e-05, "loss": 2.3078, "mean_token_accuracy": 0.41379310488700866, "step": 131705 }, { "epoch": 0.13265971087011919, "grad_norm": 14.342943222275748, "learning_rate": 4.9164632097633837e-05, "loss": 2.3003, "mean_token_accuracy": 0.48004926443099977, "step": 131710 }, { "epoch": 0.13266474692322336, "grad_norm": 10.761889931504381, "learning_rate": 4.91645309382664e-05, "loss": 2.1374, "mean_token_accuracy": 0.4551724076271057, "step": 131715 }, { "epoch": 0.13266978297632753, "grad_norm": 11.984663810654952, "learning_rate": 4.9164429772890235e-05, "loss": 2.4102, "mean_token_accuracy": 0.4655172348022461, "step": 131720 }, { "epoch": 0.1326748190294317, "grad_norm": 12.920931275325259, "learning_rate": 4.916432860150535e-05, "loss": 2.5328, "mean_token_accuracy": 0.46376285552978513, "step": 131725 }, { "epoch": 0.13267985508253588, "grad_norm": 9.875292559232992, "learning_rate": 4.916422742411179e-05, "loss": 2.4135, "mean_token_accuracy": 0.4482758641242981, "step": 131730 }, { "epoch": 0.13268489113564005, "grad_norm": 12.410993239805087, "learning_rate": 4.916412624070958e-05, "loss": 2.433, "mean_token_accuracy": 0.3896551728248596, "step": 131735 }, { "epoch": 0.13268992718874423, "grad_norm": 9.917737556210058, "learning_rate": 4.916402505129874e-05, "loss": 2.4898, "mean_token_accuracy": 0.43793103098869324, "step": 131740 }, { "epoch": 0.1326949632418484, "grad_norm": 12.867834643911216, "learning_rate": 4.91639238558793e-05, "loss": 2.4387, "mean_token_accuracy": 0.41034482717514037, "step": 131745 }, { "epoch": 0.13269999929495258, "grad_norm": 9.632127676682542, "learning_rate": 4.91638226544513e-05, "loss": 2.233, "mean_token_accuracy": 0.458620685338974, "step": 131750 }, { "epoch": 0.13270503534805675, "grad_norm": 9.573333116567206, "learning_rate": 4.916372144701474e-05, "loss": 2.0884, "mean_token_accuracy": 0.48965516686439514, "step": 131755 }, { "epoch": 0.13271007140116092, "grad_norm": 9.435793480784792, "learning_rate": 4.916362023356968e-05, "loss": 2.0749, "mean_token_accuracy": 0.4551724135875702, "step": 131760 }, { "epoch": 0.1327151074542651, "grad_norm": 10.638511432362334, "learning_rate": 4.916351901411612e-05, "loss": 2.365, "mean_token_accuracy": 0.4413793087005615, "step": 131765 }, { "epoch": 0.13272014350736927, "grad_norm": 19.767062646244298, "learning_rate": 4.916341778865411e-05, "loss": 2.4292, "mean_token_accuracy": 0.3999999940395355, "step": 131770 }, { "epoch": 0.13272517956047342, "grad_norm": 10.474225381550344, "learning_rate": 4.9163316557183675e-05, "loss": 2.11, "mean_token_accuracy": 0.5126436829566956, "step": 131775 }, { "epoch": 0.1327302156135776, "grad_norm": 11.657439438173649, "learning_rate": 4.916321531970484e-05, "loss": 2.7211, "mean_token_accuracy": 0.4068965494632721, "step": 131780 }, { "epoch": 0.13273525166668176, "grad_norm": 13.246943746822842, "learning_rate": 4.916311407621762e-05, "loss": 2.4879, "mean_token_accuracy": 0.44827585816383364, "step": 131785 }, { "epoch": 0.13274028771978594, "grad_norm": 8.023523552751579, "learning_rate": 4.916301282672206e-05, "loss": 2.049, "mean_token_accuracy": 0.5275862038135528, "step": 131790 }, { "epoch": 0.1327453237728901, "grad_norm": 8.637507836645831, "learning_rate": 4.916291157121819e-05, "loss": 2.228, "mean_token_accuracy": 0.4931034505367279, "step": 131795 }, { "epoch": 0.13275035982599429, "grad_norm": 11.564754129719343, "learning_rate": 4.916281030970602e-05, "loss": 2.9449, "mean_token_accuracy": 0.3862069010734558, "step": 131800 }, { "epoch": 0.13275539587909846, "grad_norm": 9.99793168000728, "learning_rate": 4.9162709042185596e-05, "loss": 2.1108, "mean_token_accuracy": 0.4689655125141144, "step": 131805 }, { "epoch": 0.13276043193220263, "grad_norm": 11.47606025338248, "learning_rate": 4.916260776865693e-05, "loss": 2.536, "mean_token_accuracy": 0.42413793206214906, "step": 131810 }, { "epoch": 0.1327654679853068, "grad_norm": 10.340388348556395, "learning_rate": 4.916250648912006e-05, "loss": 2.8799, "mean_token_accuracy": 0.34482758939266206, "step": 131815 }, { "epoch": 0.13277050403841098, "grad_norm": 11.270559860737144, "learning_rate": 4.916240520357503e-05, "loss": 2.5084, "mean_token_accuracy": 0.42068964838981626, "step": 131820 }, { "epoch": 0.13277554009151515, "grad_norm": 10.355598985787058, "learning_rate": 4.916230391202183e-05, "loss": 2.0765, "mean_token_accuracy": 0.5034482777118683, "step": 131825 }, { "epoch": 0.13278057614461933, "grad_norm": 10.490987947046968, "learning_rate": 4.9162202614460516e-05, "loss": 2.6774, "mean_token_accuracy": 0.3896551728248596, "step": 131830 }, { "epoch": 0.1327856121977235, "grad_norm": 11.359052761441204, "learning_rate": 4.9162101310891115e-05, "loss": 2.3137, "mean_token_accuracy": 0.46551724076271056, "step": 131835 }, { "epoch": 0.13279064825082768, "grad_norm": 8.805525345810198, "learning_rate": 4.916200000131364e-05, "loss": 2.418, "mean_token_accuracy": 0.42758620381355283, "step": 131840 }, { "epoch": 0.13279568430393185, "grad_norm": 11.485304325040717, "learning_rate": 4.9161898685728135e-05, "loss": 2.5012, "mean_token_accuracy": 0.41379311084747317, "step": 131845 }, { "epoch": 0.13280072035703602, "grad_norm": 11.85380995143541, "learning_rate": 4.916179736413462e-05, "loss": 2.7883, "mean_token_accuracy": 0.41724138259887694, "step": 131850 }, { "epoch": 0.1328057564101402, "grad_norm": 11.79408619798211, "learning_rate": 4.916169603653313e-05, "loss": 2.5852, "mean_token_accuracy": 0.43448275327682495, "step": 131855 }, { "epoch": 0.13281079246324437, "grad_norm": 10.814319003709935, "learning_rate": 4.916159470292368e-05, "loss": 2.3897, "mean_token_accuracy": 0.42758620381355283, "step": 131860 }, { "epoch": 0.13281582851634854, "grad_norm": 12.36461339934935, "learning_rate": 4.916149336330631e-05, "loss": 2.1125, "mean_token_accuracy": 0.4758620738983154, "step": 131865 }, { "epoch": 0.13282086456945272, "grad_norm": 10.251104530062156, "learning_rate": 4.916139201768104e-05, "loss": 2.4213, "mean_token_accuracy": 0.42413793206214906, "step": 131870 }, { "epoch": 0.1328259006225569, "grad_norm": 8.484113929065005, "learning_rate": 4.9161290666047904e-05, "loss": 2.647, "mean_token_accuracy": 0.4275861918926239, "step": 131875 }, { "epoch": 0.13283093667566107, "grad_norm": 10.543575917388496, "learning_rate": 4.916118930840693e-05, "loss": 2.9202, "mean_token_accuracy": 0.35862069129943847, "step": 131880 }, { "epoch": 0.13283597272876524, "grad_norm": 8.667227564300891, "learning_rate": 4.916108794475815e-05, "loss": 1.9443, "mean_token_accuracy": 0.4984271049499512, "step": 131885 }, { "epoch": 0.1328410087818694, "grad_norm": 8.661460774879252, "learning_rate": 4.916098657510157e-05, "loss": 2.2425, "mean_token_accuracy": 0.4896551728248596, "step": 131890 }, { "epoch": 0.1328460448349736, "grad_norm": 12.274384618143488, "learning_rate": 4.9160885199437246e-05, "loss": 2.6273, "mean_token_accuracy": 0.4137930989265442, "step": 131895 }, { "epoch": 0.13285108088807776, "grad_norm": 11.894362730075114, "learning_rate": 4.91607838177652e-05, "loss": 2.1414, "mean_token_accuracy": 0.4620689570903778, "step": 131900 }, { "epoch": 0.13285611694118193, "grad_norm": 13.058002187582895, "learning_rate": 4.916068243008544e-05, "loss": 2.5928, "mean_token_accuracy": 0.4413793206214905, "step": 131905 }, { "epoch": 0.13286115299428608, "grad_norm": 9.359054182187394, "learning_rate": 4.9160581036398015e-05, "loss": 2.0785, "mean_token_accuracy": 0.46551724076271056, "step": 131910 }, { "epoch": 0.13286618904739025, "grad_norm": 9.193580598433886, "learning_rate": 4.916047963670295e-05, "loss": 2.5178, "mean_token_accuracy": 0.44361767172813416, "step": 131915 }, { "epoch": 0.13287122510049443, "grad_norm": 9.438929849363703, "learning_rate": 4.9160378231000275e-05, "loss": 2.6763, "mean_token_accuracy": 0.3793103456497192, "step": 131920 }, { "epoch": 0.1328762611535986, "grad_norm": 10.195353714241149, "learning_rate": 4.916027681929001e-05, "loss": 2.5948, "mean_token_accuracy": 0.4390199601650238, "step": 131925 }, { "epoch": 0.13288129720670278, "grad_norm": 11.975394004266729, "learning_rate": 4.916017540157218e-05, "loss": 2.5113, "mean_token_accuracy": 0.4034482777118683, "step": 131930 }, { "epoch": 0.13288633325980695, "grad_norm": 10.119138938993203, "learning_rate": 4.916007397784682e-05, "loss": 2.6704, "mean_token_accuracy": 0.4, "step": 131935 }, { "epoch": 0.13289136931291112, "grad_norm": 11.243474446267884, "learning_rate": 4.915997254811396e-05, "loss": 2.1211, "mean_token_accuracy": 0.4780399203300476, "step": 131940 }, { "epoch": 0.1328964053660153, "grad_norm": 9.085515380993764, "learning_rate": 4.915987111237363e-05, "loss": 1.8056, "mean_token_accuracy": 0.5103448331356049, "step": 131945 }, { "epoch": 0.13290144141911947, "grad_norm": 9.815287074696679, "learning_rate": 4.915976967062585e-05, "loss": 2.2832, "mean_token_accuracy": 0.4534180223941803, "step": 131950 }, { "epoch": 0.13290647747222364, "grad_norm": 10.185282238549654, "learning_rate": 4.9159668222870654e-05, "loss": 2.7654, "mean_token_accuracy": 0.39655172526836396, "step": 131955 }, { "epoch": 0.13291151352532782, "grad_norm": 10.863188943194942, "learning_rate": 4.915956676910806e-05, "loss": 2.4454, "mean_token_accuracy": 0.4034482777118683, "step": 131960 }, { "epoch": 0.132916549578432, "grad_norm": 10.67823818133062, "learning_rate": 4.915946530933812e-05, "loss": 2.4855, "mean_token_accuracy": 0.4310344815254211, "step": 131965 }, { "epoch": 0.13292158563153617, "grad_norm": 15.1265342494139, "learning_rate": 4.915936384356084e-05, "loss": 2.3431, "mean_token_accuracy": 0.4103448212146759, "step": 131970 }, { "epoch": 0.13292662168464034, "grad_norm": 22.052109568977116, "learning_rate": 4.9159262371776243e-05, "loss": 2.46, "mean_token_accuracy": 0.441379314661026, "step": 131975 }, { "epoch": 0.1329316577377445, "grad_norm": 10.76489942420142, "learning_rate": 4.915916089398438e-05, "loss": 1.9348, "mean_token_accuracy": 0.5085299372673034, "step": 131980 }, { "epoch": 0.1329366937908487, "grad_norm": 8.798894477178559, "learning_rate": 4.915905941018527e-05, "loss": 2.3728, "mean_token_accuracy": 0.441379314661026, "step": 131985 }, { "epoch": 0.13294172984395286, "grad_norm": 12.647294383361563, "learning_rate": 4.915895792037893e-05, "loss": 2.7153, "mean_token_accuracy": 0.41034482717514037, "step": 131990 }, { "epoch": 0.13294676589705703, "grad_norm": 10.796091961172817, "learning_rate": 4.9158856424565406e-05, "loss": 2.7037, "mean_token_accuracy": 0.39310344457626345, "step": 131995 }, { "epoch": 0.1329518019501612, "grad_norm": 10.039218075877924, "learning_rate": 4.915875492274472e-05, "loss": 2.2736, "mean_token_accuracy": 0.46376286149024964, "step": 132000 }, { "epoch": 0.13295683800326538, "grad_norm": 11.396729238236093, "learning_rate": 4.9158653414916886e-05, "loss": 2.1541, "mean_token_accuracy": 0.482758629322052, "step": 132005 }, { "epoch": 0.13296187405636956, "grad_norm": 11.646207327810686, "learning_rate": 4.915855190108194e-05, "loss": 2.1585, "mean_token_accuracy": 0.4777374446392059, "step": 132010 }, { "epoch": 0.13296691010947373, "grad_norm": 10.653717123464158, "learning_rate": 4.915845038123993e-05, "loss": 2.3329, "mean_token_accuracy": 0.4275861978530884, "step": 132015 }, { "epoch": 0.1329719461625779, "grad_norm": 12.564776851315061, "learning_rate": 4.915834885539086e-05, "loss": 2.018, "mean_token_accuracy": 0.4620689690113068, "step": 132020 }, { "epoch": 0.13297698221568208, "grad_norm": 10.207974851854551, "learning_rate": 4.9158247323534766e-05, "loss": 2.4124, "mean_token_accuracy": 0.44827587008476255, "step": 132025 }, { "epoch": 0.13298201826878625, "grad_norm": 12.089901391488828, "learning_rate": 4.915814578567168e-05, "loss": 2.7537, "mean_token_accuracy": 0.44482758045196535, "step": 132030 }, { "epoch": 0.13298705432189042, "grad_norm": 8.938536101347493, "learning_rate": 4.915804424180163e-05, "loss": 2.4471, "mean_token_accuracy": 0.42068966031074523, "step": 132035 }, { "epoch": 0.1329920903749946, "grad_norm": 12.077924313112794, "learning_rate": 4.915794269192463e-05, "loss": 2.5535, "mean_token_accuracy": 0.4206896543502808, "step": 132040 }, { "epoch": 0.13299712642809877, "grad_norm": 11.852353616291612, "learning_rate": 4.915784113604072e-05, "loss": 2.2313, "mean_token_accuracy": 0.4655172348022461, "step": 132045 }, { "epoch": 0.13300216248120292, "grad_norm": 13.399834003605365, "learning_rate": 4.915773957414993e-05, "loss": 2.3031, "mean_token_accuracy": 0.4448275864124298, "step": 132050 }, { "epoch": 0.1330071985343071, "grad_norm": 11.966392271391191, "learning_rate": 4.9157638006252285e-05, "loss": 2.5341, "mean_token_accuracy": 0.38620689511299133, "step": 132055 }, { "epoch": 0.13301223458741127, "grad_norm": 9.349772230467618, "learning_rate": 4.915753643234782e-05, "loss": 2.7031, "mean_token_accuracy": 0.3965517282485962, "step": 132060 }, { "epoch": 0.13301727064051544, "grad_norm": 11.353400416253692, "learning_rate": 4.915743485243655e-05, "loss": 2.5622, "mean_token_accuracy": 0.40459770560264585, "step": 132065 }, { "epoch": 0.1330223066936196, "grad_norm": 9.116405140053319, "learning_rate": 4.9157333266518505e-05, "loss": 2.3034, "mean_token_accuracy": 0.4620689630508423, "step": 132070 }, { "epoch": 0.1330273427467238, "grad_norm": 11.168078417540226, "learning_rate": 4.9157231674593725e-05, "loss": 2.0119, "mean_token_accuracy": 0.5103448092937469, "step": 132075 }, { "epoch": 0.13303237879982796, "grad_norm": 10.151547722963047, "learning_rate": 4.915713007666223e-05, "loss": 2.2252, "mean_token_accuracy": 0.43793103098869324, "step": 132080 }, { "epoch": 0.13303741485293213, "grad_norm": 11.479746197267716, "learning_rate": 4.915702847272404e-05, "loss": 2.5537, "mean_token_accuracy": 0.4103448182344437, "step": 132085 }, { "epoch": 0.1330424509060363, "grad_norm": 7.962346591684328, "learning_rate": 4.915692686277921e-05, "loss": 2.6326, "mean_token_accuracy": 0.44827585220336913, "step": 132090 }, { "epoch": 0.13304748695914048, "grad_norm": 8.297299082817306, "learning_rate": 4.915682524682773e-05, "loss": 2.5865, "mean_token_accuracy": 0.4344827651977539, "step": 132095 }, { "epoch": 0.13305252301224466, "grad_norm": 11.443677323384353, "learning_rate": 4.915672362486967e-05, "loss": 2.9198, "mean_token_accuracy": 0.37931033968925476, "step": 132100 }, { "epoch": 0.13305755906534883, "grad_norm": 10.802605073644573, "learning_rate": 4.915662199690502e-05, "loss": 2.1064, "mean_token_accuracy": 0.46551724076271056, "step": 132105 }, { "epoch": 0.133062595118453, "grad_norm": 9.45052455665339, "learning_rate": 4.9156520362933835e-05, "loss": 2.7062, "mean_token_accuracy": 0.35172412991523744, "step": 132110 }, { "epoch": 0.13306763117155718, "grad_norm": 9.23547982077421, "learning_rate": 4.9156418722956124e-05, "loss": 2.1828, "mean_token_accuracy": 0.4586206912994385, "step": 132115 }, { "epoch": 0.13307266722466135, "grad_norm": 9.673633147922375, "learning_rate": 4.915631707697193e-05, "loss": 2.3341, "mean_token_accuracy": 0.4206896543502808, "step": 132120 }, { "epoch": 0.13307770327776552, "grad_norm": 13.885664281892273, "learning_rate": 4.9156215424981275e-05, "loss": 2.1458, "mean_token_accuracy": 0.47931033968925474, "step": 132125 }, { "epoch": 0.1330827393308697, "grad_norm": 11.176150266962713, "learning_rate": 4.915611376698419e-05, "loss": 2.6637, "mean_token_accuracy": 0.4034482777118683, "step": 132130 }, { "epoch": 0.13308777538397387, "grad_norm": 10.051840013391441, "learning_rate": 4.91560121029807e-05, "loss": 2.4634, "mean_token_accuracy": 0.38620689511299133, "step": 132135 }, { "epoch": 0.13309281143707805, "grad_norm": 11.134289399222892, "learning_rate": 4.9155910432970834e-05, "loss": 2.216, "mean_token_accuracy": 0.4310344815254211, "step": 132140 }, { "epoch": 0.13309784749018222, "grad_norm": 12.018013637770206, "learning_rate": 4.915580875695462e-05, "loss": 2.6949, "mean_token_accuracy": 0.42413793206214906, "step": 132145 }, { "epoch": 0.1331028835432864, "grad_norm": 9.42623565566413, "learning_rate": 4.915570707493209e-05, "loss": 2.1757, "mean_token_accuracy": 0.43103448748588563, "step": 132150 }, { "epoch": 0.13310791959639057, "grad_norm": 11.750737887213594, "learning_rate": 4.9155605386903266e-05, "loss": 3.0971, "mean_token_accuracy": 0.36551724672317504, "step": 132155 }, { "epoch": 0.13311295564949474, "grad_norm": 12.535646438558016, "learning_rate": 4.9155503692868176e-05, "loss": 2.2651, "mean_token_accuracy": 0.4034482777118683, "step": 132160 }, { "epoch": 0.13311799170259891, "grad_norm": 11.135125656488361, "learning_rate": 4.915540199282686e-05, "loss": 2.5473, "mean_token_accuracy": 0.4068965494632721, "step": 132165 }, { "epoch": 0.1331230277557031, "grad_norm": 10.803154192200502, "learning_rate": 4.915530028677933e-05, "loss": 2.3341, "mean_token_accuracy": 0.42758620977401735, "step": 132170 }, { "epoch": 0.13312806380880726, "grad_norm": 10.538160831708433, "learning_rate": 4.9155198574725625e-05, "loss": 2.7023, "mean_token_accuracy": 0.3896551728248596, "step": 132175 }, { "epoch": 0.13313309986191144, "grad_norm": 11.32589980018265, "learning_rate": 4.915509685666577e-05, "loss": 2.4213, "mean_token_accuracy": 0.43793103098869324, "step": 132180 }, { "epoch": 0.1331381359150156, "grad_norm": 11.263135811665544, "learning_rate": 4.915499513259979e-05, "loss": 2.3971, "mean_token_accuracy": 0.44379915595054625, "step": 132185 }, { "epoch": 0.13314317196811976, "grad_norm": 9.510455347950607, "learning_rate": 4.9154893402527716e-05, "loss": 2.3048, "mean_token_accuracy": 0.42413793206214906, "step": 132190 }, { "epoch": 0.13314820802122393, "grad_norm": 10.397219407345082, "learning_rate": 4.9154791666449576e-05, "loss": 2.8361, "mean_token_accuracy": 0.3482758581638336, "step": 132195 }, { "epoch": 0.1331532440743281, "grad_norm": 9.519707912151798, "learning_rate": 4.9154689924365404e-05, "loss": 2.1436, "mean_token_accuracy": 0.44482759237289426, "step": 132200 }, { "epoch": 0.13315828012743228, "grad_norm": 10.703701834386429, "learning_rate": 4.915458817627522e-05, "loss": 2.211, "mean_token_accuracy": 0.4620689630508423, "step": 132205 }, { "epoch": 0.13316331618053645, "grad_norm": 10.87082224789843, "learning_rate": 4.915448642217905e-05, "loss": 2.3981, "mean_token_accuracy": 0.4068965494632721, "step": 132210 }, { "epoch": 0.13316835223364062, "grad_norm": 10.222340345317583, "learning_rate": 4.9154384662076935e-05, "loss": 2.3752, "mean_token_accuracy": 0.4481548726558685, "step": 132215 }, { "epoch": 0.1331733882867448, "grad_norm": 9.966450625379919, "learning_rate": 4.9154282895968896e-05, "loss": 2.2086, "mean_token_accuracy": 0.4620689690113068, "step": 132220 }, { "epoch": 0.13317842433984897, "grad_norm": 9.901856998448434, "learning_rate": 4.915418112385495e-05, "loss": 2.1241, "mean_token_accuracy": 0.42068964838981626, "step": 132225 }, { "epoch": 0.13318346039295315, "grad_norm": 10.362412351414692, "learning_rate": 4.9154079345735146e-05, "loss": 2.1566, "mean_token_accuracy": 0.4570477962493896, "step": 132230 }, { "epoch": 0.13318849644605732, "grad_norm": 11.946812552390723, "learning_rate": 4.91539775616095e-05, "loss": 2.5042, "mean_token_accuracy": 0.4537205040454865, "step": 132235 }, { "epoch": 0.1331935324991615, "grad_norm": 8.963898442849015, "learning_rate": 4.9153875771478044e-05, "loss": 2.1656, "mean_token_accuracy": 0.4704779267311096, "step": 132240 }, { "epoch": 0.13319856855226567, "grad_norm": 11.230198414813245, "learning_rate": 4.91537739753408e-05, "loss": 2.992, "mean_token_accuracy": 0.334482753276825, "step": 132245 }, { "epoch": 0.13320360460536984, "grad_norm": 11.137909716423819, "learning_rate": 4.9153672173197814e-05, "loss": 2.6267, "mean_token_accuracy": 0.4034482777118683, "step": 132250 }, { "epoch": 0.13320864065847401, "grad_norm": 9.633426638705155, "learning_rate": 4.915357036504909e-05, "loss": 2.2067, "mean_token_accuracy": 0.45862069725990295, "step": 132255 }, { "epoch": 0.1332136767115782, "grad_norm": 10.24122116036434, "learning_rate": 4.915346855089467e-05, "loss": 2.0719, "mean_token_accuracy": 0.4774349570274353, "step": 132260 }, { "epoch": 0.13321871276468236, "grad_norm": 9.826265850620526, "learning_rate": 4.915336673073458e-05, "loss": 2.4983, "mean_token_accuracy": 0.4137930989265442, "step": 132265 }, { "epoch": 0.13322374881778654, "grad_norm": 9.7760541309109, "learning_rate": 4.915326490456884e-05, "loss": 2.2561, "mean_token_accuracy": 0.4620689570903778, "step": 132270 }, { "epoch": 0.1332287848708907, "grad_norm": 16.81265450181346, "learning_rate": 4.91531630723975e-05, "loss": 2.3172, "mean_token_accuracy": 0.5121921122074127, "step": 132275 }, { "epoch": 0.13323382092399488, "grad_norm": 10.115995813429034, "learning_rate": 4.915306123422056e-05, "loss": 2.4699, "mean_token_accuracy": 0.4344827592372894, "step": 132280 }, { "epoch": 0.13323885697709906, "grad_norm": 10.663772783600143, "learning_rate": 4.9152959390038075e-05, "loss": 2.4444, "mean_token_accuracy": 0.39310344457626345, "step": 132285 }, { "epoch": 0.13324389303020323, "grad_norm": 10.845725373986284, "learning_rate": 4.915285753985006e-05, "loss": 2.1197, "mean_token_accuracy": 0.4724137902259827, "step": 132290 }, { "epoch": 0.1332489290833074, "grad_norm": 10.506011958517517, "learning_rate": 4.915275568365654e-05, "loss": 2.608, "mean_token_accuracy": 0.4034482777118683, "step": 132295 }, { "epoch": 0.13325396513641158, "grad_norm": 8.927997535448005, "learning_rate": 4.9152653821457546e-05, "loss": 2.2032, "mean_token_accuracy": 0.47586206793785096, "step": 132300 }, { "epoch": 0.13325900118951575, "grad_norm": 9.539594807060489, "learning_rate": 4.9152551953253115e-05, "loss": 2.4096, "mean_token_accuracy": 0.4206896543502808, "step": 132305 }, { "epoch": 0.13326403724261993, "grad_norm": 10.19393566965759, "learning_rate": 4.9152450079043255e-05, "loss": 2.2502, "mean_token_accuracy": 0.4, "step": 132310 }, { "epoch": 0.1332690732957241, "grad_norm": 10.76437977440662, "learning_rate": 4.915234819882802e-05, "loss": 2.0744, "mean_token_accuracy": 0.4344827651977539, "step": 132315 }, { "epoch": 0.13327410934882827, "grad_norm": 12.426593182115818, "learning_rate": 4.915224631260742e-05, "loss": 2.3423, "mean_token_accuracy": 0.4482758641242981, "step": 132320 }, { "epoch": 0.13327914540193245, "grad_norm": 10.371151253711096, "learning_rate": 4.915214442038148e-05, "loss": 2.5384, "mean_token_accuracy": 0.4103448212146759, "step": 132325 }, { "epoch": 0.1332841814550366, "grad_norm": 12.081050002121174, "learning_rate": 4.9152042522150256e-05, "loss": 2.625, "mean_token_accuracy": 0.35517241060733795, "step": 132330 }, { "epoch": 0.13328921750814077, "grad_norm": 11.575247314112485, "learning_rate": 4.915194061791374e-05, "loss": 2.4546, "mean_token_accuracy": 0.4192377507686615, "step": 132335 }, { "epoch": 0.13329425356124494, "grad_norm": 12.513990260424526, "learning_rate": 4.915183870767199e-05, "loss": 2.6065, "mean_token_accuracy": 0.38620689511299133, "step": 132340 }, { "epoch": 0.13329928961434911, "grad_norm": 12.074422198381475, "learning_rate": 4.9151736791425015e-05, "loss": 2.7445, "mean_token_accuracy": 0.38275861740112305, "step": 132345 }, { "epoch": 0.1333043256674533, "grad_norm": 11.81352053095203, "learning_rate": 4.9151634869172856e-05, "loss": 2.2128, "mean_token_accuracy": 0.4413793087005615, "step": 132350 }, { "epoch": 0.13330936172055746, "grad_norm": 10.157109635385419, "learning_rate": 4.915153294091553e-05, "loss": 2.3494, "mean_token_accuracy": 0.46551724076271056, "step": 132355 }, { "epoch": 0.13331439777366164, "grad_norm": 11.08906866905451, "learning_rate": 4.9151431006653074e-05, "loss": 2.1999, "mean_token_accuracy": 0.4517241358757019, "step": 132360 }, { "epoch": 0.1333194338267658, "grad_norm": 9.874767953637017, "learning_rate": 4.915132906638551e-05, "loss": 2.1974, "mean_token_accuracy": 0.4551724135875702, "step": 132365 }, { "epoch": 0.13332446987986998, "grad_norm": 8.889791975857879, "learning_rate": 4.915122712011286e-05, "loss": 2.3457, "mean_token_accuracy": 0.4607380509376526, "step": 132370 }, { "epoch": 0.13332950593297416, "grad_norm": 9.579326306040423, "learning_rate": 4.915112516783518e-05, "loss": 2.2347, "mean_token_accuracy": 0.48620688915252686, "step": 132375 }, { "epoch": 0.13333454198607833, "grad_norm": 7.44213485909985, "learning_rate": 4.915102320955247e-05, "loss": 2.4027, "mean_token_accuracy": 0.47477314472198484, "step": 132380 }, { "epoch": 0.1333395780391825, "grad_norm": 11.016213690426166, "learning_rate": 4.915092124526476e-05, "loss": 2.2751, "mean_token_accuracy": 0.4413793206214905, "step": 132385 }, { "epoch": 0.13334461409228668, "grad_norm": 9.429713145680276, "learning_rate": 4.9150819274972105e-05, "loss": 2.4606, "mean_token_accuracy": 0.41034482717514037, "step": 132390 }, { "epoch": 0.13334965014539085, "grad_norm": 10.412893022096712, "learning_rate": 4.91507172986745e-05, "loss": 2.3192, "mean_token_accuracy": 0.4241379380226135, "step": 132395 }, { "epoch": 0.13335468619849503, "grad_norm": 12.683025104184283, "learning_rate": 4.9150615316371994e-05, "loss": 2.3743, "mean_token_accuracy": 0.4517241299152374, "step": 132400 }, { "epoch": 0.1333597222515992, "grad_norm": 10.08946744621511, "learning_rate": 4.915051332806461e-05, "loss": 2.3287, "mean_token_accuracy": 0.4379310369491577, "step": 132405 }, { "epoch": 0.13336475830470337, "grad_norm": 12.311138825663695, "learning_rate": 4.915041133375237e-05, "loss": 2.4812, "mean_token_accuracy": 0.47931033968925474, "step": 132410 }, { "epoch": 0.13336979435780755, "grad_norm": 9.841007098414352, "learning_rate": 4.9150309333435306e-05, "loss": 1.9557, "mean_token_accuracy": 0.5206896603107453, "step": 132415 }, { "epoch": 0.13337483041091172, "grad_norm": 10.44645064500164, "learning_rate": 4.915020732711345e-05, "loss": 2.2705, "mean_token_accuracy": 0.44313369393348695, "step": 132420 }, { "epoch": 0.1333798664640159, "grad_norm": 12.76287741334063, "learning_rate": 4.9150105314786834e-05, "loss": 2.5614, "mean_token_accuracy": 0.4379310369491577, "step": 132425 }, { "epoch": 0.13338490251712007, "grad_norm": 9.703420109395488, "learning_rate": 4.915000329645548e-05, "loss": 2.6288, "mean_token_accuracy": 0.4206896543502808, "step": 132430 }, { "epoch": 0.13338993857022424, "grad_norm": 11.359975340461137, "learning_rate": 4.914990127211941e-05, "loss": 2.3734, "mean_token_accuracy": 0.4586206912994385, "step": 132435 }, { "epoch": 0.13339497462332842, "grad_norm": 9.972858105994659, "learning_rate": 4.914979924177866e-05, "loss": 2.5992, "mean_token_accuracy": 0.3896551728248596, "step": 132440 }, { "epoch": 0.1334000106764326, "grad_norm": 10.273852217690463, "learning_rate": 4.914969720543326e-05, "loss": 2.5234, "mean_token_accuracy": 0.41724138259887694, "step": 132445 }, { "epoch": 0.13340504672953676, "grad_norm": 11.041777510228668, "learning_rate": 4.914959516308324e-05, "loss": 2.1707, "mean_token_accuracy": 0.4793103516101837, "step": 132450 }, { "epoch": 0.13341008278264094, "grad_norm": 10.355198465854153, "learning_rate": 4.9149493114728616e-05, "loss": 2.3933, "mean_token_accuracy": 0.46896552443504336, "step": 132455 }, { "epoch": 0.1334151188357451, "grad_norm": 9.969261327728704, "learning_rate": 4.914939106036943e-05, "loss": 2.2307, "mean_token_accuracy": 0.4902709424495697, "step": 132460 }, { "epoch": 0.13342015488884928, "grad_norm": 10.715074201412383, "learning_rate": 4.91492890000057e-05, "loss": 2.5986, "mean_token_accuracy": 0.3862069010734558, "step": 132465 }, { "epoch": 0.13342519094195343, "grad_norm": 8.290038760557643, "learning_rate": 4.914918693363747e-05, "loss": 1.934, "mean_token_accuracy": 0.48759830594062803, "step": 132470 }, { "epoch": 0.1334302269950576, "grad_norm": 9.115469342357398, "learning_rate": 4.9149084861264744e-05, "loss": 2.0923, "mean_token_accuracy": 0.47931033968925474, "step": 132475 }, { "epoch": 0.13343526304816178, "grad_norm": 9.930231112784956, "learning_rate": 4.9148982782887566e-05, "loss": 2.5001, "mean_token_accuracy": 0.41379311084747317, "step": 132480 }, { "epoch": 0.13344029910126595, "grad_norm": 12.289848910541782, "learning_rate": 4.914888069850596e-05, "loss": 2.6816, "mean_token_accuracy": 0.43448275327682495, "step": 132485 }, { "epoch": 0.13344533515437013, "grad_norm": 10.923444466219964, "learning_rate": 4.9148778608119965e-05, "loss": 2.1846, "mean_token_accuracy": 0.4137930989265442, "step": 132490 }, { "epoch": 0.1334503712074743, "grad_norm": 9.219428885511377, "learning_rate": 4.9148676511729595e-05, "loss": 2.6184, "mean_token_accuracy": 0.40344826579093934, "step": 132495 }, { "epoch": 0.13345540726057847, "grad_norm": 12.167766426389326, "learning_rate": 4.914857440933489e-05, "loss": 2.0861, "mean_token_accuracy": 0.5159709632396698, "step": 132500 }, { "epoch": 0.13346044331368265, "grad_norm": 8.495593411891136, "learning_rate": 4.914847230093586e-05, "loss": 2.3632, "mean_token_accuracy": 0.4310344815254211, "step": 132505 }, { "epoch": 0.13346547936678682, "grad_norm": 9.673650213929653, "learning_rate": 4.9148370186532555e-05, "loss": 2.2874, "mean_token_accuracy": 0.4172413766384125, "step": 132510 }, { "epoch": 0.133470515419891, "grad_norm": 9.903761315307902, "learning_rate": 4.914826806612499e-05, "loss": 2.5553, "mean_token_accuracy": 0.3896551787853241, "step": 132515 }, { "epoch": 0.13347555147299517, "grad_norm": 11.253200742936842, "learning_rate": 4.91481659397132e-05, "loss": 2.1125, "mean_token_accuracy": 0.4517241358757019, "step": 132520 }, { "epoch": 0.13348058752609934, "grad_norm": 9.606242510285352, "learning_rate": 4.9148063807297205e-05, "loss": 2.6121, "mean_token_accuracy": 0.4068965494632721, "step": 132525 }, { "epoch": 0.13348562357920352, "grad_norm": 11.074393237953196, "learning_rate": 4.914796166887705e-05, "loss": 2.5716, "mean_token_accuracy": 0.39310344457626345, "step": 132530 }, { "epoch": 0.1334906596323077, "grad_norm": 14.032823941949458, "learning_rate": 4.914785952445275e-05, "loss": 2.6413, "mean_token_accuracy": 0.37241379618644715, "step": 132535 }, { "epoch": 0.13349569568541186, "grad_norm": 10.692836874100095, "learning_rate": 4.9147757374024325e-05, "loss": 2.5335, "mean_token_accuracy": 0.4517241358757019, "step": 132540 }, { "epoch": 0.13350073173851604, "grad_norm": 11.480399136903326, "learning_rate": 4.9147655217591814e-05, "loss": 2.2775, "mean_token_accuracy": 0.4206896543502808, "step": 132545 }, { "epoch": 0.1335057677916202, "grad_norm": 9.498456804950347, "learning_rate": 4.914755305515526e-05, "loss": 2.791, "mean_token_accuracy": 0.36551723778247835, "step": 132550 }, { "epoch": 0.13351080384472439, "grad_norm": 17.17535357230016, "learning_rate": 4.914745088671465e-05, "loss": 2.6421, "mean_token_accuracy": 0.4206896543502808, "step": 132555 }, { "epoch": 0.13351583989782856, "grad_norm": 10.396980808087871, "learning_rate": 4.914734871227006e-05, "loss": 2.1902, "mean_token_accuracy": 0.44482757449150084, "step": 132560 }, { "epoch": 0.13352087595093273, "grad_norm": 10.728842336884252, "learning_rate": 4.914724653182149e-05, "loss": 2.1188, "mean_token_accuracy": 0.46896551847457885, "step": 132565 }, { "epoch": 0.1335259120040369, "grad_norm": 12.136619877507176, "learning_rate": 4.914714434536898e-05, "loss": 2.3394, "mean_token_accuracy": 0.4777374565601349, "step": 132570 }, { "epoch": 0.13353094805714108, "grad_norm": 11.312670255868811, "learning_rate": 4.914704215291255e-05, "loss": 2.6016, "mean_token_accuracy": 0.38620689511299133, "step": 132575 }, { "epoch": 0.13353598411024525, "grad_norm": 10.134706433020293, "learning_rate": 4.914693995445224e-05, "loss": 2.3442, "mean_token_accuracy": 0.4379310250282288, "step": 132580 }, { "epoch": 0.13354102016334943, "grad_norm": 9.175375443598146, "learning_rate": 4.9146837749988054e-05, "loss": 1.9364, "mean_token_accuracy": 0.43448275327682495, "step": 132585 }, { "epoch": 0.1335460562164536, "grad_norm": 10.585431626375978, "learning_rate": 4.914673553952005e-05, "loss": 2.3183, "mean_token_accuracy": 0.3965517282485962, "step": 132590 }, { "epoch": 0.13355109226955778, "grad_norm": 8.579939257555077, "learning_rate": 4.9146633323048234e-05, "loss": 2.265, "mean_token_accuracy": 0.44827585816383364, "step": 132595 }, { "epoch": 0.13355612832266195, "grad_norm": 10.832529687475509, "learning_rate": 4.914653110057265e-05, "loss": 2.3328, "mean_token_accuracy": 0.41034482717514037, "step": 132600 }, { "epoch": 0.13356116437576612, "grad_norm": 10.723135066863131, "learning_rate": 4.914642887209332e-05, "loss": 2.5505, "mean_token_accuracy": 0.41034482717514037, "step": 132605 }, { "epoch": 0.13356620042887027, "grad_norm": 11.354202567108759, "learning_rate": 4.914632663761028e-05, "loss": 2.184, "mean_token_accuracy": 0.4698275804519653, "step": 132610 }, { "epoch": 0.13357123648197444, "grad_norm": 13.082194537811986, "learning_rate": 4.9146224397123536e-05, "loss": 2.6288, "mean_token_accuracy": 0.3793103516101837, "step": 132615 }, { "epoch": 0.13357627253507862, "grad_norm": 10.610177877711255, "learning_rate": 4.9146122150633135e-05, "loss": 2.301, "mean_token_accuracy": 0.45172414779663084, "step": 132620 }, { "epoch": 0.1335813085881828, "grad_norm": 11.398095866296075, "learning_rate": 4.9146019898139106e-05, "loss": 2.278, "mean_token_accuracy": 0.4034482717514038, "step": 132625 }, { "epoch": 0.13358634464128696, "grad_norm": 9.617024180196179, "learning_rate": 4.914591763964147e-05, "loss": 2.4793, "mean_token_accuracy": 0.4517241358757019, "step": 132630 }, { "epoch": 0.13359138069439114, "grad_norm": 12.898190848755679, "learning_rate": 4.9145815375140254e-05, "loss": 2.7774, "mean_token_accuracy": 0.35862069129943847, "step": 132635 }, { "epoch": 0.1335964167474953, "grad_norm": 10.424689137327166, "learning_rate": 4.91457131046355e-05, "loss": 2.3163, "mean_token_accuracy": 0.4275861978530884, "step": 132640 }, { "epoch": 0.13360145280059949, "grad_norm": 10.616311205440159, "learning_rate": 4.914561082812721e-05, "loss": 2.6066, "mean_token_accuracy": 0.38965516686439516, "step": 132645 }, { "epoch": 0.13360648885370366, "grad_norm": 8.816876301880235, "learning_rate": 4.914550854561544e-05, "loss": 2.2237, "mean_token_accuracy": 0.4793103575706482, "step": 132650 }, { "epoch": 0.13361152490680783, "grad_norm": 10.065134558069312, "learning_rate": 4.914540625710021e-05, "loss": 2.4567, "mean_token_accuracy": 0.42758620977401735, "step": 132655 }, { "epoch": 0.133616560959912, "grad_norm": 17.385560365900922, "learning_rate": 4.914530396258154e-05, "loss": 2.6183, "mean_token_accuracy": 0.39655172526836396, "step": 132660 }, { "epoch": 0.13362159701301618, "grad_norm": 12.352481916083004, "learning_rate": 4.914520166205947e-05, "loss": 2.2549, "mean_token_accuracy": 0.4206896543502808, "step": 132665 }, { "epoch": 0.13362663306612035, "grad_norm": 12.818845359529771, "learning_rate": 4.914509935553402e-05, "loss": 2.496, "mean_token_accuracy": 0.43103448748588563, "step": 132670 }, { "epoch": 0.13363166911922453, "grad_norm": 11.51858791853334, "learning_rate": 4.914499704300522e-05, "loss": 2.1524, "mean_token_accuracy": 0.48275861144065857, "step": 132675 }, { "epoch": 0.1336367051723287, "grad_norm": 22.79409224571709, "learning_rate": 4.91448947244731e-05, "loss": 2.8767, "mean_token_accuracy": 0.42413793206214906, "step": 132680 }, { "epoch": 0.13364174122543288, "grad_norm": 10.669982224306079, "learning_rate": 4.914479239993769e-05, "loss": 2.4179, "mean_token_accuracy": 0.41379311084747317, "step": 132685 }, { "epoch": 0.13364677727853705, "grad_norm": 10.827332537692554, "learning_rate": 4.914469006939901e-05, "loss": 2.5588, "mean_token_accuracy": 0.43103448748588563, "step": 132690 }, { "epoch": 0.13365181333164122, "grad_norm": 12.321758019302035, "learning_rate": 4.91445877328571e-05, "loss": 2.7483, "mean_token_accuracy": 0.37586206793785093, "step": 132695 }, { "epoch": 0.1336568493847454, "grad_norm": 9.985538986564807, "learning_rate": 4.9144485390311985e-05, "loss": 2.5958, "mean_token_accuracy": 0.4, "step": 132700 }, { "epoch": 0.13366188543784957, "grad_norm": 12.781752086945525, "learning_rate": 4.914438304176369e-05, "loss": 2.8808, "mean_token_accuracy": 0.3551724016666412, "step": 132705 }, { "epoch": 0.13366692149095374, "grad_norm": 10.376259500940364, "learning_rate": 4.914428068721224e-05, "loss": 2.5545, "mean_token_accuracy": 0.4000000059604645, "step": 132710 }, { "epoch": 0.13367195754405792, "grad_norm": 8.191527774801951, "learning_rate": 4.914417832665767e-05, "loss": 2.1893, "mean_token_accuracy": 0.43866994976997375, "step": 132715 }, { "epoch": 0.1336769935971621, "grad_norm": 14.34974612829401, "learning_rate": 4.91440759601e-05, "loss": 2.6576, "mean_token_accuracy": 0.37586206793785093, "step": 132720 }, { "epoch": 0.13368202965026627, "grad_norm": 10.427514085603141, "learning_rate": 4.914397358753927e-05, "loss": 2.6231, "mean_token_accuracy": 0.41724138259887694, "step": 132725 }, { "epoch": 0.13368706570337044, "grad_norm": 11.5500148420489, "learning_rate": 4.914387120897551e-05, "loss": 2.2847, "mean_token_accuracy": 0.42413793206214906, "step": 132730 }, { "epoch": 0.1336921017564746, "grad_norm": 10.878118902282123, "learning_rate": 4.914376882440874e-05, "loss": 2.6356, "mean_token_accuracy": 0.4, "step": 132735 }, { "epoch": 0.1336971378095788, "grad_norm": 13.983115964926723, "learning_rate": 4.914366643383898e-05, "loss": 2.8121, "mean_token_accuracy": 0.3620689630508423, "step": 132740 }, { "epoch": 0.13370217386268296, "grad_norm": 10.155528925921091, "learning_rate": 4.9143564037266275e-05, "loss": 2.7115, "mean_token_accuracy": 0.41379310488700866, "step": 132745 }, { "epoch": 0.1337072099157871, "grad_norm": 11.784421946494257, "learning_rate": 4.9143461634690645e-05, "loss": 2.3591, "mean_token_accuracy": 0.43103448748588563, "step": 132750 }, { "epoch": 0.13371224596889128, "grad_norm": 12.584000791093423, "learning_rate": 4.914335922611212e-05, "loss": 2.17, "mean_token_accuracy": 0.458620685338974, "step": 132755 }, { "epoch": 0.13371728202199545, "grad_norm": 11.062782696755132, "learning_rate": 4.9143256811530734e-05, "loss": 3.0922, "mean_token_accuracy": 0.3551724076271057, "step": 132760 }, { "epoch": 0.13372231807509963, "grad_norm": 9.852571150636424, "learning_rate": 4.91431543909465e-05, "loss": 2.2072, "mean_token_accuracy": 0.4068965494632721, "step": 132765 }, { "epoch": 0.1337273541282038, "grad_norm": 9.508538450273738, "learning_rate": 4.914305196435947e-05, "loss": 2.3233, "mean_token_accuracy": 0.42413792610168455, "step": 132770 }, { "epoch": 0.13373239018130798, "grad_norm": 12.218598375822111, "learning_rate": 4.914294953176965e-05, "loss": 2.262, "mean_token_accuracy": 0.4551724076271057, "step": 132775 }, { "epoch": 0.13373742623441215, "grad_norm": 12.766986084677166, "learning_rate": 4.9142847093177075e-05, "loss": 1.9585, "mean_token_accuracy": 0.49848759174346924, "step": 132780 }, { "epoch": 0.13374246228751632, "grad_norm": 8.570092817951632, "learning_rate": 4.914274464858178e-05, "loss": 2.173, "mean_token_accuracy": 0.458620685338974, "step": 132785 }, { "epoch": 0.1337474983406205, "grad_norm": 10.904169385376413, "learning_rate": 4.914264219798379e-05, "loss": 1.9838, "mean_token_accuracy": 0.47931034564971925, "step": 132790 }, { "epoch": 0.13375253439372467, "grad_norm": 11.971272960783534, "learning_rate": 4.914253974138312e-05, "loss": 2.4261, "mean_token_accuracy": 0.41724138259887694, "step": 132795 }, { "epoch": 0.13375757044682884, "grad_norm": 11.458978331242289, "learning_rate": 4.914243727877983e-05, "loss": 2.3362, "mean_token_accuracy": 0.41724138259887694, "step": 132800 }, { "epoch": 0.13376260649993302, "grad_norm": 8.854863267870993, "learning_rate": 4.914233481017392e-05, "loss": 2.0155, "mean_token_accuracy": 0.45862067937850953, "step": 132805 }, { "epoch": 0.1337676425530372, "grad_norm": 17.31446553650992, "learning_rate": 4.9142232335565424e-05, "loss": 2.819, "mean_token_accuracy": 0.3689655065536499, "step": 132810 }, { "epoch": 0.13377267860614137, "grad_norm": 10.870273783181668, "learning_rate": 4.9142129854954383e-05, "loss": 2.321, "mean_token_accuracy": 0.4103448212146759, "step": 132815 }, { "epoch": 0.13377771465924554, "grad_norm": 10.124824159533858, "learning_rate": 4.914202736834081e-05, "loss": 2.4832, "mean_token_accuracy": 0.4413793087005615, "step": 132820 }, { "epoch": 0.1337827507123497, "grad_norm": 11.811036420889828, "learning_rate": 4.914192487572474e-05, "loss": 1.9942, "mean_token_accuracy": 0.47931033968925474, "step": 132825 }, { "epoch": 0.1337877867654539, "grad_norm": 15.224906291430523, "learning_rate": 4.914182237710621e-05, "loss": 2.5857, "mean_token_accuracy": 0.4379310429096222, "step": 132830 }, { "epoch": 0.13379282281855806, "grad_norm": 11.504207770620662, "learning_rate": 4.9141719872485226e-05, "loss": 2.2877, "mean_token_accuracy": 0.4379310369491577, "step": 132835 }, { "epoch": 0.13379785887166223, "grad_norm": 11.40332274990979, "learning_rate": 4.914161736186184e-05, "loss": 2.8495, "mean_token_accuracy": 0.3896551728248596, "step": 132840 }, { "epoch": 0.1338028949247664, "grad_norm": 8.67422066018454, "learning_rate": 4.914151484523607e-05, "loss": 2.4108, "mean_token_accuracy": 0.42413792610168455, "step": 132845 }, { "epoch": 0.13380793097787058, "grad_norm": 10.836801792395475, "learning_rate": 4.914141232260794e-05, "loss": 2.1388, "mean_token_accuracy": 0.44827587008476255, "step": 132850 }, { "epoch": 0.13381296703097476, "grad_norm": 9.0048891014409, "learning_rate": 4.914130979397749e-05, "loss": 2.3271, "mean_token_accuracy": 0.44827585816383364, "step": 132855 }, { "epoch": 0.13381800308407893, "grad_norm": 13.409097072022995, "learning_rate": 4.914120725934473e-05, "loss": 2.8783, "mean_token_accuracy": 0.4, "step": 132860 }, { "epoch": 0.1338230391371831, "grad_norm": 11.00959267113664, "learning_rate": 4.914110471870971e-05, "loss": 2.471, "mean_token_accuracy": 0.42068966031074523, "step": 132865 }, { "epoch": 0.13382807519028728, "grad_norm": 8.653477752527861, "learning_rate": 4.9141002172072454e-05, "loss": 2.0179, "mean_token_accuracy": 0.47586206197738645, "step": 132870 }, { "epoch": 0.13383311124339145, "grad_norm": 16.228999588675798, "learning_rate": 4.914089961943298e-05, "loss": 2.8427, "mean_token_accuracy": 0.38965516686439516, "step": 132875 }, { "epoch": 0.13383814729649562, "grad_norm": 8.061341592807446, "learning_rate": 4.914079706079132e-05, "loss": 1.9456, "mean_token_accuracy": 0.482103967666626, "step": 132880 }, { "epoch": 0.1338431833495998, "grad_norm": 10.451054794891656, "learning_rate": 4.91406944961475e-05, "loss": 2.3465, "mean_token_accuracy": 0.42068964838981626, "step": 132885 }, { "epoch": 0.13384821940270394, "grad_norm": 10.63925203095825, "learning_rate": 4.9140591925501555e-05, "loss": 2.558, "mean_token_accuracy": 0.39655172228813174, "step": 132890 }, { "epoch": 0.13385325545580812, "grad_norm": 8.208218346940836, "learning_rate": 4.9140489348853526e-05, "loss": 2.1901, "mean_token_accuracy": 0.420689657330513, "step": 132895 }, { "epoch": 0.1338582915089123, "grad_norm": 11.74804805769207, "learning_rate": 4.914038676620341e-05, "loss": 2.5114, "mean_token_accuracy": 0.42758620381355283, "step": 132900 }, { "epoch": 0.13386332756201647, "grad_norm": 10.075976236252403, "learning_rate": 4.9140284177551256e-05, "loss": 2.256, "mean_token_accuracy": 0.4758620738983154, "step": 132905 }, { "epoch": 0.13386836361512064, "grad_norm": 9.139071813359875, "learning_rate": 4.914018158289709e-05, "loss": 1.9682, "mean_token_accuracy": 0.4620689630508423, "step": 132910 }, { "epoch": 0.1338733996682248, "grad_norm": 10.810455610549674, "learning_rate": 4.9140078982240934e-05, "loss": 2.0084, "mean_token_accuracy": 0.47931033968925474, "step": 132915 }, { "epoch": 0.133878435721329, "grad_norm": 8.57428280495784, "learning_rate": 4.9139976375582826e-05, "loss": 2.1659, "mean_token_accuracy": 0.458620685338974, "step": 132920 }, { "epoch": 0.13388347177443316, "grad_norm": 11.581032813967136, "learning_rate": 4.913987376292279e-05, "loss": 2.9114, "mean_token_accuracy": 0.34137930274009703, "step": 132925 }, { "epoch": 0.13388850782753733, "grad_norm": 11.804490014022837, "learning_rate": 4.913977114426085e-05, "loss": 2.3912, "mean_token_accuracy": 0.43448275327682495, "step": 132930 }, { "epoch": 0.1338935438806415, "grad_norm": 11.689207118688163, "learning_rate": 4.9139668519597046e-05, "loss": 2.7253, "mean_token_accuracy": 0.4344827473163605, "step": 132935 }, { "epoch": 0.13389857993374568, "grad_norm": 8.85195806845111, "learning_rate": 4.9139565888931395e-05, "loss": 2.2928, "mean_token_accuracy": 0.4517241418361664, "step": 132940 }, { "epoch": 0.13390361598684986, "grad_norm": 10.658283975668535, "learning_rate": 4.9139463252263926e-05, "loss": 2.7722, "mean_token_accuracy": 0.3827586233615875, "step": 132945 }, { "epoch": 0.13390865203995403, "grad_norm": 9.51333075158609, "learning_rate": 4.9139360609594674e-05, "loss": 2.4664, "mean_token_accuracy": 0.41234118938446046, "step": 132950 }, { "epoch": 0.1339136880930582, "grad_norm": 9.145077594692667, "learning_rate": 4.913925796092367e-05, "loss": 2.4349, "mean_token_accuracy": 0.45311554670333865, "step": 132955 }, { "epoch": 0.13391872414616238, "grad_norm": 10.269115618329831, "learning_rate": 4.913915530625093e-05, "loss": 2.4945, "mean_token_accuracy": 0.4310344815254211, "step": 132960 }, { "epoch": 0.13392376019926655, "grad_norm": 10.771952804465455, "learning_rate": 4.9139052645576495e-05, "loss": 2.2464, "mean_token_accuracy": 0.3655172407627106, "step": 132965 }, { "epoch": 0.13392879625237072, "grad_norm": 12.203569634118466, "learning_rate": 4.9138949978900384e-05, "loss": 2.6665, "mean_token_accuracy": 0.3965517282485962, "step": 132970 }, { "epoch": 0.1339338323054749, "grad_norm": 9.92652646975939, "learning_rate": 4.913884730622263e-05, "loss": 2.7056, "mean_token_accuracy": 0.32413792610168457, "step": 132975 }, { "epoch": 0.13393886835857907, "grad_norm": 9.99964857963376, "learning_rate": 4.913874462754326e-05, "loss": 2.6125, "mean_token_accuracy": 0.42758620381355283, "step": 132980 }, { "epoch": 0.13394390441168325, "grad_norm": 10.101567371639872, "learning_rate": 4.913864194286231e-05, "loss": 1.7469, "mean_token_accuracy": 0.5586206972599029, "step": 132985 }, { "epoch": 0.13394894046478742, "grad_norm": 8.377959689537375, "learning_rate": 4.913853925217979e-05, "loss": 2.2224, "mean_token_accuracy": 0.48054187297821044, "step": 132990 }, { "epoch": 0.1339539765178916, "grad_norm": 9.44531952152318, "learning_rate": 4.9138436555495756e-05, "loss": 2.544, "mean_token_accuracy": 0.4034482777118683, "step": 132995 }, { "epoch": 0.13395901257099577, "grad_norm": 10.000169829196041, "learning_rate": 4.913833385281021e-05, "loss": 2.1524, "mean_token_accuracy": 0.458620685338974, "step": 133000 }, { "epoch": 0.13396404862409994, "grad_norm": 13.629602265393356, "learning_rate": 4.913823114412319e-05, "loss": 2.5425, "mean_token_accuracy": 0.36896551847457887, "step": 133005 }, { "epoch": 0.13396908467720411, "grad_norm": 10.51377362786883, "learning_rate": 4.913812842943473e-05, "loss": 2.2282, "mean_token_accuracy": 0.45517241954803467, "step": 133010 }, { "epoch": 0.1339741207303083, "grad_norm": 10.584606563915406, "learning_rate": 4.9138025708744855e-05, "loss": 2.4518, "mean_token_accuracy": 0.40689654350280763, "step": 133015 }, { "epoch": 0.13397915678341246, "grad_norm": 9.472146976716418, "learning_rate": 4.913792298205359e-05, "loss": 2.2788, "mean_token_accuracy": 0.40689654350280763, "step": 133020 }, { "epoch": 0.13398419283651664, "grad_norm": 8.030218433539074, "learning_rate": 4.9137820249360974e-05, "loss": 2.2345, "mean_token_accuracy": 0.4344827651977539, "step": 133025 }, { "epoch": 0.13398922888962078, "grad_norm": 9.976727152180056, "learning_rate": 4.913771751066702e-05, "loss": 2.6853, "mean_token_accuracy": 0.38275861740112305, "step": 133030 }, { "epoch": 0.13399426494272496, "grad_norm": 12.819014328147253, "learning_rate": 4.913761476597176e-05, "loss": 1.8869, "mean_token_accuracy": 0.5034482836723327, "step": 133035 }, { "epoch": 0.13399930099582913, "grad_norm": 11.847882179186286, "learning_rate": 4.9137512015275244e-05, "loss": 2.3391, "mean_token_accuracy": 0.42068964838981626, "step": 133040 }, { "epoch": 0.1340043370489333, "grad_norm": 11.053580448375211, "learning_rate": 4.9137409258577474e-05, "loss": 2.5285, "mean_token_accuracy": 0.4068965554237366, "step": 133045 }, { "epoch": 0.13400937310203748, "grad_norm": 11.451463147876439, "learning_rate": 4.913730649587848e-05, "loss": 2.2927, "mean_token_accuracy": 0.458620685338974, "step": 133050 }, { "epoch": 0.13401440915514165, "grad_norm": 9.146146665604276, "learning_rate": 4.91372037271783e-05, "loss": 2.6785, "mean_token_accuracy": 0.417241370677948, "step": 133055 }, { "epoch": 0.13401944520824582, "grad_norm": 10.681153017267379, "learning_rate": 4.913710095247697e-05, "loss": 2.3833, "mean_token_accuracy": 0.4620689630508423, "step": 133060 }, { "epoch": 0.13402448126135, "grad_norm": 10.369229621960262, "learning_rate": 4.91369981717745e-05, "loss": 2.2033, "mean_token_accuracy": 0.47586206197738645, "step": 133065 }, { "epoch": 0.13402951731445417, "grad_norm": 9.313391799992058, "learning_rate": 4.913689538507094e-05, "loss": 2.2578, "mean_token_accuracy": 0.43103447556495667, "step": 133070 }, { "epoch": 0.13403455336755835, "grad_norm": 12.514210450104544, "learning_rate": 4.9136792592366296e-05, "loss": 2.1865, "mean_token_accuracy": 0.5091954052448273, "step": 133075 }, { "epoch": 0.13403958942066252, "grad_norm": 9.37192116552471, "learning_rate": 4.913668979366061e-05, "loss": 2.309, "mean_token_accuracy": 0.4724137902259827, "step": 133080 }, { "epoch": 0.1340446254737667, "grad_norm": 11.55364010225787, "learning_rate": 4.913658698895391e-05, "loss": 2.2739, "mean_token_accuracy": 0.46811856627464293, "step": 133085 }, { "epoch": 0.13404966152687087, "grad_norm": 9.139603808094293, "learning_rate": 4.9136484178246214e-05, "loss": 2.3969, "mean_token_accuracy": 0.41724138855934145, "step": 133090 }, { "epoch": 0.13405469757997504, "grad_norm": 11.068345412109831, "learning_rate": 4.913638136153756e-05, "loss": 2.2119, "mean_token_accuracy": 0.4775559604167938, "step": 133095 }, { "epoch": 0.13405973363307921, "grad_norm": 11.89108967900816, "learning_rate": 4.913627853882799e-05, "loss": 2.5425, "mean_token_accuracy": 0.47931033968925474, "step": 133100 }, { "epoch": 0.1340647696861834, "grad_norm": 12.545874715113227, "learning_rate": 4.91361757101175e-05, "loss": 2.6121, "mean_token_accuracy": 0.3896551728248596, "step": 133105 }, { "epoch": 0.13406980573928756, "grad_norm": 8.566067221791846, "learning_rate": 4.913607287540615e-05, "loss": 2.3286, "mean_token_accuracy": 0.44827585816383364, "step": 133110 }, { "epoch": 0.13407484179239174, "grad_norm": 10.49852594089317, "learning_rate": 4.913597003469394e-05, "loss": 2.3743, "mean_token_accuracy": 0.42068966031074523, "step": 133115 }, { "epoch": 0.1340798778454959, "grad_norm": 13.509442755498323, "learning_rate": 4.913586718798092e-05, "loss": 2.9966, "mean_token_accuracy": 0.3758620709180832, "step": 133120 }, { "epoch": 0.13408491389860008, "grad_norm": 9.345885190793043, "learning_rate": 4.913576433526711e-05, "loss": 2.4534, "mean_token_accuracy": 0.36551723480224607, "step": 133125 }, { "epoch": 0.13408994995170426, "grad_norm": 9.824789875846834, "learning_rate": 4.913566147655254e-05, "loss": 2.459, "mean_token_accuracy": 0.44137930274009707, "step": 133130 }, { "epoch": 0.13409498600480843, "grad_norm": 9.11876444051404, "learning_rate": 4.913555861183724e-05, "loss": 2.2755, "mean_token_accuracy": 0.4879007875919342, "step": 133135 }, { "epoch": 0.1341000220579126, "grad_norm": 9.697784821845543, "learning_rate": 4.913545574112124e-05, "loss": 1.8646, "mean_token_accuracy": 0.5068965494632721, "step": 133140 }, { "epoch": 0.13410505811101678, "grad_norm": 11.780024562371858, "learning_rate": 4.913535286440456e-05, "loss": 2.3058, "mean_token_accuracy": 0.44827585816383364, "step": 133145 }, { "epoch": 0.13411009416412095, "grad_norm": 11.082344507478709, "learning_rate": 4.9135249981687236e-05, "loss": 2.3147, "mean_token_accuracy": 0.4189957737922668, "step": 133150 }, { "epoch": 0.13411513021722513, "grad_norm": 11.679361567906682, "learning_rate": 4.91351470929693e-05, "loss": 2.3547, "mean_token_accuracy": 0.4379310369491577, "step": 133155 }, { "epoch": 0.1341201662703293, "grad_norm": 12.86433525259886, "learning_rate": 4.913504419825077e-05, "loss": 2.5741, "mean_token_accuracy": 0.3827586114406586, "step": 133160 }, { "epoch": 0.13412520232343347, "grad_norm": 8.872623305573056, "learning_rate": 4.9134941297531675e-05, "loss": 2.3257, "mean_token_accuracy": 0.47241379618644713, "step": 133165 }, { "epoch": 0.13413023837653762, "grad_norm": 14.146418841040349, "learning_rate": 4.913483839081206e-05, "loss": 2.6141, "mean_token_accuracy": 0.4672111332416534, "step": 133170 }, { "epoch": 0.1341352744296418, "grad_norm": 10.232165953638365, "learning_rate": 4.913473547809193e-05, "loss": 2.2596, "mean_token_accuracy": 0.42413793206214906, "step": 133175 }, { "epoch": 0.13414031048274597, "grad_norm": 13.307303938397402, "learning_rate": 4.913463255937133e-05, "loss": 1.992, "mean_token_accuracy": 0.5207501649856567, "step": 133180 }, { "epoch": 0.13414534653585014, "grad_norm": 11.607179444353685, "learning_rate": 4.9134529634650286e-05, "loss": 2.4738, "mean_token_accuracy": 0.47586206197738645, "step": 133185 }, { "epoch": 0.13415038258895431, "grad_norm": 10.685020322988358, "learning_rate": 4.9134426703928824e-05, "loss": 2.5088, "mean_token_accuracy": 0.37241379618644715, "step": 133190 }, { "epoch": 0.1341554186420585, "grad_norm": 11.727129026799751, "learning_rate": 4.913432376720697e-05, "loss": 3.1191, "mean_token_accuracy": 0.35862068831920624, "step": 133195 }, { "epoch": 0.13416045469516266, "grad_norm": 10.339170376160103, "learning_rate": 4.9134220824484763e-05, "loss": 2.4782, "mean_token_accuracy": 0.4137930989265442, "step": 133200 }, { "epoch": 0.13416549074826684, "grad_norm": 9.180894110905147, "learning_rate": 4.913411787576221e-05, "loss": 2.4063, "mean_token_accuracy": 0.41034482717514037, "step": 133205 }, { "epoch": 0.134170526801371, "grad_norm": 12.360519081212592, "learning_rate": 4.913401492103937e-05, "loss": 2.3694, "mean_token_accuracy": 0.4502117395401001, "step": 133210 }, { "epoch": 0.13417556285447518, "grad_norm": 13.12219190977595, "learning_rate": 4.913391196031625e-05, "loss": 2.5681, "mean_token_accuracy": 0.3448275804519653, "step": 133215 }, { "epoch": 0.13418059890757936, "grad_norm": 9.795439270401314, "learning_rate": 4.913380899359288e-05, "loss": 2.0729, "mean_token_accuracy": 0.4551724076271057, "step": 133220 }, { "epoch": 0.13418563496068353, "grad_norm": 9.787602510419227, "learning_rate": 4.9133706020869296e-05, "loss": 2.1653, "mean_token_accuracy": 0.4586206912994385, "step": 133225 }, { "epoch": 0.1341906710137877, "grad_norm": 12.946466812453222, "learning_rate": 4.9133603042145515e-05, "loss": 2.8084, "mean_token_accuracy": 0.3896551728248596, "step": 133230 }, { "epoch": 0.13419570706689188, "grad_norm": 10.131712109489373, "learning_rate": 4.913350005742158e-05, "loss": 2.1511, "mean_token_accuracy": 0.4517241358757019, "step": 133235 }, { "epoch": 0.13420074311999605, "grad_norm": 12.390586016617839, "learning_rate": 4.9133397066697514e-05, "loss": 2.2503, "mean_token_accuracy": 0.4551724135875702, "step": 133240 }, { "epoch": 0.13420577917310023, "grad_norm": 13.206889327990059, "learning_rate": 4.913329406997335e-05, "loss": 2.6561, "mean_token_accuracy": 0.41149425506591797, "step": 133245 }, { "epoch": 0.1342108152262044, "grad_norm": 11.139734694336255, "learning_rate": 4.9133191067249105e-05, "loss": 2.4348, "mean_token_accuracy": 0.4448275864124298, "step": 133250 }, { "epoch": 0.13421585127930857, "grad_norm": 14.013888397976796, "learning_rate": 4.913308805852481e-05, "loss": 2.4082, "mean_token_accuracy": 0.42068966031074523, "step": 133255 }, { "epoch": 0.13422088733241275, "grad_norm": 11.405764248954883, "learning_rate": 4.91329850438005e-05, "loss": 2.2966, "mean_token_accuracy": 0.47241379618644713, "step": 133260 }, { "epoch": 0.13422592338551692, "grad_norm": 9.94753541247845, "learning_rate": 4.9132882023076196e-05, "loss": 2.3294, "mean_token_accuracy": 0.42758620977401735, "step": 133265 }, { "epoch": 0.1342309594386211, "grad_norm": 12.12423042332564, "learning_rate": 4.913277899635194e-05, "loss": 2.5722, "mean_token_accuracy": 0.42758620977401735, "step": 133270 }, { "epoch": 0.13423599549172527, "grad_norm": 8.784623124555536, "learning_rate": 4.913267596362775e-05, "loss": 2.1155, "mean_token_accuracy": 0.46551724076271056, "step": 133275 }, { "epoch": 0.13424103154482944, "grad_norm": 8.420739386644287, "learning_rate": 4.9132572924903655e-05, "loss": 2.1642, "mean_token_accuracy": 0.4586206912994385, "step": 133280 }, { "epoch": 0.13424606759793362, "grad_norm": 10.118032240411473, "learning_rate": 4.913246988017969e-05, "loss": 2.1197, "mean_token_accuracy": 0.4172413766384125, "step": 133285 }, { "epoch": 0.1342511036510378, "grad_norm": 17.003970341571257, "learning_rate": 4.913236682945587e-05, "loss": 2.64, "mean_token_accuracy": 0.41034482717514037, "step": 133290 }, { "epoch": 0.13425613970414196, "grad_norm": 9.666119406403576, "learning_rate": 4.913226377273224e-05, "loss": 2.0674, "mean_token_accuracy": 0.4689655125141144, "step": 133295 }, { "epoch": 0.13426117575724614, "grad_norm": 10.483624931438063, "learning_rate": 4.9132160710008815e-05, "loss": 2.3974, "mean_token_accuracy": 0.44827585816383364, "step": 133300 }, { "epoch": 0.1342662118103503, "grad_norm": 20.50268935468113, "learning_rate": 4.913205764128563e-05, "loss": 2.789, "mean_token_accuracy": 0.39310344457626345, "step": 133305 }, { "epoch": 0.13427124786345446, "grad_norm": 11.510033586954078, "learning_rate": 4.9131954566562724e-05, "loss": 2.3261, "mean_token_accuracy": 0.4622504532337189, "step": 133310 }, { "epoch": 0.13427628391655863, "grad_norm": 8.661591766418606, "learning_rate": 4.91318514858401e-05, "loss": 2.1883, "mean_token_accuracy": 0.441379314661026, "step": 133315 }, { "epoch": 0.1342813199696628, "grad_norm": 9.308058952299053, "learning_rate": 4.9131748399117805e-05, "loss": 2.7824, "mean_token_accuracy": 0.41379311084747317, "step": 133320 }, { "epoch": 0.13428635602276698, "grad_norm": 10.865454672814817, "learning_rate": 4.913164530639587e-05, "loss": 2.5983, "mean_token_accuracy": 0.3896551728248596, "step": 133325 }, { "epoch": 0.13429139207587115, "grad_norm": 9.760613271791762, "learning_rate": 4.9131542207674315e-05, "loss": 2.2572, "mean_token_accuracy": 0.4689655125141144, "step": 133330 }, { "epoch": 0.13429642812897533, "grad_norm": 10.06067540029772, "learning_rate": 4.913143910295317e-05, "loss": 2.3667, "mean_token_accuracy": 0.4413793087005615, "step": 133335 }, { "epoch": 0.1343014641820795, "grad_norm": 9.752870812481735, "learning_rate": 4.9131335992232464e-05, "loss": 2.3194, "mean_token_accuracy": 0.44343616664409635, "step": 133340 }, { "epoch": 0.13430650023518367, "grad_norm": 9.521570759914928, "learning_rate": 4.9131232875512223e-05, "loss": 2.2088, "mean_token_accuracy": 0.44137930274009707, "step": 133345 }, { "epoch": 0.13431153628828785, "grad_norm": 9.595652112515525, "learning_rate": 4.913112975279249e-05, "loss": 2.0212, "mean_token_accuracy": 0.4931034505367279, "step": 133350 }, { "epoch": 0.13431657234139202, "grad_norm": 11.017809863357463, "learning_rate": 4.9131026624073275e-05, "loss": 2.4975, "mean_token_accuracy": 0.41724138259887694, "step": 133355 }, { "epoch": 0.1343216083944962, "grad_norm": 12.118249733207724, "learning_rate": 4.913092348935461e-05, "loss": 2.2868, "mean_token_accuracy": 0.41724138259887694, "step": 133360 }, { "epoch": 0.13432664444760037, "grad_norm": 10.7971842295323, "learning_rate": 4.9130820348636534e-05, "loss": 2.6374, "mean_token_accuracy": 0.3827586233615875, "step": 133365 }, { "epoch": 0.13433168050070454, "grad_norm": 12.3888643519639, "learning_rate": 4.913071720191906e-05, "loss": 2.5238, "mean_token_accuracy": 0.3915305554866791, "step": 133370 }, { "epoch": 0.13433671655380872, "grad_norm": 10.022165617186296, "learning_rate": 4.913061404920223e-05, "loss": 2.2466, "mean_token_accuracy": 0.4965517222881317, "step": 133375 }, { "epoch": 0.1343417526069129, "grad_norm": 10.572371980290658, "learning_rate": 4.9130510890486084e-05, "loss": 2.0121, "mean_token_accuracy": 0.4655172348022461, "step": 133380 }, { "epoch": 0.13434678866001706, "grad_norm": 14.280286013821122, "learning_rate": 4.913040772577062e-05, "loss": 2.4849, "mean_token_accuracy": 0.41034482717514037, "step": 133385 }, { "epoch": 0.13435182471312124, "grad_norm": 12.001847560340789, "learning_rate": 4.913030455505588e-05, "loss": 2.8677, "mean_token_accuracy": 0.39310344457626345, "step": 133390 }, { "epoch": 0.1343568607662254, "grad_norm": 11.483911533121626, "learning_rate": 4.91302013783419e-05, "loss": 2.679, "mean_token_accuracy": 0.41724138259887694, "step": 133395 }, { "epoch": 0.13436189681932958, "grad_norm": 7.9937897646191685, "learning_rate": 4.91300981956287e-05, "loss": 2.335, "mean_token_accuracy": 0.43103448748588563, "step": 133400 }, { "epoch": 0.13436693287243376, "grad_norm": 10.207440022254842, "learning_rate": 4.912999500691632e-05, "loss": 2.0766, "mean_token_accuracy": 0.4931034445762634, "step": 133405 }, { "epoch": 0.13437196892553793, "grad_norm": 10.57464206836519, "learning_rate": 4.912989181220476e-05, "loss": 2.5347, "mean_token_accuracy": 0.4034482777118683, "step": 133410 }, { "epoch": 0.1343770049786421, "grad_norm": 11.706299070904699, "learning_rate": 4.912978861149409e-05, "loss": 2.4012, "mean_token_accuracy": 0.4206896543502808, "step": 133415 }, { "epoch": 0.13438204103174628, "grad_norm": 10.15425674813988, "learning_rate": 4.912968540478431e-05, "loss": 2.4497, "mean_token_accuracy": 0.41724138259887694, "step": 133420 }, { "epoch": 0.13438707708485045, "grad_norm": 13.635647965013494, "learning_rate": 4.9129582192075457e-05, "loss": 2.7606, "mean_token_accuracy": 0.3551724135875702, "step": 133425 }, { "epoch": 0.13439211313795463, "grad_norm": 10.139692543827582, "learning_rate": 4.9129478973367557e-05, "loss": 2.4617, "mean_token_accuracy": 0.358620685338974, "step": 133430 }, { "epoch": 0.1343971491910588, "grad_norm": 9.172812265900287, "learning_rate": 4.9129375748660644e-05, "loss": 2.1545, "mean_token_accuracy": 0.4551724135875702, "step": 133435 }, { "epoch": 0.13440218524416297, "grad_norm": 11.754630218774727, "learning_rate": 4.912927251795473e-05, "loss": 2.424, "mean_token_accuracy": 0.42413792610168455, "step": 133440 }, { "epoch": 0.13440722129726715, "grad_norm": 12.444782696408742, "learning_rate": 4.9129169281249876e-05, "loss": 2.2839, "mean_token_accuracy": 0.48275862336158754, "step": 133445 }, { "epoch": 0.1344122573503713, "grad_norm": 9.00634132468982, "learning_rate": 4.912906603854608e-05, "loss": 2.3557, "mean_token_accuracy": 0.47931033968925474, "step": 133450 }, { "epoch": 0.13441729340347547, "grad_norm": 12.523626858454488, "learning_rate": 4.912896278984338e-05, "loss": 2.6274, "mean_token_accuracy": 0.3999999940395355, "step": 133455 }, { "epoch": 0.13442232945657964, "grad_norm": 11.513513783155293, "learning_rate": 4.9128859535141814e-05, "loss": 2.3317, "mean_token_accuracy": 0.4586206912994385, "step": 133460 }, { "epoch": 0.13442736550968382, "grad_norm": 10.452896530261441, "learning_rate": 4.9128756274441395e-05, "loss": 2.2798, "mean_token_accuracy": 0.4551724135875702, "step": 133465 }, { "epoch": 0.134432401562788, "grad_norm": 10.323327562640388, "learning_rate": 4.9128653007742167e-05, "loss": 1.9003, "mean_token_accuracy": 0.5502117335796356, "step": 133470 }, { "epoch": 0.13443743761589216, "grad_norm": 9.41760246078116, "learning_rate": 4.912854973504414e-05, "loss": 2.6979, "mean_token_accuracy": 0.4172413766384125, "step": 133475 }, { "epoch": 0.13444247366899634, "grad_norm": 10.679260859463167, "learning_rate": 4.9128446456347364e-05, "loss": 2.586, "mean_token_accuracy": 0.4137930989265442, "step": 133480 }, { "epoch": 0.1344475097221005, "grad_norm": 9.938032536234672, "learning_rate": 4.9128343171651856e-05, "loss": 2.6954, "mean_token_accuracy": 0.42758620977401735, "step": 133485 }, { "epoch": 0.13445254577520468, "grad_norm": 10.792187770235449, "learning_rate": 4.9128239880957655e-05, "loss": 2.5092, "mean_token_accuracy": 0.37731397747993467, "step": 133490 }, { "epoch": 0.13445758182830886, "grad_norm": 11.27454788544269, "learning_rate": 4.912813658426477e-05, "loss": 2.216, "mean_token_accuracy": 0.4413793087005615, "step": 133495 }, { "epoch": 0.13446261788141303, "grad_norm": 8.325130653367347, "learning_rate": 4.9128033281573245e-05, "loss": 2.2849, "mean_token_accuracy": 0.4448275864124298, "step": 133500 }, { "epoch": 0.1344676539345172, "grad_norm": 19.603952016840513, "learning_rate": 4.9127929972883104e-05, "loss": 2.6118, "mean_token_accuracy": 0.4000000059604645, "step": 133505 }, { "epoch": 0.13447268998762138, "grad_norm": 9.258050918716755, "learning_rate": 4.9127826658194363e-05, "loss": 2.5046, "mean_token_accuracy": 0.4034482717514038, "step": 133510 }, { "epoch": 0.13447772604072555, "grad_norm": 14.102883500042001, "learning_rate": 4.912772333750708e-05, "loss": 2.6154, "mean_token_accuracy": 0.38620689511299133, "step": 133515 }, { "epoch": 0.13448276209382973, "grad_norm": 9.074882647934036, "learning_rate": 4.912762001082126e-05, "loss": 2.5697, "mean_token_accuracy": 0.43103447556495667, "step": 133520 }, { "epoch": 0.1344877981469339, "grad_norm": 12.398591908673087, "learning_rate": 4.912751667813694e-05, "loss": 2.4407, "mean_token_accuracy": 0.45517241954803467, "step": 133525 }, { "epoch": 0.13449283420003807, "grad_norm": 10.843185465278435, "learning_rate": 4.9127413339454155e-05, "loss": 2.8635, "mean_token_accuracy": 0.3379310339689255, "step": 133530 }, { "epoch": 0.13449787025314225, "grad_norm": 10.031724284400868, "learning_rate": 4.912730999477292e-05, "loss": 2.8421, "mean_token_accuracy": 0.3724137872457504, "step": 133535 }, { "epoch": 0.13450290630624642, "grad_norm": 10.031076282944703, "learning_rate": 4.912720664409327e-05, "loss": 2.3495, "mean_token_accuracy": 0.43309134244918823, "step": 133540 }, { "epoch": 0.1345079423593506, "grad_norm": 5.70606433663581, "learning_rate": 4.912710328741523e-05, "loss": 1.9283, "mean_token_accuracy": 0.5120689630508423, "step": 133545 }, { "epoch": 0.13451297841245477, "grad_norm": 10.097182019474523, "learning_rate": 4.912699992473883e-05, "loss": 2.2718, "mean_token_accuracy": 0.4482758641242981, "step": 133550 }, { "epoch": 0.13451801446555894, "grad_norm": 8.15183901205883, "learning_rate": 4.9126896556064114e-05, "loss": 2.2808, "mean_token_accuracy": 0.4517241358757019, "step": 133555 }, { "epoch": 0.13452305051866312, "grad_norm": 12.916751120344587, "learning_rate": 4.912679318139109e-05, "loss": 2.4991, "mean_token_accuracy": 0.41034482717514037, "step": 133560 }, { "epoch": 0.1345280865717673, "grad_norm": 10.76104074360416, "learning_rate": 4.912668980071979e-05, "loss": 2.2546, "mean_token_accuracy": 0.42068964838981626, "step": 133565 }, { "epoch": 0.13453312262487147, "grad_norm": 10.253601639588439, "learning_rate": 4.912658641405025e-05, "loss": 2.5368, "mean_token_accuracy": 0.38433151245117186, "step": 133570 }, { "epoch": 0.13453815867797564, "grad_norm": 12.134093752749203, "learning_rate": 4.91264830213825e-05, "loss": 2.4463, "mean_token_accuracy": 0.4344827651977539, "step": 133575 }, { "epoch": 0.1345431947310798, "grad_norm": 11.176038787645842, "learning_rate": 4.912637962271655e-05, "loss": 2.3772, "mean_token_accuracy": 0.4344827651977539, "step": 133580 }, { "epoch": 0.134548230784184, "grad_norm": 10.813119999477912, "learning_rate": 4.912627621805246e-05, "loss": 2.353, "mean_token_accuracy": 0.3793103456497192, "step": 133585 }, { "epoch": 0.13455326683728813, "grad_norm": 10.582459540899668, "learning_rate": 4.912617280739024e-05, "loss": 2.3769, "mean_token_accuracy": 0.42413793206214906, "step": 133590 }, { "epoch": 0.1345583028903923, "grad_norm": 9.138121368722187, "learning_rate": 4.9126069390729914e-05, "loss": 2.179, "mean_token_accuracy": 0.39310344457626345, "step": 133595 }, { "epoch": 0.13456333894349648, "grad_norm": 9.011560157753548, "learning_rate": 4.912596596807152e-05, "loss": 2.3317, "mean_token_accuracy": 0.4344827592372894, "step": 133600 }, { "epoch": 0.13456837499660065, "grad_norm": 9.607758436900355, "learning_rate": 4.9125862539415076e-05, "loss": 2.0898, "mean_token_accuracy": 0.46551724076271056, "step": 133605 }, { "epoch": 0.13457341104970483, "grad_norm": 11.648362120287345, "learning_rate": 4.9125759104760626e-05, "loss": 2.9579, "mean_token_accuracy": 0.37931033670902253, "step": 133610 }, { "epoch": 0.134578447102809, "grad_norm": 9.75640295240588, "learning_rate": 4.912565566410819e-05, "loss": 1.9985, "mean_token_accuracy": 0.4620689570903778, "step": 133615 }, { "epoch": 0.13458348315591318, "grad_norm": 10.37540080001233, "learning_rate": 4.91255522174578e-05, "loss": 2.3242, "mean_token_accuracy": 0.42413792610168455, "step": 133620 }, { "epoch": 0.13458851920901735, "grad_norm": 10.583097536956135, "learning_rate": 4.912544876480948e-05, "loss": 2.3466, "mean_token_accuracy": 0.4482758641242981, "step": 133625 }, { "epoch": 0.13459355526212152, "grad_norm": 13.876431571158918, "learning_rate": 4.9125345306163256e-05, "loss": 2.5999, "mean_token_accuracy": 0.43103448748588563, "step": 133630 }, { "epoch": 0.1345985913152257, "grad_norm": 6.694327059811454, "learning_rate": 4.912524184151917e-05, "loss": 2.4814, "mean_token_accuracy": 0.46049606800079346, "step": 133635 }, { "epoch": 0.13460362736832987, "grad_norm": 10.402479532906442, "learning_rate": 4.9125138370877244e-05, "loss": 2.892, "mean_token_accuracy": 0.3482758581638336, "step": 133640 }, { "epoch": 0.13460866342143404, "grad_norm": 12.186872258273448, "learning_rate": 4.912503489423749e-05, "loss": 2.3716, "mean_token_accuracy": 0.4379310369491577, "step": 133645 }, { "epoch": 0.13461369947453822, "grad_norm": 15.448822356552942, "learning_rate": 4.9124931411599966e-05, "loss": 2.5409, "mean_token_accuracy": 0.417241370677948, "step": 133650 }, { "epoch": 0.1346187355276424, "grad_norm": 10.154175173796041, "learning_rate": 4.912482792296468e-05, "loss": 2.9022, "mean_token_accuracy": 0.38275861740112305, "step": 133655 }, { "epoch": 0.13462377158074657, "grad_norm": 11.723209325468458, "learning_rate": 4.912472442833167e-05, "loss": 2.6422, "mean_token_accuracy": 0.4172413766384125, "step": 133660 }, { "epoch": 0.13462880763385074, "grad_norm": 12.18209481165935, "learning_rate": 4.9124620927700965e-05, "loss": 2.8385, "mean_token_accuracy": 0.3931034505367279, "step": 133665 }, { "epoch": 0.1346338436869549, "grad_norm": 8.411865562125689, "learning_rate": 4.9124517421072594e-05, "loss": 2.4184, "mean_token_accuracy": 0.46896551847457885, "step": 133670 }, { "epoch": 0.1346388797400591, "grad_norm": 9.677528228758433, "learning_rate": 4.912441390844658e-05, "loss": 2.277, "mean_token_accuracy": 0.4635813653469086, "step": 133675 }, { "epoch": 0.13464391579316326, "grad_norm": 14.922613398793022, "learning_rate": 4.912431038982294e-05, "loss": 2.5145, "mean_token_accuracy": 0.4310344815254211, "step": 133680 }, { "epoch": 0.13464895184626743, "grad_norm": 13.401940994145589, "learning_rate": 4.912420686520173e-05, "loss": 2.5887, "mean_token_accuracy": 0.3999999940395355, "step": 133685 }, { "epoch": 0.1346539878993716, "grad_norm": 11.99587020235577, "learning_rate": 4.912410333458297e-05, "loss": 2.3474, "mean_token_accuracy": 0.43103447556495667, "step": 133690 }, { "epoch": 0.13465902395247578, "grad_norm": 8.685505027949565, "learning_rate": 4.912399979796667e-05, "loss": 2.1243, "mean_token_accuracy": 0.5068965375423431, "step": 133695 }, { "epoch": 0.13466406000557996, "grad_norm": 9.51198468992629, "learning_rate": 4.912389625535288e-05, "loss": 2.4714, "mean_token_accuracy": 0.4517241418361664, "step": 133700 }, { "epoch": 0.13466909605868413, "grad_norm": 28.386604242370172, "learning_rate": 4.912379270674162e-05, "loss": 2.7739, "mean_token_accuracy": 0.4137930989265442, "step": 133705 }, { "epoch": 0.1346741321117883, "grad_norm": 10.066417902762476, "learning_rate": 4.9123689152132926e-05, "loss": 2.0011, "mean_token_accuracy": 0.458620685338974, "step": 133710 }, { "epoch": 0.13467916816489248, "grad_norm": 10.132296576308173, "learning_rate": 4.912358559152682e-05, "loss": 2.6321, "mean_token_accuracy": 0.3896551728248596, "step": 133715 }, { "epoch": 0.13468420421799665, "grad_norm": 13.253826024454707, "learning_rate": 4.912348202492333e-05, "loss": 2.4118, "mean_token_accuracy": 0.4724137902259827, "step": 133720 }, { "epoch": 0.13468924027110082, "grad_norm": 11.019678721869447, "learning_rate": 4.912337845232248e-05, "loss": 2.281, "mean_token_accuracy": 0.44482759237289426, "step": 133725 }, { "epoch": 0.13469427632420497, "grad_norm": 9.313991698671483, "learning_rate": 4.912327487372431e-05, "loss": 2.2266, "mean_token_accuracy": 0.47779794335365294, "step": 133730 }, { "epoch": 0.13469931237730914, "grad_norm": 12.398574198450524, "learning_rate": 4.912317128912885e-05, "loss": 2.4346, "mean_token_accuracy": 0.3810042321681976, "step": 133735 }, { "epoch": 0.13470434843041332, "grad_norm": 10.5935220502372, "learning_rate": 4.912306769853612e-05, "loss": 2.3504, "mean_token_accuracy": 0.4551724135875702, "step": 133740 }, { "epoch": 0.1347093844835175, "grad_norm": 11.175962196455592, "learning_rate": 4.9122964101946147e-05, "loss": 2.6502, "mean_token_accuracy": 0.4034482777118683, "step": 133745 }, { "epoch": 0.13471442053662167, "grad_norm": 12.101065987946038, "learning_rate": 4.9122860499358964e-05, "loss": 2.3451, "mean_token_accuracy": 0.4344827473163605, "step": 133750 }, { "epoch": 0.13471945658972584, "grad_norm": 9.007486248400172, "learning_rate": 4.91227568907746e-05, "loss": 2.3273, "mean_token_accuracy": 0.4413793087005615, "step": 133755 }, { "epoch": 0.13472449264283, "grad_norm": 9.892949173219353, "learning_rate": 4.912265327619309e-05, "loss": 2.2623, "mean_token_accuracy": 0.43103448748588563, "step": 133760 }, { "epoch": 0.1347295286959342, "grad_norm": 10.927627703182745, "learning_rate": 4.9122549655614455e-05, "loss": 2.4706, "mean_token_accuracy": 0.42413793206214906, "step": 133765 }, { "epoch": 0.13473456474903836, "grad_norm": 9.446514498577935, "learning_rate": 4.9122446029038724e-05, "loss": 2.3503, "mean_token_accuracy": 0.43793103098869324, "step": 133770 }, { "epoch": 0.13473960080214253, "grad_norm": 11.530480000539573, "learning_rate": 4.9122342396465925e-05, "loss": 3.0848, "mean_token_accuracy": 0.34482758641242983, "step": 133775 }, { "epoch": 0.1347446368552467, "grad_norm": 11.046254420366234, "learning_rate": 4.912223875789609e-05, "loss": 2.3711, "mean_token_accuracy": 0.41034482717514037, "step": 133780 }, { "epoch": 0.13474967290835088, "grad_norm": 8.752395082828878, "learning_rate": 4.9122135113329245e-05, "loss": 2.7309, "mean_token_accuracy": 0.3965517163276672, "step": 133785 }, { "epoch": 0.13475470896145506, "grad_norm": 10.541089007514666, "learning_rate": 4.912203146276542e-05, "loss": 2.53, "mean_token_accuracy": 0.43103447556495667, "step": 133790 }, { "epoch": 0.13475974501455923, "grad_norm": 9.581735740937182, "learning_rate": 4.912192780620464e-05, "loss": 2.485, "mean_token_accuracy": 0.4034482777118683, "step": 133795 }, { "epoch": 0.1347647810676634, "grad_norm": 9.41235586227464, "learning_rate": 4.912182414364694e-05, "loss": 2.2543, "mean_token_accuracy": 0.4413793087005615, "step": 133800 }, { "epoch": 0.13476981712076758, "grad_norm": 10.810401897623253, "learning_rate": 4.912172047509235e-05, "loss": 2.4189, "mean_token_accuracy": 0.43793103098869324, "step": 133805 }, { "epoch": 0.13477485317387175, "grad_norm": 9.53372088423735, "learning_rate": 4.9121616800540906e-05, "loss": 2.4075, "mean_token_accuracy": 0.42413793206214906, "step": 133810 }, { "epoch": 0.13477988922697592, "grad_norm": 9.738706491641206, "learning_rate": 4.912151311999261e-05, "loss": 1.8531, "mean_token_accuracy": 0.49999999403953554, "step": 133815 }, { "epoch": 0.1347849252800801, "grad_norm": 10.071647612939412, "learning_rate": 4.912140943344751e-05, "loss": 2.3993, "mean_token_accuracy": 0.4068965554237366, "step": 133820 }, { "epoch": 0.13478996133318427, "grad_norm": 9.60063380760036, "learning_rate": 4.9121305740905635e-05, "loss": 2.1425, "mean_token_accuracy": 0.4379310250282288, "step": 133825 }, { "epoch": 0.13479499738628845, "grad_norm": 11.734080148261107, "learning_rate": 4.9121202042367e-05, "loss": 2.7414, "mean_token_accuracy": 0.4034482717514038, "step": 133830 }, { "epoch": 0.13480003343939262, "grad_norm": 10.67872495327807, "learning_rate": 4.912109833783166e-05, "loss": 2.318, "mean_token_accuracy": 0.42758620977401735, "step": 133835 }, { "epoch": 0.1348050694924968, "grad_norm": 8.361095471035858, "learning_rate": 4.9120994627299614e-05, "loss": 2.0856, "mean_token_accuracy": 0.4793103516101837, "step": 133840 }, { "epoch": 0.13481010554560097, "grad_norm": 8.280312432846882, "learning_rate": 4.9120890910770915e-05, "loss": 2.0511, "mean_token_accuracy": 0.47241378426551817, "step": 133845 }, { "epoch": 0.13481514159870514, "grad_norm": 15.400048115235153, "learning_rate": 4.9120787188245575e-05, "loss": 2.5305, "mean_token_accuracy": 0.41724138259887694, "step": 133850 }, { "epoch": 0.13482017765180931, "grad_norm": 12.298536895995761, "learning_rate": 4.912068345972363e-05, "loss": 2.7783, "mean_token_accuracy": 0.36206896901130675, "step": 133855 }, { "epoch": 0.1348252137049135, "grad_norm": 11.97210348239016, "learning_rate": 4.912057972520511e-05, "loss": 2.6808, "mean_token_accuracy": 0.35862069129943847, "step": 133860 }, { "epoch": 0.13483024975801766, "grad_norm": 12.5364236790175, "learning_rate": 4.9120475984690035e-05, "loss": 2.4669, "mean_token_accuracy": 0.441379314661026, "step": 133865 }, { "epoch": 0.1348352858111218, "grad_norm": 10.551063878686245, "learning_rate": 4.912037223817844e-05, "loss": 2.4076, "mean_token_accuracy": 0.41034482717514037, "step": 133870 }, { "epoch": 0.13484032186422598, "grad_norm": 10.174315704925045, "learning_rate": 4.9120268485670366e-05, "loss": 3.0154, "mean_token_accuracy": 0.34482758939266206, "step": 133875 }, { "epoch": 0.13484535791733016, "grad_norm": 11.781305791097932, "learning_rate": 4.912016472716582e-05, "loss": 2.4743, "mean_token_accuracy": 0.41379310488700866, "step": 133880 }, { "epoch": 0.13485039397043433, "grad_norm": 11.96023562107951, "learning_rate": 4.912006096266485e-05, "loss": 2.2782, "mean_token_accuracy": 0.44827585816383364, "step": 133885 }, { "epoch": 0.1348554300235385, "grad_norm": 15.998734954490622, "learning_rate": 4.911995719216746e-05, "loss": 2.3395, "mean_token_accuracy": 0.4620689630508423, "step": 133890 }, { "epoch": 0.13486046607664268, "grad_norm": 11.012662040539741, "learning_rate": 4.91198534156737e-05, "loss": 2.3062, "mean_token_accuracy": 0.43103448748588563, "step": 133895 }, { "epoch": 0.13486550212974685, "grad_norm": 11.237657636424517, "learning_rate": 4.911974963318361e-05, "loss": 3.0287, "mean_token_accuracy": 0.3862068891525269, "step": 133900 }, { "epoch": 0.13487053818285102, "grad_norm": 10.118485586805253, "learning_rate": 4.9119645844697176e-05, "loss": 2.4654, "mean_token_accuracy": 0.4620689570903778, "step": 133905 }, { "epoch": 0.1348755742359552, "grad_norm": 10.27909446657505, "learning_rate": 4.911954205021447e-05, "loss": 2.4954, "mean_token_accuracy": 0.417241370677948, "step": 133910 }, { "epoch": 0.13488061028905937, "grad_norm": 16.774118091077757, "learning_rate": 4.91194382497355e-05, "loss": 2.6778, "mean_token_accuracy": 0.40344828069210054, "step": 133915 }, { "epoch": 0.13488564634216355, "grad_norm": 11.323490016084147, "learning_rate": 4.911933444326029e-05, "loss": 2.2893, "mean_token_accuracy": 0.4551724135875702, "step": 133920 }, { "epoch": 0.13489068239526772, "grad_norm": 9.348215625439757, "learning_rate": 4.911923063078889e-05, "loss": 2.4759, "mean_token_accuracy": 0.4034482777118683, "step": 133925 }, { "epoch": 0.1348957184483719, "grad_norm": 10.055446909814775, "learning_rate": 4.9119126812321307e-05, "loss": 2.1959, "mean_token_accuracy": 0.48620688915252686, "step": 133930 }, { "epoch": 0.13490075450147607, "grad_norm": 10.862648239989293, "learning_rate": 4.911902298785758e-05, "loss": 2.7677, "mean_token_accuracy": 0.4137930989265442, "step": 133935 }, { "epoch": 0.13490579055458024, "grad_norm": 10.300920681506383, "learning_rate": 4.9118919157397736e-05, "loss": 3.0812, "mean_token_accuracy": 0.39655172228813174, "step": 133940 }, { "epoch": 0.13491082660768441, "grad_norm": 10.489518210733586, "learning_rate": 4.91188153209418e-05, "loss": 2.5897, "mean_token_accuracy": 0.4, "step": 133945 }, { "epoch": 0.1349158626607886, "grad_norm": 16.287307495748056, "learning_rate": 4.911871147848982e-05, "loss": 2.4924, "mean_token_accuracy": 0.4206896543502808, "step": 133950 }, { "epoch": 0.13492089871389276, "grad_norm": 8.915601218879473, "learning_rate": 4.911860763004179e-05, "loss": 2.0285, "mean_token_accuracy": 0.4724137902259827, "step": 133955 }, { "epoch": 0.13492593476699694, "grad_norm": 10.430567495711408, "learning_rate": 4.911850377559777e-05, "loss": 2.0593, "mean_token_accuracy": 0.47241378426551817, "step": 133960 }, { "epoch": 0.1349309708201011, "grad_norm": 19.03416432676315, "learning_rate": 4.911839991515778e-05, "loss": 2.569, "mean_token_accuracy": 0.39310345649719236, "step": 133965 }, { "epoch": 0.13493600687320528, "grad_norm": 9.22705765416522, "learning_rate": 4.911829604872183e-05, "loss": 2.2654, "mean_token_accuracy": 0.4310344815254211, "step": 133970 }, { "epoch": 0.13494104292630946, "grad_norm": 12.047863230369853, "learning_rate": 4.911819217628998e-05, "loss": 2.3742, "mean_token_accuracy": 0.4, "step": 133975 }, { "epoch": 0.13494607897941363, "grad_norm": 10.8520032882447, "learning_rate": 4.911808829786224e-05, "loss": 2.1134, "mean_token_accuracy": 0.4896551787853241, "step": 133980 }, { "epoch": 0.1349511150325178, "grad_norm": 13.608625923537215, "learning_rate": 4.911798441343865e-05, "loss": 2.324, "mean_token_accuracy": 0.4496672749519348, "step": 133985 }, { "epoch": 0.13495615108562198, "grad_norm": 10.309724646163778, "learning_rate": 4.911788052301922e-05, "loss": 2.2157, "mean_token_accuracy": 0.44827585816383364, "step": 133990 }, { "epoch": 0.13496118713872615, "grad_norm": 12.901230017291423, "learning_rate": 4.911777662660399e-05, "loss": 2.3951, "mean_token_accuracy": 0.4310344815254211, "step": 133995 }, { "epoch": 0.13496622319183033, "grad_norm": 10.571001115092859, "learning_rate": 4.911767272419299e-05, "loss": 2.5293, "mean_token_accuracy": 0.4068965554237366, "step": 134000 }, { "epoch": 0.1349712592449345, "grad_norm": 12.118835239395823, "learning_rate": 4.911756881578626e-05, "loss": 2.5977, "mean_token_accuracy": 0.3862068891525269, "step": 134005 }, { "epoch": 0.13497629529803865, "grad_norm": 10.838426177390122, "learning_rate": 4.9117464901383806e-05, "loss": 2.218, "mean_token_accuracy": 0.5241379201412201, "step": 134010 }, { "epoch": 0.13498133135114282, "grad_norm": 11.946303971191362, "learning_rate": 4.911736098098567e-05, "loss": 2.3248, "mean_token_accuracy": 0.4689655125141144, "step": 134015 }, { "epoch": 0.134986367404247, "grad_norm": 10.835532176075716, "learning_rate": 4.911725705459188e-05, "loss": 2.3557, "mean_token_accuracy": 0.4689655065536499, "step": 134020 }, { "epoch": 0.13499140345735117, "grad_norm": 12.084261370210807, "learning_rate": 4.9117153122202456e-05, "loss": 2.6956, "mean_token_accuracy": 0.4379310250282288, "step": 134025 }, { "epoch": 0.13499643951045534, "grad_norm": 10.109992058669187, "learning_rate": 4.911704918381744e-05, "loss": 2.3221, "mean_token_accuracy": 0.47241379618644713, "step": 134030 }, { "epoch": 0.13500147556355951, "grad_norm": 11.427131594917999, "learning_rate": 4.911694523943685e-05, "loss": 2.4006, "mean_token_accuracy": 0.4068965494632721, "step": 134035 }, { "epoch": 0.1350065116166637, "grad_norm": 11.1950678910402, "learning_rate": 4.911684128906072e-05, "loss": 2.3739, "mean_token_accuracy": 0.42068964838981626, "step": 134040 }, { "epoch": 0.13501154766976786, "grad_norm": 9.522524461318765, "learning_rate": 4.911673733268908e-05, "loss": 2.0028, "mean_token_accuracy": 0.540411365032196, "step": 134045 }, { "epoch": 0.13501658372287204, "grad_norm": 8.764797279627862, "learning_rate": 4.911663337032197e-05, "loss": 2.2466, "mean_token_accuracy": 0.42413793206214906, "step": 134050 }, { "epoch": 0.1350216197759762, "grad_norm": 9.267684411076148, "learning_rate": 4.9116529401959386e-05, "loss": 2.0835, "mean_token_accuracy": 0.5034482657909394, "step": 134055 }, { "epoch": 0.13502665582908038, "grad_norm": 10.663144303020138, "learning_rate": 4.911642542760138e-05, "loss": 2.3304, "mean_token_accuracy": 0.43448275327682495, "step": 134060 }, { "epoch": 0.13503169188218456, "grad_norm": 11.30183683156277, "learning_rate": 4.9116321447247986e-05, "loss": 2.5074, "mean_token_accuracy": 0.42413793206214906, "step": 134065 }, { "epoch": 0.13503672793528873, "grad_norm": 14.325452294640524, "learning_rate": 4.9116217460899225e-05, "loss": 2.0719, "mean_token_accuracy": 0.47241380214691164, "step": 134070 }, { "epoch": 0.1350417639883929, "grad_norm": 12.3677009720883, "learning_rate": 4.9116113468555124e-05, "loss": 2.0885, "mean_token_accuracy": 0.48275862336158754, "step": 134075 }, { "epoch": 0.13504680004149708, "grad_norm": 10.453095009381897, "learning_rate": 4.9116009470215704e-05, "loss": 2.0955, "mean_token_accuracy": 0.4517241299152374, "step": 134080 }, { "epoch": 0.13505183609460125, "grad_norm": 10.177434958845236, "learning_rate": 4.911590546588101e-05, "loss": 2.1657, "mean_token_accuracy": 0.5034482717514038, "step": 134085 }, { "epoch": 0.13505687214770543, "grad_norm": 14.466640415489486, "learning_rate": 4.9115801455551074e-05, "loss": 2.449, "mean_token_accuracy": 0.441379314661026, "step": 134090 }, { "epoch": 0.1350619082008096, "grad_norm": 8.71599529256436, "learning_rate": 4.91156974392259e-05, "loss": 2.5054, "mean_token_accuracy": 0.4103448212146759, "step": 134095 }, { "epoch": 0.13506694425391377, "grad_norm": 9.964051426015743, "learning_rate": 4.911559341690554e-05, "loss": 2.4314, "mean_token_accuracy": 0.38965516686439516, "step": 134100 }, { "epoch": 0.13507198030701795, "grad_norm": 8.982877355933086, "learning_rate": 4.9115489388590016e-05, "loss": 2.3002, "mean_token_accuracy": 0.4551724076271057, "step": 134105 }, { "epoch": 0.13507701636012212, "grad_norm": 10.68853928492849, "learning_rate": 4.911538535427935e-05, "loss": 2.2288, "mean_token_accuracy": 0.4137930989265442, "step": 134110 }, { "epoch": 0.1350820524132263, "grad_norm": 12.064505048445348, "learning_rate": 4.9115281313973574e-05, "loss": 2.313, "mean_token_accuracy": 0.4758620738983154, "step": 134115 }, { "epoch": 0.13508708846633047, "grad_norm": 10.3681680363652, "learning_rate": 4.9115177267672715e-05, "loss": 2.04, "mean_token_accuracy": 0.48620688915252686, "step": 134120 }, { "epoch": 0.13509212451943464, "grad_norm": 15.009625634737692, "learning_rate": 4.911507321537682e-05, "loss": 2.3651, "mean_token_accuracy": 0.4103448331356049, "step": 134125 }, { "epoch": 0.13509716057253882, "grad_norm": 11.979159361815714, "learning_rate": 4.91149691570859e-05, "loss": 2.1158, "mean_token_accuracy": 0.4517241418361664, "step": 134130 }, { "epoch": 0.135102196625643, "grad_norm": 10.048725524509766, "learning_rate": 4.9114865092799985e-05, "loss": 2.3877, "mean_token_accuracy": 0.42413793206214906, "step": 134135 }, { "epoch": 0.13510723267874716, "grad_norm": 9.584548277364538, "learning_rate": 4.911476102251911e-05, "loss": 2.2622, "mean_token_accuracy": 0.43448275327682495, "step": 134140 }, { "epoch": 0.13511226873185134, "grad_norm": 10.146499888779887, "learning_rate": 4.9114656946243286e-05, "loss": 2.1218, "mean_token_accuracy": 0.4430732071399689, "step": 134145 }, { "epoch": 0.13511730478495548, "grad_norm": 8.822927197041379, "learning_rate": 4.911455286397257e-05, "loss": 2.6094, "mean_token_accuracy": 0.4068965494632721, "step": 134150 }, { "epoch": 0.13512234083805966, "grad_norm": 8.816105331622458, "learning_rate": 4.9114448775706976e-05, "loss": 2.1758, "mean_token_accuracy": 0.4551724076271057, "step": 134155 }, { "epoch": 0.13512737689116383, "grad_norm": 12.561072336316514, "learning_rate": 4.9114344681446536e-05, "loss": 2.6445, "mean_token_accuracy": 0.4172413766384125, "step": 134160 }, { "epoch": 0.135132412944268, "grad_norm": 11.068561780759016, "learning_rate": 4.911424058119127e-05, "loss": 2.6803, "mean_token_accuracy": 0.41379310488700866, "step": 134165 }, { "epoch": 0.13513744899737218, "grad_norm": 10.560662256423516, "learning_rate": 4.9114136474941216e-05, "loss": 2.4919, "mean_token_accuracy": 0.4551724135875702, "step": 134170 }, { "epoch": 0.13514248505047635, "grad_norm": 13.962793709724243, "learning_rate": 4.9114032362696396e-05, "loss": 2.7213, "mean_token_accuracy": 0.4, "step": 134175 }, { "epoch": 0.13514752110358053, "grad_norm": 9.784722099460133, "learning_rate": 4.911392824445686e-05, "loss": 2.288, "mean_token_accuracy": 0.5103448331356049, "step": 134180 }, { "epoch": 0.1351525571566847, "grad_norm": 11.529774238402076, "learning_rate": 4.91138241202226e-05, "loss": 2.4712, "mean_token_accuracy": 0.43793103098869324, "step": 134185 }, { "epoch": 0.13515759320978887, "grad_norm": 7.804580903429626, "learning_rate": 4.911371998999368e-05, "loss": 2.5191, "mean_token_accuracy": 0.41379310488700866, "step": 134190 }, { "epoch": 0.13516262926289305, "grad_norm": 11.342987471364093, "learning_rate": 4.911361585377011e-05, "loss": 1.9117, "mean_token_accuracy": 0.5310344815254211, "step": 134195 }, { "epoch": 0.13516766531599722, "grad_norm": 12.48097639164439, "learning_rate": 4.911351171155192e-05, "loss": 2.1229, "mean_token_accuracy": 0.4502117455005646, "step": 134200 }, { "epoch": 0.1351727013691014, "grad_norm": 14.526926243458428, "learning_rate": 4.9113407563339145e-05, "loss": 2.2314, "mean_token_accuracy": 0.4310344815254211, "step": 134205 }, { "epoch": 0.13517773742220557, "grad_norm": 10.12012098430208, "learning_rate": 4.9113303409131805e-05, "loss": 2.3848, "mean_token_accuracy": 0.4068965494632721, "step": 134210 }, { "epoch": 0.13518277347530974, "grad_norm": 10.63263857675329, "learning_rate": 4.911319924892994e-05, "loss": 2.6857, "mean_token_accuracy": 0.39655172228813174, "step": 134215 }, { "epoch": 0.13518780952841392, "grad_norm": 11.56514580180841, "learning_rate": 4.911309508273357e-05, "loss": 2.2822, "mean_token_accuracy": 0.41724138259887694, "step": 134220 }, { "epoch": 0.1351928455815181, "grad_norm": 11.57157120705022, "learning_rate": 4.911299091054273e-05, "loss": 2.307, "mean_token_accuracy": 0.48275862336158754, "step": 134225 }, { "epoch": 0.13519788163462226, "grad_norm": 11.579526536382897, "learning_rate": 4.911288673235745e-05, "loss": 2.4915, "mean_token_accuracy": 0.4379310369491577, "step": 134230 }, { "epoch": 0.13520291768772644, "grad_norm": 9.282754933043174, "learning_rate": 4.911278254817775e-05, "loss": 2.2027, "mean_token_accuracy": 0.4517241358757019, "step": 134235 }, { "epoch": 0.1352079537408306, "grad_norm": 11.68654468548344, "learning_rate": 4.911267835800367e-05, "loss": 2.5102, "mean_token_accuracy": 0.4413793087005615, "step": 134240 }, { "epoch": 0.13521298979393478, "grad_norm": 11.696958310195383, "learning_rate": 4.911257416183523e-05, "loss": 2.2067, "mean_token_accuracy": 0.44137930274009707, "step": 134245 }, { "epoch": 0.13521802584703896, "grad_norm": 7.7341016996194885, "learning_rate": 4.9112469959672456e-05, "loss": 2.285, "mean_token_accuracy": 0.5275862157344818, "step": 134250 }, { "epoch": 0.13522306190014313, "grad_norm": 8.26806514487324, "learning_rate": 4.9112365751515385e-05, "loss": 2.2244, "mean_token_accuracy": 0.45603448152542114, "step": 134255 }, { "epoch": 0.1352280979532473, "grad_norm": 9.052513778662904, "learning_rate": 4.9112261537364045e-05, "loss": 2.7097, "mean_token_accuracy": 0.37586206793785093, "step": 134260 }, { "epoch": 0.13523313400635148, "grad_norm": 10.358952667644225, "learning_rate": 4.911215731721846e-05, "loss": 2.6853, "mean_token_accuracy": 0.4103448331356049, "step": 134265 }, { "epoch": 0.13523817005945565, "grad_norm": 10.84652823674153, "learning_rate": 4.911205309107867e-05, "loss": 2.7389, "mean_token_accuracy": 0.38620689511299133, "step": 134270 }, { "epoch": 0.13524320611255983, "grad_norm": 9.661898795137935, "learning_rate": 4.91119488589447e-05, "loss": 2.4458, "mean_token_accuracy": 0.4137930989265442, "step": 134275 }, { "epoch": 0.135248242165664, "grad_norm": 10.578013512358094, "learning_rate": 4.911184462081657e-05, "loss": 1.57, "mean_token_accuracy": 0.5782819092273712, "step": 134280 }, { "epoch": 0.13525327821876817, "grad_norm": 9.214234654981272, "learning_rate": 4.911174037669431e-05, "loss": 2.4946, "mean_token_accuracy": 0.41379310190677643, "step": 134285 }, { "epoch": 0.13525831427187232, "grad_norm": 10.377155604179269, "learning_rate": 4.911163612657796e-05, "loss": 2.7323, "mean_token_accuracy": 0.35517241060733795, "step": 134290 }, { "epoch": 0.1352633503249765, "grad_norm": 11.791831056460001, "learning_rate": 4.9111531870467534e-05, "loss": 2.6479, "mean_token_accuracy": 0.4369026005268097, "step": 134295 }, { "epoch": 0.13526838637808067, "grad_norm": 8.321679894250863, "learning_rate": 4.911142760836308e-05, "loss": 2.3718, "mean_token_accuracy": 0.4482758641242981, "step": 134300 }, { "epoch": 0.13527342243118484, "grad_norm": 12.363095154415507, "learning_rate": 4.911132334026461e-05, "loss": 2.3877, "mean_token_accuracy": 0.4206896543502808, "step": 134305 }, { "epoch": 0.13527845848428902, "grad_norm": 10.960700157617692, "learning_rate": 4.9111219066172156e-05, "loss": 2.7451, "mean_token_accuracy": 0.3793103456497192, "step": 134310 }, { "epoch": 0.1352834945373932, "grad_norm": 9.551276207656139, "learning_rate": 4.911111478608575e-05, "loss": 2.1222, "mean_token_accuracy": 0.4448275864124298, "step": 134315 }, { "epoch": 0.13528853059049736, "grad_norm": 8.959557224852793, "learning_rate": 4.9111010500005424e-05, "loss": 2.1304, "mean_token_accuracy": 0.4758620738983154, "step": 134320 }, { "epoch": 0.13529356664360154, "grad_norm": 10.394042482921195, "learning_rate": 4.91109062079312e-05, "loss": 2.3457, "mean_token_accuracy": 0.45862067937850953, "step": 134325 }, { "epoch": 0.1352986026967057, "grad_norm": 10.347218849399173, "learning_rate": 4.9110801909863114e-05, "loss": 2.4632, "mean_token_accuracy": 0.37241379022598264, "step": 134330 }, { "epoch": 0.13530363874980988, "grad_norm": 9.816100206324814, "learning_rate": 4.911069760580119e-05, "loss": 2.5011, "mean_token_accuracy": 0.4322444021701813, "step": 134335 }, { "epoch": 0.13530867480291406, "grad_norm": 11.386703863016669, "learning_rate": 4.911059329574546e-05, "loss": 2.3656, "mean_token_accuracy": 0.4172413766384125, "step": 134340 }, { "epoch": 0.13531371085601823, "grad_norm": 9.30169926119461, "learning_rate": 4.911048897969595e-05, "loss": 2.3522, "mean_token_accuracy": 0.4172413766384125, "step": 134345 }, { "epoch": 0.1353187469091224, "grad_norm": 9.707239272392467, "learning_rate": 4.911038465765269e-05, "loss": 2.3368, "mean_token_accuracy": 0.47931034564971925, "step": 134350 }, { "epoch": 0.13532378296222658, "grad_norm": 9.892065496286596, "learning_rate": 4.911028032961571e-05, "loss": 2.347, "mean_token_accuracy": 0.4413793087005615, "step": 134355 }, { "epoch": 0.13532881901533075, "grad_norm": 14.069196258933996, "learning_rate": 4.9110175995585045e-05, "loss": 2.7075, "mean_token_accuracy": 0.42413793206214906, "step": 134360 }, { "epoch": 0.13533385506843493, "grad_norm": 11.455966748293225, "learning_rate": 4.91100716555607e-05, "loss": 2.2488, "mean_token_accuracy": 0.48511797189712524, "step": 134365 }, { "epoch": 0.1353388911215391, "grad_norm": 9.070914228591613, "learning_rate": 4.9109967309542734e-05, "loss": 2.7783, "mean_token_accuracy": 0.3965517163276672, "step": 134370 }, { "epoch": 0.13534392717464327, "grad_norm": 8.638662378730041, "learning_rate": 4.9109862957531165e-05, "loss": 2.1407, "mean_token_accuracy": 0.49999998807907103, "step": 134375 }, { "epoch": 0.13534896322774745, "grad_norm": 11.116117841010515, "learning_rate": 4.910975859952601e-05, "loss": 2.6601, "mean_token_accuracy": 0.3793103456497192, "step": 134380 }, { "epoch": 0.13535399928085162, "grad_norm": 12.160637588656488, "learning_rate": 4.910965423552731e-05, "loss": 2.4647, "mean_token_accuracy": 0.4068965554237366, "step": 134385 }, { "epoch": 0.1353590353339558, "grad_norm": 10.68550088546402, "learning_rate": 4.9109549865535097e-05, "loss": 2.6795, "mean_token_accuracy": 0.3965517282485962, "step": 134390 }, { "epoch": 0.13536407138705997, "grad_norm": 9.187802779646592, "learning_rate": 4.910944548954939e-05, "loss": 2.1286, "mean_token_accuracy": 0.5034482717514038, "step": 134395 }, { "epoch": 0.13536910744016414, "grad_norm": 9.571506840992566, "learning_rate": 4.910934110757023e-05, "loss": 2.3671, "mean_token_accuracy": 0.39310344457626345, "step": 134400 }, { "epoch": 0.13537414349326832, "grad_norm": 11.61446372423744, "learning_rate": 4.910923671959763e-05, "loss": 2.4063, "mean_token_accuracy": 0.4, "step": 134405 }, { "epoch": 0.1353791795463725, "grad_norm": 9.787973356697972, "learning_rate": 4.910913232563163e-05, "loss": 2.339, "mean_token_accuracy": 0.45765275359153745, "step": 134410 }, { "epoch": 0.13538421559947666, "grad_norm": 9.809484587417113, "learning_rate": 4.9109027925672266e-05, "loss": 1.9537, "mean_token_accuracy": 0.5571082711219788, "step": 134415 }, { "epoch": 0.13538925165258084, "grad_norm": 9.226670387197863, "learning_rate": 4.910892351971954e-05, "loss": 2.294, "mean_token_accuracy": 0.44482759237289426, "step": 134420 }, { "epoch": 0.135394287705685, "grad_norm": 10.579580986271335, "learning_rate": 4.910881910777351e-05, "loss": 2.6831, "mean_token_accuracy": 0.3896551728248596, "step": 134425 }, { "epoch": 0.13539932375878916, "grad_norm": 10.590151771080215, "learning_rate": 4.9108714689834194e-05, "loss": 2.1947, "mean_token_accuracy": 0.44827585816383364, "step": 134430 }, { "epoch": 0.13540435981189333, "grad_norm": 7.895620978827685, "learning_rate": 4.910861026590162e-05, "loss": 2.0078, "mean_token_accuracy": 0.4896551787853241, "step": 134435 }, { "epoch": 0.1354093958649975, "grad_norm": 10.37713905080901, "learning_rate": 4.910850583597581e-05, "loss": 2.9234, "mean_token_accuracy": 0.35172412991523744, "step": 134440 }, { "epoch": 0.13541443191810168, "grad_norm": 11.385826271765035, "learning_rate": 4.910840140005681e-05, "loss": 2.3887, "mean_token_accuracy": 0.47586206793785096, "step": 134445 }, { "epoch": 0.13541946797120585, "grad_norm": 14.109973597538097, "learning_rate": 4.910829695814463e-05, "loss": 2.613, "mean_token_accuracy": 0.35862069129943847, "step": 134450 }, { "epoch": 0.13542450402431003, "grad_norm": 6.602300073144044, "learning_rate": 4.9108192510239315e-05, "loss": 2.2262, "mean_token_accuracy": 0.4629310369491577, "step": 134455 }, { "epoch": 0.1354295400774142, "grad_norm": 12.075685160737626, "learning_rate": 4.910808805634088e-05, "loss": 2.2568, "mean_token_accuracy": 0.4517241358757019, "step": 134460 }, { "epoch": 0.13543457613051837, "grad_norm": 13.61166427562624, "learning_rate": 4.910798359644936e-05, "loss": 2.3764, "mean_token_accuracy": 0.4275862157344818, "step": 134465 }, { "epoch": 0.13543961218362255, "grad_norm": 10.295834218076493, "learning_rate": 4.91078791305648e-05, "loss": 2.2475, "mean_token_accuracy": 0.4724137902259827, "step": 134470 }, { "epoch": 0.13544464823672672, "grad_norm": 9.673106805131535, "learning_rate": 4.91077746586872e-05, "loss": 2.4547, "mean_token_accuracy": 0.39310344457626345, "step": 134475 }, { "epoch": 0.1354496842898309, "grad_norm": 14.141919511802351, "learning_rate": 4.910767018081661e-05, "loss": 2.8227, "mean_token_accuracy": 0.3862068891525269, "step": 134480 }, { "epoch": 0.13545472034293507, "grad_norm": 10.452408266692883, "learning_rate": 4.910756569695305e-05, "loss": 2.3405, "mean_token_accuracy": 0.4678765952587128, "step": 134485 }, { "epoch": 0.13545975639603924, "grad_norm": 10.644654044171334, "learning_rate": 4.910746120709656e-05, "loss": 2.3277, "mean_token_accuracy": 0.5192377507686615, "step": 134490 }, { "epoch": 0.13546479244914342, "grad_norm": 9.15269330776536, "learning_rate": 4.910735671124715e-05, "loss": 2.2452, "mean_token_accuracy": 0.4620689690113068, "step": 134495 }, { "epoch": 0.1354698285022476, "grad_norm": 9.76447611091786, "learning_rate": 4.910725220940486e-05, "loss": 2.3601, "mean_token_accuracy": 0.41724138259887694, "step": 134500 }, { "epoch": 0.13547486455535176, "grad_norm": 20.65541071571421, "learning_rate": 4.9107147701569714e-05, "loss": 2.5985, "mean_token_accuracy": 0.4206896543502808, "step": 134505 }, { "epoch": 0.13547990060845594, "grad_norm": 10.858618480034215, "learning_rate": 4.910704318774175e-05, "loss": 2.7066, "mean_token_accuracy": 0.39655172228813174, "step": 134510 }, { "epoch": 0.1354849366615601, "grad_norm": 11.148422083990829, "learning_rate": 4.910693866792099e-05, "loss": 2.4485, "mean_token_accuracy": 0.4172413766384125, "step": 134515 }, { "epoch": 0.1354899727146643, "grad_norm": 8.316215149826675, "learning_rate": 4.9106834142107465e-05, "loss": 2.2563, "mean_token_accuracy": 0.46206897497177124, "step": 134520 }, { "epoch": 0.13549500876776846, "grad_norm": 12.269140773848898, "learning_rate": 4.9106729610301206e-05, "loss": 2.237, "mean_token_accuracy": 0.4620689690113068, "step": 134525 }, { "epoch": 0.13550004482087263, "grad_norm": 13.925802520228363, "learning_rate": 4.9106625072502236e-05, "loss": 2.7348, "mean_token_accuracy": 0.4159104585647583, "step": 134530 }, { "epoch": 0.1355050808739768, "grad_norm": 13.488833469103525, "learning_rate": 4.9106520528710596e-05, "loss": 2.4526, "mean_token_accuracy": 0.39655172228813174, "step": 134535 }, { "epoch": 0.13551011692708098, "grad_norm": 9.790118483949719, "learning_rate": 4.9106415978926307e-05, "loss": 2.2722, "mean_token_accuracy": 0.48275862336158754, "step": 134540 }, { "epoch": 0.13551515298018516, "grad_norm": 9.33258444305644, "learning_rate": 4.910631142314939e-05, "loss": 2.0748, "mean_token_accuracy": 0.44827587008476255, "step": 134545 }, { "epoch": 0.13552018903328933, "grad_norm": 10.74609256888069, "learning_rate": 4.910620686137989e-05, "loss": 2.2249, "mean_token_accuracy": 0.46551724672317507, "step": 134550 }, { "epoch": 0.1355252250863935, "grad_norm": 9.812625818012902, "learning_rate": 4.910610229361782e-05, "loss": 2.2271, "mean_token_accuracy": 0.5034482717514038, "step": 134555 }, { "epoch": 0.13553026113949768, "grad_norm": 9.15525460646177, "learning_rate": 4.9105997719863214e-05, "loss": 2.0133, "mean_token_accuracy": 0.458620685338974, "step": 134560 }, { "epoch": 0.13553529719260185, "grad_norm": 9.560462453324432, "learning_rate": 4.910589314011611e-05, "loss": 2.2022, "mean_token_accuracy": 0.44137930274009707, "step": 134565 }, { "epoch": 0.135540333245706, "grad_norm": 11.123479346124654, "learning_rate": 4.910578855437653e-05, "loss": 2.2477, "mean_token_accuracy": 0.44482759237289426, "step": 134570 }, { "epoch": 0.13554536929881017, "grad_norm": 17.165124226894974, "learning_rate": 4.910568396264451e-05, "loss": 2.4816, "mean_token_accuracy": 0.41034482717514037, "step": 134575 }, { "epoch": 0.13555040535191434, "grad_norm": 10.294722633833748, "learning_rate": 4.9105579364920065e-05, "loss": 2.3182, "mean_token_accuracy": 0.4551724135875702, "step": 134580 }, { "epoch": 0.13555544140501852, "grad_norm": 12.547537538882024, "learning_rate": 4.9105474761203236e-05, "loss": 2.8974, "mean_token_accuracy": 0.41034482717514037, "step": 134585 }, { "epoch": 0.1355604774581227, "grad_norm": 9.824400181034228, "learning_rate": 4.910537015149405e-05, "loss": 2.3554, "mean_token_accuracy": 0.42413792610168455, "step": 134590 }, { "epoch": 0.13556551351122686, "grad_norm": 12.099075409569291, "learning_rate": 4.910526553579253e-05, "loss": 2.1965, "mean_token_accuracy": 0.45172412395477296, "step": 134595 }, { "epoch": 0.13557054956433104, "grad_norm": 17.399589723914314, "learning_rate": 4.91051609140987e-05, "loss": 2.7991, "mean_token_accuracy": 0.4467634618282318, "step": 134600 }, { "epoch": 0.1355755856174352, "grad_norm": 11.818633242135931, "learning_rate": 4.9105056286412615e-05, "loss": 2.4059, "mean_token_accuracy": 0.4310344815254211, "step": 134605 }, { "epoch": 0.1355806216705394, "grad_norm": 10.398285796101677, "learning_rate": 4.910495165273427e-05, "loss": 2.3367, "mean_token_accuracy": 0.4361766457557678, "step": 134610 }, { "epoch": 0.13558565772364356, "grad_norm": 10.334115957871711, "learning_rate": 4.910484701306373e-05, "loss": 2.3425, "mean_token_accuracy": 0.45172412395477296, "step": 134615 }, { "epoch": 0.13559069377674773, "grad_norm": 10.292044808807947, "learning_rate": 4.9104742367400994e-05, "loss": 2.6356, "mean_token_accuracy": 0.3965517282485962, "step": 134620 }, { "epoch": 0.1355957298298519, "grad_norm": 12.10726224563903, "learning_rate": 4.9104637715746105e-05, "loss": 2.0444, "mean_token_accuracy": 0.5, "step": 134625 }, { "epoch": 0.13560076588295608, "grad_norm": 11.887122461123523, "learning_rate": 4.910453305809908e-05, "loss": 2.3411, "mean_token_accuracy": 0.4344827592372894, "step": 134630 }, { "epoch": 0.13560580193606026, "grad_norm": 12.776616194512602, "learning_rate": 4.910442839445997e-05, "loss": 2.4799, "mean_token_accuracy": 0.4482758641242981, "step": 134635 }, { "epoch": 0.13561083798916443, "grad_norm": 9.073238643883903, "learning_rate": 4.9104323724828786e-05, "loss": 2.0792, "mean_token_accuracy": 0.5034482836723327, "step": 134640 }, { "epoch": 0.1356158740422686, "grad_norm": 12.28018019401208, "learning_rate": 4.910421904920556e-05, "loss": 2.4638, "mean_token_accuracy": 0.41379310488700866, "step": 134645 }, { "epoch": 0.13562091009537278, "grad_norm": 12.020921971274877, "learning_rate": 4.9104114367590325e-05, "loss": 2.5349, "mean_token_accuracy": 0.3909255862236023, "step": 134650 }, { "epoch": 0.13562594614847695, "grad_norm": 9.879299725274887, "learning_rate": 4.9104009679983116e-05, "loss": 2.3308, "mean_token_accuracy": 0.44827585816383364, "step": 134655 }, { "epoch": 0.13563098220158112, "grad_norm": 12.79554918928031, "learning_rate": 4.910390498638394e-05, "loss": 2.4381, "mean_token_accuracy": 0.42413793206214906, "step": 134660 }, { "epoch": 0.1356360182546853, "grad_norm": 10.033473108875548, "learning_rate": 4.910380028679285e-05, "loss": 2.384, "mean_token_accuracy": 0.4034482717514038, "step": 134665 }, { "epoch": 0.13564105430778947, "grad_norm": 11.407401730491923, "learning_rate": 4.9103695581209866e-05, "loss": 2.4792, "mean_token_accuracy": 0.41034482717514037, "step": 134670 }, { "epoch": 0.13564609036089365, "grad_norm": 10.269367932254575, "learning_rate": 4.910359086963501e-05, "loss": 2.5202, "mean_token_accuracy": 0.42413792610168455, "step": 134675 }, { "epoch": 0.13565112641399782, "grad_norm": 9.017545695758873, "learning_rate": 4.910348615206832e-05, "loss": 2.4904, "mean_token_accuracy": 0.458620685338974, "step": 134680 }, { "epoch": 0.135656162467102, "grad_norm": 11.758824284667863, "learning_rate": 4.9103381428509824e-05, "loss": 2.4863, "mean_token_accuracy": 0.36896551847457887, "step": 134685 }, { "epoch": 0.13566119852020617, "grad_norm": 10.324570422707733, "learning_rate": 4.910327669895955e-05, "loss": 2.2392, "mean_token_accuracy": 0.4607380509376526, "step": 134690 }, { "epoch": 0.13566623457331034, "grad_norm": 12.703443822856327, "learning_rate": 4.9103171963417526e-05, "loss": 2.3297, "mean_token_accuracy": 0.3931034505367279, "step": 134695 }, { "epoch": 0.13567127062641451, "grad_norm": 8.540405010803717, "learning_rate": 4.910306722188378e-05, "loss": 2.3059, "mean_token_accuracy": 0.4793103516101837, "step": 134700 }, { "epoch": 0.1356763066795187, "grad_norm": 7.75841584452314, "learning_rate": 4.910296247435835e-05, "loss": 2.4136, "mean_token_accuracy": 0.4068965554237366, "step": 134705 }, { "epoch": 0.13568134273262283, "grad_norm": 7.011809069499588, "learning_rate": 4.910285772084124e-05, "loss": 2.175, "mean_token_accuracy": 0.405686628818512, "step": 134710 }, { "epoch": 0.135686378785727, "grad_norm": 11.657648535725755, "learning_rate": 4.9102752961332524e-05, "loss": 2.2094, "mean_token_accuracy": 0.4551724135875702, "step": 134715 }, { "epoch": 0.13569141483883118, "grad_norm": 11.19370054381177, "learning_rate": 4.9102648195832186e-05, "loss": 2.0656, "mean_token_accuracy": 0.4620689690113068, "step": 134720 }, { "epoch": 0.13569645089193536, "grad_norm": 10.396463003001184, "learning_rate": 4.910254342434027e-05, "loss": 2.2045, "mean_token_accuracy": 0.4344827651977539, "step": 134725 }, { "epoch": 0.13570148694503953, "grad_norm": 11.166109500554299, "learning_rate": 4.910243864685681e-05, "loss": 2.2489, "mean_token_accuracy": 0.4517241358757019, "step": 134730 }, { "epoch": 0.1357065229981437, "grad_norm": 10.078022418014237, "learning_rate": 4.910233386338184e-05, "loss": 2.9287, "mean_token_accuracy": 0.35862069129943847, "step": 134735 }, { "epoch": 0.13571155905124788, "grad_norm": 11.472873720029453, "learning_rate": 4.910222907391538e-05, "loss": 2.4801, "mean_token_accuracy": 0.39655172228813174, "step": 134740 }, { "epoch": 0.13571659510435205, "grad_norm": 9.687737765961836, "learning_rate": 4.9102124278457464e-05, "loss": 2.2483, "mean_token_accuracy": 0.44137930274009707, "step": 134745 }, { "epoch": 0.13572163115745622, "grad_norm": 11.2223644952304, "learning_rate": 4.910201947700811e-05, "loss": 2.4151, "mean_token_accuracy": 0.42758620977401735, "step": 134750 }, { "epoch": 0.1357266672105604, "grad_norm": 10.330103789203898, "learning_rate": 4.9101914669567364e-05, "loss": 2.2052, "mean_token_accuracy": 0.4965517222881317, "step": 134755 }, { "epoch": 0.13573170326366457, "grad_norm": 11.792452838435404, "learning_rate": 4.9101809856135234e-05, "loss": 2.6672, "mean_token_accuracy": 0.4401088893413544, "step": 134760 }, { "epoch": 0.13573673931676875, "grad_norm": 10.416671559807702, "learning_rate": 4.910170503671177e-05, "loss": 2.6069, "mean_token_accuracy": 0.41724138259887694, "step": 134765 }, { "epoch": 0.13574177536987292, "grad_norm": 8.319861882176228, "learning_rate": 4.9101600211296994e-05, "loss": 2.2543, "mean_token_accuracy": 0.4275862157344818, "step": 134770 }, { "epoch": 0.1357468114229771, "grad_norm": 8.967706717234528, "learning_rate": 4.910149537989093e-05, "loss": 2.3003, "mean_token_accuracy": 0.4517241418361664, "step": 134775 }, { "epoch": 0.13575184747608127, "grad_norm": 10.727135679483629, "learning_rate": 4.910139054249361e-05, "loss": 2.5893, "mean_token_accuracy": 0.43793103098869324, "step": 134780 }, { "epoch": 0.13575688352918544, "grad_norm": 8.71511367396217, "learning_rate": 4.910128569910507e-05, "loss": 1.863, "mean_token_accuracy": 0.5172413766384125, "step": 134785 }, { "epoch": 0.13576191958228961, "grad_norm": 10.20220617606134, "learning_rate": 4.910118084972533e-05, "loss": 2.325, "mean_token_accuracy": 0.441379314661026, "step": 134790 }, { "epoch": 0.1357669556353938, "grad_norm": 14.654835845579635, "learning_rate": 4.9101075994354415e-05, "loss": 2.0335, "mean_token_accuracy": 0.4620689570903778, "step": 134795 }, { "epoch": 0.13577199168849796, "grad_norm": 9.107487045661019, "learning_rate": 4.910097113299237e-05, "loss": 2.5072, "mean_token_accuracy": 0.48620688915252686, "step": 134800 }, { "epoch": 0.13577702774160214, "grad_norm": 10.256392372117176, "learning_rate": 4.910086626563921e-05, "loss": 2.4318, "mean_token_accuracy": 0.42758620381355283, "step": 134805 }, { "epoch": 0.1357820637947063, "grad_norm": 12.465447984058482, "learning_rate": 4.910076139229498e-05, "loss": 2.4414, "mean_token_accuracy": 0.417241370677948, "step": 134810 }, { "epoch": 0.13578709984781048, "grad_norm": 20.737231958196922, "learning_rate": 4.910065651295967e-05, "loss": 2.5157, "mean_token_accuracy": 0.46551724672317507, "step": 134815 }, { "epoch": 0.13579213590091466, "grad_norm": 11.527708214182006, "learning_rate": 4.9100551627633363e-05, "loss": 2.4518, "mean_token_accuracy": 0.3931034505367279, "step": 134820 }, { "epoch": 0.13579717195401883, "grad_norm": 8.989011306114987, "learning_rate": 4.9100446736316065e-05, "loss": 2.0133, "mean_token_accuracy": 0.4948578476905823, "step": 134825 }, { "epoch": 0.135802208007123, "grad_norm": 8.810941986319946, "learning_rate": 4.9100341839007785e-05, "loss": 2.2906, "mean_token_accuracy": 0.4310344815254211, "step": 134830 }, { "epoch": 0.13580724406022718, "grad_norm": 11.451202896023153, "learning_rate": 4.9100236935708576e-05, "loss": 2.9063, "mean_token_accuracy": 0.42589232325553894, "step": 134835 }, { "epoch": 0.13581228011333135, "grad_norm": 11.448326912683768, "learning_rate": 4.910013202641847e-05, "loss": 2.4252, "mean_token_accuracy": 0.4724137902259827, "step": 134840 }, { "epoch": 0.13581731616643553, "grad_norm": 9.809057552147612, "learning_rate": 4.910002711113747e-05, "loss": 2.5392, "mean_token_accuracy": 0.3999999940395355, "step": 134845 }, { "epoch": 0.13582235221953967, "grad_norm": 10.625769521623344, "learning_rate": 4.909992218986564e-05, "loss": 2.2282, "mean_token_accuracy": 0.441379314661026, "step": 134850 }, { "epoch": 0.13582738827264385, "grad_norm": 9.318727028269022, "learning_rate": 4.9099817262602975e-05, "loss": 2.3223, "mean_token_accuracy": 0.4655172348022461, "step": 134855 }, { "epoch": 0.13583242432574802, "grad_norm": 9.37571888139062, "learning_rate": 4.909971232934953e-05, "loss": 1.9659, "mean_token_accuracy": 0.48620688915252686, "step": 134860 }, { "epoch": 0.1358374603788522, "grad_norm": 10.43704680094581, "learning_rate": 4.9099607390105326e-05, "loss": 2.3609, "mean_token_accuracy": 0.42068964838981626, "step": 134865 }, { "epoch": 0.13584249643195637, "grad_norm": 10.526898917299087, "learning_rate": 4.909950244487038e-05, "loss": 3.1729, "mean_token_accuracy": 0.3655172407627106, "step": 134870 }, { "epoch": 0.13584753248506054, "grad_norm": 9.628162157020952, "learning_rate": 4.909939749364474e-05, "loss": 2.034, "mean_token_accuracy": 0.5088929176330567, "step": 134875 }, { "epoch": 0.13585256853816471, "grad_norm": 10.378508311716494, "learning_rate": 4.909929253642843e-05, "loss": 2.8286, "mean_token_accuracy": 0.4, "step": 134880 }, { "epoch": 0.1358576045912689, "grad_norm": 10.18549868262699, "learning_rate": 4.909918757322146e-05, "loss": 2.0197, "mean_token_accuracy": 0.47586206793785096, "step": 134885 }, { "epoch": 0.13586264064437306, "grad_norm": 9.705192629733032, "learning_rate": 4.909908260402389e-05, "loss": 2.1531, "mean_token_accuracy": 0.4620689630508423, "step": 134890 }, { "epoch": 0.13586767669747724, "grad_norm": 10.650840882695144, "learning_rate": 4.9098977628835724e-05, "loss": 2.4431, "mean_token_accuracy": 0.46418632864952086, "step": 134895 }, { "epoch": 0.1358727127505814, "grad_norm": 12.034846807645062, "learning_rate": 4.9098872647657e-05, "loss": 3.1024, "mean_token_accuracy": 0.3206896513700485, "step": 134900 }, { "epoch": 0.13587774880368558, "grad_norm": 13.285080532283546, "learning_rate": 4.909876766048776e-05, "loss": 2.5358, "mean_token_accuracy": 0.4296430677175522, "step": 134905 }, { "epoch": 0.13588278485678976, "grad_norm": 9.350627589899336, "learning_rate": 4.909866266732802e-05, "loss": 2.7844, "mean_token_accuracy": 0.4068965554237366, "step": 134910 }, { "epoch": 0.13588782090989393, "grad_norm": 9.76162932119752, "learning_rate": 4.90985576681778e-05, "loss": 2.4129, "mean_token_accuracy": 0.4379310369491577, "step": 134915 }, { "epoch": 0.1358928569629981, "grad_norm": 10.339488160598401, "learning_rate": 4.909845266303714e-05, "loss": 2.5262, "mean_token_accuracy": 0.4068965494632721, "step": 134920 }, { "epoch": 0.13589789301610228, "grad_norm": 9.575103478744335, "learning_rate": 4.909834765190607e-05, "loss": 1.9268, "mean_token_accuracy": 0.48275862336158754, "step": 134925 }, { "epoch": 0.13590292906920645, "grad_norm": 10.300009171430993, "learning_rate": 4.909824263478462e-05, "loss": 2.6719, "mean_token_accuracy": 0.41379310488700866, "step": 134930 }, { "epoch": 0.13590796512231063, "grad_norm": 10.701821422309203, "learning_rate": 4.9098137611672826e-05, "loss": 2.385, "mean_token_accuracy": 0.4206896543502808, "step": 134935 }, { "epoch": 0.1359130011754148, "grad_norm": 10.26987192395177, "learning_rate": 4.9098032582570705e-05, "loss": 2.2756, "mean_token_accuracy": 0.46896552443504336, "step": 134940 }, { "epoch": 0.13591803722851897, "grad_norm": 10.122385395617751, "learning_rate": 4.909792754747828e-05, "loss": 2.3364, "mean_token_accuracy": 0.43448275327682495, "step": 134945 }, { "epoch": 0.13592307328162315, "grad_norm": 11.025132976003075, "learning_rate": 4.909782250639559e-05, "loss": 2.3811, "mean_token_accuracy": 0.4137930989265442, "step": 134950 }, { "epoch": 0.13592810933472732, "grad_norm": 10.602909905138443, "learning_rate": 4.909771745932266e-05, "loss": 2.1733, "mean_token_accuracy": 0.5083484590053559, "step": 134955 }, { "epoch": 0.1359331453878315, "grad_norm": 10.751343204639257, "learning_rate": 4.9097612406259534e-05, "loss": 2.4136, "mean_token_accuracy": 0.4172413766384125, "step": 134960 }, { "epoch": 0.13593818144093567, "grad_norm": 10.857986457624833, "learning_rate": 4.909750734720622e-05, "loss": 2.4537, "mean_token_accuracy": 0.44137930274009707, "step": 134965 }, { "epoch": 0.13594321749403984, "grad_norm": 12.320225899263486, "learning_rate": 4.909740228216276e-05, "loss": 2.7086, "mean_token_accuracy": 0.3655172407627106, "step": 134970 }, { "epoch": 0.13594825354714402, "grad_norm": 8.57234693096351, "learning_rate": 4.909729721112918e-05, "loss": 2.1371, "mean_token_accuracy": 0.47465215921401976, "step": 134975 }, { "epoch": 0.1359532896002482, "grad_norm": 8.60113866560614, "learning_rate": 4.9097192134105515e-05, "loss": 2.3393, "mean_token_accuracy": 0.42068966031074523, "step": 134980 }, { "epoch": 0.13595832565335236, "grad_norm": 11.203779607274168, "learning_rate": 4.9097087051091786e-05, "loss": 2.2498, "mean_token_accuracy": 0.4996975243091583, "step": 134985 }, { "epoch": 0.1359633617064565, "grad_norm": 10.384319456705752, "learning_rate": 4.909698196208802e-05, "loss": 2.4609, "mean_token_accuracy": 0.4537205040454865, "step": 134990 }, { "epoch": 0.13596839775956068, "grad_norm": 9.850889807999696, "learning_rate": 4.909687686709425e-05, "loss": 2.2459, "mean_token_accuracy": 0.42758620977401735, "step": 134995 }, { "epoch": 0.13597343381266486, "grad_norm": 9.919318033324638, "learning_rate": 4.9096771766110504e-05, "loss": 2.5861, "mean_token_accuracy": 0.44827585816383364, "step": 135000 }, { "epoch": 0.13597846986576903, "grad_norm": 10.834208506090366, "learning_rate": 4.909666665913682e-05, "loss": 2.5082, "mean_token_accuracy": 0.4275861978530884, "step": 135005 }, { "epoch": 0.1359835059188732, "grad_norm": 9.994035391413979, "learning_rate": 4.9096561546173214e-05, "loss": 2.3494, "mean_token_accuracy": 0.3999999940395355, "step": 135010 }, { "epoch": 0.13598854197197738, "grad_norm": 9.037913822541128, "learning_rate": 4.909645642721972e-05, "loss": 2.194, "mean_token_accuracy": 0.4344827651977539, "step": 135015 }, { "epoch": 0.13599357802508155, "grad_norm": 9.476318552434709, "learning_rate": 4.909635130227637e-05, "loss": 2.2532, "mean_token_accuracy": 0.4534180283546448, "step": 135020 }, { "epoch": 0.13599861407818573, "grad_norm": 9.534823880227412, "learning_rate": 4.9096246171343197e-05, "loss": 2.3333, "mean_token_accuracy": 0.42068964838981626, "step": 135025 }, { "epoch": 0.1360036501312899, "grad_norm": 8.61040708087451, "learning_rate": 4.909614103442022e-05, "loss": 2.3254, "mean_token_accuracy": 0.4620689690113068, "step": 135030 }, { "epoch": 0.13600868618439407, "grad_norm": 12.376448307067244, "learning_rate": 4.909603589150747e-05, "loss": 3.1811, "mean_token_accuracy": 0.3275862067937851, "step": 135035 }, { "epoch": 0.13601372223749825, "grad_norm": 11.254993594381054, "learning_rate": 4.909593074260499e-05, "loss": 2.5762, "mean_token_accuracy": 0.4379310369491577, "step": 135040 }, { "epoch": 0.13601875829060242, "grad_norm": 11.655562213876975, "learning_rate": 4.909582558771279e-05, "loss": 2.5759, "mean_token_accuracy": 0.38620689511299133, "step": 135045 }, { "epoch": 0.1360237943437066, "grad_norm": 9.426336308367542, "learning_rate": 4.90957204268309e-05, "loss": 2.3741, "mean_token_accuracy": 0.4068965494632721, "step": 135050 }, { "epoch": 0.13602883039681077, "grad_norm": 9.828631522931131, "learning_rate": 4.909561525995937e-05, "loss": 2.4961, "mean_token_accuracy": 0.3827586233615875, "step": 135055 }, { "epoch": 0.13603386644991494, "grad_norm": 10.165825869598894, "learning_rate": 4.909551008709821e-05, "loss": 2.3954, "mean_token_accuracy": 0.4310344815254211, "step": 135060 }, { "epoch": 0.13603890250301912, "grad_norm": 11.310326970354806, "learning_rate": 4.909540490824745e-05, "loss": 2.3369, "mean_token_accuracy": 0.41379310488700866, "step": 135065 }, { "epoch": 0.1360439385561233, "grad_norm": 11.554730150391544, "learning_rate": 4.909529972340713e-05, "loss": 2.2475, "mean_token_accuracy": 0.4813672065734863, "step": 135070 }, { "epoch": 0.13604897460922746, "grad_norm": 10.888892245926671, "learning_rate": 4.9095194532577265e-05, "loss": 2.7959, "mean_token_accuracy": 0.3655172407627106, "step": 135075 }, { "epoch": 0.13605401066233164, "grad_norm": 14.301049862934988, "learning_rate": 4.90950893357579e-05, "loss": 2.1327, "mean_token_accuracy": 0.46896552443504336, "step": 135080 }, { "epoch": 0.1360590467154358, "grad_norm": 9.637814314957314, "learning_rate": 4.909498413294906e-05, "loss": 2.0183, "mean_token_accuracy": 0.4744101583957672, "step": 135085 }, { "epoch": 0.13606408276853998, "grad_norm": 11.596876911292185, "learning_rate": 4.909487892415076e-05, "loss": 2.2781, "mean_token_accuracy": 0.43793103098869324, "step": 135090 }, { "epoch": 0.13606911882164416, "grad_norm": 10.564066701047258, "learning_rate": 4.9094773709363045e-05, "loss": 2.0091, "mean_token_accuracy": 0.45862067937850953, "step": 135095 }, { "epoch": 0.13607415487474833, "grad_norm": 10.893480891659905, "learning_rate": 4.909466848858594e-05, "loss": 2.7744, "mean_token_accuracy": 0.38620689511299133, "step": 135100 }, { "epoch": 0.1360791909278525, "grad_norm": 11.894073803883426, "learning_rate": 4.909456326181947e-05, "loss": 2.5722, "mean_token_accuracy": 0.42758620381355283, "step": 135105 }, { "epoch": 0.13608422698095668, "grad_norm": 11.103981380465429, "learning_rate": 4.9094458029063666e-05, "loss": 2.2399, "mean_token_accuracy": 0.4379310369491577, "step": 135110 }, { "epoch": 0.13608926303406085, "grad_norm": 11.7665019787653, "learning_rate": 4.9094352790318566e-05, "loss": 2.6011, "mean_token_accuracy": 0.38620689511299133, "step": 135115 }, { "epoch": 0.13609429908716503, "grad_norm": 10.386723034479058, "learning_rate": 4.909424754558419e-05, "loss": 2.4262, "mean_token_accuracy": 0.4103448331356049, "step": 135120 }, { "epoch": 0.1360993351402692, "grad_norm": 9.795442890473675, "learning_rate": 4.909414229486057e-05, "loss": 2.2112, "mean_token_accuracy": 0.49655171632766726, "step": 135125 }, { "epoch": 0.13610437119337335, "grad_norm": 10.543278520003819, "learning_rate": 4.909403703814773e-05, "loss": 2.7072, "mean_token_accuracy": 0.43448275327682495, "step": 135130 }, { "epoch": 0.13610940724647752, "grad_norm": 12.18317096866015, "learning_rate": 4.909393177544569e-05, "loss": 2.4097, "mean_token_accuracy": 0.4123411953449249, "step": 135135 }, { "epoch": 0.1361144432995817, "grad_norm": 11.567600636890907, "learning_rate": 4.9093826506754515e-05, "loss": 2.2595, "mean_token_accuracy": 0.4517241358757019, "step": 135140 }, { "epoch": 0.13611947935268587, "grad_norm": 10.271591629195022, "learning_rate": 4.9093721232074205e-05, "loss": 2.3277, "mean_token_accuracy": 0.46206897497177124, "step": 135145 }, { "epoch": 0.13612451540579004, "grad_norm": 11.858334251233378, "learning_rate": 4.909361595140479e-05, "loss": 2.5765, "mean_token_accuracy": 0.42758620381355283, "step": 135150 }, { "epoch": 0.13612955145889422, "grad_norm": 12.613837023524765, "learning_rate": 4.9093510664746315e-05, "loss": 2.1171, "mean_token_accuracy": 0.4758620738983154, "step": 135155 }, { "epoch": 0.1361345875119984, "grad_norm": 11.531968914418643, "learning_rate": 4.909340537209879e-05, "loss": 3.0951, "mean_token_accuracy": 0.41524500250816343, "step": 135160 }, { "epoch": 0.13613962356510256, "grad_norm": 8.685575388182794, "learning_rate": 4.909330007346226e-05, "loss": 2.1483, "mean_token_accuracy": 0.45662432312965395, "step": 135165 }, { "epoch": 0.13614465961820674, "grad_norm": 14.525316624975638, "learning_rate": 4.909319476883674e-05, "loss": 2.6061, "mean_token_accuracy": 0.37931033968925476, "step": 135170 }, { "epoch": 0.1361496956713109, "grad_norm": 8.766679044006592, "learning_rate": 4.909308945822227e-05, "loss": 2.0469, "mean_token_accuracy": 0.4871921181678772, "step": 135175 }, { "epoch": 0.13615473172441508, "grad_norm": 10.384038159544508, "learning_rate": 4.9092984141618876e-05, "loss": 2.3653, "mean_token_accuracy": 0.47586206793785096, "step": 135180 }, { "epoch": 0.13615976777751926, "grad_norm": 10.983669825097552, "learning_rate": 4.909287881902659e-05, "loss": 2.3395, "mean_token_accuracy": 0.3896551698446274, "step": 135185 }, { "epoch": 0.13616480383062343, "grad_norm": 8.911640785485202, "learning_rate": 4.909277349044543e-05, "loss": 2.0558, "mean_token_accuracy": 0.47931033968925474, "step": 135190 }, { "epoch": 0.1361698398837276, "grad_norm": 10.144545324285271, "learning_rate": 4.9092668155875446e-05, "loss": 2.414, "mean_token_accuracy": 0.46551724672317507, "step": 135195 }, { "epoch": 0.13617487593683178, "grad_norm": 10.71539302123256, "learning_rate": 4.909256281531665e-05, "loss": 2.5, "mean_token_accuracy": 0.4068965494632721, "step": 135200 }, { "epoch": 0.13617991198993595, "grad_norm": 8.199437387665153, "learning_rate": 4.9092457468769074e-05, "loss": 2.3983, "mean_token_accuracy": 0.4206896543502808, "step": 135205 }, { "epoch": 0.13618494804304013, "grad_norm": 7.531103572492438, "learning_rate": 4.909235211623275e-05, "loss": 2.4469, "mean_token_accuracy": 0.43793103098869324, "step": 135210 }, { "epoch": 0.1361899840961443, "grad_norm": 11.20982693060921, "learning_rate": 4.909224675770771e-05, "loss": 2.7481, "mean_token_accuracy": 0.38620689511299133, "step": 135215 }, { "epoch": 0.13619502014924847, "grad_norm": 11.903465358504354, "learning_rate": 4.909214139319398e-05, "loss": 2.2895, "mean_token_accuracy": 0.42758620977401735, "step": 135220 }, { "epoch": 0.13620005620235265, "grad_norm": 8.71958800570729, "learning_rate": 4.909203602269158e-05, "loss": 2.1193, "mean_token_accuracy": 0.4931034505367279, "step": 135225 }, { "epoch": 0.13620509225545682, "grad_norm": 8.298410295578147, "learning_rate": 4.909193064620056e-05, "loss": 2.0886, "mean_token_accuracy": 0.5103448331356049, "step": 135230 }, { "epoch": 0.136210128308561, "grad_norm": 13.469718728551308, "learning_rate": 4.909182526372093e-05, "loss": 2.6954, "mean_token_accuracy": 0.39655172228813174, "step": 135235 }, { "epoch": 0.13621516436166517, "grad_norm": 10.44142830007144, "learning_rate": 4.9091719875252726e-05, "loss": 2.4475, "mean_token_accuracy": 0.4655172348022461, "step": 135240 }, { "epoch": 0.13622020041476934, "grad_norm": 11.184724697943007, "learning_rate": 4.909161448079599e-05, "loss": 2.4728, "mean_token_accuracy": 0.4517241418361664, "step": 135245 }, { "epoch": 0.13622523646787352, "grad_norm": 11.599710357012317, "learning_rate": 4.9091509080350725e-05, "loss": 2.3116, "mean_token_accuracy": 0.44670296311378477, "step": 135250 }, { "epoch": 0.1362302725209777, "grad_norm": 10.982550594858601, "learning_rate": 4.909140367391698e-05, "loss": 2.3269, "mean_token_accuracy": 0.47586206793785096, "step": 135255 }, { "epoch": 0.13623530857408186, "grad_norm": 10.924649200313599, "learning_rate": 4.9091298261494775e-05, "loss": 2.7371, "mean_token_accuracy": 0.4172413766384125, "step": 135260 }, { "epoch": 0.13624034462718604, "grad_norm": 9.707937050177211, "learning_rate": 4.909119284308415e-05, "loss": 2.4901, "mean_token_accuracy": 0.42758620977401735, "step": 135265 }, { "epoch": 0.13624538068029018, "grad_norm": 11.195725706103861, "learning_rate": 4.909108741868512e-05, "loss": 2.586, "mean_token_accuracy": 0.4413793087005615, "step": 135270 }, { "epoch": 0.13625041673339436, "grad_norm": 11.198289722582283, "learning_rate": 4.909098198829773e-05, "loss": 2.3725, "mean_token_accuracy": 0.3965517282485962, "step": 135275 }, { "epoch": 0.13625545278649853, "grad_norm": 9.953590525112904, "learning_rate": 4.9090876551921995e-05, "loss": 2.5993, "mean_token_accuracy": 0.39655172228813174, "step": 135280 }, { "epoch": 0.1362604888396027, "grad_norm": 12.190293475508124, "learning_rate": 4.909077110955795e-05, "loss": 2.3804, "mean_token_accuracy": 0.46896551847457885, "step": 135285 }, { "epoch": 0.13626552489270688, "grad_norm": 10.203274644735158, "learning_rate": 4.909066566120562e-05, "loss": 2.3389, "mean_token_accuracy": 0.4206896424293518, "step": 135290 }, { "epoch": 0.13627056094581105, "grad_norm": 12.484476917394169, "learning_rate": 4.909056020686505e-05, "loss": 2.491, "mean_token_accuracy": 0.37586206793785093, "step": 135295 }, { "epoch": 0.13627559699891523, "grad_norm": 12.140425491162, "learning_rate": 4.909045474653624e-05, "loss": 2.056, "mean_token_accuracy": 0.5137930929660797, "step": 135300 }, { "epoch": 0.1362806330520194, "grad_norm": 13.025325655031022, "learning_rate": 4.9090349280219255e-05, "loss": 2.2648, "mean_token_accuracy": 0.4344827592372894, "step": 135305 }, { "epoch": 0.13628566910512357, "grad_norm": 8.521537603635023, "learning_rate": 4.90902438079141e-05, "loss": 2.1226, "mean_token_accuracy": 0.4758620738983154, "step": 135310 }, { "epoch": 0.13629070515822775, "grad_norm": 8.889512194189532, "learning_rate": 4.909013832962081e-05, "loss": 2.5257, "mean_token_accuracy": 0.42068964838981626, "step": 135315 }, { "epoch": 0.13629574121133192, "grad_norm": 11.299305237901116, "learning_rate": 4.90900328453394e-05, "loss": 2.5716, "mean_token_accuracy": 0.36896551251411436, "step": 135320 }, { "epoch": 0.1363007772644361, "grad_norm": 12.740960495417967, "learning_rate": 4.9089927355069936e-05, "loss": 2.5015, "mean_token_accuracy": 0.441379314661026, "step": 135325 }, { "epoch": 0.13630581331754027, "grad_norm": 8.901009574069738, "learning_rate": 4.9089821858812414e-05, "loss": 1.9809, "mean_token_accuracy": 0.47586206793785096, "step": 135330 }, { "epoch": 0.13631084937064444, "grad_norm": 10.156291374049633, "learning_rate": 4.908971635656687e-05, "loss": 2.5359, "mean_token_accuracy": 0.358620685338974, "step": 135335 }, { "epoch": 0.13631588542374862, "grad_norm": 10.876251908827907, "learning_rate": 4.9089610848333345e-05, "loss": 2.6118, "mean_token_accuracy": 0.37586206793785093, "step": 135340 }, { "epoch": 0.1363209214768528, "grad_norm": 11.496870807405713, "learning_rate": 4.908950533411186e-05, "loss": 2.4697, "mean_token_accuracy": 0.42413792610168455, "step": 135345 }, { "epoch": 0.13632595752995696, "grad_norm": 10.053602593276302, "learning_rate": 4.908939981390244e-05, "loss": 2.3262, "mean_token_accuracy": 0.44482758045196535, "step": 135350 }, { "epoch": 0.13633099358306114, "grad_norm": 10.651505026473655, "learning_rate": 4.908929428770514e-05, "loss": 2.125, "mean_token_accuracy": 0.49304295182228086, "step": 135355 }, { "epoch": 0.1363360296361653, "grad_norm": 8.486358133970912, "learning_rate": 4.9089188755519946e-05, "loss": 2.6041, "mean_token_accuracy": 0.3896551728248596, "step": 135360 }, { "epoch": 0.13634106568926949, "grad_norm": 10.595594961370118, "learning_rate": 4.908908321734692e-05, "loss": 2.2611, "mean_token_accuracy": 0.4689655065536499, "step": 135365 }, { "epoch": 0.13634610174237366, "grad_norm": 9.77888335773004, "learning_rate": 4.908897767318607e-05, "loss": 2.6426, "mean_token_accuracy": 0.40344826579093934, "step": 135370 }, { "epoch": 0.13635113779547783, "grad_norm": 14.029751904359943, "learning_rate": 4.908887212303745e-05, "loss": 2.0935, "mean_token_accuracy": 0.493103438615799, "step": 135375 }, { "epoch": 0.136356173848582, "grad_norm": 10.018733048680547, "learning_rate": 4.908876656690107e-05, "loss": 2.5488, "mean_token_accuracy": 0.42068964838981626, "step": 135380 }, { "epoch": 0.13636120990168618, "grad_norm": 11.48659306883941, "learning_rate": 4.9088661004776964e-05, "loss": 2.4371, "mean_token_accuracy": 0.42758620977401735, "step": 135385 }, { "epoch": 0.13636624595479035, "grad_norm": 12.345266104832206, "learning_rate": 4.908855543666516e-05, "loss": 2.345, "mean_token_accuracy": 0.47586206793785096, "step": 135390 }, { "epoch": 0.13637128200789453, "grad_norm": 11.708150446160678, "learning_rate": 4.908844986256569e-05, "loss": 2.5193, "mean_token_accuracy": 0.40689654350280763, "step": 135395 }, { "epoch": 0.1363763180609987, "grad_norm": 9.933684622114361, "learning_rate": 4.908834428247859e-05, "loss": 1.8625, "mean_token_accuracy": 0.4931034445762634, "step": 135400 }, { "epoch": 0.13638135411410288, "grad_norm": 10.180946165121135, "learning_rate": 4.9088238696403885e-05, "loss": 2.4931, "mean_token_accuracy": 0.4034482717514038, "step": 135405 }, { "epoch": 0.13638639016720702, "grad_norm": 12.944302460802872, "learning_rate": 4.9088133104341597e-05, "loss": 2.4866, "mean_token_accuracy": 0.4034482777118683, "step": 135410 }, { "epoch": 0.1363914262203112, "grad_norm": 11.160175260513713, "learning_rate": 4.908802750629175e-05, "loss": 3.2161, "mean_token_accuracy": 0.3068965464830399, "step": 135415 }, { "epoch": 0.13639646227341537, "grad_norm": 10.115774536303306, "learning_rate": 4.908792190225439e-05, "loss": 2.378, "mean_token_accuracy": 0.4534180283546448, "step": 135420 }, { "epoch": 0.13640149832651954, "grad_norm": 10.24903239010195, "learning_rate": 4.908781629222954e-05, "loss": 2.3047, "mean_token_accuracy": 0.41379310488700866, "step": 135425 }, { "epoch": 0.13640653437962372, "grad_norm": 12.24430658822229, "learning_rate": 4.908771067621723e-05, "loss": 2.7355, "mean_token_accuracy": 0.4448275864124298, "step": 135430 }, { "epoch": 0.1364115704327279, "grad_norm": 9.75682316947418, "learning_rate": 4.908760505421748e-05, "loss": 2.3864, "mean_token_accuracy": 0.3551724076271057, "step": 135435 }, { "epoch": 0.13641660648583206, "grad_norm": 10.84387583303431, "learning_rate": 4.9087499426230345e-05, "loss": 2.134, "mean_token_accuracy": 0.482758617401123, "step": 135440 }, { "epoch": 0.13642164253893624, "grad_norm": 7.781143301341315, "learning_rate": 4.908739379225582e-05, "loss": 2.7764, "mean_token_accuracy": 0.4103448331356049, "step": 135445 }, { "epoch": 0.1364266785920404, "grad_norm": 9.715167308356557, "learning_rate": 4.9087288152293956e-05, "loss": 2.4715, "mean_token_accuracy": 0.41034482419490814, "step": 135450 }, { "epoch": 0.1364317146451446, "grad_norm": 10.339507817014834, "learning_rate": 4.9087182506344773e-05, "loss": 2.1105, "mean_token_accuracy": 0.4620689690113068, "step": 135455 }, { "epoch": 0.13643675069824876, "grad_norm": 11.267792314345673, "learning_rate": 4.9087076854408315e-05, "loss": 2.3678, "mean_token_accuracy": 0.4344827651977539, "step": 135460 }, { "epoch": 0.13644178675135293, "grad_norm": 9.922119316205647, "learning_rate": 4.908697119648459e-05, "loss": 2.4497, "mean_token_accuracy": 0.4896551787853241, "step": 135465 }, { "epoch": 0.1364468228044571, "grad_norm": 15.993875423042248, "learning_rate": 4.908686553257364e-05, "loss": 2.8003, "mean_token_accuracy": 0.3827586144208908, "step": 135470 }, { "epoch": 0.13645185885756128, "grad_norm": 7.624076825591366, "learning_rate": 4.9086759862675505e-05, "loss": 2.1068, "mean_token_accuracy": 0.47586206793785096, "step": 135475 }, { "epoch": 0.13645689491066545, "grad_norm": 8.058550841209156, "learning_rate": 4.9086654186790185e-05, "loss": 2.2171, "mean_token_accuracy": 0.4344827651977539, "step": 135480 }, { "epoch": 0.13646193096376963, "grad_norm": 9.978191751112645, "learning_rate": 4.908654850491773e-05, "loss": 2.7062, "mean_token_accuracy": 0.3551724076271057, "step": 135485 }, { "epoch": 0.1364669670168738, "grad_norm": 11.453235491368275, "learning_rate": 4.908644281705817e-05, "loss": 2.9978, "mean_token_accuracy": 0.3703569233417511, "step": 135490 }, { "epoch": 0.13647200306997798, "grad_norm": 10.123248518917839, "learning_rate": 4.9086337123211526e-05, "loss": 2.6042, "mean_token_accuracy": 0.4103448212146759, "step": 135495 }, { "epoch": 0.13647703912308215, "grad_norm": 12.399090320080028, "learning_rate": 4.908623142337784e-05, "loss": 2.579, "mean_token_accuracy": 0.4137930989265442, "step": 135500 }, { "epoch": 0.13648207517618632, "grad_norm": 10.293412898312734, "learning_rate": 4.9086125717557116e-05, "loss": 2.3392, "mean_token_accuracy": 0.46896551847457885, "step": 135505 }, { "epoch": 0.1364871112292905, "grad_norm": 11.832399098214186, "learning_rate": 4.908602000574941e-05, "loss": 2.5785, "mean_token_accuracy": 0.4068965494632721, "step": 135510 }, { "epoch": 0.13649214728239467, "grad_norm": 10.30583612612166, "learning_rate": 4.9085914287954734e-05, "loss": 2.805, "mean_token_accuracy": 0.36896551847457887, "step": 135515 }, { "epoch": 0.13649718333549885, "grad_norm": 12.42619622049391, "learning_rate": 4.908580856417314e-05, "loss": 2.2246, "mean_token_accuracy": 0.482758629322052, "step": 135520 }, { "epoch": 0.13650221938860302, "grad_norm": 10.099012146219438, "learning_rate": 4.908570283440462e-05, "loss": 2.3806, "mean_token_accuracy": 0.4241379380226135, "step": 135525 }, { "epoch": 0.1365072554417072, "grad_norm": 21.67459570613669, "learning_rate": 4.908559709864924e-05, "loss": 2.7892, "mean_token_accuracy": 0.39655172228813174, "step": 135530 }, { "epoch": 0.13651229149481137, "grad_norm": 9.609873011820277, "learning_rate": 4.908549135690701e-05, "loss": 2.6407, "mean_token_accuracy": 0.4172413766384125, "step": 135535 }, { "epoch": 0.13651732754791554, "grad_norm": 10.385783878801465, "learning_rate": 4.9085385609177966e-05, "loss": 2.3383, "mean_token_accuracy": 0.46551724672317507, "step": 135540 }, { "epoch": 0.13652236360101971, "grad_norm": 9.631095652365897, "learning_rate": 4.908527985546213e-05, "loss": 2.3807, "mean_token_accuracy": 0.4689655065536499, "step": 135545 }, { "epoch": 0.13652739965412386, "grad_norm": 9.58935937001955, "learning_rate": 4.908517409575953e-05, "loss": 1.9288, "mean_token_accuracy": 0.4965517222881317, "step": 135550 }, { "epoch": 0.13653243570722803, "grad_norm": 9.380781398888034, "learning_rate": 4.908506833007021e-05, "loss": 2.2995, "mean_token_accuracy": 0.45517241954803467, "step": 135555 }, { "epoch": 0.1365374717603322, "grad_norm": 11.484186725657539, "learning_rate": 4.908496255839419e-05, "loss": 2.5599, "mean_token_accuracy": 0.4310344815254211, "step": 135560 }, { "epoch": 0.13654250781343638, "grad_norm": 8.893936903036693, "learning_rate": 4.90848567807315e-05, "loss": 2.0975, "mean_token_accuracy": 0.4517241358757019, "step": 135565 }, { "epoch": 0.13654754386654055, "grad_norm": 8.709901800691684, "learning_rate": 4.908475099708217e-05, "loss": 2.1193, "mean_token_accuracy": 0.4551724135875702, "step": 135570 }, { "epoch": 0.13655257991964473, "grad_norm": 10.24766166134284, "learning_rate": 4.9084645207446227e-05, "loss": 2.1881, "mean_token_accuracy": 0.47931034564971925, "step": 135575 }, { "epoch": 0.1365576159727489, "grad_norm": 10.756631531153516, "learning_rate": 4.9084539411823706e-05, "loss": 2.5237, "mean_token_accuracy": 0.3896551787853241, "step": 135580 }, { "epoch": 0.13656265202585308, "grad_norm": 9.200164999961094, "learning_rate": 4.908443361021463e-05, "loss": 2.1445, "mean_token_accuracy": 0.46896551847457885, "step": 135585 }, { "epoch": 0.13656768807895725, "grad_norm": 12.552491377522674, "learning_rate": 4.9084327802619026e-05, "loss": 2.219, "mean_token_accuracy": 0.441379314661026, "step": 135590 }, { "epoch": 0.13657272413206142, "grad_norm": 15.657541026704987, "learning_rate": 4.908422198903693e-05, "loss": 2.5273, "mean_token_accuracy": 0.4275862157344818, "step": 135595 }, { "epoch": 0.1365777601851656, "grad_norm": 11.511882973868785, "learning_rate": 4.908411616946837e-05, "loss": 2.5325, "mean_token_accuracy": 0.45680580735206605, "step": 135600 }, { "epoch": 0.13658279623826977, "grad_norm": 11.333998279274018, "learning_rate": 4.908401034391338e-05, "loss": 2.4372, "mean_token_accuracy": 0.42413792610168455, "step": 135605 }, { "epoch": 0.13658783229137395, "grad_norm": 11.38586177226549, "learning_rate": 4.908390451237198e-05, "loss": 2.1006, "mean_token_accuracy": 0.45517241954803467, "step": 135610 }, { "epoch": 0.13659286834447812, "grad_norm": 10.296699962451797, "learning_rate": 4.908379867484421e-05, "loss": 2.2691, "mean_token_accuracy": 0.4103448212146759, "step": 135615 }, { "epoch": 0.1365979043975823, "grad_norm": 14.543260602105354, "learning_rate": 4.9083692831330085e-05, "loss": 2.5569, "mean_token_accuracy": 0.45862067937850953, "step": 135620 }, { "epoch": 0.13660294045068647, "grad_norm": 12.509600024158017, "learning_rate": 4.908358698182964e-05, "loss": 2.5598, "mean_token_accuracy": 0.4332123339176178, "step": 135625 }, { "epoch": 0.13660797650379064, "grad_norm": 9.296517733305533, "learning_rate": 4.908348112634292e-05, "loss": 2.067, "mean_token_accuracy": 0.48481547832489014, "step": 135630 }, { "epoch": 0.13661301255689481, "grad_norm": 10.609535722223866, "learning_rate": 4.908337526486993e-05, "loss": 2.1491, "mean_token_accuracy": 0.4551724135875702, "step": 135635 }, { "epoch": 0.136618048609999, "grad_norm": 11.191355586371886, "learning_rate": 4.908326939741071e-05, "loss": 2.6168, "mean_token_accuracy": 0.3896551728248596, "step": 135640 }, { "epoch": 0.13662308466310316, "grad_norm": 9.410607713723122, "learning_rate": 4.908316352396529e-05, "loss": 2.1183, "mean_token_accuracy": 0.458620685338974, "step": 135645 }, { "epoch": 0.13662812071620734, "grad_norm": 11.713432375302173, "learning_rate": 4.908305764453371e-05, "loss": 2.4753, "mean_token_accuracy": 0.41724138259887694, "step": 135650 }, { "epoch": 0.1366331567693115, "grad_norm": 10.460759250891593, "learning_rate": 4.908295175911597e-05, "loss": 2.2784, "mean_token_accuracy": 0.4620689630508423, "step": 135655 }, { "epoch": 0.13663819282241568, "grad_norm": 10.518883913201156, "learning_rate": 4.908284586771213e-05, "loss": 2.647, "mean_token_accuracy": 0.36896551847457887, "step": 135660 }, { "epoch": 0.13664322887551986, "grad_norm": 10.219987220055971, "learning_rate": 4.908273997032221e-05, "loss": 2.3712, "mean_token_accuracy": 0.41379310488700866, "step": 135665 }, { "epoch": 0.13664826492862403, "grad_norm": 11.685848597574024, "learning_rate": 4.908263406694623e-05, "loss": 2.5529, "mean_token_accuracy": 0.4068965554237366, "step": 135670 }, { "epoch": 0.1366533009817282, "grad_norm": 12.678774976506544, "learning_rate": 4.9082528157584225e-05, "loss": 2.3296, "mean_token_accuracy": 0.4448275864124298, "step": 135675 }, { "epoch": 0.13665833703483238, "grad_norm": 10.059582428515721, "learning_rate": 4.9082422242236234e-05, "loss": 2.5278, "mean_token_accuracy": 0.4676950991153717, "step": 135680 }, { "epoch": 0.13666337308793652, "grad_norm": 11.103417334819177, "learning_rate": 4.908231632090227e-05, "loss": 2.5446, "mean_token_accuracy": 0.3988505780696869, "step": 135685 }, { "epoch": 0.1366684091410407, "grad_norm": 11.509387331554962, "learning_rate": 4.9082210393582366e-05, "loss": 2.5288, "mean_token_accuracy": 0.39655172228813174, "step": 135690 }, { "epoch": 0.13667344519414487, "grad_norm": 10.843360639643176, "learning_rate": 4.9082104460276565e-05, "loss": 2.5729, "mean_token_accuracy": 0.4068965494632721, "step": 135695 }, { "epoch": 0.13667848124724905, "grad_norm": 12.800319408649564, "learning_rate": 4.9081998520984877e-05, "loss": 2.5334, "mean_token_accuracy": 0.42413793206214906, "step": 135700 }, { "epoch": 0.13668351730035322, "grad_norm": 8.56434919795878, "learning_rate": 4.908189257570735e-05, "loss": 1.879, "mean_token_accuracy": 0.5379310309886932, "step": 135705 }, { "epoch": 0.1366885533534574, "grad_norm": 10.938561242395497, "learning_rate": 4.9081786624444e-05, "loss": 2.5053, "mean_token_accuracy": 0.3758620619773865, "step": 135710 }, { "epoch": 0.13669358940656157, "grad_norm": 10.368933791296365, "learning_rate": 4.908168066719486e-05, "loss": 2.6473, "mean_token_accuracy": 0.41149425506591797, "step": 135715 }, { "epoch": 0.13669862545966574, "grad_norm": 16.352253388168233, "learning_rate": 4.908157470395997e-05, "loss": 2.7023, "mean_token_accuracy": 0.43103447556495667, "step": 135720 }, { "epoch": 0.13670366151276991, "grad_norm": 8.160908149039999, "learning_rate": 4.9081468734739336e-05, "loss": 2.1192, "mean_token_accuracy": 0.44482758045196535, "step": 135725 }, { "epoch": 0.1367086975658741, "grad_norm": 10.729100064138116, "learning_rate": 4.908136275953301e-05, "loss": 1.7761, "mean_token_accuracy": 0.5401693999767303, "step": 135730 }, { "epoch": 0.13671373361897826, "grad_norm": 12.256194451201003, "learning_rate": 4.908125677834101e-05, "loss": 2.3885, "mean_token_accuracy": 0.4068965554237366, "step": 135735 }, { "epoch": 0.13671876967208244, "grad_norm": 10.080527200540951, "learning_rate": 4.908115079116337e-05, "loss": 2.2734, "mean_token_accuracy": 0.46394434571266174, "step": 135740 }, { "epoch": 0.1367238057251866, "grad_norm": 8.503494370737384, "learning_rate": 4.9081044798000114e-05, "loss": 2.1487, "mean_token_accuracy": 0.47931033968925474, "step": 135745 }, { "epoch": 0.13672884177829078, "grad_norm": 12.39321165001928, "learning_rate": 4.908093879885128e-05, "loss": 2.4709, "mean_token_accuracy": 0.41379311084747317, "step": 135750 }, { "epoch": 0.13673387783139496, "grad_norm": 10.89923452419791, "learning_rate": 4.908083279371688e-05, "loss": 2.3868, "mean_token_accuracy": 0.4137930929660797, "step": 135755 }, { "epoch": 0.13673891388449913, "grad_norm": 11.085570439455482, "learning_rate": 4.9080726782596966e-05, "loss": 2.0162, "mean_token_accuracy": 0.5034482717514038, "step": 135760 }, { "epoch": 0.1367439499376033, "grad_norm": 10.47726377049291, "learning_rate": 4.908062076549155e-05, "loss": 2.1781, "mean_token_accuracy": 0.4793103516101837, "step": 135765 }, { "epoch": 0.13674898599070748, "grad_norm": 12.95445687510898, "learning_rate": 4.908051474240068e-05, "loss": 2.4998, "mean_token_accuracy": 0.4241379380226135, "step": 135770 }, { "epoch": 0.13675402204381165, "grad_norm": 9.112381986491494, "learning_rate": 4.9080408713324365e-05, "loss": 1.922, "mean_token_accuracy": 0.5034482717514038, "step": 135775 }, { "epoch": 0.13675905809691583, "grad_norm": 8.486942946990604, "learning_rate": 4.9080302678262646e-05, "loss": 2.7234, "mean_token_accuracy": 0.477918928861618, "step": 135780 }, { "epoch": 0.13676409415002, "grad_norm": 10.81563756380048, "learning_rate": 4.9080196637215555e-05, "loss": 2.2595, "mean_token_accuracy": 0.4662561535835266, "step": 135785 }, { "epoch": 0.13676913020312417, "grad_norm": 12.023642180534015, "learning_rate": 4.9080090590183105e-05, "loss": 2.6414, "mean_token_accuracy": 0.43103448748588563, "step": 135790 }, { "epoch": 0.13677416625622835, "grad_norm": 11.336626388819141, "learning_rate": 4.907998453716534e-05, "loss": 2.1673, "mean_token_accuracy": 0.4551724135875702, "step": 135795 }, { "epoch": 0.13677920230933252, "grad_norm": 9.939559783102764, "learning_rate": 4.9079878478162286e-05, "loss": 2.3156, "mean_token_accuracy": 0.4189957737922668, "step": 135800 }, { "epoch": 0.1367842383624367, "grad_norm": 9.15824023565933, "learning_rate": 4.907977241317397e-05, "loss": 2.3298, "mean_token_accuracy": 0.38965516686439516, "step": 135805 }, { "epoch": 0.13678927441554087, "grad_norm": 10.574219774939547, "learning_rate": 4.9079666342200434e-05, "loss": 2.7353, "mean_token_accuracy": 0.4137930989265442, "step": 135810 }, { "epoch": 0.13679431046864504, "grad_norm": 10.913841034023124, "learning_rate": 4.907956026524169e-05, "loss": 2.9818, "mean_token_accuracy": 0.3862068891525269, "step": 135815 }, { "epoch": 0.13679934652174922, "grad_norm": 9.492111242102373, "learning_rate": 4.9079454182297764e-05, "loss": 2.1097, "mean_token_accuracy": 0.4896551728248596, "step": 135820 }, { "epoch": 0.13680438257485336, "grad_norm": 11.172785300326586, "learning_rate": 4.9079348093368714e-05, "loss": 2.2607, "mean_token_accuracy": 0.4655172348022461, "step": 135825 }, { "epoch": 0.13680941862795754, "grad_norm": 12.09329985211726, "learning_rate": 4.907924199845454e-05, "loss": 2.5087, "mean_token_accuracy": 0.4379310429096222, "step": 135830 }, { "epoch": 0.1368144546810617, "grad_norm": 12.114663388722757, "learning_rate": 4.907913589755528e-05, "loss": 2.4364, "mean_token_accuracy": 0.3620689630508423, "step": 135835 }, { "epoch": 0.13681949073416588, "grad_norm": 12.065124300662013, "learning_rate": 4.907902979067097e-05, "loss": 2.3438, "mean_token_accuracy": 0.4448275864124298, "step": 135840 }, { "epoch": 0.13682452678727006, "grad_norm": 9.494257844873646, "learning_rate": 4.9078923677801644e-05, "loss": 2.1547, "mean_token_accuracy": 0.441379314661026, "step": 135845 }, { "epoch": 0.13682956284037423, "grad_norm": 8.251131335075153, "learning_rate": 4.907881755894732e-05, "loss": 2.2371, "mean_token_accuracy": 0.44827585220336913, "step": 135850 }, { "epoch": 0.1368345988934784, "grad_norm": 10.496312156606233, "learning_rate": 4.907871143410802e-05, "loss": 2.45, "mean_token_accuracy": 0.4551724135875702, "step": 135855 }, { "epoch": 0.13683963494658258, "grad_norm": 10.704950881211056, "learning_rate": 4.9078605303283794e-05, "loss": 2.7004, "mean_token_accuracy": 0.41724138259887694, "step": 135860 }, { "epoch": 0.13684467099968675, "grad_norm": 11.511399896788891, "learning_rate": 4.907849916647466e-05, "loss": 2.4837, "mean_token_accuracy": 0.417241370677948, "step": 135865 }, { "epoch": 0.13684970705279093, "grad_norm": 15.016114437666047, "learning_rate": 4.9078393023680644e-05, "loss": 2.2655, "mean_token_accuracy": 0.4655172348022461, "step": 135870 }, { "epoch": 0.1368547431058951, "grad_norm": 10.0134265843284, "learning_rate": 4.907828687490178e-05, "loss": 2.4789, "mean_token_accuracy": 0.4206896543502808, "step": 135875 }, { "epoch": 0.13685977915899927, "grad_norm": 9.225585068947067, "learning_rate": 4.907818072013811e-05, "loss": 2.4856, "mean_token_accuracy": 0.4241379380226135, "step": 135880 }, { "epoch": 0.13686481521210345, "grad_norm": 9.947984305211662, "learning_rate": 4.907807455938964e-05, "loss": 2.3211, "mean_token_accuracy": 0.4551724135875702, "step": 135885 }, { "epoch": 0.13686985126520762, "grad_norm": 10.864506555676126, "learning_rate": 4.9077968392656406e-05, "loss": 2.1528, "mean_token_accuracy": 0.4620689630508423, "step": 135890 }, { "epoch": 0.1368748873183118, "grad_norm": 11.536300710192116, "learning_rate": 4.907786221993845e-05, "loss": 2.8137, "mean_token_accuracy": 0.3551724135875702, "step": 135895 }, { "epoch": 0.13687992337141597, "grad_norm": 11.514153017691001, "learning_rate": 4.907775604123579e-05, "loss": 2.2797, "mean_token_accuracy": 0.4517241418361664, "step": 135900 }, { "epoch": 0.13688495942452014, "grad_norm": 9.5683650319034, "learning_rate": 4.907764985654846e-05, "loss": 2.1909, "mean_token_accuracy": 0.4482758641242981, "step": 135905 }, { "epoch": 0.13688999547762432, "grad_norm": 11.143735614080876, "learning_rate": 4.907754366587649e-05, "loss": 2.4838, "mean_token_accuracy": 0.4344827651977539, "step": 135910 }, { "epoch": 0.1368950315307285, "grad_norm": 10.645569250971965, "learning_rate": 4.907743746921991e-05, "loss": 2.2982, "mean_token_accuracy": 0.4935269236564636, "step": 135915 }, { "epoch": 0.13690006758383266, "grad_norm": 13.127273468234945, "learning_rate": 4.907733126657874e-05, "loss": 2.4373, "mean_token_accuracy": 0.41379311084747317, "step": 135920 }, { "epoch": 0.13690510363693684, "grad_norm": 9.508404774728922, "learning_rate": 4.9077225057953025e-05, "loss": 2.5469, "mean_token_accuracy": 0.39310344457626345, "step": 135925 }, { "epoch": 0.136910139690041, "grad_norm": 11.144393319841221, "learning_rate": 4.9077118843342785e-05, "loss": 2.4026, "mean_token_accuracy": 0.44271021485328677, "step": 135930 }, { "epoch": 0.13691517574314518, "grad_norm": 13.097881625104053, "learning_rate": 4.9077012622748046e-05, "loss": 2.4426, "mean_token_accuracy": 0.4354679763317108, "step": 135935 }, { "epoch": 0.13692021179624936, "grad_norm": 9.62386253153449, "learning_rate": 4.9076906396168843e-05, "loss": 2.1662, "mean_token_accuracy": 0.39655172228813174, "step": 135940 }, { "epoch": 0.13692524784935353, "grad_norm": 9.067579986071935, "learning_rate": 4.9076800163605204e-05, "loss": 2.6508, "mean_token_accuracy": 0.4379310250282288, "step": 135945 }, { "epoch": 0.1369302839024577, "grad_norm": 9.645249132035138, "learning_rate": 4.907669392505716e-05, "loss": 2.3388, "mean_token_accuracy": 0.43103448748588563, "step": 135950 }, { "epoch": 0.13693531995556188, "grad_norm": 11.687398053373709, "learning_rate": 4.907658768052474e-05, "loss": 2.1456, "mean_token_accuracy": 0.5068965435028077, "step": 135955 }, { "epoch": 0.13694035600866605, "grad_norm": 9.432984148265975, "learning_rate": 4.9076481430007976e-05, "loss": 2.3607, "mean_token_accuracy": 0.4034482777118683, "step": 135960 }, { "epoch": 0.1369453920617702, "grad_norm": 8.841447470051627, "learning_rate": 4.907637517350688e-05, "loss": 2.1675, "mean_token_accuracy": 0.44482759237289426, "step": 135965 }, { "epoch": 0.13695042811487437, "grad_norm": 12.175639530744665, "learning_rate": 4.907626891102152e-05, "loss": 2.5576, "mean_token_accuracy": 0.4068965494632721, "step": 135970 }, { "epoch": 0.13695546416797855, "grad_norm": 14.827176017455397, "learning_rate": 4.907616264255188e-05, "loss": 3.1147, "mean_token_accuracy": 0.3758620619773865, "step": 135975 }, { "epoch": 0.13696050022108272, "grad_norm": 11.234839425148808, "learning_rate": 4.907605636809802e-05, "loss": 3.0248, "mean_token_accuracy": 0.3896551728248596, "step": 135980 }, { "epoch": 0.1369655362741869, "grad_norm": 12.574025625243491, "learning_rate": 4.9075950087659954e-05, "loss": 2.5381, "mean_token_accuracy": 0.37241379022598264, "step": 135985 }, { "epoch": 0.13697057232729107, "grad_norm": 7.804242184151293, "learning_rate": 4.907584380123773e-05, "loss": 2.0614, "mean_token_accuracy": 0.44482758045196535, "step": 135990 }, { "epoch": 0.13697560838039524, "grad_norm": 9.780675150070724, "learning_rate": 4.9075737508831356e-05, "loss": 2.1679, "mean_token_accuracy": 0.4448275864124298, "step": 135995 }, { "epoch": 0.13698064443349942, "grad_norm": 9.572254974878895, "learning_rate": 4.907563121044088e-05, "loss": 2.2885, "mean_token_accuracy": 0.47586206793785096, "step": 136000 }, { "epoch": 0.1369856804866036, "grad_norm": 10.042028799340086, "learning_rate": 4.9075524906066306e-05, "loss": 2.3757, "mean_token_accuracy": 0.4137930989265442, "step": 136005 }, { "epoch": 0.13699071653970776, "grad_norm": 11.119454788986793, "learning_rate": 4.907541859570769e-05, "loss": 2.1566, "mean_token_accuracy": 0.4551724135875702, "step": 136010 }, { "epoch": 0.13699575259281194, "grad_norm": 11.495457951447007, "learning_rate": 4.9075312279365046e-05, "loss": 2.1571, "mean_token_accuracy": 0.46551724076271056, "step": 136015 }, { "epoch": 0.1370007886459161, "grad_norm": 10.886745299393255, "learning_rate": 4.907520595703841e-05, "loss": 2.4077, "mean_token_accuracy": 0.4241379201412201, "step": 136020 }, { "epoch": 0.13700582469902028, "grad_norm": 11.340735266264652, "learning_rate": 4.9075099628727814e-05, "loss": 2.088, "mean_token_accuracy": 0.46382335424423216, "step": 136025 }, { "epoch": 0.13701086075212446, "grad_norm": 9.706314110154384, "learning_rate": 4.907499329443329e-05, "loss": 2.3392, "mean_token_accuracy": 0.3965517282485962, "step": 136030 }, { "epoch": 0.13701589680522863, "grad_norm": 8.527116252600417, "learning_rate": 4.907488695415485e-05, "loss": 2.4207, "mean_token_accuracy": 0.441379314661026, "step": 136035 }, { "epoch": 0.1370209328583328, "grad_norm": 11.010879816877619, "learning_rate": 4.907478060789253e-05, "loss": 2.4459, "mean_token_accuracy": 0.4517241299152374, "step": 136040 }, { "epoch": 0.13702596891143698, "grad_norm": 12.25871214004453, "learning_rate": 4.907467425564638e-05, "loss": 2.4374, "mean_token_accuracy": 0.4068965494632721, "step": 136045 }, { "epoch": 0.13703100496454115, "grad_norm": 9.928917428190028, "learning_rate": 4.90745678974164e-05, "loss": 2.2436, "mean_token_accuracy": 0.4172413766384125, "step": 136050 }, { "epoch": 0.13703604101764533, "grad_norm": 11.114040014995396, "learning_rate": 4.9074461533202645e-05, "loss": 2.2276, "mean_token_accuracy": 0.42413793206214906, "step": 136055 }, { "epoch": 0.1370410770707495, "grad_norm": 9.267364456394278, "learning_rate": 4.9074355163005124e-05, "loss": 2.2421, "mean_token_accuracy": 0.44482758045196535, "step": 136060 }, { "epoch": 0.13704611312385367, "grad_norm": 10.816579715589292, "learning_rate": 4.907424878682388e-05, "loss": 2.1085, "mean_token_accuracy": 0.4862068951129913, "step": 136065 }, { "epoch": 0.13705114917695785, "grad_norm": 9.816683275266303, "learning_rate": 4.907414240465893e-05, "loss": 2.5691, "mean_token_accuracy": 0.3517241358757019, "step": 136070 }, { "epoch": 0.13705618523006202, "grad_norm": 9.462703960918113, "learning_rate": 4.9074036016510315e-05, "loss": 2.2923, "mean_token_accuracy": 0.48620688915252686, "step": 136075 }, { "epoch": 0.1370612212831662, "grad_norm": 11.611679605605419, "learning_rate": 4.907392962237807e-05, "loss": 2.5785, "mean_token_accuracy": 0.39310344457626345, "step": 136080 }, { "epoch": 0.13706625733627037, "grad_norm": 10.557758545073588, "learning_rate": 4.9073823222262205e-05, "loss": 2.3001, "mean_token_accuracy": 0.44137930274009707, "step": 136085 }, { "epoch": 0.13707129338937454, "grad_norm": 9.433129599599724, "learning_rate": 4.9073716816162765e-05, "loss": 2.2114, "mean_token_accuracy": 0.441379314661026, "step": 136090 }, { "epoch": 0.13707632944247872, "grad_norm": 13.690244724965458, "learning_rate": 4.907361040407977e-05, "loss": 2.6409, "mean_token_accuracy": 0.4206896543502808, "step": 136095 }, { "epoch": 0.1370813654955829, "grad_norm": 8.524433838375618, "learning_rate": 4.9073503986013244e-05, "loss": 2.3354, "mean_token_accuracy": 0.4620689630508423, "step": 136100 }, { "epoch": 0.13708640154868704, "grad_norm": 11.07018864312266, "learning_rate": 4.9073397561963245e-05, "loss": 2.3782, "mean_token_accuracy": 0.4413793087005615, "step": 136105 }, { "epoch": 0.1370914376017912, "grad_norm": 12.046695789768876, "learning_rate": 4.9073291131929764e-05, "loss": 2.3822, "mean_token_accuracy": 0.42068966031074523, "step": 136110 }, { "epoch": 0.13709647365489538, "grad_norm": 12.816600202262455, "learning_rate": 4.907318469591286e-05, "loss": 2.4926, "mean_token_accuracy": 0.41724137365818026, "step": 136115 }, { "epoch": 0.13710150970799956, "grad_norm": 10.342952437291279, "learning_rate": 4.907307825391256e-05, "loss": 2.5264, "mean_token_accuracy": 0.42413793206214906, "step": 136120 }, { "epoch": 0.13710654576110373, "grad_norm": 11.30941781606829, "learning_rate": 4.907297180592889e-05, "loss": 2.3541, "mean_token_accuracy": 0.4344827592372894, "step": 136125 }, { "epoch": 0.1371115818142079, "grad_norm": 9.542453894523126, "learning_rate": 4.907286535196186e-05, "loss": 2.0848, "mean_token_accuracy": 0.458620685338974, "step": 136130 }, { "epoch": 0.13711661786731208, "grad_norm": 9.272627075370915, "learning_rate": 4.907275889201152e-05, "loss": 2.2037, "mean_token_accuracy": 0.4620689690113068, "step": 136135 }, { "epoch": 0.13712165392041625, "grad_norm": 12.109606398340768, "learning_rate": 4.9072652426077905e-05, "loss": 2.7207, "mean_token_accuracy": 0.38965516090393065, "step": 136140 }, { "epoch": 0.13712668997352043, "grad_norm": 12.705608967016135, "learning_rate": 4.907254595416102e-05, "loss": 2.4437, "mean_token_accuracy": 0.40344828367233276, "step": 136145 }, { "epoch": 0.1371317260266246, "grad_norm": 10.686500966298913, "learning_rate": 4.907243947626092e-05, "loss": 2.2968, "mean_token_accuracy": 0.44827585220336913, "step": 136150 }, { "epoch": 0.13713676207972877, "grad_norm": 11.47754341045721, "learning_rate": 4.907233299237761e-05, "loss": 2.501, "mean_token_accuracy": 0.38965516686439516, "step": 136155 }, { "epoch": 0.13714179813283295, "grad_norm": 8.142258236729274, "learning_rate": 4.9072226502511144e-05, "loss": 2.2272, "mean_token_accuracy": 0.4931034505367279, "step": 136160 }, { "epoch": 0.13714683418593712, "grad_norm": 10.340973576528922, "learning_rate": 4.907212000666154e-05, "loss": 2.1338, "mean_token_accuracy": 0.4482758641242981, "step": 136165 }, { "epoch": 0.1371518702390413, "grad_norm": 9.353108202392926, "learning_rate": 4.907201350482882e-05, "loss": 2.15, "mean_token_accuracy": 0.4517241299152374, "step": 136170 }, { "epoch": 0.13715690629214547, "grad_norm": 10.242239453726352, "learning_rate": 4.907190699701303e-05, "loss": 2.0592, "mean_token_accuracy": 0.49655172824859617, "step": 136175 }, { "epoch": 0.13716194234524964, "grad_norm": 9.42138167629758, "learning_rate": 4.9071800483214183e-05, "loss": 2.404, "mean_token_accuracy": 0.4206896424293518, "step": 136180 }, { "epoch": 0.13716697839835382, "grad_norm": 9.708814390341226, "learning_rate": 4.907169396343233e-05, "loss": 2.5426, "mean_token_accuracy": 0.4068965494632721, "step": 136185 }, { "epoch": 0.137172014451458, "grad_norm": 16.2530589417657, "learning_rate": 4.9071587437667484e-05, "loss": 2.5089, "mean_token_accuracy": 0.403448286652565, "step": 136190 }, { "epoch": 0.13717705050456216, "grad_norm": 14.518727612001474, "learning_rate": 4.907148090591967e-05, "loss": 2.3871, "mean_token_accuracy": 0.4689655065536499, "step": 136195 }, { "epoch": 0.13718208655766634, "grad_norm": 9.524263509919313, "learning_rate": 4.907137436818893e-05, "loss": 2.6311, "mean_token_accuracy": 0.42238354682922363, "step": 136200 }, { "epoch": 0.1371871226107705, "grad_norm": 9.362826799482935, "learning_rate": 4.907126782447528e-05, "loss": 2.2835, "mean_token_accuracy": 0.38275861740112305, "step": 136205 }, { "epoch": 0.13719215866387469, "grad_norm": 7.482705876092103, "learning_rate": 4.907116127477877e-05, "loss": 2.2736, "mean_token_accuracy": 0.4745916426181793, "step": 136210 }, { "epoch": 0.13719719471697886, "grad_norm": 8.38619159968426, "learning_rate": 4.9071054719099414e-05, "loss": 2.2208, "mean_token_accuracy": 0.4620689630508423, "step": 136215 }, { "epoch": 0.13720223077008303, "grad_norm": 12.394507897619473, "learning_rate": 4.9070948157437244e-05, "loss": 2.4786, "mean_token_accuracy": 0.3793103516101837, "step": 136220 }, { "epoch": 0.1372072668231872, "grad_norm": 9.408778307993797, "learning_rate": 4.907084158979229e-05, "loss": 2.4651, "mean_token_accuracy": 0.41724138855934145, "step": 136225 }, { "epoch": 0.13721230287629138, "grad_norm": 10.934104177259798, "learning_rate": 4.907073501616459e-05, "loss": 2.4992, "mean_token_accuracy": 0.41034482717514037, "step": 136230 }, { "epoch": 0.13721733892939555, "grad_norm": 11.972905825989562, "learning_rate": 4.9070628436554155e-05, "loss": 2.2579, "mean_token_accuracy": 0.4482758641242981, "step": 136235 }, { "epoch": 0.13722237498249973, "grad_norm": 9.550701547007698, "learning_rate": 4.9070521850961027e-05, "loss": 2.1434, "mean_token_accuracy": 0.49570477604866026, "step": 136240 }, { "epoch": 0.13722741103560387, "grad_norm": 9.009356377795768, "learning_rate": 4.9070415259385244e-05, "loss": 2.4399, "mean_token_accuracy": 0.43103447556495667, "step": 136245 }, { "epoch": 0.13723244708870805, "grad_norm": 10.337365230336474, "learning_rate": 4.907030866182683e-05, "loss": 2.6053, "mean_token_accuracy": 0.38275861740112305, "step": 136250 }, { "epoch": 0.13723748314181222, "grad_norm": 12.734087246568043, "learning_rate": 4.907020205828579e-05, "loss": 2.4927, "mean_token_accuracy": 0.43623715043067934, "step": 136255 }, { "epoch": 0.1372425191949164, "grad_norm": 12.035540202230168, "learning_rate": 4.9070095448762185e-05, "loss": 2.5144, "mean_token_accuracy": 0.41034482717514037, "step": 136260 }, { "epoch": 0.13724755524802057, "grad_norm": 11.692116725575898, "learning_rate": 4.9069988833256034e-05, "loss": 2.5743, "mean_token_accuracy": 0.44482759237289426, "step": 136265 }, { "epoch": 0.13725259130112474, "grad_norm": 10.151920373961394, "learning_rate": 4.9069882211767364e-05, "loss": 3.0067, "mean_token_accuracy": 0.3620689630508423, "step": 136270 }, { "epoch": 0.13725762735422892, "grad_norm": 11.77063418645305, "learning_rate": 4.906977558429621e-05, "loss": 2.5996, "mean_token_accuracy": 0.41379310488700866, "step": 136275 }, { "epoch": 0.1372626634073331, "grad_norm": 14.13691016072988, "learning_rate": 4.9069668950842596e-05, "loss": 2.5965, "mean_token_accuracy": 0.43448275327682495, "step": 136280 }, { "epoch": 0.13726769946043726, "grad_norm": 11.029586573691278, "learning_rate": 4.906956231140656e-05, "loss": 2.4327, "mean_token_accuracy": 0.38275861740112305, "step": 136285 }, { "epoch": 0.13727273551354144, "grad_norm": 8.751896540793833, "learning_rate": 4.906945566598812e-05, "loss": 2.2932, "mean_token_accuracy": 0.44827585816383364, "step": 136290 }, { "epoch": 0.1372777715666456, "grad_norm": 10.977420043212653, "learning_rate": 4.9069349014587305e-05, "loss": 1.9976, "mean_token_accuracy": 0.5034482717514038, "step": 136295 }, { "epoch": 0.13728280761974979, "grad_norm": 13.143539726540567, "learning_rate": 4.9069242357204164e-05, "loss": 2.8354, "mean_token_accuracy": 0.41034482717514037, "step": 136300 }, { "epoch": 0.13728784367285396, "grad_norm": 13.561305320310327, "learning_rate": 4.90691356938387e-05, "loss": 2.6335, "mean_token_accuracy": 0.43103448748588563, "step": 136305 }, { "epoch": 0.13729287972595813, "grad_norm": 11.021052340221079, "learning_rate": 4.906902902449096e-05, "loss": 2.0098, "mean_token_accuracy": 0.5124016880989075, "step": 136310 }, { "epoch": 0.1372979157790623, "grad_norm": 8.86168839029277, "learning_rate": 4.9068922349160974e-05, "loss": 2.021, "mean_token_accuracy": 0.4620689630508423, "step": 136315 }, { "epoch": 0.13730295183216648, "grad_norm": 11.391912999731778, "learning_rate": 4.9068815667848766e-05, "loss": 2.2412, "mean_token_accuracy": 0.493103438615799, "step": 136320 }, { "epoch": 0.13730798788527065, "grad_norm": 14.714220771487728, "learning_rate": 4.9068708980554364e-05, "loss": 2.6496, "mean_token_accuracy": 0.3985480904579163, "step": 136325 }, { "epoch": 0.13731302393837483, "grad_norm": 9.94891545761921, "learning_rate": 4.90686022872778e-05, "loss": 2.3421, "mean_token_accuracy": 0.44827585816383364, "step": 136330 }, { "epoch": 0.137318059991479, "grad_norm": 12.7171115352992, "learning_rate": 4.90684955880191e-05, "loss": 2.46, "mean_token_accuracy": 0.4137930989265442, "step": 136335 }, { "epoch": 0.13732309604458318, "grad_norm": 10.990955599204508, "learning_rate": 4.906838888277831e-05, "loss": 2.5245, "mean_token_accuracy": 0.4034482777118683, "step": 136340 }, { "epoch": 0.13732813209768735, "grad_norm": 10.387655015171548, "learning_rate": 4.9068282171555434e-05, "loss": 2.5266, "mean_token_accuracy": 0.3965517282485962, "step": 136345 }, { "epoch": 0.13733316815079152, "grad_norm": 12.952437233026501, "learning_rate": 4.9068175454350516e-05, "loss": 2.0946, "mean_token_accuracy": 0.43793103098869324, "step": 136350 }, { "epoch": 0.1373382042038957, "grad_norm": 8.274698151445026, "learning_rate": 4.906806873116359e-05, "loss": 2.3992, "mean_token_accuracy": 0.41724138259887694, "step": 136355 }, { "epoch": 0.13734324025699987, "grad_norm": 10.989499560781473, "learning_rate": 4.9067962001994676e-05, "loss": 2.0467, "mean_token_accuracy": 0.49999999403953554, "step": 136360 }, { "epoch": 0.13734827631010404, "grad_norm": 12.520432452312585, "learning_rate": 4.906785526684381e-05, "loss": 2.8514, "mean_token_accuracy": 0.379310342669487, "step": 136365 }, { "epoch": 0.13735331236320822, "grad_norm": 11.070790068735144, "learning_rate": 4.9067748525711015e-05, "loss": 2.4084, "mean_token_accuracy": 0.39655172228813174, "step": 136370 }, { "epoch": 0.1373583484163124, "grad_norm": 9.792638919538321, "learning_rate": 4.906764177859633e-05, "loss": 2.3621, "mean_token_accuracy": 0.4775559604167938, "step": 136375 }, { "epoch": 0.13736338446941657, "grad_norm": 10.601379319490368, "learning_rate": 4.9067535025499764e-05, "loss": 2.405, "mean_token_accuracy": 0.4310344815254211, "step": 136380 }, { "epoch": 0.1373684205225207, "grad_norm": 12.32227299803304, "learning_rate": 4.9067428266421387e-05, "loss": 2.7521, "mean_token_accuracy": 0.36896551847457887, "step": 136385 }, { "epoch": 0.13737345657562489, "grad_norm": 12.86883440409392, "learning_rate": 4.906732150136119e-05, "loss": 2.359, "mean_token_accuracy": 0.42068966031074523, "step": 136390 }, { "epoch": 0.13737849262872906, "grad_norm": 10.581922677303655, "learning_rate": 4.906721473031921e-05, "loss": 2.4041, "mean_token_accuracy": 0.38620689511299133, "step": 136395 }, { "epoch": 0.13738352868183323, "grad_norm": 12.51682623500794, "learning_rate": 4.906710795329549e-05, "loss": 2.2267, "mean_token_accuracy": 0.47586206793785096, "step": 136400 }, { "epoch": 0.1373885647349374, "grad_norm": 11.540444447718269, "learning_rate": 4.906700117029005e-05, "loss": 2.614, "mean_token_accuracy": 0.42565032839775085, "step": 136405 }, { "epoch": 0.13739360078804158, "grad_norm": 12.363266223349033, "learning_rate": 4.906689438130292e-05, "loss": 2.5614, "mean_token_accuracy": 0.42413793206214906, "step": 136410 }, { "epoch": 0.13739863684114575, "grad_norm": 9.81090070323206, "learning_rate": 4.906678758633413e-05, "loss": 2.922, "mean_token_accuracy": 0.35862069129943847, "step": 136415 }, { "epoch": 0.13740367289424993, "grad_norm": 11.301183879033823, "learning_rate": 4.9066680785383715e-05, "loss": 2.1516, "mean_token_accuracy": 0.46551724672317507, "step": 136420 }, { "epoch": 0.1374087089473541, "grad_norm": 10.273330544556877, "learning_rate": 4.90665739784517e-05, "loss": 2.8287, "mean_token_accuracy": 0.3965517282485962, "step": 136425 }, { "epoch": 0.13741374500045828, "grad_norm": 10.776364942481337, "learning_rate": 4.906646716553811e-05, "loss": 2.5586, "mean_token_accuracy": 0.3862068891525269, "step": 136430 }, { "epoch": 0.13741878105356245, "grad_norm": 11.984528575711225, "learning_rate": 4.9066360346642995e-05, "loss": 2.3482, "mean_token_accuracy": 0.4, "step": 136435 }, { "epoch": 0.13742381710666662, "grad_norm": 11.540234106329256, "learning_rate": 4.906625352176635e-05, "loss": 2.3558, "mean_token_accuracy": 0.45862067937850953, "step": 136440 }, { "epoch": 0.1374288531597708, "grad_norm": 10.722915835629925, "learning_rate": 4.906614669090824e-05, "loss": 2.28, "mean_token_accuracy": 0.47586206793785096, "step": 136445 }, { "epoch": 0.13743388921287497, "grad_norm": 9.645872253154693, "learning_rate": 4.9066039854068675e-05, "loss": 2.63, "mean_token_accuracy": 0.43103448748588563, "step": 136450 }, { "epoch": 0.13743892526597914, "grad_norm": 11.53571113142397, "learning_rate": 4.906593301124768e-05, "loss": 2.3693, "mean_token_accuracy": 0.42413793206214906, "step": 136455 }, { "epoch": 0.13744396131908332, "grad_norm": 11.719508133002138, "learning_rate": 4.90658261624453e-05, "loss": 2.3318, "mean_token_accuracy": 0.4758620738983154, "step": 136460 }, { "epoch": 0.1374489973721875, "grad_norm": 11.831339310424726, "learning_rate": 4.9065719307661556e-05, "loss": 2.4024, "mean_token_accuracy": 0.42232305407524107, "step": 136465 }, { "epoch": 0.13745403342529167, "grad_norm": 12.740673870858636, "learning_rate": 4.906561244689648e-05, "loss": 2.6623, "mean_token_accuracy": 0.4310344815254211, "step": 136470 }, { "epoch": 0.13745906947839584, "grad_norm": 8.703729263294038, "learning_rate": 4.90655055801501e-05, "loss": 2.0985, "mean_token_accuracy": 0.47241379618644713, "step": 136475 }, { "epoch": 0.1374641055315, "grad_norm": 10.575471902962864, "learning_rate": 4.906539870742245e-05, "loss": 2.0685, "mean_token_accuracy": 0.482758617401123, "step": 136480 }, { "epoch": 0.1374691415846042, "grad_norm": 9.629168613369925, "learning_rate": 4.906529182871355e-05, "loss": 1.9937, "mean_token_accuracy": 0.482758617401123, "step": 136485 }, { "epoch": 0.13747417763770836, "grad_norm": 9.411305289169983, "learning_rate": 4.906518494402344e-05, "loss": 1.9733, "mean_token_accuracy": 0.4912885665893555, "step": 136490 }, { "epoch": 0.13747921369081254, "grad_norm": 10.55569999939673, "learning_rate": 4.9065078053352144e-05, "loss": 2.2655, "mean_token_accuracy": 0.4551724135875702, "step": 136495 }, { "epoch": 0.1374842497439167, "grad_norm": 10.880735978588028, "learning_rate": 4.90649711566997e-05, "loss": 2.4775, "mean_token_accuracy": 0.4034482777118683, "step": 136500 }, { "epoch": 0.13748928579702088, "grad_norm": 11.323541153133842, "learning_rate": 4.9064864254066116e-05, "loss": 2.3076, "mean_token_accuracy": 0.4275861978530884, "step": 136505 }, { "epoch": 0.13749432185012506, "grad_norm": 14.17729734090147, "learning_rate": 4.9064757345451444e-05, "loss": 2.5106, "mean_token_accuracy": 0.41034482717514037, "step": 136510 }, { "epoch": 0.13749935790322923, "grad_norm": 8.401545497778573, "learning_rate": 4.906465043085571e-05, "loss": 2.2, "mean_token_accuracy": 0.42413792610168455, "step": 136515 }, { "epoch": 0.1375043939563334, "grad_norm": 10.427659484019577, "learning_rate": 4.9064543510278934e-05, "loss": 2.6843, "mean_token_accuracy": 0.36551724672317504, "step": 136520 }, { "epoch": 0.13750943000943755, "grad_norm": 10.422607359600912, "learning_rate": 4.9064436583721164e-05, "loss": 2.3948, "mean_token_accuracy": 0.42758620977401735, "step": 136525 }, { "epoch": 0.13751446606254172, "grad_norm": 12.272840518022374, "learning_rate": 4.9064329651182406e-05, "loss": 2.7378, "mean_token_accuracy": 0.37241379022598264, "step": 136530 }, { "epoch": 0.1375195021156459, "grad_norm": 9.081743187149343, "learning_rate": 4.90642227126627e-05, "loss": 2.2047, "mean_token_accuracy": 0.46551724076271056, "step": 136535 }, { "epoch": 0.13752453816875007, "grad_norm": 10.462039705229992, "learning_rate": 4.906411576816208e-05, "loss": 2.2901, "mean_token_accuracy": 0.43605564832687377, "step": 136540 }, { "epoch": 0.13752957422185424, "grad_norm": 12.466515237867482, "learning_rate": 4.906400881768057e-05, "loss": 2.562, "mean_token_accuracy": 0.4103448331356049, "step": 136545 }, { "epoch": 0.13753461027495842, "grad_norm": 9.902118063692782, "learning_rate": 4.90639018612182e-05, "loss": 2.2668, "mean_token_accuracy": 0.43103448748588563, "step": 136550 }, { "epoch": 0.1375396463280626, "grad_norm": 13.30530957149223, "learning_rate": 4.9063794898775e-05, "loss": 2.5321, "mean_token_accuracy": 0.43103448748588563, "step": 136555 }, { "epoch": 0.13754468238116677, "grad_norm": 9.003939930118364, "learning_rate": 4.906368793035102e-05, "loss": 2.8386, "mean_token_accuracy": 0.41724138259887694, "step": 136560 }, { "epoch": 0.13754971843427094, "grad_norm": 12.764952823437417, "learning_rate": 4.906358095594625e-05, "loss": 2.4504, "mean_token_accuracy": 0.4379310369491577, "step": 136565 }, { "epoch": 0.1375547544873751, "grad_norm": 9.731918624674412, "learning_rate": 4.906347397556074e-05, "loss": 2.2048, "mean_token_accuracy": 0.46551724076271056, "step": 136570 }, { "epoch": 0.1375597905404793, "grad_norm": 14.460179150725013, "learning_rate": 4.906336698919453e-05, "loss": 2.5673, "mean_token_accuracy": 0.4068965494632721, "step": 136575 }, { "epoch": 0.13756482659358346, "grad_norm": 10.504726128188349, "learning_rate": 4.9063259996847634e-05, "loss": 2.7979, "mean_token_accuracy": 0.36206896007061007, "step": 136580 }, { "epoch": 0.13756986264668764, "grad_norm": 10.493296448473458, "learning_rate": 4.906315299852009e-05, "loss": 3.068, "mean_token_accuracy": 0.36896551847457887, "step": 136585 }, { "epoch": 0.1375748986997918, "grad_norm": 10.148421421011829, "learning_rate": 4.906304599421192e-05, "loss": 2.6677, "mean_token_accuracy": 0.3896551728248596, "step": 136590 }, { "epoch": 0.13757993475289598, "grad_norm": 8.9807198983443, "learning_rate": 4.9062938983923166e-05, "loss": 2.1835, "mean_token_accuracy": 0.49134907126426697, "step": 136595 }, { "epoch": 0.13758497080600016, "grad_norm": 10.801109995988671, "learning_rate": 4.9062831967653846e-05, "loss": 1.8324, "mean_token_accuracy": 0.5334543228149414, "step": 136600 }, { "epoch": 0.13759000685910433, "grad_norm": 9.405679141414362, "learning_rate": 4.9062724945404e-05, "loss": 2.3966, "mean_token_accuracy": 0.44827587008476255, "step": 136605 }, { "epoch": 0.1375950429122085, "grad_norm": 9.845748829689436, "learning_rate": 4.9062617917173645e-05, "loss": 2.2054, "mean_token_accuracy": 0.4655172348022461, "step": 136610 }, { "epoch": 0.13760007896531268, "grad_norm": 11.187357914483874, "learning_rate": 4.906251088296281e-05, "loss": 2.7507, "mean_token_accuracy": 0.3758620619773865, "step": 136615 }, { "epoch": 0.13760511501841685, "grad_norm": 10.445948104660134, "learning_rate": 4.9062403842771546e-05, "loss": 2.7209, "mean_token_accuracy": 0.44827585220336913, "step": 136620 }, { "epoch": 0.13761015107152103, "grad_norm": 11.01636669908, "learning_rate": 4.906229679659987e-05, "loss": 2.2845, "mean_token_accuracy": 0.3931034505367279, "step": 136625 }, { "epoch": 0.1376151871246252, "grad_norm": 15.111603290554621, "learning_rate": 4.906218974444781e-05, "loss": 2.4377, "mean_token_accuracy": 0.4551724076271057, "step": 136630 }, { "epoch": 0.13762022317772937, "grad_norm": 8.375800745313047, "learning_rate": 4.906208268631539e-05, "loss": 2.1765, "mean_token_accuracy": 0.43448275327682495, "step": 136635 }, { "epoch": 0.13762525923083355, "grad_norm": 11.653389826858136, "learning_rate": 4.9061975622202645e-05, "loss": 2.3983, "mean_token_accuracy": 0.3793103516101837, "step": 136640 }, { "epoch": 0.13763029528393772, "grad_norm": 11.276637213463118, "learning_rate": 4.9061868552109605e-05, "loss": 2.0715, "mean_token_accuracy": 0.4931034505367279, "step": 136645 }, { "epoch": 0.1376353313370419, "grad_norm": 10.104326211257732, "learning_rate": 4.90617614760363e-05, "loss": 2.4408, "mean_token_accuracy": 0.4137930989265442, "step": 136650 }, { "epoch": 0.13764036739014607, "grad_norm": 11.396492953972677, "learning_rate": 4.906165439398277e-05, "loss": 2.6152, "mean_token_accuracy": 0.4620689570903778, "step": 136655 }, { "epoch": 0.13764540344325024, "grad_norm": 11.79919644382028, "learning_rate": 4.906154730594903e-05, "loss": 2.2641, "mean_token_accuracy": 0.42413793206214906, "step": 136660 }, { "epoch": 0.1376504394963544, "grad_norm": 12.052208431850582, "learning_rate": 4.906144021193511e-05, "loss": 2.6911, "mean_token_accuracy": 0.4310344815254211, "step": 136665 }, { "epoch": 0.13765547554945856, "grad_norm": 10.716860197718113, "learning_rate": 4.906133311194105e-05, "loss": 2.2462, "mean_token_accuracy": 0.4713853597640991, "step": 136670 }, { "epoch": 0.13766051160256274, "grad_norm": 8.845636419514264, "learning_rate": 4.906122600596687e-05, "loss": 2.4097, "mean_token_accuracy": 0.45172412395477296, "step": 136675 }, { "epoch": 0.1376655476556669, "grad_norm": 9.832655404320112, "learning_rate": 4.9061118894012606e-05, "loss": 2.3503, "mean_token_accuracy": 0.4034482777118683, "step": 136680 }, { "epoch": 0.13767058370877108, "grad_norm": 10.656940961050061, "learning_rate": 4.906101177607828e-05, "loss": 2.5194, "mean_token_accuracy": 0.4103448331356049, "step": 136685 }, { "epoch": 0.13767561976187526, "grad_norm": 13.850997173002858, "learning_rate": 4.906090465216393e-05, "loss": 2.0968, "mean_token_accuracy": 0.45722927451133727, "step": 136690 }, { "epoch": 0.13768065581497943, "grad_norm": 12.083859191690681, "learning_rate": 4.9060797522269587e-05, "loss": 2.3586, "mean_token_accuracy": 0.4413793087005615, "step": 136695 }, { "epoch": 0.1376856918680836, "grad_norm": 10.55002487815266, "learning_rate": 4.9060690386395264e-05, "loss": 2.2371, "mean_token_accuracy": 0.44137930274009707, "step": 136700 }, { "epoch": 0.13769072792118778, "grad_norm": 7.593834996570759, "learning_rate": 4.906058324454102e-05, "loss": 1.8884, "mean_token_accuracy": 0.4896551728248596, "step": 136705 }, { "epoch": 0.13769576397429195, "grad_norm": 9.774646881517143, "learning_rate": 4.906047609670685e-05, "loss": 1.947, "mean_token_accuracy": 0.48275862336158754, "step": 136710 }, { "epoch": 0.13770080002739613, "grad_norm": 9.924265967868068, "learning_rate": 4.906036894289281e-05, "loss": 2.6218, "mean_token_accuracy": 0.334482753276825, "step": 136715 }, { "epoch": 0.1377058360805003, "grad_norm": 10.602625830938189, "learning_rate": 4.906026178309893e-05, "loss": 2.5839, "mean_token_accuracy": 0.37241379022598264, "step": 136720 }, { "epoch": 0.13771087213360447, "grad_norm": 8.495630254069749, "learning_rate": 4.906015461732522e-05, "loss": 2.3646, "mean_token_accuracy": 0.45015124678611756, "step": 136725 }, { "epoch": 0.13771590818670865, "grad_norm": 9.61102893210028, "learning_rate": 4.9060047445571724e-05, "loss": 2.379, "mean_token_accuracy": 0.42413792610168455, "step": 136730 }, { "epoch": 0.13772094423981282, "grad_norm": 12.124575524206698, "learning_rate": 4.905994026783846e-05, "loss": 2.4611, "mean_token_accuracy": 0.4379310250282288, "step": 136735 }, { "epoch": 0.137725980292917, "grad_norm": 11.709566039554073, "learning_rate": 4.905983308412548e-05, "loss": 2.5627, "mean_token_accuracy": 0.41034482717514037, "step": 136740 }, { "epoch": 0.13773101634602117, "grad_norm": 9.630544866211713, "learning_rate": 4.90597258944328e-05, "loss": 2.2565, "mean_token_accuracy": 0.441379314661026, "step": 136745 }, { "epoch": 0.13773605239912534, "grad_norm": 13.858902192140379, "learning_rate": 4.9059618698760436e-05, "loss": 2.4368, "mean_token_accuracy": 0.44482758045196535, "step": 136750 }, { "epoch": 0.13774108845222952, "grad_norm": 9.291180180520113, "learning_rate": 4.905951149710844e-05, "loss": 2.1934, "mean_token_accuracy": 0.4310344815254211, "step": 136755 }, { "epoch": 0.1377461245053337, "grad_norm": 14.002157040737972, "learning_rate": 4.905940428947683e-05, "loss": 2.1689, "mean_token_accuracy": 0.47434967160224917, "step": 136760 }, { "epoch": 0.13775116055843786, "grad_norm": 8.820057836259314, "learning_rate": 4.905929707586564e-05, "loss": 2.6486, "mean_token_accuracy": 0.45051422119140627, "step": 136765 }, { "epoch": 0.13775619661154204, "grad_norm": 12.042484893992597, "learning_rate": 4.9059189856274906e-05, "loss": 2.5097, "mean_token_accuracy": 0.4310344815254211, "step": 136770 }, { "epoch": 0.1377612326646462, "grad_norm": 9.591091031643348, "learning_rate": 4.905908263070464e-05, "loss": 1.8931, "mean_token_accuracy": 0.5090139091014863, "step": 136775 }, { "epoch": 0.13776626871775038, "grad_norm": 13.975887059718948, "learning_rate": 4.905897539915488e-05, "loss": 2.7371, "mean_token_accuracy": 0.38620689511299133, "step": 136780 }, { "epoch": 0.13777130477085456, "grad_norm": 11.277756565453252, "learning_rate": 4.905886816162567e-05, "loss": 2.5793, "mean_token_accuracy": 0.38275861740112305, "step": 136785 }, { "epoch": 0.13777634082395873, "grad_norm": 10.185410510737633, "learning_rate": 4.905876091811702e-05, "loss": 2.6091, "mean_token_accuracy": 0.3931034505367279, "step": 136790 }, { "epoch": 0.1377813768770629, "grad_norm": 8.0069155507688, "learning_rate": 4.905865366862897e-05, "loss": 2.2221, "mean_token_accuracy": 0.458620685338974, "step": 136795 }, { "epoch": 0.13778641293016708, "grad_norm": 10.834240940441203, "learning_rate": 4.905854641316154e-05, "loss": 2.2695, "mean_token_accuracy": 0.4413793087005615, "step": 136800 }, { "epoch": 0.13779144898327123, "grad_norm": 12.09209203205288, "learning_rate": 4.905843915171477e-05, "loss": 2.3944, "mean_token_accuracy": 0.44137930274009707, "step": 136805 }, { "epoch": 0.1377964850363754, "grad_norm": 10.38534993805814, "learning_rate": 4.905833188428869e-05, "loss": 2.1349, "mean_token_accuracy": 0.47241379618644713, "step": 136810 }, { "epoch": 0.13780152108947957, "grad_norm": 7.594934501084621, "learning_rate": 4.905822461088333e-05, "loss": 2.2327, "mean_token_accuracy": 0.4862069010734558, "step": 136815 }, { "epoch": 0.13780655714258375, "grad_norm": 14.194466932647156, "learning_rate": 4.905811733149871e-05, "loss": 2.6584, "mean_token_accuracy": 0.3999999940395355, "step": 136820 }, { "epoch": 0.13781159319568792, "grad_norm": 12.965053641913252, "learning_rate": 4.905801004613486e-05, "loss": 2.4713, "mean_token_accuracy": 0.42413792610168455, "step": 136825 }, { "epoch": 0.1378166292487921, "grad_norm": 12.280817252737299, "learning_rate": 4.905790275479182e-05, "loss": 2.377, "mean_token_accuracy": 0.4413793087005615, "step": 136830 }, { "epoch": 0.13782166530189627, "grad_norm": 11.878873899025091, "learning_rate": 4.905779545746962e-05, "loss": 2.5405, "mean_token_accuracy": 0.3896551728248596, "step": 136835 }, { "epoch": 0.13782670135500044, "grad_norm": 9.630215387534745, "learning_rate": 4.905768815416829e-05, "loss": 2.1913, "mean_token_accuracy": 0.4448275864124298, "step": 136840 }, { "epoch": 0.13783173740810462, "grad_norm": 10.875049910242822, "learning_rate": 4.9057580844887834e-05, "loss": 2.4449, "mean_token_accuracy": 0.4448275864124298, "step": 136845 }, { "epoch": 0.1378367734612088, "grad_norm": 11.19522485683445, "learning_rate": 4.905747352962832e-05, "loss": 2.5162, "mean_token_accuracy": 0.4254688322544098, "step": 136850 }, { "epoch": 0.13784180951431296, "grad_norm": 8.350857277435006, "learning_rate": 4.905736620838975e-05, "loss": 2.4651, "mean_token_accuracy": 0.39655172526836396, "step": 136855 }, { "epoch": 0.13784684556741714, "grad_norm": 10.284784564587305, "learning_rate": 4.905725888117218e-05, "loss": 2.4822, "mean_token_accuracy": 0.4172413766384125, "step": 136860 }, { "epoch": 0.1378518816205213, "grad_norm": 10.669307444808718, "learning_rate": 4.905715154797561e-05, "loss": 2.064, "mean_token_accuracy": 0.5, "step": 136865 }, { "epoch": 0.13785691767362548, "grad_norm": 13.866455836440284, "learning_rate": 4.905704420880009e-05, "loss": 2.5525, "mean_token_accuracy": 0.4137930989265442, "step": 136870 }, { "epoch": 0.13786195372672966, "grad_norm": 12.248044318795838, "learning_rate": 4.9056936863645635e-05, "loss": 2.1595, "mean_token_accuracy": 0.48411330580711365, "step": 136875 }, { "epoch": 0.13786698977983383, "grad_norm": 8.668135042266153, "learning_rate": 4.9056829512512294e-05, "loss": 2.4577, "mean_token_accuracy": 0.429824560880661, "step": 136880 }, { "epoch": 0.137872025832938, "grad_norm": 11.294755298022483, "learning_rate": 4.9056722155400074e-05, "loss": 2.8683, "mean_token_accuracy": 0.33793103992938994, "step": 136885 }, { "epoch": 0.13787706188604218, "grad_norm": 10.386652912984934, "learning_rate": 4.905661479230903e-05, "loss": 2.1511, "mean_token_accuracy": 0.49655172824859617, "step": 136890 }, { "epoch": 0.13788209793914635, "grad_norm": 9.792388121877957, "learning_rate": 4.905650742323917e-05, "loss": 2.2602, "mean_token_accuracy": 0.44482759237289426, "step": 136895 }, { "epoch": 0.13788713399225053, "grad_norm": 15.028538542117635, "learning_rate": 4.905640004819053e-05, "loss": 2.3416, "mean_token_accuracy": 0.4586206912994385, "step": 136900 }, { "epoch": 0.1378921700453547, "grad_norm": 13.425048102203833, "learning_rate": 4.905629266716315e-05, "loss": 2.5431, "mean_token_accuracy": 0.39310344457626345, "step": 136905 }, { "epoch": 0.13789720609845887, "grad_norm": 10.908821719862164, "learning_rate": 4.905618528015705e-05, "loss": 2.3338, "mean_token_accuracy": 0.4344827651977539, "step": 136910 }, { "epoch": 0.13790224215156305, "grad_norm": 9.89787586598569, "learning_rate": 4.905607788717225e-05, "loss": 2.5354, "mean_token_accuracy": 0.39310344457626345, "step": 136915 }, { "epoch": 0.13790727820466722, "grad_norm": 12.073112708228308, "learning_rate": 4.90559704882088e-05, "loss": 2.2725, "mean_token_accuracy": 0.4034482717514038, "step": 136920 }, { "epoch": 0.1379123142577714, "grad_norm": 13.285635492773977, "learning_rate": 4.905586308326673e-05, "loss": 3.2935, "mean_token_accuracy": 0.3241379290819168, "step": 136925 }, { "epoch": 0.13791735031087557, "grad_norm": 14.378967800372177, "learning_rate": 4.905575567234605e-05, "loss": 2.6202, "mean_token_accuracy": 0.3862069010734558, "step": 136930 }, { "epoch": 0.13792238636397974, "grad_norm": 10.900127823284995, "learning_rate": 4.9055648255446803e-05, "loss": 1.9515, "mean_token_accuracy": 0.482758617401123, "step": 136935 }, { "epoch": 0.13792742241708392, "grad_norm": 9.970344111401257, "learning_rate": 4.9055540832569014e-05, "loss": 2.0349, "mean_token_accuracy": 0.49655172824859617, "step": 136940 }, { "epoch": 0.13793245847018806, "grad_norm": 10.066737277256141, "learning_rate": 4.905543340371272e-05, "loss": 2.0653, "mean_token_accuracy": 0.4551724135875702, "step": 136945 }, { "epoch": 0.13793749452329224, "grad_norm": 15.82429436231584, "learning_rate": 4.9055325968877945e-05, "loss": 1.9315, "mean_token_accuracy": 0.4724137783050537, "step": 136950 }, { "epoch": 0.1379425305763964, "grad_norm": 11.176959868858997, "learning_rate": 4.905521852806472e-05, "loss": 2.3429, "mean_token_accuracy": 0.47241379618644713, "step": 136955 }, { "epoch": 0.13794756662950058, "grad_norm": 10.920548319080318, "learning_rate": 4.9055111081273076e-05, "loss": 2.7859, "mean_token_accuracy": 0.3827586233615875, "step": 136960 }, { "epoch": 0.13795260268260476, "grad_norm": 9.071779718795359, "learning_rate": 4.905500362850304e-05, "loss": 2.3313, "mean_token_accuracy": 0.43793103098869324, "step": 136965 }, { "epoch": 0.13795763873570893, "grad_norm": 14.804829078159091, "learning_rate": 4.9054896169754645e-05, "loss": 2.3547, "mean_token_accuracy": 0.4206896543502808, "step": 136970 }, { "epoch": 0.1379626747888131, "grad_norm": 9.867255189960712, "learning_rate": 4.905478870502792e-05, "loss": 2.4791, "mean_token_accuracy": 0.38275861740112305, "step": 136975 }, { "epoch": 0.13796771084191728, "grad_norm": 11.315643617224785, "learning_rate": 4.905468123432288e-05, "loss": 2.4736, "mean_token_accuracy": 0.4137930989265442, "step": 136980 }, { "epoch": 0.13797274689502145, "grad_norm": 8.824749949082534, "learning_rate": 4.9054573757639586e-05, "loss": 2.9322, "mean_token_accuracy": 0.38052027225494384, "step": 136985 }, { "epoch": 0.13797778294812563, "grad_norm": 12.474583745039066, "learning_rate": 4.905446627497804e-05, "loss": 2.484, "mean_token_accuracy": 0.3670296400785446, "step": 136990 }, { "epoch": 0.1379828190012298, "grad_norm": 10.503463242314194, "learning_rate": 4.905435878633829e-05, "loss": 2.258, "mean_token_accuracy": 0.42068964838981626, "step": 136995 }, { "epoch": 0.13798785505433397, "grad_norm": 14.520077099238978, "learning_rate": 4.905425129172036e-05, "loss": 2.5189, "mean_token_accuracy": 0.4379310369491577, "step": 137000 }, { "epoch": 0.13799289110743815, "grad_norm": 12.288976383106434, "learning_rate": 4.9054143791124266e-05, "loss": 2.3049, "mean_token_accuracy": 0.4517241299152374, "step": 137005 }, { "epoch": 0.13799792716054232, "grad_norm": 10.870148067752572, "learning_rate": 4.905403628455006e-05, "loss": 2.0616, "mean_token_accuracy": 0.5401088893413544, "step": 137010 }, { "epoch": 0.1380029632136465, "grad_norm": 11.381628355192376, "learning_rate": 4.905392877199776e-05, "loss": 2.1135, "mean_token_accuracy": 0.49999999403953554, "step": 137015 }, { "epoch": 0.13800799926675067, "grad_norm": 12.398975859255172, "learning_rate": 4.90538212534674e-05, "loss": 2.266, "mean_token_accuracy": 0.441379314661026, "step": 137020 }, { "epoch": 0.13801303531985484, "grad_norm": 8.669295632679196, "learning_rate": 4.9053713728959e-05, "loss": 2.14, "mean_token_accuracy": 0.47241379618644713, "step": 137025 }, { "epoch": 0.13801807137295902, "grad_norm": 11.444708069676741, "learning_rate": 4.90536061984726e-05, "loss": 2.3678, "mean_token_accuracy": 0.43793103098869324, "step": 137030 }, { "epoch": 0.1380231074260632, "grad_norm": 14.32177627152805, "learning_rate": 4.9053498662008224e-05, "loss": 2.7152, "mean_token_accuracy": 0.4344827592372894, "step": 137035 }, { "epoch": 0.13802814347916736, "grad_norm": 10.144860758809681, "learning_rate": 4.9053391119565906e-05, "loss": 2.5075, "mean_token_accuracy": 0.4206896543502808, "step": 137040 }, { "epoch": 0.13803317953227154, "grad_norm": 11.131856876310675, "learning_rate": 4.9053283571145684e-05, "loss": 2.4843, "mean_token_accuracy": 0.3793103456497192, "step": 137045 }, { "epoch": 0.1380382155853757, "grad_norm": 12.656420751038938, "learning_rate": 4.9053176016747563e-05, "loss": 2.4822, "mean_token_accuracy": 0.4517241358757019, "step": 137050 }, { "epoch": 0.13804325163847989, "grad_norm": 9.106570207860083, "learning_rate": 4.90530684563716e-05, "loss": 2.1144, "mean_token_accuracy": 0.4931034445762634, "step": 137055 }, { "epoch": 0.13804828769158406, "grad_norm": 10.8556618101491, "learning_rate": 4.905296089001781e-05, "loss": 2.3589, "mean_token_accuracy": 0.4604355812072754, "step": 137060 }, { "epoch": 0.13805332374468823, "grad_norm": 13.90595170214158, "learning_rate": 4.905285331768622e-05, "loss": 2.4118, "mean_token_accuracy": 0.43448275327682495, "step": 137065 }, { "epoch": 0.1380583597977924, "grad_norm": 10.359932887432198, "learning_rate": 4.905274573937688e-05, "loss": 2.809, "mean_token_accuracy": 0.3758620619773865, "step": 137070 }, { "epoch": 0.13806339585089658, "grad_norm": 11.624453058993755, "learning_rate": 4.9052638155089784e-05, "loss": 2.3946, "mean_token_accuracy": 0.46551724076271056, "step": 137075 }, { "epoch": 0.13806843190400075, "grad_norm": 10.360263526165383, "learning_rate": 4.9052530564825e-05, "loss": 2.1491, "mean_token_accuracy": 0.4620689690113068, "step": 137080 }, { "epoch": 0.1380734679571049, "grad_norm": 9.680510268385348, "learning_rate": 4.905242296858254e-05, "loss": 2.1831, "mean_token_accuracy": 0.39655172228813174, "step": 137085 }, { "epoch": 0.13807850401020907, "grad_norm": 10.689559689993407, "learning_rate": 4.9052315366362437e-05, "loss": 2.4747, "mean_token_accuracy": 0.42758620381355283, "step": 137090 }, { "epoch": 0.13808354006331325, "grad_norm": 9.599386529997076, "learning_rate": 4.905220775816471e-05, "loss": 2.247, "mean_token_accuracy": 0.4602540791034698, "step": 137095 }, { "epoch": 0.13808857611641742, "grad_norm": 9.433341354868118, "learning_rate": 4.90521001439894e-05, "loss": 2.4138, "mean_token_accuracy": 0.41917724609375, "step": 137100 }, { "epoch": 0.1380936121695216, "grad_norm": 11.290225540078543, "learning_rate": 4.905199252383654e-05, "loss": 2.4215, "mean_token_accuracy": 0.41034482717514037, "step": 137105 }, { "epoch": 0.13809864822262577, "grad_norm": 9.879474403676081, "learning_rate": 4.905188489770615e-05, "loss": 2.2323, "mean_token_accuracy": 0.44482758045196535, "step": 137110 }, { "epoch": 0.13810368427572994, "grad_norm": 12.747891207018581, "learning_rate": 4.905177726559827e-05, "loss": 2.41, "mean_token_accuracy": 0.45517240166664125, "step": 137115 }, { "epoch": 0.13810872032883412, "grad_norm": 12.10589362761177, "learning_rate": 4.9051669627512906e-05, "loss": 2.2764, "mean_token_accuracy": 0.441379314661026, "step": 137120 }, { "epoch": 0.1381137563819383, "grad_norm": 10.312404845477976, "learning_rate": 4.905156198345013e-05, "loss": 2.3761, "mean_token_accuracy": 0.44016939401626587, "step": 137125 }, { "epoch": 0.13811879243504246, "grad_norm": 9.25616159943411, "learning_rate": 4.905145433340993e-05, "loss": 2.0753, "mean_token_accuracy": 0.4448275864124298, "step": 137130 }, { "epoch": 0.13812382848814664, "grad_norm": 15.540410244376028, "learning_rate": 4.905134667739236e-05, "loss": 3.0296, "mean_token_accuracy": 0.34482758641242983, "step": 137135 }, { "epoch": 0.1381288645412508, "grad_norm": 10.735319805341923, "learning_rate": 4.905123901539745e-05, "loss": 2.2277, "mean_token_accuracy": 0.47816091775894165, "step": 137140 }, { "epoch": 0.13813390059435499, "grad_norm": 12.175245379993324, "learning_rate": 4.9051131347425214e-05, "loss": 2.4507, "mean_token_accuracy": 0.41724138259887694, "step": 137145 }, { "epoch": 0.13813893664745916, "grad_norm": 18.14650242142119, "learning_rate": 4.90510236734757e-05, "loss": 2.5692, "mean_token_accuracy": 0.441379314661026, "step": 137150 }, { "epoch": 0.13814397270056333, "grad_norm": 10.868349348383948, "learning_rate": 4.905091599354892e-05, "loss": 2.3217, "mean_token_accuracy": 0.43103447556495667, "step": 137155 }, { "epoch": 0.1381490087536675, "grad_norm": 13.34380480243648, "learning_rate": 4.905080830764492e-05, "loss": 2.6578, "mean_token_accuracy": 0.3862069010734558, "step": 137160 }, { "epoch": 0.13815404480677168, "grad_norm": 8.348855998744924, "learning_rate": 4.9050700615763714e-05, "loss": 2.2951, "mean_token_accuracy": 0.45674530863761903, "step": 137165 }, { "epoch": 0.13815908085987585, "grad_norm": 10.714009487202686, "learning_rate": 4.905059291790536e-05, "loss": 2.0102, "mean_token_accuracy": 0.4931034505367279, "step": 137170 }, { "epoch": 0.13816411691298003, "grad_norm": 14.481448321359927, "learning_rate": 4.905048521406985e-05, "loss": 2.1652, "mean_token_accuracy": 0.4730295598506927, "step": 137175 }, { "epoch": 0.1381691529660842, "grad_norm": 10.231914992411706, "learning_rate": 4.905037750425724e-05, "loss": 2.026, "mean_token_accuracy": 0.4620689690113068, "step": 137180 }, { "epoch": 0.13817418901918838, "grad_norm": 15.689019094250472, "learning_rate": 4.905026978846755e-05, "loss": 2.6332, "mean_token_accuracy": 0.46896552443504336, "step": 137185 }, { "epoch": 0.13817922507229255, "grad_norm": 9.469104299702643, "learning_rate": 4.905016206670081e-05, "loss": 2.3308, "mean_token_accuracy": 0.4413793087005615, "step": 137190 }, { "epoch": 0.13818426112539672, "grad_norm": 10.221052666376501, "learning_rate": 4.905005433895706e-05, "loss": 2.2697, "mean_token_accuracy": 0.4551724076271057, "step": 137195 }, { "epoch": 0.1381892971785009, "grad_norm": 9.591216125438017, "learning_rate": 4.904994660523631e-05, "loss": 2.4653, "mean_token_accuracy": 0.4068965494632721, "step": 137200 }, { "epoch": 0.13819433323160507, "grad_norm": 10.852272926656907, "learning_rate": 4.904983886553862e-05, "loss": 2.2275, "mean_token_accuracy": 0.40000000298023225, "step": 137205 }, { "epoch": 0.13819936928470924, "grad_norm": 10.475173500781013, "learning_rate": 4.904973111986399e-05, "loss": 2.2973, "mean_token_accuracy": 0.41034482717514037, "step": 137210 }, { "epoch": 0.13820440533781342, "grad_norm": 11.55150213426642, "learning_rate": 4.904962336821246e-05, "loss": 2.3126, "mean_token_accuracy": 0.4482758641242981, "step": 137215 }, { "epoch": 0.1382094413909176, "grad_norm": 11.281230794346442, "learning_rate": 4.9049515610584066e-05, "loss": 2.707, "mean_token_accuracy": 0.42758620381355283, "step": 137220 }, { "epoch": 0.13821447744402174, "grad_norm": 11.021125983677178, "learning_rate": 4.904940784697883e-05, "loss": 2.3097, "mean_token_accuracy": 0.4551724135875702, "step": 137225 }, { "epoch": 0.1382195134971259, "grad_norm": 11.119004590427968, "learning_rate": 4.904930007739679e-05, "loss": 2.6087, "mean_token_accuracy": 0.33793103098869326, "step": 137230 }, { "epoch": 0.13822454955023009, "grad_norm": 10.573175022213283, "learning_rate": 4.9049192301837964e-05, "loss": 2.1897, "mean_token_accuracy": 0.45172414779663084, "step": 137235 }, { "epoch": 0.13822958560333426, "grad_norm": 16.90987496133507, "learning_rate": 4.9049084520302396e-05, "loss": 2.9156, "mean_token_accuracy": 0.36896551847457887, "step": 137240 }, { "epoch": 0.13823462165643843, "grad_norm": 16.466814730736886, "learning_rate": 4.904897673279011e-05, "loss": 2.767, "mean_token_accuracy": 0.4085299432277679, "step": 137245 }, { "epoch": 0.1382396577095426, "grad_norm": 10.317154033298207, "learning_rate": 4.904886893930113e-05, "loss": 2.353, "mean_token_accuracy": 0.4586206912994385, "step": 137250 }, { "epoch": 0.13824469376264678, "grad_norm": 14.731924504778434, "learning_rate": 4.9048761139835494e-05, "loss": 2.5058, "mean_token_accuracy": 0.4310344815254211, "step": 137255 }, { "epoch": 0.13824972981575095, "grad_norm": 9.495680848055532, "learning_rate": 4.904865333439323e-05, "loss": 2.2217, "mean_token_accuracy": 0.36896551251411436, "step": 137260 }, { "epoch": 0.13825476586885513, "grad_norm": 10.395401397049287, "learning_rate": 4.904854552297436e-05, "loss": 2.2706, "mean_token_accuracy": 0.4482758641242981, "step": 137265 }, { "epoch": 0.1382598019219593, "grad_norm": 9.271198117956585, "learning_rate": 4.9048437705578924e-05, "loss": 1.9969, "mean_token_accuracy": 0.48275861144065857, "step": 137270 }, { "epoch": 0.13826483797506348, "grad_norm": 10.250941744286214, "learning_rate": 4.904832988220695e-05, "loss": 2.3267, "mean_token_accuracy": 0.4151845097541809, "step": 137275 }, { "epoch": 0.13826987402816765, "grad_norm": 9.250033277470592, "learning_rate": 4.9048222052858464e-05, "loss": 2.4131, "mean_token_accuracy": 0.42758620381355283, "step": 137280 }, { "epoch": 0.13827491008127182, "grad_norm": 11.554099944772243, "learning_rate": 4.90481142175335e-05, "loss": 2.3868, "mean_token_accuracy": 0.4551724076271057, "step": 137285 }, { "epoch": 0.138279946134376, "grad_norm": 9.451505387463778, "learning_rate": 4.904800637623208e-05, "loss": 2.217, "mean_token_accuracy": 0.40344828367233276, "step": 137290 }, { "epoch": 0.13828498218748017, "grad_norm": 7.950810557745291, "learning_rate": 4.9047898528954243e-05, "loss": 2.0858, "mean_token_accuracy": 0.5000000059604645, "step": 137295 }, { "epoch": 0.13829001824058434, "grad_norm": 11.068630256193153, "learning_rate": 4.904779067570002e-05, "loss": 2.3589, "mean_token_accuracy": 0.41034482717514037, "step": 137300 }, { "epoch": 0.13829505429368852, "grad_norm": 11.382220404431587, "learning_rate": 4.904768281646944e-05, "loss": 2.5385, "mean_token_accuracy": 0.42758620977401735, "step": 137305 }, { "epoch": 0.1383000903467927, "grad_norm": 12.88338565696784, "learning_rate": 4.904757495126252e-05, "loss": 2.5518, "mean_token_accuracy": 0.4034482777118683, "step": 137310 }, { "epoch": 0.13830512639989687, "grad_norm": 12.480782662638976, "learning_rate": 4.9047467080079305e-05, "loss": 2.8154, "mean_token_accuracy": 0.39310344457626345, "step": 137315 }, { "epoch": 0.13831016245300104, "grad_norm": 11.82498137813083, "learning_rate": 4.904735920291982e-05, "loss": 2.4048, "mean_token_accuracy": 0.41724138259887694, "step": 137320 }, { "epoch": 0.1383151985061052, "grad_norm": 9.58618007083539, "learning_rate": 4.9047251319784096e-05, "loss": 2.5648, "mean_token_accuracy": 0.403448274731636, "step": 137325 }, { "epoch": 0.1383202345592094, "grad_norm": 9.01771748780737, "learning_rate": 4.904714343067216e-05, "loss": 2.1572, "mean_token_accuracy": 0.46751360297203065, "step": 137330 }, { "epoch": 0.13832527061231356, "grad_norm": 9.15640355929264, "learning_rate": 4.904703553558404e-05, "loss": 3.0106, "mean_token_accuracy": 0.3827586114406586, "step": 137335 }, { "epoch": 0.13833030666541773, "grad_norm": 11.72502150114695, "learning_rate": 4.9046927634519774e-05, "loss": 2.3864, "mean_token_accuracy": 0.40344828367233276, "step": 137340 }, { "epoch": 0.1383353427185219, "grad_norm": 9.52159232524416, "learning_rate": 4.904681972747938e-05, "loss": 2.1858, "mean_token_accuracy": 0.4413793087005615, "step": 137345 }, { "epoch": 0.13834037877162608, "grad_norm": 9.218164867696823, "learning_rate": 4.904671181446291e-05, "loss": 2.3578, "mean_token_accuracy": 0.4482758641242981, "step": 137350 }, { "epoch": 0.13834541482473026, "grad_norm": 11.812019112137193, "learning_rate": 4.9046603895470364e-05, "loss": 2.6194, "mean_token_accuracy": 0.3931034505367279, "step": 137355 }, { "epoch": 0.13835045087783443, "grad_norm": 11.781836200897756, "learning_rate": 4.9046495970501794e-05, "loss": 2.6645, "mean_token_accuracy": 0.42558982968330383, "step": 137360 }, { "epoch": 0.13835548693093858, "grad_norm": 11.276375512533276, "learning_rate": 4.904638803955722e-05, "loss": 2.1741, "mean_token_accuracy": 0.43061100840568545, "step": 137365 }, { "epoch": 0.13836052298404275, "grad_norm": 10.405169205644825, "learning_rate": 4.904628010263668e-05, "loss": 2.6281, "mean_token_accuracy": 0.39310344457626345, "step": 137370 }, { "epoch": 0.13836555903714692, "grad_norm": 12.194100580645351, "learning_rate": 4.904617215974019e-05, "loss": 2.1313, "mean_token_accuracy": 0.4482758641242981, "step": 137375 }, { "epoch": 0.1383705950902511, "grad_norm": 10.833087115932075, "learning_rate": 4.9046064210867795e-05, "loss": 2.3863, "mean_token_accuracy": 0.4379310250282288, "step": 137380 }, { "epoch": 0.13837563114335527, "grad_norm": 7.916665057757423, "learning_rate": 4.904595625601952e-05, "loss": 2.2082, "mean_token_accuracy": 0.47447065711021424, "step": 137385 }, { "epoch": 0.13838066719645944, "grad_norm": 8.663216830766238, "learning_rate": 4.904584829519539e-05, "loss": 1.9562, "mean_token_accuracy": 0.4965517222881317, "step": 137390 }, { "epoch": 0.13838570324956362, "grad_norm": 11.622604955985764, "learning_rate": 4.904574032839544e-05, "loss": 2.5407, "mean_token_accuracy": 0.39310344457626345, "step": 137395 }, { "epoch": 0.1383907393026678, "grad_norm": 11.697562640288808, "learning_rate": 4.90456323556197e-05, "loss": 3.1473, "mean_token_accuracy": 0.34827586114406583, "step": 137400 }, { "epoch": 0.13839577535577197, "grad_norm": 10.577042874043594, "learning_rate": 4.9045524376868196e-05, "loss": 3.3001, "mean_token_accuracy": 0.34137930870056155, "step": 137405 }, { "epoch": 0.13840081140887614, "grad_norm": 12.746065946383407, "learning_rate": 4.904541639214096e-05, "loss": 2.4203, "mean_token_accuracy": 0.43448275327682495, "step": 137410 }, { "epoch": 0.1384058474619803, "grad_norm": 10.261736686311684, "learning_rate": 4.904530840143803e-05, "loss": 2.3571, "mean_token_accuracy": 0.4606170654296875, "step": 137415 }, { "epoch": 0.1384108835150845, "grad_norm": 13.541318582829687, "learning_rate": 4.904520040475942e-05, "loss": 2.6215, "mean_token_accuracy": 0.4206896543502808, "step": 137420 }, { "epoch": 0.13841591956818866, "grad_norm": 10.280934054611498, "learning_rate": 4.904509240210517e-05, "loss": 2.1093, "mean_token_accuracy": 0.43103447556495667, "step": 137425 }, { "epoch": 0.13842095562129283, "grad_norm": 11.157989834070339, "learning_rate": 4.904498439347531e-05, "loss": 2.2858, "mean_token_accuracy": 0.4379310369491577, "step": 137430 }, { "epoch": 0.138425991674397, "grad_norm": 11.620784354362506, "learning_rate": 4.9044876378869865e-05, "loss": 2.7407, "mean_token_accuracy": 0.42068966031074523, "step": 137435 }, { "epoch": 0.13843102772750118, "grad_norm": 11.273182659546192, "learning_rate": 4.9044768358288866e-05, "loss": 2.6857, "mean_token_accuracy": 0.37586206793785093, "step": 137440 }, { "epoch": 0.13843606378060536, "grad_norm": 10.910585631493031, "learning_rate": 4.904466033173235e-05, "loss": 2.4133, "mean_token_accuracy": 0.4433151870965958, "step": 137445 }, { "epoch": 0.13844109983370953, "grad_norm": 10.726100672630558, "learning_rate": 4.9044552299200345e-05, "loss": 2.4185, "mean_token_accuracy": 0.4206896543502808, "step": 137450 }, { "epoch": 0.1384461358868137, "grad_norm": 10.660147194562361, "learning_rate": 4.9044444260692876e-05, "loss": 2.3437, "mean_token_accuracy": 0.43793103098869324, "step": 137455 }, { "epoch": 0.13845117193991788, "grad_norm": 10.308886504176655, "learning_rate": 4.904433621620997e-05, "loss": 2.1821, "mean_token_accuracy": 0.44627949595451355, "step": 137460 }, { "epoch": 0.13845620799302205, "grad_norm": 11.023020700375163, "learning_rate": 4.9044228165751666e-05, "loss": 1.9921, "mean_token_accuracy": 0.5089534163475037, "step": 137465 }, { "epoch": 0.13846124404612623, "grad_norm": 9.615908699486756, "learning_rate": 4.9044120109317986e-05, "loss": 2.471, "mean_token_accuracy": 0.3999999940395355, "step": 137470 }, { "epoch": 0.1384662800992304, "grad_norm": 14.119461677050118, "learning_rate": 4.904401204690897e-05, "loss": 2.4644, "mean_token_accuracy": 0.44827585816383364, "step": 137475 }, { "epoch": 0.13847131615233457, "grad_norm": 10.701577690105434, "learning_rate": 4.9043903978524636e-05, "loss": 2.3724, "mean_token_accuracy": 0.44827587008476255, "step": 137480 }, { "epoch": 0.13847635220543875, "grad_norm": 11.080186665236422, "learning_rate": 4.904379590416503e-05, "loss": 2.5163, "mean_token_accuracy": 0.45862067937850953, "step": 137485 }, { "epoch": 0.13848138825854292, "grad_norm": 9.603316987386908, "learning_rate": 4.904368782383016e-05, "loss": 2.8909, "mean_token_accuracy": 0.34482758343219755, "step": 137490 }, { "epoch": 0.1384864243116471, "grad_norm": 12.258062794740598, "learning_rate": 4.904357973752007e-05, "loss": 2.7883, "mean_token_accuracy": 0.3103448212146759, "step": 137495 }, { "epoch": 0.13849146036475127, "grad_norm": 12.035741627762773, "learning_rate": 4.9043471645234794e-05, "loss": 2.4714, "mean_token_accuracy": 0.4379310250282288, "step": 137500 }, { "epoch": 0.1384964964178554, "grad_norm": 10.244816802233077, "learning_rate": 4.9043363546974356e-05, "loss": 2.4702, "mean_token_accuracy": 0.4551724076271057, "step": 137505 }, { "epoch": 0.1385015324709596, "grad_norm": 11.16001390284774, "learning_rate": 4.904325544273878e-05, "loss": 2.1344, "mean_token_accuracy": 0.4551724135875702, "step": 137510 }, { "epoch": 0.13850656852406376, "grad_norm": 11.881230684968784, "learning_rate": 4.90431473325281e-05, "loss": 2.1808, "mean_token_accuracy": 0.44827585220336913, "step": 137515 }, { "epoch": 0.13851160457716793, "grad_norm": 10.628101455439015, "learning_rate": 4.904303921634235e-05, "loss": 2.2778, "mean_token_accuracy": 0.4448275864124298, "step": 137520 }, { "epoch": 0.1385166406302721, "grad_norm": 11.438640988404217, "learning_rate": 4.9042931094181555e-05, "loss": 2.5684, "mean_token_accuracy": 0.37586206793785093, "step": 137525 }, { "epoch": 0.13852167668337628, "grad_norm": 10.576739010811345, "learning_rate": 4.9042822966045756e-05, "loss": 2.5198, "mean_token_accuracy": 0.41379310488700866, "step": 137530 }, { "epoch": 0.13852671273648046, "grad_norm": 10.152419746810168, "learning_rate": 4.9042714831934973e-05, "loss": 2.4466, "mean_token_accuracy": 0.44482759237289426, "step": 137535 }, { "epoch": 0.13853174878958463, "grad_norm": 12.276746517424135, "learning_rate": 4.904260669184923e-05, "loss": 2.3249, "mean_token_accuracy": 0.4551724135875702, "step": 137540 }, { "epoch": 0.1385367848426888, "grad_norm": 10.528237674010498, "learning_rate": 4.904249854578857e-05, "loss": 2.5224, "mean_token_accuracy": 0.4609800338745117, "step": 137545 }, { "epoch": 0.13854182089579298, "grad_norm": 11.036238445347475, "learning_rate": 4.904239039375302e-05, "loss": 2.8143, "mean_token_accuracy": 0.36896551847457887, "step": 137550 }, { "epoch": 0.13854685694889715, "grad_norm": 9.231108976476456, "learning_rate": 4.904228223574261e-05, "loss": 2.3346, "mean_token_accuracy": 0.4413793087005615, "step": 137555 }, { "epoch": 0.13855189300200133, "grad_norm": 10.60910131750133, "learning_rate": 4.904217407175736e-05, "loss": 2.0112, "mean_token_accuracy": 0.48965516686439514, "step": 137560 }, { "epoch": 0.1385569290551055, "grad_norm": 11.169699435349395, "learning_rate": 4.9042065901797314e-05, "loss": 2.2149, "mean_token_accuracy": 0.44827585816383364, "step": 137565 }, { "epoch": 0.13856196510820967, "grad_norm": 8.9183196385182, "learning_rate": 4.904195772586249e-05, "loss": 2.4079, "mean_token_accuracy": 0.40344826579093934, "step": 137570 }, { "epoch": 0.13856700116131385, "grad_norm": 11.64262218747763, "learning_rate": 4.904184954395292e-05, "loss": 2.3683, "mean_token_accuracy": 0.4862068951129913, "step": 137575 }, { "epoch": 0.13857203721441802, "grad_norm": 9.93335293716104, "learning_rate": 4.904174135606865e-05, "loss": 2.1072, "mean_token_accuracy": 0.47241379618644713, "step": 137580 }, { "epoch": 0.1385770732675222, "grad_norm": 8.922987126211607, "learning_rate": 4.904163316220969e-05, "loss": 2.1256, "mean_token_accuracy": 0.4793103516101837, "step": 137585 }, { "epoch": 0.13858210932062637, "grad_norm": 11.512434339345575, "learning_rate": 4.9041524962376073e-05, "loss": 2.3249, "mean_token_accuracy": 0.47586206793785096, "step": 137590 }, { "epoch": 0.13858714537373054, "grad_norm": 11.645284226868005, "learning_rate": 4.904141675656784e-05, "loss": 2.503, "mean_token_accuracy": 0.4172413766384125, "step": 137595 }, { "epoch": 0.13859218142683472, "grad_norm": 9.744035686352326, "learning_rate": 4.904130854478501e-05, "loss": 2.1334, "mean_token_accuracy": 0.458620685338974, "step": 137600 }, { "epoch": 0.1385972174799389, "grad_norm": 13.01820589789826, "learning_rate": 4.904120032702763e-05, "loss": 2.4171, "mean_token_accuracy": 0.40689654350280763, "step": 137605 }, { "epoch": 0.13860225353304306, "grad_norm": 9.849513515092672, "learning_rate": 4.904109210329571e-05, "loss": 2.2435, "mean_token_accuracy": 0.458620685338974, "step": 137610 }, { "epoch": 0.13860728958614724, "grad_norm": 11.121458827373726, "learning_rate": 4.904098387358929e-05, "loss": 2.4628, "mean_token_accuracy": 0.4034482777118683, "step": 137615 }, { "epoch": 0.1386123256392514, "grad_norm": 15.226103610918562, "learning_rate": 4.904087563790839e-05, "loss": 2.4234, "mean_token_accuracy": 0.47931033968925474, "step": 137620 }, { "epoch": 0.13861736169235558, "grad_norm": 9.868087293165326, "learning_rate": 4.9040767396253054e-05, "loss": 2.6737, "mean_token_accuracy": 0.3999999940395355, "step": 137625 }, { "epoch": 0.13862239774545976, "grad_norm": 9.515130093003481, "learning_rate": 4.90406591486233e-05, "loss": 1.9356, "mean_token_accuracy": 0.48965516686439514, "step": 137630 }, { "epoch": 0.13862743379856393, "grad_norm": 9.175991263659336, "learning_rate": 4.904055089501917e-05, "loss": 2.2071, "mean_token_accuracy": 0.5, "step": 137635 }, { "epoch": 0.1386324698516681, "grad_norm": 11.396941962001137, "learning_rate": 4.904044263544069e-05, "loss": 2.3473, "mean_token_accuracy": 0.4862068951129913, "step": 137640 }, { "epoch": 0.13863750590477225, "grad_norm": 9.99291140903006, "learning_rate": 4.904033436988789e-05, "loss": 2.4109, "mean_token_accuracy": 0.4034482717514038, "step": 137645 }, { "epoch": 0.13864254195787643, "grad_norm": 10.707973807977968, "learning_rate": 4.904022609836078e-05, "loss": 2.3892, "mean_token_accuracy": 0.4310344815254211, "step": 137650 }, { "epoch": 0.1386475780109806, "grad_norm": 10.665935932947045, "learning_rate": 4.9040117820859424e-05, "loss": 2.5403, "mean_token_accuracy": 0.404718691110611, "step": 137655 }, { "epoch": 0.13865261406408477, "grad_norm": 10.163328041771319, "learning_rate": 4.904000953738383e-05, "loss": 2.4614, "mean_token_accuracy": 0.4206896543502808, "step": 137660 }, { "epoch": 0.13865765011718895, "grad_norm": 9.364856610750643, "learning_rate": 4.9039901247934046e-05, "loss": 2.204, "mean_token_accuracy": 0.4620689570903778, "step": 137665 }, { "epoch": 0.13866268617029312, "grad_norm": 9.30278864850231, "learning_rate": 4.903979295251008e-05, "loss": 2.3598, "mean_token_accuracy": 0.3758620709180832, "step": 137670 }, { "epoch": 0.1386677222233973, "grad_norm": 11.83659438420772, "learning_rate": 4.9039684651111964e-05, "loss": 2.3189, "mean_token_accuracy": 0.4551724076271057, "step": 137675 }, { "epoch": 0.13867275827650147, "grad_norm": 12.75266470224908, "learning_rate": 4.9039576343739744e-05, "loss": 2.5158, "mean_token_accuracy": 0.4310344815254211, "step": 137680 }, { "epoch": 0.13867779432960564, "grad_norm": 12.959357670909712, "learning_rate": 4.903946803039344e-05, "loss": 2.2067, "mean_token_accuracy": 0.4534180283546448, "step": 137685 }, { "epoch": 0.13868283038270982, "grad_norm": 13.650449006551257, "learning_rate": 4.903935971107309e-05, "loss": 2.7519, "mean_token_accuracy": 0.4137930989265442, "step": 137690 }, { "epoch": 0.138687866435814, "grad_norm": 10.036240247186441, "learning_rate": 4.903925138577872e-05, "loss": 2.1192, "mean_token_accuracy": 0.4793103337287903, "step": 137695 }, { "epoch": 0.13869290248891816, "grad_norm": 10.471331265088, "learning_rate": 4.903914305451035e-05, "loss": 2.4165, "mean_token_accuracy": 0.4482758641242981, "step": 137700 }, { "epoch": 0.13869793854202234, "grad_norm": 9.794905224518436, "learning_rate": 4.9039034717268024e-05, "loss": 2.2063, "mean_token_accuracy": 0.4586206912994385, "step": 137705 }, { "epoch": 0.1387029745951265, "grad_norm": 8.86632227156554, "learning_rate": 4.903892637405176e-05, "loss": 2.5154, "mean_token_accuracy": 0.4344827592372894, "step": 137710 }, { "epoch": 0.13870801064823068, "grad_norm": 9.762438746553263, "learning_rate": 4.90388180248616e-05, "loss": 2.411, "mean_token_accuracy": 0.39655172228813174, "step": 137715 }, { "epoch": 0.13871304670133486, "grad_norm": 10.925161537819086, "learning_rate": 4.903870966969756e-05, "loss": 2.251, "mean_token_accuracy": 0.42758620977401735, "step": 137720 }, { "epoch": 0.13871808275443903, "grad_norm": 14.417647549775682, "learning_rate": 4.903860130855969e-05, "loss": 2.8337, "mean_token_accuracy": 0.4000000059604645, "step": 137725 }, { "epoch": 0.1387231188075432, "grad_norm": 15.668986893551715, "learning_rate": 4.9038492941448e-05, "loss": 2.7359, "mean_token_accuracy": 0.3482758581638336, "step": 137730 }, { "epoch": 0.13872815486064738, "grad_norm": 8.754833916991052, "learning_rate": 4.903838456836254e-05, "loss": 2.4601, "mean_token_accuracy": 0.4448275864124298, "step": 137735 }, { "epoch": 0.13873319091375155, "grad_norm": 10.782162986464106, "learning_rate": 4.9038276189303314e-05, "loss": 2.6377, "mean_token_accuracy": 0.38965516686439516, "step": 137740 }, { "epoch": 0.13873822696685573, "grad_norm": 13.224611530697665, "learning_rate": 4.903816780427038e-05, "loss": 2.2805, "mean_token_accuracy": 0.4586207032203674, "step": 137745 }, { "epoch": 0.1387432630199599, "grad_norm": 10.385233817571228, "learning_rate": 4.9038059413263745e-05, "loss": 2.5225, "mean_token_accuracy": 0.4310344815254211, "step": 137750 }, { "epoch": 0.13874829907306407, "grad_norm": 7.90224873690731, "learning_rate": 4.903795101628345e-05, "loss": 2.4298, "mean_token_accuracy": 0.3986085832118988, "step": 137755 }, { "epoch": 0.13875333512616825, "grad_norm": 8.550517658032902, "learning_rate": 4.9037842613329525e-05, "loss": 1.7479, "mean_token_accuracy": 0.5620689570903779, "step": 137760 }, { "epoch": 0.13875837117927242, "grad_norm": 9.039573614767058, "learning_rate": 4.9037734204402006e-05, "loss": 2.4587, "mean_token_accuracy": 0.37586206793785093, "step": 137765 }, { "epoch": 0.1387634072323766, "grad_norm": 10.29357716208612, "learning_rate": 4.9037625789500905e-05, "loss": 2.3634, "mean_token_accuracy": 0.42565032839775085, "step": 137770 }, { "epoch": 0.13876844328548077, "grad_norm": 11.245347297824466, "learning_rate": 4.903751736862627e-05, "loss": 2.4103, "mean_token_accuracy": 0.4034482777118683, "step": 137775 }, { "epoch": 0.13877347933858494, "grad_norm": 10.203285206720572, "learning_rate": 4.903740894177813e-05, "loss": 2.6202, "mean_token_accuracy": 0.4206896543502808, "step": 137780 }, { "epoch": 0.1387785153916891, "grad_norm": 12.423792938302471, "learning_rate": 4.9037300508956496e-05, "loss": 2.2398, "mean_token_accuracy": 0.46896552443504336, "step": 137785 }, { "epoch": 0.13878355144479326, "grad_norm": 10.2837849655862, "learning_rate": 4.903719207016142e-05, "loss": 2.4378, "mean_token_accuracy": 0.4517241418361664, "step": 137790 }, { "epoch": 0.13878858749789744, "grad_norm": 15.958778176556656, "learning_rate": 4.903708362539292e-05, "loss": 2.2345, "mean_token_accuracy": 0.4923775017261505, "step": 137795 }, { "epoch": 0.1387936235510016, "grad_norm": 10.346261953761873, "learning_rate": 4.9036975174651034e-05, "loss": 2.5646, "mean_token_accuracy": 0.4413793087005615, "step": 137800 }, { "epoch": 0.13879865960410578, "grad_norm": 11.895805338116347, "learning_rate": 4.903686671793578e-05, "loss": 2.459, "mean_token_accuracy": 0.3896551728248596, "step": 137805 }, { "epoch": 0.13880369565720996, "grad_norm": 9.555228511309338, "learning_rate": 4.90367582552472e-05, "loss": 2.3317, "mean_token_accuracy": 0.4482758641242981, "step": 137810 }, { "epoch": 0.13880873171031413, "grad_norm": 13.278667190565079, "learning_rate": 4.903664978658532e-05, "loss": 2.7452, "mean_token_accuracy": 0.3896551728248596, "step": 137815 }, { "epoch": 0.1388137677634183, "grad_norm": 10.344080922683894, "learning_rate": 4.903654131195017e-05, "loss": 2.5725, "mean_token_accuracy": 0.43103448748588563, "step": 137820 }, { "epoch": 0.13881880381652248, "grad_norm": 11.180681779655693, "learning_rate": 4.9036432831341775e-05, "loss": 2.4851, "mean_token_accuracy": 0.4379310369491577, "step": 137825 }, { "epoch": 0.13882383986962665, "grad_norm": 11.311234262064488, "learning_rate": 4.903632434476018e-05, "loss": 2.2905, "mean_token_accuracy": 0.44827587008476255, "step": 137830 }, { "epoch": 0.13882887592273083, "grad_norm": 10.161867191477869, "learning_rate": 4.90362158522054e-05, "loss": 2.4287, "mean_token_accuracy": 0.42413793206214906, "step": 137835 }, { "epoch": 0.138833911975835, "grad_norm": 9.990398835478857, "learning_rate": 4.903610735367747e-05, "loss": 2.1889, "mean_token_accuracy": 0.4517241418361664, "step": 137840 }, { "epoch": 0.13883894802893917, "grad_norm": 12.112970488862109, "learning_rate": 4.903599884917642e-05, "loss": 2.4917, "mean_token_accuracy": 0.3931034475564957, "step": 137845 }, { "epoch": 0.13884398408204335, "grad_norm": 10.24034146128821, "learning_rate": 4.903589033870228e-05, "loss": 2.1452, "mean_token_accuracy": 0.4628078818321228, "step": 137850 }, { "epoch": 0.13884902013514752, "grad_norm": 14.382979694102247, "learning_rate": 4.903578182225508e-05, "loss": 2.3549, "mean_token_accuracy": 0.4517241418361664, "step": 137855 }, { "epoch": 0.1388540561882517, "grad_norm": 11.666553817110822, "learning_rate": 4.903567329983485e-05, "loss": 2.6477, "mean_token_accuracy": 0.3999999940395355, "step": 137860 }, { "epoch": 0.13885909224135587, "grad_norm": 10.193437754289564, "learning_rate": 4.903556477144162e-05, "loss": 2.2737, "mean_token_accuracy": 0.4448275864124298, "step": 137865 }, { "epoch": 0.13886412829446004, "grad_norm": 10.590477856530862, "learning_rate": 4.903545623707542e-05, "loss": 3.1103, "mean_token_accuracy": 0.3482758581638336, "step": 137870 }, { "epoch": 0.13886916434756422, "grad_norm": 9.432163206937203, "learning_rate": 4.903534769673629e-05, "loss": 2.4242, "mean_token_accuracy": 0.3793103456497192, "step": 137875 }, { "epoch": 0.1388742004006684, "grad_norm": 11.306132357714635, "learning_rate": 4.903523915042424e-05, "loss": 2.2195, "mean_token_accuracy": 0.4778584361076355, "step": 137880 }, { "epoch": 0.13887923645377256, "grad_norm": 9.86446661995922, "learning_rate": 4.903513059813932e-05, "loss": 2.3377, "mean_token_accuracy": 0.4586206912994385, "step": 137885 }, { "epoch": 0.13888427250687674, "grad_norm": 10.780617319385092, "learning_rate": 4.9035022039881544e-05, "loss": 2.3903, "mean_token_accuracy": 0.43793103098869324, "step": 137890 }, { "epoch": 0.1388893085599809, "grad_norm": 9.198797385726362, "learning_rate": 4.9034913475650954e-05, "loss": 2.0886, "mean_token_accuracy": 0.4758620738983154, "step": 137895 }, { "epoch": 0.13889434461308509, "grad_norm": 8.012888687964251, "learning_rate": 4.9034804905447576e-05, "loss": 2.7181, "mean_token_accuracy": 0.40344828367233276, "step": 137900 }, { "epoch": 0.13889938066618926, "grad_norm": 12.746106942797661, "learning_rate": 4.9034696329271436e-05, "loss": 2.8771, "mean_token_accuracy": 0.37241379022598264, "step": 137905 }, { "epoch": 0.13890441671929343, "grad_norm": 9.153222914600978, "learning_rate": 4.903458774712257e-05, "loss": 2.4741, "mean_token_accuracy": 0.4413793087005615, "step": 137910 }, { "epoch": 0.1389094527723976, "grad_norm": 9.589653414713865, "learning_rate": 4.903447915900101e-05, "loss": 2.3311, "mean_token_accuracy": 0.4103448331356049, "step": 137915 }, { "epoch": 0.13891448882550178, "grad_norm": 12.465493931272226, "learning_rate": 4.903437056490677e-05, "loss": 2.7182, "mean_token_accuracy": 0.3862069010734558, "step": 137920 }, { "epoch": 0.13891952487860593, "grad_norm": 12.194614443620884, "learning_rate": 4.90342619648399e-05, "loss": 2.8387, "mean_token_accuracy": 0.38620689511299133, "step": 137925 }, { "epoch": 0.1389245609317101, "grad_norm": 12.483580817677456, "learning_rate": 4.9034153358800416e-05, "loss": 2.7213, "mean_token_accuracy": 0.34137930870056155, "step": 137930 }, { "epoch": 0.13892959698481427, "grad_norm": 11.274087490412493, "learning_rate": 4.9034044746788365e-05, "loss": 2.1154, "mean_token_accuracy": 0.5019963681697845, "step": 137935 }, { "epoch": 0.13893463303791845, "grad_norm": 9.183380510443397, "learning_rate": 4.903393612880376e-05, "loss": 2.2159, "mean_token_accuracy": 0.45862067937850953, "step": 137940 }, { "epoch": 0.13893966909102262, "grad_norm": 12.420203105639043, "learning_rate": 4.903382750484663e-05, "loss": 2.4579, "mean_token_accuracy": 0.41034482717514037, "step": 137945 }, { "epoch": 0.1389447051441268, "grad_norm": 8.71773347303741, "learning_rate": 4.903371887491703e-05, "loss": 2.5388, "mean_token_accuracy": 0.4034482777118683, "step": 137950 }, { "epoch": 0.13894974119723097, "grad_norm": 12.522384658642308, "learning_rate": 4.9033610239014956e-05, "loss": 2.5564, "mean_token_accuracy": 0.38965516686439516, "step": 137955 }, { "epoch": 0.13895477725033514, "grad_norm": 7.66552044937926, "learning_rate": 4.903350159714046e-05, "loss": 2.0543, "mean_token_accuracy": 0.48275861144065857, "step": 137960 }, { "epoch": 0.13895981330343932, "grad_norm": 11.261656770615696, "learning_rate": 4.903339294929357e-05, "loss": 2.5308, "mean_token_accuracy": 0.41034482717514037, "step": 137965 }, { "epoch": 0.1389648493565435, "grad_norm": 12.070572764705537, "learning_rate": 4.903328429547431e-05, "loss": 2.314, "mean_token_accuracy": 0.46400484442710876, "step": 137970 }, { "epoch": 0.13896988540964766, "grad_norm": 9.969717366675551, "learning_rate": 4.903317563568272e-05, "loss": 2.4393, "mean_token_accuracy": 0.4275861978530884, "step": 137975 }, { "epoch": 0.13897492146275184, "grad_norm": 10.244218385183077, "learning_rate": 4.903306696991882e-05, "loss": 2.103, "mean_token_accuracy": 0.4413793087005615, "step": 137980 }, { "epoch": 0.138979957515856, "grad_norm": 9.91108807615884, "learning_rate": 4.903295829818264e-05, "loss": 2.4426, "mean_token_accuracy": 0.42413793206214906, "step": 137985 }, { "epoch": 0.13898499356896019, "grad_norm": 9.70166862965431, "learning_rate": 4.9032849620474216e-05, "loss": 2.2445, "mean_token_accuracy": 0.4206896543502808, "step": 137990 }, { "epoch": 0.13899002962206436, "grad_norm": 12.324521566578316, "learning_rate": 4.9032740936793576e-05, "loss": 2.2155, "mean_token_accuracy": 0.4344827473163605, "step": 137995 }, { "epoch": 0.13899506567516853, "grad_norm": 8.910904160651258, "learning_rate": 4.9032632247140744e-05, "loss": 2.3252, "mean_token_accuracy": 0.4000000059604645, "step": 138000 }, { "epoch": 0.1390001017282727, "grad_norm": 10.912184101473503, "learning_rate": 4.903252355151577e-05, "loss": 2.4278, "mean_token_accuracy": 0.37241379022598264, "step": 138005 }, { "epoch": 0.13900513778137688, "grad_norm": 8.401878709833023, "learning_rate": 4.903241484991866e-05, "loss": 2.6433, "mean_token_accuracy": 0.4241379380226135, "step": 138010 }, { "epoch": 0.13901017383448105, "grad_norm": 11.411948926553926, "learning_rate": 4.9032306142349455e-05, "loss": 2.1714, "mean_token_accuracy": 0.4655172288417816, "step": 138015 }, { "epoch": 0.13901520988758523, "grad_norm": 9.002902173648769, "learning_rate": 4.9032197428808186e-05, "loss": 2.2608, "mean_token_accuracy": 0.44827585220336913, "step": 138020 }, { "epoch": 0.1390202459406894, "grad_norm": 11.468546986488565, "learning_rate": 4.903208870929488e-05, "loss": 2.1164, "mean_token_accuracy": 0.4689655125141144, "step": 138025 }, { "epoch": 0.13902528199379358, "grad_norm": 10.21250185102687, "learning_rate": 4.9031979983809574e-05, "loss": 2.523, "mean_token_accuracy": 0.44827585816383364, "step": 138030 }, { "epoch": 0.13903031804689775, "grad_norm": 9.20307640474596, "learning_rate": 4.9031871252352285e-05, "loss": 2.2648, "mean_token_accuracy": 0.458620685338974, "step": 138035 }, { "epoch": 0.13903535410000192, "grad_norm": 11.237508977536352, "learning_rate": 4.903176251492306e-05, "loss": 2.689, "mean_token_accuracy": 0.3999999940395355, "step": 138040 }, { "epoch": 0.1390403901531061, "grad_norm": 10.444415930032232, "learning_rate": 4.9031653771521917e-05, "loss": 2.1158, "mean_token_accuracy": 0.4586206912994385, "step": 138045 }, { "epoch": 0.13904542620621027, "grad_norm": 12.991128060419946, "learning_rate": 4.9031545022148884e-05, "loss": 2.1717, "mean_token_accuracy": 0.42413793206214906, "step": 138050 }, { "epoch": 0.13905046225931444, "grad_norm": 10.21366718347926, "learning_rate": 4.9031436266804006e-05, "loss": 2.5931, "mean_token_accuracy": 0.4551724135875702, "step": 138055 }, { "epoch": 0.13905549831241862, "grad_norm": 9.507030179771352, "learning_rate": 4.90313275054873e-05, "loss": 2.3662, "mean_token_accuracy": 0.42928009629249575, "step": 138060 }, { "epoch": 0.13906053436552276, "grad_norm": 9.136395429138256, "learning_rate": 4.903121873819879e-05, "loss": 2.6565, "mean_token_accuracy": 0.39310345649719236, "step": 138065 }, { "epoch": 0.13906557041862694, "grad_norm": 11.72924407157271, "learning_rate": 4.9031109964938534e-05, "loss": 2.1958, "mean_token_accuracy": 0.48517847061157227, "step": 138070 }, { "epoch": 0.1390706064717311, "grad_norm": 9.94878574891501, "learning_rate": 4.9031001185706534e-05, "loss": 1.9967, "mean_token_accuracy": 0.4896551728248596, "step": 138075 }, { "epoch": 0.13907564252483529, "grad_norm": 10.061181207790982, "learning_rate": 4.903089240050283e-05, "loss": 2.5354, "mean_token_accuracy": 0.4206896543502808, "step": 138080 }, { "epoch": 0.13908067857793946, "grad_norm": 10.56597023375056, "learning_rate": 4.903078360932746e-05, "loss": 2.3996, "mean_token_accuracy": 0.39655173420906065, "step": 138085 }, { "epoch": 0.13908571463104363, "grad_norm": 10.378029170450773, "learning_rate": 4.9030674812180435e-05, "loss": 2.3387, "mean_token_accuracy": 0.4448275864124298, "step": 138090 }, { "epoch": 0.1390907506841478, "grad_norm": 8.75736642418929, "learning_rate": 4.90305660090618e-05, "loss": 2.0634, "mean_token_accuracy": 0.4896551787853241, "step": 138095 }, { "epoch": 0.13909578673725198, "grad_norm": 9.659099767311268, "learning_rate": 4.90304571999716e-05, "loss": 2.5228, "mean_token_accuracy": 0.42758620977401735, "step": 138100 }, { "epoch": 0.13910082279035615, "grad_norm": 11.432933599256998, "learning_rate": 4.9030348384909825e-05, "loss": 2.6788, "mean_token_accuracy": 0.38965516686439516, "step": 138105 }, { "epoch": 0.13910585884346033, "grad_norm": 11.906547700122951, "learning_rate": 4.903023956387654e-05, "loss": 2.2792, "mean_token_accuracy": 0.45009074807167054, "step": 138110 }, { "epoch": 0.1391108948965645, "grad_norm": 13.552862035027253, "learning_rate": 4.903013073687175e-05, "loss": 2.3946, "mean_token_accuracy": 0.4448275864124298, "step": 138115 }, { "epoch": 0.13911593094966868, "grad_norm": 10.987791871745515, "learning_rate": 4.903002190389551e-05, "loss": 2.622, "mean_token_accuracy": 0.4413793087005615, "step": 138120 }, { "epoch": 0.13912096700277285, "grad_norm": 11.633542299620593, "learning_rate": 4.9029913064947835e-05, "loss": 2.5031, "mean_token_accuracy": 0.41034482717514037, "step": 138125 }, { "epoch": 0.13912600305587702, "grad_norm": 8.137984642986344, "learning_rate": 4.9029804220028755e-05, "loss": 2.1178, "mean_token_accuracy": 0.5, "step": 138130 }, { "epoch": 0.1391310391089812, "grad_norm": 13.88857811975663, "learning_rate": 4.902969536913831e-05, "loss": 2.452, "mean_token_accuracy": 0.4034482777118683, "step": 138135 }, { "epoch": 0.13913607516208537, "grad_norm": 10.689321482944598, "learning_rate": 4.902958651227652e-05, "loss": 2.4192, "mean_token_accuracy": 0.4448275864124298, "step": 138140 }, { "epoch": 0.13914111121518954, "grad_norm": 15.883226788609294, "learning_rate": 4.9029477649443416e-05, "loss": 2.857, "mean_token_accuracy": 0.38620689511299133, "step": 138145 }, { "epoch": 0.13914614726829372, "grad_norm": 10.444189513819644, "learning_rate": 4.9029368780639036e-05, "loss": 2.3962, "mean_token_accuracy": 0.44827587008476255, "step": 138150 }, { "epoch": 0.1391511833213979, "grad_norm": 10.125392677487993, "learning_rate": 4.90292599058634e-05, "loss": 2.3208, "mean_token_accuracy": 0.44319420456886294, "step": 138155 }, { "epoch": 0.13915621937450207, "grad_norm": 9.904978871022061, "learning_rate": 4.902915102511655e-05, "loss": 2.899, "mean_token_accuracy": 0.36896551251411436, "step": 138160 }, { "epoch": 0.13916125542760624, "grad_norm": 9.515434606826695, "learning_rate": 4.9029042138398505e-05, "loss": 2.2028, "mean_token_accuracy": 0.44482758045196535, "step": 138165 }, { "epoch": 0.1391662914807104, "grad_norm": 9.369317643402482, "learning_rate": 4.90289332457093e-05, "loss": 2.2061, "mean_token_accuracy": 0.44948577880859375, "step": 138170 }, { "epoch": 0.1391713275338146, "grad_norm": 9.407218242209533, "learning_rate": 4.9028824347048975e-05, "loss": 2.4273, "mean_token_accuracy": 0.44137930274009707, "step": 138175 }, { "epoch": 0.13917636358691876, "grad_norm": 11.928753865683907, "learning_rate": 4.902871544241754e-05, "loss": 2.2869, "mean_token_accuracy": 0.4471869349479675, "step": 138180 }, { "epoch": 0.13918139964002293, "grad_norm": 9.101652122933654, "learning_rate": 4.902860653181503e-05, "loss": 2.1599, "mean_token_accuracy": 0.49999999403953554, "step": 138185 }, { "epoch": 0.1391864356931271, "grad_norm": 10.007759430917048, "learning_rate": 4.902849761524149e-05, "loss": 2.659, "mean_token_accuracy": 0.4275861978530884, "step": 138190 }, { "epoch": 0.13919147174623128, "grad_norm": 11.164013256194261, "learning_rate": 4.902838869269694e-05, "loss": 2.46, "mean_token_accuracy": 0.41379310488700866, "step": 138195 }, { "epoch": 0.13919650779933546, "grad_norm": 8.214313629134498, "learning_rate": 4.902827976418141e-05, "loss": 2.1199, "mean_token_accuracy": 0.5068965554237366, "step": 138200 }, { "epoch": 0.1392015438524396, "grad_norm": 10.420779353360977, "learning_rate": 4.902817082969493e-05, "loss": 2.2839, "mean_token_accuracy": 0.4689655125141144, "step": 138205 }, { "epoch": 0.13920657990554378, "grad_norm": 13.41093416659383, "learning_rate": 4.902806188923753e-05, "loss": 2.7504, "mean_token_accuracy": 0.3379310369491577, "step": 138210 }, { "epoch": 0.13921161595864795, "grad_norm": 8.564914610140596, "learning_rate": 4.902795294280925e-05, "loss": 2.6871, "mean_token_accuracy": 0.4398669064044952, "step": 138215 }, { "epoch": 0.13921665201175212, "grad_norm": 10.542582119819189, "learning_rate": 4.90278439904101e-05, "loss": 2.2545, "mean_token_accuracy": 0.43103447556495667, "step": 138220 }, { "epoch": 0.1392216880648563, "grad_norm": 8.494785645191097, "learning_rate": 4.9027735032040126e-05, "loss": 2.1487, "mean_token_accuracy": 0.4379310369491577, "step": 138225 }, { "epoch": 0.13922672411796047, "grad_norm": 9.554576544302856, "learning_rate": 4.9027626067699355e-05, "loss": 2.3332, "mean_token_accuracy": 0.4379310369491577, "step": 138230 }, { "epoch": 0.13923176017106464, "grad_norm": 8.897587687713937, "learning_rate": 4.902751709738782e-05, "loss": 2.6564, "mean_token_accuracy": 0.44137930274009707, "step": 138235 }, { "epoch": 0.13923679622416882, "grad_norm": 10.74437103532797, "learning_rate": 4.902740812110555e-05, "loss": 2.7377, "mean_token_accuracy": 0.37931033968925476, "step": 138240 }, { "epoch": 0.139241832277273, "grad_norm": 13.989450030941116, "learning_rate": 4.902729913885257e-05, "loss": 2.8632, "mean_token_accuracy": 0.3275862067937851, "step": 138245 }, { "epoch": 0.13924686833037717, "grad_norm": 10.747075860146865, "learning_rate": 4.902719015062891e-05, "loss": 2.219, "mean_token_accuracy": 0.447126442193985, "step": 138250 }, { "epoch": 0.13925190438348134, "grad_norm": 8.964149179309548, "learning_rate": 4.90270811564346e-05, "loss": 2.2549, "mean_token_accuracy": 0.4535995125770569, "step": 138255 }, { "epoch": 0.1392569404365855, "grad_norm": 11.30206386376445, "learning_rate": 4.902697215626968e-05, "loss": 2.3434, "mean_token_accuracy": 0.4344827562570572, "step": 138260 }, { "epoch": 0.1392619764896897, "grad_norm": 13.915997808145846, "learning_rate": 4.902686315013418e-05, "loss": 2.2891, "mean_token_accuracy": 0.47241378426551817, "step": 138265 }, { "epoch": 0.13926701254279386, "grad_norm": 13.101239805499006, "learning_rate": 4.9026754138028115e-05, "loss": 2.5622, "mean_token_accuracy": 0.4448275864124298, "step": 138270 }, { "epoch": 0.13927204859589803, "grad_norm": 10.882545307473345, "learning_rate": 4.9026645119951526e-05, "loss": 2.0387, "mean_token_accuracy": 0.4507561981678009, "step": 138275 }, { "epoch": 0.1392770846490022, "grad_norm": 8.803099259557577, "learning_rate": 4.9026536095904446e-05, "loss": 2.3447, "mean_token_accuracy": 0.4206896543502808, "step": 138280 }, { "epoch": 0.13928212070210638, "grad_norm": 11.073421275329984, "learning_rate": 4.9026427065886894e-05, "loss": 2.1487, "mean_token_accuracy": 0.5068965554237366, "step": 138285 }, { "epoch": 0.13928715675521056, "grad_norm": 10.946873592636605, "learning_rate": 4.902631802989891e-05, "loss": 2.3964, "mean_token_accuracy": 0.42758620977401735, "step": 138290 }, { "epoch": 0.13929219280831473, "grad_norm": 11.61481953076501, "learning_rate": 4.902620898794053e-05, "loss": 2.5463, "mean_token_accuracy": 0.4517241358757019, "step": 138295 }, { "epoch": 0.1392972288614189, "grad_norm": 8.827576466973946, "learning_rate": 4.902609994001176e-05, "loss": 2.186, "mean_token_accuracy": 0.4324258863925934, "step": 138300 }, { "epoch": 0.13930226491452308, "grad_norm": 11.596319817919353, "learning_rate": 4.9025990886112656e-05, "loss": 2.6384, "mean_token_accuracy": 0.4034482777118683, "step": 138305 }, { "epoch": 0.13930730096762725, "grad_norm": 11.188737093216742, "learning_rate": 4.902588182624324e-05, "loss": 2.4005, "mean_token_accuracy": 0.4379310369491577, "step": 138310 }, { "epoch": 0.13931233702073142, "grad_norm": 10.877356751000981, "learning_rate": 4.902577276040354e-05, "loss": 2.1877, "mean_token_accuracy": 0.4811857223510742, "step": 138315 }, { "epoch": 0.1393173730738356, "grad_norm": 9.001370454213625, "learning_rate": 4.902566368859357e-05, "loss": 2.1878, "mean_token_accuracy": 0.46896551847457885, "step": 138320 }, { "epoch": 0.13932240912693977, "grad_norm": 11.585411080556945, "learning_rate": 4.9025554610813396e-05, "loss": 2.2194, "mean_token_accuracy": 0.42758620977401735, "step": 138325 }, { "epoch": 0.13932744518004395, "grad_norm": 9.698754410718333, "learning_rate": 4.902544552706302e-05, "loss": 2.0983, "mean_token_accuracy": 0.47931033968925474, "step": 138330 }, { "epoch": 0.13933248123314812, "grad_norm": 9.79765868913099, "learning_rate": 4.902533643734248e-05, "loss": 2.3342, "mean_token_accuracy": 0.4482758641242981, "step": 138335 }, { "epoch": 0.1393375172862523, "grad_norm": 11.372303806391345, "learning_rate": 4.902522734165182e-05, "loss": 2.6344, "mean_token_accuracy": 0.37931033968925476, "step": 138340 }, { "epoch": 0.13934255333935644, "grad_norm": 10.733291449885284, "learning_rate": 4.9025118239991046e-05, "loss": 2.282, "mean_token_accuracy": 0.4620689570903778, "step": 138345 }, { "epoch": 0.1393475893924606, "grad_norm": 11.032773316002121, "learning_rate": 4.9025009132360205e-05, "loss": 2.4531, "mean_token_accuracy": 0.43793103098869324, "step": 138350 }, { "epoch": 0.1393526254455648, "grad_norm": 10.410188478437263, "learning_rate": 4.9024900018759326e-05, "loss": 2.5367, "mean_token_accuracy": 0.3965517282485962, "step": 138355 }, { "epoch": 0.13935766149866896, "grad_norm": 10.81220914775972, "learning_rate": 4.902479089918842e-05, "loss": 2.7992, "mean_token_accuracy": 0.3793103456497192, "step": 138360 }, { "epoch": 0.13936269755177313, "grad_norm": 10.646893498768199, "learning_rate": 4.902468177364755e-05, "loss": 2.5031, "mean_token_accuracy": 0.4259528160095215, "step": 138365 }, { "epoch": 0.1393677336048773, "grad_norm": 10.169692944861206, "learning_rate": 4.902457264213672e-05, "loss": 2.8302, "mean_token_accuracy": 0.41034482717514037, "step": 138370 }, { "epoch": 0.13937276965798148, "grad_norm": 11.67320683246617, "learning_rate": 4.902446350465598e-05, "loss": 2.3626, "mean_token_accuracy": 0.41724138259887694, "step": 138375 }, { "epoch": 0.13937780571108566, "grad_norm": 12.382190024465723, "learning_rate": 4.902435436120534e-05, "loss": 2.2152, "mean_token_accuracy": 0.42561576664447787, "step": 138380 }, { "epoch": 0.13938284176418983, "grad_norm": 10.763645388359468, "learning_rate": 4.902424521178484e-05, "loss": 2.2482, "mean_token_accuracy": 0.46206897497177124, "step": 138385 }, { "epoch": 0.139387877817294, "grad_norm": 12.41934744040569, "learning_rate": 4.9024136056394514e-05, "loss": 2.4457, "mean_token_accuracy": 0.44482758045196535, "step": 138390 }, { "epoch": 0.13939291387039818, "grad_norm": 12.094651792852375, "learning_rate": 4.902402689503439e-05, "loss": 2.1624, "mean_token_accuracy": 0.482758617401123, "step": 138395 }, { "epoch": 0.13939794992350235, "grad_norm": 10.181590765726154, "learning_rate": 4.9023917727704494e-05, "loss": 2.4502, "mean_token_accuracy": 0.4103448331356049, "step": 138400 }, { "epoch": 0.13940298597660652, "grad_norm": 10.98334490657734, "learning_rate": 4.9023808554404855e-05, "loss": 2.3567, "mean_token_accuracy": 0.458620685338974, "step": 138405 }, { "epoch": 0.1394080220297107, "grad_norm": 11.351928047402858, "learning_rate": 4.902369937513551e-05, "loss": 2.4708, "mean_token_accuracy": 0.4551724135875702, "step": 138410 }, { "epoch": 0.13941305808281487, "grad_norm": 11.2812501949377, "learning_rate": 4.902359018989649e-05, "loss": 2.3831, "mean_token_accuracy": 0.4344827651977539, "step": 138415 }, { "epoch": 0.13941809413591905, "grad_norm": 11.043802477745736, "learning_rate": 4.902348099868783e-05, "loss": 2.8233, "mean_token_accuracy": 0.3931034475564957, "step": 138420 }, { "epoch": 0.13942313018902322, "grad_norm": 9.935437741146034, "learning_rate": 4.902337180150954e-05, "loss": 2.1206, "mean_token_accuracy": 0.48275862336158754, "step": 138425 }, { "epoch": 0.1394281662421274, "grad_norm": 10.799092022856922, "learning_rate": 4.902326259836166e-05, "loss": 2.3676, "mean_token_accuracy": 0.4517241358757019, "step": 138430 }, { "epoch": 0.13943320229523157, "grad_norm": 9.067445162586733, "learning_rate": 4.9023153389244226e-05, "loss": 2.5468, "mean_token_accuracy": 0.4000000059604645, "step": 138435 }, { "epoch": 0.13943823834833574, "grad_norm": 9.11153063017215, "learning_rate": 4.902304417415727e-05, "loss": 2.4023, "mean_token_accuracy": 0.4172413766384125, "step": 138440 }, { "epoch": 0.13944327440143992, "grad_norm": 12.161098585142575, "learning_rate": 4.902293495310081e-05, "loss": 2.2996, "mean_token_accuracy": 0.4517241299152374, "step": 138445 }, { "epoch": 0.1394483104545441, "grad_norm": 11.321213945096785, "learning_rate": 4.90228257260749e-05, "loss": 2.5568, "mean_token_accuracy": 0.37931033968925476, "step": 138450 }, { "epoch": 0.13945334650764826, "grad_norm": 8.778706998273167, "learning_rate": 4.902271649307953e-05, "loss": 2.493, "mean_token_accuracy": 0.4275861978530884, "step": 138455 }, { "epoch": 0.13945838256075244, "grad_norm": 13.006123335979375, "learning_rate": 4.902260725411477e-05, "loss": 2.2237, "mean_token_accuracy": 0.4724137902259827, "step": 138460 }, { "epoch": 0.1394634186138566, "grad_norm": 11.11006556785583, "learning_rate": 4.902249800918063e-05, "loss": 2.5023, "mean_token_accuracy": 0.3827586144208908, "step": 138465 }, { "epoch": 0.13946845466696078, "grad_norm": 9.32610739204872, "learning_rate": 4.902238875827715e-05, "loss": 2.1111, "mean_token_accuracy": 0.4879007875919342, "step": 138470 }, { "epoch": 0.13947349072006496, "grad_norm": 12.701999385316975, "learning_rate": 4.902227950140435e-05, "loss": 2.3897, "mean_token_accuracy": 0.4103448212146759, "step": 138475 }, { "epoch": 0.13947852677316913, "grad_norm": 18.756392609940214, "learning_rate": 4.902217023856227e-05, "loss": 2.3465, "mean_token_accuracy": 0.47586207985877993, "step": 138480 }, { "epoch": 0.13948356282627328, "grad_norm": 8.76499009037372, "learning_rate": 4.902206096975093e-05, "loss": 2.2433, "mean_token_accuracy": 0.47241379618644713, "step": 138485 }, { "epoch": 0.13948859887937745, "grad_norm": 10.424806807924217, "learning_rate": 4.9021951694970366e-05, "loss": 2.2282, "mean_token_accuracy": 0.4366001129150391, "step": 138490 }, { "epoch": 0.13949363493248162, "grad_norm": 9.530232678047204, "learning_rate": 4.902184241422061e-05, "loss": 2.3043, "mean_token_accuracy": 0.4241379380226135, "step": 138495 }, { "epoch": 0.1394986709855858, "grad_norm": 14.184248034622462, "learning_rate": 4.902173312750169e-05, "loss": 3.0517, "mean_token_accuracy": 0.3896551728248596, "step": 138500 }, { "epoch": 0.13950370703868997, "grad_norm": 8.646129628138572, "learning_rate": 4.902162383481364e-05, "loss": 2.1395, "mean_token_accuracy": 0.482758617401123, "step": 138505 }, { "epoch": 0.13950874309179415, "grad_norm": 8.954026470766216, "learning_rate": 4.902151453615649e-05, "loss": 2.5184, "mean_token_accuracy": 0.4310344815254211, "step": 138510 }, { "epoch": 0.13951377914489832, "grad_norm": 9.92052424405013, "learning_rate": 4.902140523153026e-05, "loss": 2.3178, "mean_token_accuracy": 0.43103448748588563, "step": 138515 }, { "epoch": 0.1395188151980025, "grad_norm": 10.608029590972818, "learning_rate": 4.902129592093499e-05, "loss": 2.4826, "mean_token_accuracy": 0.3620689630508423, "step": 138520 }, { "epoch": 0.13952385125110667, "grad_norm": 9.38824613635307, "learning_rate": 4.902118660437071e-05, "loss": 2.5035, "mean_token_accuracy": 0.38275861740112305, "step": 138525 }, { "epoch": 0.13952888730421084, "grad_norm": 11.012334879282674, "learning_rate": 4.9021077281837455e-05, "loss": 3.1635, "mean_token_accuracy": 0.34137930870056155, "step": 138530 }, { "epoch": 0.13953392335731502, "grad_norm": 9.412759144174226, "learning_rate": 4.902096795333524e-05, "loss": 2.1925, "mean_token_accuracy": 0.48802177906036376, "step": 138535 }, { "epoch": 0.1395389594104192, "grad_norm": 10.633446858388059, "learning_rate": 4.9020858618864106e-05, "loss": 2.2427, "mean_token_accuracy": 0.43103448748588563, "step": 138540 }, { "epoch": 0.13954399546352336, "grad_norm": 8.375857380693052, "learning_rate": 4.9020749278424084e-05, "loss": 2.3735, "mean_token_accuracy": 0.46763460636138915, "step": 138545 }, { "epoch": 0.13954903151662754, "grad_norm": 11.612892044901223, "learning_rate": 4.9020639932015197e-05, "loss": 2.1501, "mean_token_accuracy": 0.4620689690113068, "step": 138550 }, { "epoch": 0.1395540675697317, "grad_norm": 9.97141359396388, "learning_rate": 4.902053057963748e-05, "loss": 1.9055, "mean_token_accuracy": 0.5135511159896851, "step": 138555 }, { "epoch": 0.13955910362283588, "grad_norm": 13.345035026753036, "learning_rate": 4.9020421221290976e-05, "loss": 3.0173, "mean_token_accuracy": 0.34482758939266206, "step": 138560 }, { "epoch": 0.13956413967594006, "grad_norm": 11.707339745097064, "learning_rate": 4.902031185697569e-05, "loss": 2.4901, "mean_token_accuracy": 0.4034482777118683, "step": 138565 }, { "epoch": 0.13956917572904423, "grad_norm": 11.758542923246482, "learning_rate": 4.902020248669167e-05, "loss": 2.0067, "mean_token_accuracy": 0.4517241418361664, "step": 138570 }, { "epoch": 0.1395742117821484, "grad_norm": 11.196569265510686, "learning_rate": 4.902009311043894e-05, "loss": 2.2605, "mean_token_accuracy": 0.43448275327682495, "step": 138575 }, { "epoch": 0.13957924783525258, "grad_norm": 10.052913751080675, "learning_rate": 4.901998372821753e-05, "loss": 2.2127, "mean_token_accuracy": 0.4793103575706482, "step": 138580 }, { "epoch": 0.13958428388835675, "grad_norm": 10.325772128783614, "learning_rate": 4.9019874340027475e-05, "loss": 2.7954, "mean_token_accuracy": 0.4068965494632721, "step": 138585 }, { "epoch": 0.13958931994146093, "grad_norm": 10.84032970806072, "learning_rate": 4.9019764945868806e-05, "loss": 2.5035, "mean_token_accuracy": 0.4172413766384125, "step": 138590 }, { "epoch": 0.1395943559945651, "grad_norm": 10.6851604910485, "learning_rate": 4.901965554574154e-05, "loss": 2.2395, "mean_token_accuracy": 0.47931033968925474, "step": 138595 }, { "epoch": 0.13959939204766927, "grad_norm": 13.432105152064153, "learning_rate": 4.901954613964573e-05, "loss": 2.3655, "mean_token_accuracy": 0.4413793087005615, "step": 138600 }, { "epoch": 0.13960442810077345, "grad_norm": 10.057474841407988, "learning_rate": 4.9019436727581385e-05, "loss": 2.5073, "mean_token_accuracy": 0.4206896543502808, "step": 138605 }, { "epoch": 0.13960946415387762, "grad_norm": 9.650911306282936, "learning_rate": 4.9019327309548544e-05, "loss": 2.3488, "mean_token_accuracy": 0.4599515974521637, "step": 138610 }, { "epoch": 0.1396145002069818, "grad_norm": 11.248147183678558, "learning_rate": 4.901921788554725e-05, "loss": 2.3044, "mean_token_accuracy": 0.42413793206214906, "step": 138615 }, { "epoch": 0.13961953626008597, "grad_norm": 10.12306111273883, "learning_rate": 4.90191084555775e-05, "loss": 2.1577, "mean_token_accuracy": 0.43103448748588563, "step": 138620 }, { "epoch": 0.13962457231319012, "grad_norm": 13.757092797154547, "learning_rate": 4.901899901963936e-05, "loss": 2.786, "mean_token_accuracy": 0.3931034505367279, "step": 138625 }, { "epoch": 0.1396296083662943, "grad_norm": 9.97554070946242, "learning_rate": 4.901888957773284e-05, "loss": 2.6034, "mean_token_accuracy": 0.34482758641242983, "step": 138630 }, { "epoch": 0.13963464441939846, "grad_norm": 16.11802176669795, "learning_rate": 4.9018780129857975e-05, "loss": 3.1878, "mean_token_accuracy": 0.32413793802261354, "step": 138635 }, { "epoch": 0.13963968047250264, "grad_norm": 12.025073633421036, "learning_rate": 4.9018670676014796e-05, "loss": 2.8853, "mean_token_accuracy": 0.40000000298023225, "step": 138640 }, { "epoch": 0.1396447165256068, "grad_norm": 9.608657618001468, "learning_rate": 4.9018561216203335e-05, "loss": 2.4033, "mean_token_accuracy": 0.3965517282485962, "step": 138645 }, { "epoch": 0.13964975257871098, "grad_norm": 10.923076100713418, "learning_rate": 4.901845175042362e-05, "loss": 2.6882, "mean_token_accuracy": 0.36896551847457887, "step": 138650 }, { "epoch": 0.13965478863181516, "grad_norm": 9.891167464380397, "learning_rate": 4.901834227867568e-05, "loss": 2.1082, "mean_token_accuracy": 0.4501512348651886, "step": 138655 }, { "epoch": 0.13965982468491933, "grad_norm": 11.83539559042224, "learning_rate": 4.901823280095955e-05, "loss": 2.3404, "mean_token_accuracy": 0.4401088893413544, "step": 138660 }, { "epoch": 0.1396648607380235, "grad_norm": 9.50602837051987, "learning_rate": 4.901812331727525e-05, "loss": 2.4626, "mean_token_accuracy": 0.47241379618644713, "step": 138665 }, { "epoch": 0.13966989679112768, "grad_norm": 9.991968010804602, "learning_rate": 4.9018013827622834e-05, "loss": 2.4565, "mean_token_accuracy": 0.41034482717514037, "step": 138670 }, { "epoch": 0.13967493284423185, "grad_norm": 12.534533423459216, "learning_rate": 4.901790433200231e-05, "loss": 2.2622, "mean_token_accuracy": 0.47241378426551817, "step": 138675 }, { "epoch": 0.13967996889733603, "grad_norm": 6.9494080248452645, "learning_rate": 4.9017794830413714e-05, "loss": 2.339, "mean_token_accuracy": 0.4748336374759674, "step": 138680 }, { "epoch": 0.1396850049504402, "grad_norm": 8.24727363396836, "learning_rate": 4.901768532285708e-05, "loss": 2.259, "mean_token_accuracy": 0.4498487591743469, "step": 138685 }, { "epoch": 0.13969004100354437, "grad_norm": 10.413743122868981, "learning_rate": 4.9017575809332434e-05, "loss": 2.3511, "mean_token_accuracy": 0.4482758641242981, "step": 138690 }, { "epoch": 0.13969507705664855, "grad_norm": 9.514875476680462, "learning_rate": 4.9017466289839804e-05, "loss": 1.8696, "mean_token_accuracy": 0.49999999403953554, "step": 138695 }, { "epoch": 0.13970011310975272, "grad_norm": 15.61218891489284, "learning_rate": 4.9017356764379224e-05, "loss": 2.1779, "mean_token_accuracy": 0.44137930274009707, "step": 138700 }, { "epoch": 0.1397051491628569, "grad_norm": 11.5958430694819, "learning_rate": 4.901724723295073e-05, "loss": 2.7642, "mean_token_accuracy": 0.3965517282485962, "step": 138705 }, { "epoch": 0.13971018521596107, "grad_norm": 10.743450446058946, "learning_rate": 4.901713769555434e-05, "loss": 2.0948, "mean_token_accuracy": 0.4620689690113068, "step": 138710 }, { "epoch": 0.13971522126906524, "grad_norm": 11.391153197258028, "learning_rate": 4.90170281521901e-05, "loss": 2.4385, "mean_token_accuracy": 0.46896551847457885, "step": 138715 }, { "epoch": 0.13972025732216942, "grad_norm": 8.123152879525938, "learning_rate": 4.901691860285803e-05, "loss": 2.4433, "mean_token_accuracy": 0.4379310369491577, "step": 138720 }, { "epoch": 0.1397252933752736, "grad_norm": 9.04099305298166, "learning_rate": 4.901680904755816e-05, "loss": 2.5904, "mean_token_accuracy": 0.3517241358757019, "step": 138725 }, { "epoch": 0.13973032942837776, "grad_norm": 11.197689852313406, "learning_rate": 4.901669948629052e-05, "loss": 2.7276, "mean_token_accuracy": 0.41379310488700866, "step": 138730 }, { "epoch": 0.13973536548148194, "grad_norm": 8.543191133511975, "learning_rate": 4.901658991905515e-05, "loss": 2.1756, "mean_token_accuracy": 0.4413793087005615, "step": 138735 }, { "epoch": 0.1397404015345861, "grad_norm": 9.259519840888386, "learning_rate": 4.901648034585207e-05, "loss": 2.2343, "mean_token_accuracy": 0.4413793087005615, "step": 138740 }, { "epoch": 0.13974543758769029, "grad_norm": 10.094875051719807, "learning_rate": 4.901637076668132e-05, "loss": 2.8997, "mean_token_accuracy": 0.3482758551836014, "step": 138745 }, { "epoch": 0.13975047364079446, "grad_norm": 11.52518806306133, "learning_rate": 4.901626118154292e-05, "loss": 2.4085, "mean_token_accuracy": 0.4068965554237366, "step": 138750 }, { "epoch": 0.13975550969389863, "grad_norm": 9.924997927173873, "learning_rate": 4.90161515904369e-05, "loss": 2.5045, "mean_token_accuracy": 0.41034482717514037, "step": 138755 }, { "epoch": 0.1397605457470028, "grad_norm": 10.322435487405864, "learning_rate": 4.90160419933633e-05, "loss": 2.5755, "mean_token_accuracy": 0.41034482717514037, "step": 138760 }, { "epoch": 0.13976558180010695, "grad_norm": 10.984497541881616, "learning_rate": 4.901593239032214e-05, "loss": 2.5877, "mean_token_accuracy": 0.44482758045196535, "step": 138765 }, { "epoch": 0.13977061785321113, "grad_norm": 9.985500438638363, "learning_rate": 4.9015822781313456e-05, "loss": 2.6733, "mean_token_accuracy": 0.3862068891525269, "step": 138770 }, { "epoch": 0.1397756539063153, "grad_norm": 9.839280189869713, "learning_rate": 4.901571316633729e-05, "loss": 2.227, "mean_token_accuracy": 0.41034482717514037, "step": 138775 }, { "epoch": 0.13978068995941947, "grad_norm": 11.619182377787622, "learning_rate": 4.9015603545393646e-05, "loss": 2.7114, "mean_token_accuracy": 0.37241379618644715, "step": 138780 }, { "epoch": 0.13978572601252365, "grad_norm": 9.720204505650978, "learning_rate": 4.901549391848258e-05, "loss": 2.1535, "mean_token_accuracy": 0.4344827592372894, "step": 138785 }, { "epoch": 0.13979076206562782, "grad_norm": 8.880793317753692, "learning_rate": 4.90153842856041e-05, "loss": 2.4879, "mean_token_accuracy": 0.41724138259887694, "step": 138790 }, { "epoch": 0.139795798118732, "grad_norm": 9.848995127445994, "learning_rate": 4.9015274646758255e-05, "loss": 2.1515, "mean_token_accuracy": 0.45517241954803467, "step": 138795 }, { "epoch": 0.13980083417183617, "grad_norm": 9.391197121756159, "learning_rate": 4.9015165001945074e-05, "loss": 2.8079, "mean_token_accuracy": 0.3379310369491577, "step": 138800 }, { "epoch": 0.13980587022494034, "grad_norm": 12.512222154983283, "learning_rate": 4.9015055351164574e-05, "loss": 2.4856, "mean_token_accuracy": 0.4206896543502808, "step": 138805 }, { "epoch": 0.13981090627804452, "grad_norm": 13.156955501515759, "learning_rate": 4.9014945694416794e-05, "loss": 2.4305, "mean_token_accuracy": 0.43103447556495667, "step": 138810 }, { "epoch": 0.1398159423311487, "grad_norm": 14.232943009717248, "learning_rate": 4.901483603170177e-05, "loss": 2.7672, "mean_token_accuracy": 0.3931034505367279, "step": 138815 }, { "epoch": 0.13982097838425286, "grad_norm": 10.095733539290586, "learning_rate": 4.901472636301952e-05, "loss": 2.3585, "mean_token_accuracy": 0.41724138259887694, "step": 138820 }, { "epoch": 0.13982601443735704, "grad_norm": 11.012987496322, "learning_rate": 4.901461668837008e-05, "loss": 2.3807, "mean_token_accuracy": 0.42601331472396853, "step": 138825 }, { "epoch": 0.1398310504904612, "grad_norm": 12.154427601217643, "learning_rate": 4.901450700775349e-05, "loss": 2.7411, "mean_token_accuracy": 0.4068965494632721, "step": 138830 }, { "epoch": 0.13983608654356539, "grad_norm": 9.369549210602742, "learning_rate": 4.9014397321169755e-05, "loss": 2.4032, "mean_token_accuracy": 0.4724137902259827, "step": 138835 }, { "epoch": 0.13984112259666956, "grad_norm": 11.779058860201115, "learning_rate": 4.901428762861893e-05, "loss": 2.5919, "mean_token_accuracy": 0.44482758045196535, "step": 138840 }, { "epoch": 0.13984615864977373, "grad_norm": 13.923338633110752, "learning_rate": 4.901417793010104e-05, "loss": 2.4291, "mean_token_accuracy": 0.4137930989265442, "step": 138845 }, { "epoch": 0.1398511947028779, "grad_norm": 10.72745288436101, "learning_rate": 4.901406822561611e-05, "loss": 2.7415, "mean_token_accuracy": 0.38457350730895995, "step": 138850 }, { "epoch": 0.13985623075598208, "grad_norm": 9.111101787912652, "learning_rate": 4.901395851516417e-05, "loss": 2.1251, "mean_token_accuracy": 0.46896552443504336, "step": 138855 }, { "epoch": 0.13986126680908625, "grad_norm": 11.183073884929263, "learning_rate": 4.901384879874526e-05, "loss": 2.465, "mean_token_accuracy": 0.37586206793785093, "step": 138860 }, { "epoch": 0.13986630286219043, "grad_norm": 10.552658930336557, "learning_rate": 4.90137390763594e-05, "loss": 2.4291, "mean_token_accuracy": 0.42758620977401735, "step": 138865 }, { "epoch": 0.1398713389152946, "grad_norm": 10.24059635533299, "learning_rate": 4.901362934800662e-05, "loss": 2.5229, "mean_token_accuracy": 0.41034482717514037, "step": 138870 }, { "epoch": 0.13987637496839878, "grad_norm": 18.121937558094515, "learning_rate": 4.901351961368696e-05, "loss": 3.0135, "mean_token_accuracy": 0.39655172228813174, "step": 138875 }, { "epoch": 0.13988141102150295, "grad_norm": 12.192624920447315, "learning_rate": 4.901340987340044e-05, "loss": 2.7795, "mean_token_accuracy": 0.382758629322052, "step": 138880 }, { "epoch": 0.13988644707460712, "grad_norm": 14.448744215219861, "learning_rate": 4.9013300127147107e-05, "loss": 2.4568, "mean_token_accuracy": 0.3793103456497192, "step": 138885 }, { "epoch": 0.1398914831277113, "grad_norm": 8.602885960596431, "learning_rate": 4.901319037492697e-05, "loss": 2.9684, "mean_token_accuracy": 0.3965517282485962, "step": 138890 }, { "epoch": 0.13989651918081547, "grad_norm": 14.829126514133309, "learning_rate": 4.901308061674007e-05, "loss": 2.7345, "mean_token_accuracy": 0.39310344457626345, "step": 138895 }, { "epoch": 0.13990155523391964, "grad_norm": 12.299415496261407, "learning_rate": 4.901297085258644e-05, "loss": 2.3936, "mean_token_accuracy": 0.5000000119209289, "step": 138900 }, { "epoch": 0.1399065912870238, "grad_norm": 9.759059418694848, "learning_rate": 4.9012861082466104e-05, "loss": 2.4379, "mean_token_accuracy": 0.38620689511299133, "step": 138905 }, { "epoch": 0.13991162734012796, "grad_norm": 10.638240956294572, "learning_rate": 4.9012751306379093e-05, "loss": 2.4646, "mean_token_accuracy": 0.4620689690113068, "step": 138910 }, { "epoch": 0.13991666339323214, "grad_norm": 9.398956662430583, "learning_rate": 4.901264152432545e-05, "loss": 2.5795, "mean_token_accuracy": 0.43103448748588563, "step": 138915 }, { "epoch": 0.1399216994463363, "grad_norm": 12.626573288276347, "learning_rate": 4.901253173630519e-05, "loss": 2.4614, "mean_token_accuracy": 0.46551724076271056, "step": 138920 }, { "epoch": 0.13992673549944049, "grad_norm": 12.691360376781272, "learning_rate": 4.901242194231835e-05, "loss": 2.2126, "mean_token_accuracy": 0.4724137902259827, "step": 138925 }, { "epoch": 0.13993177155254466, "grad_norm": 8.703075945727914, "learning_rate": 4.901231214236495e-05, "loss": 2.3335, "mean_token_accuracy": 0.4172413766384125, "step": 138930 }, { "epoch": 0.13993680760564883, "grad_norm": 9.258698468487308, "learning_rate": 4.901220233644505e-05, "loss": 2.0679, "mean_token_accuracy": 0.4931034445762634, "step": 138935 }, { "epoch": 0.139941843658753, "grad_norm": 11.939088530866028, "learning_rate": 4.9012092524558644e-05, "loss": 2.1829, "mean_token_accuracy": 0.467332124710083, "step": 138940 }, { "epoch": 0.13994687971185718, "grad_norm": 9.44370795039286, "learning_rate": 4.901198270670579e-05, "loss": 1.9512, "mean_token_accuracy": 0.47241379618644713, "step": 138945 }, { "epoch": 0.13995191576496135, "grad_norm": 10.626942016479017, "learning_rate": 4.90118728828865e-05, "loss": 2.4886, "mean_token_accuracy": 0.41724138259887694, "step": 138950 }, { "epoch": 0.13995695181806553, "grad_norm": 11.54772783675832, "learning_rate": 4.901176305310081e-05, "loss": 2.2523, "mean_token_accuracy": 0.4793103516101837, "step": 138955 }, { "epoch": 0.1399619878711697, "grad_norm": 10.63693418340612, "learning_rate": 4.901165321734875e-05, "loss": 2.5565, "mean_token_accuracy": 0.4551724135875702, "step": 138960 }, { "epoch": 0.13996702392427388, "grad_norm": 10.503411659562344, "learning_rate": 4.901154337563036e-05, "loss": 2.4678, "mean_token_accuracy": 0.4344827592372894, "step": 138965 }, { "epoch": 0.13997205997737805, "grad_norm": 11.670286438156529, "learning_rate": 4.901143352794566e-05, "loss": 2.4988, "mean_token_accuracy": 0.4137930989265442, "step": 138970 }, { "epoch": 0.13997709603048222, "grad_norm": 10.191622304656846, "learning_rate": 4.9011323674294685e-05, "loss": 1.8963, "mean_token_accuracy": 0.47586206197738645, "step": 138975 }, { "epoch": 0.1399821320835864, "grad_norm": 9.512826429027188, "learning_rate": 4.9011213814677466e-05, "loss": 3.29, "mean_token_accuracy": 0.35172413289546967, "step": 138980 }, { "epoch": 0.13998716813669057, "grad_norm": 19.334430252992227, "learning_rate": 4.901110394909403e-05, "loss": 2.5925, "mean_token_accuracy": 0.4586206942796707, "step": 138985 }, { "epoch": 0.13999220418979474, "grad_norm": 8.090313246728336, "learning_rate": 4.9010994077544415e-05, "loss": 2.2273, "mean_token_accuracy": 0.46551724076271056, "step": 138990 }, { "epoch": 0.13999724024289892, "grad_norm": 10.2247295561546, "learning_rate": 4.901088420002863e-05, "loss": 2.4429, "mean_token_accuracy": 0.4290381193161011, "step": 138995 }, { "epoch": 0.1400022762960031, "grad_norm": 10.51347882502283, "learning_rate": 4.901077431654673e-05, "loss": 2.5892, "mean_token_accuracy": 0.4206896543502808, "step": 139000 }, { "epoch": 0.14000731234910727, "grad_norm": 19.928081727414696, "learning_rate": 4.901066442709874e-05, "loss": 2.3488, "mean_token_accuracy": 0.46787658929824827, "step": 139005 }, { "epoch": 0.14001234840221144, "grad_norm": 13.073458969460921, "learning_rate": 4.9010554531684684e-05, "loss": 2.8096, "mean_token_accuracy": 0.3827586114406586, "step": 139010 }, { "epoch": 0.1400173844553156, "grad_norm": 13.611862322043228, "learning_rate": 4.90104446303046e-05, "loss": 2.4835, "mean_token_accuracy": 0.37586207389831544, "step": 139015 }, { "epoch": 0.1400224205084198, "grad_norm": 9.21132323967406, "learning_rate": 4.90103347229585e-05, "loss": 2.1327, "mean_token_accuracy": 0.4862069010734558, "step": 139020 }, { "epoch": 0.14002745656152396, "grad_norm": 7.535234153586982, "learning_rate": 4.901022480964644e-05, "loss": 2.2996, "mean_token_accuracy": 0.4712038815021515, "step": 139025 }, { "epoch": 0.14003249261462813, "grad_norm": 15.50829161788053, "learning_rate": 4.901011489036844e-05, "loss": 2.6368, "mean_token_accuracy": 0.35862069129943847, "step": 139030 }, { "epoch": 0.1400375286677323, "grad_norm": 15.411876217041295, "learning_rate": 4.901000496512452e-05, "loss": 2.4964, "mean_token_accuracy": 0.43103448748588563, "step": 139035 }, { "epoch": 0.14004256472083648, "grad_norm": 9.1592915126872, "learning_rate": 4.9009895033914724e-05, "loss": 2.4246, "mean_token_accuracy": 0.4172413766384125, "step": 139040 }, { "epoch": 0.14004760077394063, "grad_norm": 14.251832508038001, "learning_rate": 4.900978509673908e-05, "loss": 2.7483, "mean_token_accuracy": 0.4172413796186447, "step": 139045 }, { "epoch": 0.1400526368270448, "grad_norm": 10.521493549543232, "learning_rate": 4.9009675153597625e-05, "loss": 2.3302, "mean_token_accuracy": 0.4103448212146759, "step": 139050 }, { "epoch": 0.14005767288014898, "grad_norm": 9.139566987242326, "learning_rate": 4.900956520449038e-05, "loss": 2.3065, "mean_token_accuracy": 0.4467634618282318, "step": 139055 }, { "epoch": 0.14006270893325315, "grad_norm": 9.848211533087522, "learning_rate": 4.9009455249417365e-05, "loss": 2.3878, "mean_token_accuracy": 0.43448275327682495, "step": 139060 }, { "epoch": 0.14006774498635732, "grad_norm": 9.027539234516688, "learning_rate": 4.900934528837862e-05, "loss": 2.2721, "mean_token_accuracy": 0.44349666833877566, "step": 139065 }, { "epoch": 0.1400727810394615, "grad_norm": 12.458266542839068, "learning_rate": 4.9009235321374194e-05, "loss": 2.9422, "mean_token_accuracy": 0.3758620619773865, "step": 139070 }, { "epoch": 0.14007781709256567, "grad_norm": 9.372321142066122, "learning_rate": 4.900912534840409e-05, "loss": 2.5158, "mean_token_accuracy": 0.41379310488700866, "step": 139075 }, { "epoch": 0.14008285314566984, "grad_norm": 10.2786625992754, "learning_rate": 4.900901536946835e-05, "loss": 2.7206, "mean_token_accuracy": 0.38620689511299133, "step": 139080 }, { "epoch": 0.14008788919877402, "grad_norm": 8.855288560786711, "learning_rate": 4.9008905384567e-05, "loss": 1.855, "mean_token_accuracy": 0.4965517222881317, "step": 139085 }, { "epoch": 0.1400929252518782, "grad_norm": 12.105421089854316, "learning_rate": 4.900879539370009e-05, "loss": 2.5964, "mean_token_accuracy": 0.4206896543502808, "step": 139090 }, { "epoch": 0.14009796130498237, "grad_norm": 10.585312496330332, "learning_rate": 4.9008685396867634e-05, "loss": 2.0317, "mean_token_accuracy": 0.5206896603107453, "step": 139095 }, { "epoch": 0.14010299735808654, "grad_norm": 8.981563536695814, "learning_rate": 4.900857539406965e-05, "loss": 2.2813, "mean_token_accuracy": 0.4586206912994385, "step": 139100 }, { "epoch": 0.1401080334111907, "grad_norm": 13.201874204328215, "learning_rate": 4.900846538530619e-05, "loss": 2.3025, "mean_token_accuracy": 0.415366005897522, "step": 139105 }, { "epoch": 0.1401130694642949, "grad_norm": 9.933358970549161, "learning_rate": 4.9008355370577274e-05, "loss": 2.1694, "mean_token_accuracy": 0.4517241358757019, "step": 139110 }, { "epoch": 0.14011810551739906, "grad_norm": 12.062571233864, "learning_rate": 4.9008245349882937e-05, "loss": 2.6266, "mean_token_accuracy": 0.3793103486299515, "step": 139115 }, { "epoch": 0.14012314157050323, "grad_norm": 10.41651913250454, "learning_rate": 4.900813532322322e-05, "loss": 2.0123, "mean_token_accuracy": 0.43448275327682495, "step": 139120 }, { "epoch": 0.1401281776236074, "grad_norm": 16.591361418564762, "learning_rate": 4.900802529059812e-05, "loss": 2.1933, "mean_token_accuracy": 0.48275862336158754, "step": 139125 }, { "epoch": 0.14013321367671158, "grad_norm": 12.727940082014657, "learning_rate": 4.90079152520077e-05, "loss": 2.2459, "mean_token_accuracy": 0.42758620381355283, "step": 139130 }, { "epoch": 0.14013824972981576, "grad_norm": 9.073764428447051, "learning_rate": 4.900780520745198e-05, "loss": 2.1091, "mean_token_accuracy": 0.4517241358757019, "step": 139135 }, { "epoch": 0.14014328578291993, "grad_norm": 12.034877253069277, "learning_rate": 4.900769515693099e-05, "loss": 2.09, "mean_token_accuracy": 0.5241379201412201, "step": 139140 }, { "epoch": 0.1401483218360241, "grad_norm": 9.312586739437604, "learning_rate": 4.900758510044476e-05, "loss": 2.1223, "mean_token_accuracy": 0.4689655065536499, "step": 139145 }, { "epoch": 0.14015335788912828, "grad_norm": 16.291483471752958, "learning_rate": 4.900747503799332e-05, "loss": 1.8691, "mean_token_accuracy": 0.5263762891292572, "step": 139150 }, { "epoch": 0.14015839394223245, "grad_norm": 11.517378736431523, "learning_rate": 4.90073649695767e-05, "loss": 2.3893, "mean_token_accuracy": 0.4, "step": 139155 }, { "epoch": 0.14016342999533662, "grad_norm": 12.058350532068006, "learning_rate": 4.900725489519493e-05, "loss": 2.3427, "mean_token_accuracy": 0.4724137902259827, "step": 139160 }, { "epoch": 0.1401684660484408, "grad_norm": 12.247367859836316, "learning_rate": 4.900714481484805e-05, "loss": 2.4289, "mean_token_accuracy": 0.42068964540958403, "step": 139165 }, { "epoch": 0.14017350210154497, "grad_norm": 11.664116711394353, "learning_rate": 4.900703472853608e-05, "loss": 2.0739, "mean_token_accuracy": 0.46551724076271056, "step": 139170 }, { "epoch": 0.14017853815464915, "grad_norm": 8.84188175726541, "learning_rate": 4.9006924636259057e-05, "loss": 2.4292, "mean_token_accuracy": 0.45517241954803467, "step": 139175 }, { "epoch": 0.14018357420775332, "grad_norm": 11.66205435050764, "learning_rate": 4.9006814538016996e-05, "loss": 2.42, "mean_token_accuracy": 0.4206896543502808, "step": 139180 }, { "epoch": 0.14018861026085747, "grad_norm": 15.69878509604065, "learning_rate": 4.900670443380995e-05, "loss": 2.3147, "mean_token_accuracy": 0.41034482717514037, "step": 139185 }, { "epoch": 0.14019364631396164, "grad_norm": 10.017662401771489, "learning_rate": 4.9006594323637936e-05, "loss": 2.1387, "mean_token_accuracy": 0.4655172348022461, "step": 139190 }, { "epoch": 0.1401986823670658, "grad_norm": 10.124190116692123, "learning_rate": 4.900648420750099e-05, "loss": 2.7362, "mean_token_accuracy": 0.4103448331356049, "step": 139195 }, { "epoch": 0.14020371842017, "grad_norm": 9.662283841152613, "learning_rate": 4.9006374085399144e-05, "loss": 2.2973, "mean_token_accuracy": 0.4344827651977539, "step": 139200 }, { "epoch": 0.14020875447327416, "grad_norm": 11.179549673853787, "learning_rate": 4.9006263957332416e-05, "loss": 2.4211, "mean_token_accuracy": 0.42413792610168455, "step": 139205 }, { "epoch": 0.14021379052637833, "grad_norm": 10.283234753001329, "learning_rate": 4.9006153823300856e-05, "loss": 2.391, "mean_token_accuracy": 0.42758620977401735, "step": 139210 }, { "epoch": 0.1402188265794825, "grad_norm": 11.171105789881556, "learning_rate": 4.9006043683304476e-05, "loss": 2.4757, "mean_token_accuracy": 0.42413792610168455, "step": 139215 }, { "epoch": 0.14022386263258668, "grad_norm": 8.097784619175115, "learning_rate": 4.900593353734332e-05, "loss": 2.3885, "mean_token_accuracy": 0.4448275864124298, "step": 139220 }, { "epoch": 0.14022889868569086, "grad_norm": 11.955975766076095, "learning_rate": 4.9005823385417407e-05, "loss": 2.5614, "mean_token_accuracy": 0.41724138259887694, "step": 139225 }, { "epoch": 0.14023393473879503, "grad_norm": 10.223153127774294, "learning_rate": 4.900571322752677e-05, "loss": 2.4043, "mean_token_accuracy": 0.4241379380226135, "step": 139230 }, { "epoch": 0.1402389707918992, "grad_norm": 11.52589749687505, "learning_rate": 4.900560306367145e-05, "loss": 2.2384, "mean_token_accuracy": 0.4517241358757019, "step": 139235 }, { "epoch": 0.14024400684500338, "grad_norm": 10.124743661857542, "learning_rate": 4.9005492893851475e-05, "loss": 2.3875, "mean_token_accuracy": 0.43793103098869324, "step": 139240 }, { "epoch": 0.14024904289810755, "grad_norm": 10.44462942717473, "learning_rate": 4.900538271806687e-05, "loss": 2.3663, "mean_token_accuracy": 0.42758620977401735, "step": 139245 }, { "epoch": 0.14025407895121172, "grad_norm": 9.322598172616475, "learning_rate": 4.900527253631766e-05, "loss": 2.16, "mean_token_accuracy": 0.47586206793785096, "step": 139250 }, { "epoch": 0.1402591150043159, "grad_norm": 9.584306030148369, "learning_rate": 4.900516234860388e-05, "loss": 2.3789, "mean_token_accuracy": 0.46442831158638, "step": 139255 }, { "epoch": 0.14026415105742007, "grad_norm": 9.659192377417122, "learning_rate": 4.900505215492558e-05, "loss": 2.8023, "mean_token_accuracy": 0.3965517282485962, "step": 139260 }, { "epoch": 0.14026918711052425, "grad_norm": 12.828217970272565, "learning_rate": 4.9004941955282756e-05, "loss": 2.9303, "mean_token_accuracy": 0.3241379290819168, "step": 139265 }, { "epoch": 0.14027422316362842, "grad_norm": 12.259214896672813, "learning_rate": 4.900483174967546e-05, "loss": 2.6622, "mean_token_accuracy": 0.4137930989265442, "step": 139270 }, { "epoch": 0.1402792592167326, "grad_norm": 11.350557603293797, "learning_rate": 4.900472153810372e-05, "loss": 1.9712, "mean_token_accuracy": 0.48965516686439514, "step": 139275 }, { "epoch": 0.14028429526983677, "grad_norm": 10.581251705602003, "learning_rate": 4.9004611320567566e-05, "loss": 2.017, "mean_token_accuracy": 0.47586206793785096, "step": 139280 }, { "epoch": 0.14028933132294094, "grad_norm": 10.82034167271544, "learning_rate": 4.900450109706703e-05, "loss": 2.1689, "mean_token_accuracy": 0.4068965494632721, "step": 139285 }, { "epoch": 0.14029436737604511, "grad_norm": 13.203422437323873, "learning_rate": 4.900439086760214e-05, "loss": 2.1883, "mean_token_accuracy": 0.4689655125141144, "step": 139290 }, { "epoch": 0.1402994034291493, "grad_norm": 9.205727895805992, "learning_rate": 4.900428063217293e-05, "loss": 2.3245, "mean_token_accuracy": 0.45172414779663084, "step": 139295 }, { "epoch": 0.14030443948225346, "grad_norm": 19.21318897273395, "learning_rate": 4.900417039077942e-05, "loss": 2.9463, "mean_token_accuracy": 0.3344827562570572, "step": 139300 }, { "epoch": 0.14030947553535764, "grad_norm": 16.47565831909942, "learning_rate": 4.9004060143421656e-05, "loss": 2.9311, "mean_token_accuracy": 0.3896551698446274, "step": 139305 }, { "epoch": 0.1403145115884618, "grad_norm": 9.05375693288172, "learning_rate": 4.9003949890099654e-05, "loss": 2.4132, "mean_token_accuracy": 0.4156079888343811, "step": 139310 }, { "epoch": 0.14031954764156598, "grad_norm": 11.980634945575275, "learning_rate": 4.9003839630813456e-05, "loss": 2.2819, "mean_token_accuracy": 0.4517241418361664, "step": 139315 }, { "epoch": 0.14032458369467013, "grad_norm": 9.041239592318718, "learning_rate": 4.900372936556308e-05, "loss": 2.5154, "mean_token_accuracy": 0.41034482717514037, "step": 139320 }, { "epoch": 0.1403296197477743, "grad_norm": 10.722671620746567, "learning_rate": 4.900361909434857e-05, "loss": 2.5416, "mean_token_accuracy": 0.39655172228813174, "step": 139325 }, { "epoch": 0.14033465580087848, "grad_norm": 9.231609839942674, "learning_rate": 4.9003508817169955e-05, "loss": 2.1098, "mean_token_accuracy": 0.4206896543502808, "step": 139330 }, { "epoch": 0.14033969185398265, "grad_norm": 10.24275957869904, "learning_rate": 4.9003398534027257e-05, "loss": 2.2875, "mean_token_accuracy": 0.441379314661026, "step": 139335 }, { "epoch": 0.14034472790708682, "grad_norm": 11.761517503972227, "learning_rate": 4.900328824492051e-05, "loss": 2.4554, "mean_token_accuracy": 0.4482758641242981, "step": 139340 }, { "epoch": 0.140349763960191, "grad_norm": 11.761358809799896, "learning_rate": 4.9003177949849746e-05, "loss": 2.6772, "mean_token_accuracy": 0.4000000059604645, "step": 139345 }, { "epoch": 0.14035480001329517, "grad_norm": 10.137150080451988, "learning_rate": 4.9003067648814994e-05, "loss": 2.6645, "mean_token_accuracy": 0.40865094065666197, "step": 139350 }, { "epoch": 0.14035983606639935, "grad_norm": 9.896264781417404, "learning_rate": 4.900295734181629e-05, "loss": 2.4077, "mean_token_accuracy": 0.44827587008476255, "step": 139355 }, { "epoch": 0.14036487211950352, "grad_norm": 8.832964560311275, "learning_rate": 4.900284702885366e-05, "loss": 2.2254, "mean_token_accuracy": 0.4812462151050568, "step": 139360 }, { "epoch": 0.1403699081726077, "grad_norm": 11.209053106571305, "learning_rate": 4.9002736709927135e-05, "loss": 2.2229, "mean_token_accuracy": 0.45862069725990295, "step": 139365 }, { "epoch": 0.14037494422571187, "grad_norm": 8.675804603529198, "learning_rate": 4.9002626385036744e-05, "loss": 2.2057, "mean_token_accuracy": 0.47931034564971925, "step": 139370 }, { "epoch": 0.14037998027881604, "grad_norm": 9.656935528038789, "learning_rate": 4.900251605418252e-05, "loss": 2.5236, "mean_token_accuracy": 0.43103448748588563, "step": 139375 }, { "epoch": 0.14038501633192021, "grad_norm": 10.335045491477471, "learning_rate": 4.900240571736449e-05, "loss": 2.42, "mean_token_accuracy": 0.3793103456497192, "step": 139380 }, { "epoch": 0.1403900523850244, "grad_norm": 10.545376150839555, "learning_rate": 4.900229537458269e-05, "loss": 2.7233, "mean_token_accuracy": 0.4103448212146759, "step": 139385 }, { "epoch": 0.14039508843812856, "grad_norm": 10.054170697550973, "learning_rate": 4.900218502583715e-05, "loss": 2.0616, "mean_token_accuracy": 0.46551724076271056, "step": 139390 }, { "epoch": 0.14040012449123274, "grad_norm": 9.992570489486635, "learning_rate": 4.90020746711279e-05, "loss": 2.295, "mean_token_accuracy": 0.45517241954803467, "step": 139395 }, { "epoch": 0.1404051605443369, "grad_norm": 12.087619958634065, "learning_rate": 4.900196431045497e-05, "loss": 2.5257, "mean_token_accuracy": 0.41379310488700866, "step": 139400 }, { "epoch": 0.14041019659744108, "grad_norm": 11.297483629008509, "learning_rate": 4.900185394381839e-05, "loss": 2.7386, "mean_token_accuracy": 0.3862069010734558, "step": 139405 }, { "epoch": 0.14041523265054526, "grad_norm": 12.593812614576546, "learning_rate": 4.900174357121818e-05, "loss": 2.4246, "mean_token_accuracy": 0.37586206793785093, "step": 139410 }, { "epoch": 0.14042026870364943, "grad_norm": 8.909897318422171, "learning_rate": 4.900163319265439e-05, "loss": 2.4036, "mean_token_accuracy": 0.4398064136505127, "step": 139415 }, { "epoch": 0.1404253047567536, "grad_norm": 10.338116893786037, "learning_rate": 4.9001522808127045e-05, "loss": 2.3046, "mean_token_accuracy": 0.48275861144065857, "step": 139420 }, { "epoch": 0.14043034080985778, "grad_norm": 10.942972624665288, "learning_rate": 4.900141241763617e-05, "loss": 2.4669, "mean_token_accuracy": 0.43103447556495667, "step": 139425 }, { "epoch": 0.14043537686296195, "grad_norm": 11.438862790844516, "learning_rate": 4.9001302021181794e-05, "loss": 2.258, "mean_token_accuracy": 0.40514216423034666, "step": 139430 }, { "epoch": 0.14044041291606613, "grad_norm": 10.664454673071734, "learning_rate": 4.9001191618763954e-05, "loss": 2.1797, "mean_token_accuracy": 0.4984271049499512, "step": 139435 }, { "epoch": 0.1404454489691703, "grad_norm": 9.828506200140152, "learning_rate": 4.9001081210382685e-05, "loss": 2.0698, "mean_token_accuracy": 0.4396854221820831, "step": 139440 }, { "epoch": 0.14045048502227447, "grad_norm": 11.173908697940576, "learning_rate": 4.9000970796038e-05, "loss": 2.2084, "mean_token_accuracy": 0.4517241299152374, "step": 139445 }, { "epoch": 0.14045552107537865, "grad_norm": 11.476234922586118, "learning_rate": 4.900086037572995e-05, "loss": 2.4118, "mean_token_accuracy": 0.4344827651977539, "step": 139450 }, { "epoch": 0.14046055712848282, "grad_norm": 10.761930333336347, "learning_rate": 4.9000749949458544e-05, "loss": 2.5639, "mean_token_accuracy": 0.4310344815254211, "step": 139455 }, { "epoch": 0.14046559318158697, "grad_norm": 12.552787006342987, "learning_rate": 4.900063951722384e-05, "loss": 2.6124, "mean_token_accuracy": 0.4344827592372894, "step": 139460 }, { "epoch": 0.14047062923469114, "grad_norm": 10.376070024977015, "learning_rate": 4.9000529079025835e-05, "loss": 2.3629, "mean_token_accuracy": 0.4793103516101837, "step": 139465 }, { "epoch": 0.14047566528779531, "grad_norm": 11.142843012201803, "learning_rate": 4.900041863486459e-05, "loss": 1.9052, "mean_token_accuracy": 0.5137930929660797, "step": 139470 }, { "epoch": 0.1404807013408995, "grad_norm": 14.60091052510591, "learning_rate": 4.900030818474012e-05, "loss": 2.979, "mean_token_accuracy": 0.4172413796186447, "step": 139475 }, { "epoch": 0.14048573739400366, "grad_norm": 12.897988935225726, "learning_rate": 4.900019772865247e-05, "loss": 2.7057, "mean_token_accuracy": 0.3862068921327591, "step": 139480 }, { "epoch": 0.14049077344710784, "grad_norm": 10.324823458658424, "learning_rate": 4.9000087266601645e-05, "loss": 2.502, "mean_token_accuracy": 0.4068965494632721, "step": 139485 }, { "epoch": 0.140495809500212, "grad_norm": 10.953638221385665, "learning_rate": 4.899997679858769e-05, "loss": 2.27, "mean_token_accuracy": 0.4275862157344818, "step": 139490 }, { "epoch": 0.14050084555331618, "grad_norm": 11.062164752672775, "learning_rate": 4.899986632461065e-05, "loss": 2.0976, "mean_token_accuracy": 0.4758620738983154, "step": 139495 }, { "epoch": 0.14050588160642036, "grad_norm": 10.402952569081437, "learning_rate": 4.899975584467053e-05, "loss": 2.6015, "mean_token_accuracy": 0.4206896543502808, "step": 139500 }, { "epoch": 0.14051091765952453, "grad_norm": 9.00585432002654, "learning_rate": 4.8999645358767385e-05, "loss": 2.0235, "mean_token_accuracy": 0.4931034505367279, "step": 139505 }, { "epoch": 0.1405159537126287, "grad_norm": 11.48408208839091, "learning_rate": 4.899953486690123e-05, "loss": 2.6746, "mean_token_accuracy": 0.3931034475564957, "step": 139510 }, { "epoch": 0.14052098976573288, "grad_norm": 8.903935284368469, "learning_rate": 4.899942436907209e-05, "loss": 2.4534, "mean_token_accuracy": 0.43793103098869324, "step": 139515 }, { "epoch": 0.14052602581883705, "grad_norm": 10.829057253173971, "learning_rate": 4.899931386528001e-05, "loss": 2.1615, "mean_token_accuracy": 0.4862068951129913, "step": 139520 }, { "epoch": 0.14053106187194123, "grad_norm": 8.982386851741925, "learning_rate": 4.899920335552501e-05, "loss": 2.0607, "mean_token_accuracy": 0.4862069010734558, "step": 139525 }, { "epoch": 0.1405360979250454, "grad_norm": 10.897197379718845, "learning_rate": 4.8999092839807134e-05, "loss": 2.345, "mean_token_accuracy": 0.4517241358757019, "step": 139530 }, { "epoch": 0.14054113397814957, "grad_norm": 11.031823700677897, "learning_rate": 4.89989823181264e-05, "loss": 2.2833, "mean_token_accuracy": 0.45366001725196836, "step": 139535 }, { "epoch": 0.14054617003125375, "grad_norm": 12.70552698639148, "learning_rate": 4.8998871790482836e-05, "loss": 2.5248, "mean_token_accuracy": 0.4137930989265442, "step": 139540 }, { "epoch": 0.14055120608435792, "grad_norm": 9.040965860315774, "learning_rate": 4.899876125687649e-05, "loss": 2.1248, "mean_token_accuracy": 0.4517241358757019, "step": 139545 }, { "epoch": 0.1405562421374621, "grad_norm": 9.875195562904292, "learning_rate": 4.8998650717307384e-05, "loss": 2.4734, "mean_token_accuracy": 0.4206896543502808, "step": 139550 }, { "epoch": 0.14056127819056627, "grad_norm": 10.370181418695568, "learning_rate": 4.8998540171775535e-05, "loss": 2.2798, "mean_token_accuracy": 0.44827585816383364, "step": 139555 }, { "epoch": 0.14056631424367044, "grad_norm": 9.325861809152086, "learning_rate": 4.8998429620280995e-05, "loss": 2.1956, "mean_token_accuracy": 0.4724137902259827, "step": 139560 }, { "epoch": 0.14057135029677462, "grad_norm": 9.799924933096444, "learning_rate": 4.899831906282378e-05, "loss": 2.6542, "mean_token_accuracy": 0.4068965554237366, "step": 139565 }, { "epoch": 0.1405763863498788, "grad_norm": 10.422712060934366, "learning_rate": 4.8998208499403934e-05, "loss": 2.3175, "mean_token_accuracy": 0.4689655125141144, "step": 139570 }, { "epoch": 0.14058142240298296, "grad_norm": 9.22770587468454, "learning_rate": 4.8998097930021474e-05, "loss": 2.3997, "mean_token_accuracy": 0.4206896543502808, "step": 139575 }, { "epoch": 0.14058645845608714, "grad_norm": 9.77450462307289, "learning_rate": 4.899798735467643e-05, "loss": 2.2906, "mean_token_accuracy": 0.4, "step": 139580 }, { "epoch": 0.1405914945091913, "grad_norm": 14.29418619099334, "learning_rate": 4.899787677336886e-05, "loss": 2.6012, "mean_token_accuracy": 0.4344827592372894, "step": 139585 }, { "epoch": 0.14059653056229549, "grad_norm": 10.946760185547165, "learning_rate": 4.899776618609876e-05, "loss": 2.751, "mean_token_accuracy": 0.44137930274009707, "step": 139590 }, { "epoch": 0.14060156661539966, "grad_norm": 9.114994358870264, "learning_rate": 4.899765559286617e-05, "loss": 2.3317, "mean_token_accuracy": 0.4517241418361664, "step": 139595 }, { "epoch": 0.1406066026685038, "grad_norm": 10.06857655927627, "learning_rate": 4.899754499367113e-05, "loss": 2.4197, "mean_token_accuracy": 0.4430127084255219, "step": 139600 }, { "epoch": 0.14061163872160798, "grad_norm": 12.347033308199482, "learning_rate": 4.899743438851366e-05, "loss": 2.2558, "mean_token_accuracy": 0.4413793087005615, "step": 139605 }, { "epoch": 0.14061667477471215, "grad_norm": 12.026445723120974, "learning_rate": 4.89973237773938e-05, "loss": 2.2505, "mean_token_accuracy": 0.4413793087005615, "step": 139610 }, { "epoch": 0.14062171082781633, "grad_norm": 9.958994714436637, "learning_rate": 4.8997213160311585e-05, "loss": 2.2818, "mean_token_accuracy": 0.4620689570903778, "step": 139615 }, { "epoch": 0.1406267468809205, "grad_norm": 8.968294497628191, "learning_rate": 4.8997102537267026e-05, "loss": 2.2862, "mean_token_accuracy": 0.4103448331356049, "step": 139620 }, { "epoch": 0.14063178293402467, "grad_norm": 10.250974065133866, "learning_rate": 4.899699190826017e-05, "loss": 2.1489, "mean_token_accuracy": 0.44827585220336913, "step": 139625 }, { "epoch": 0.14063681898712885, "grad_norm": 11.022246682922955, "learning_rate": 4.899688127329105e-05, "loss": 2.4209, "mean_token_accuracy": 0.4620689690113068, "step": 139630 }, { "epoch": 0.14064185504023302, "grad_norm": 11.751815057017549, "learning_rate": 4.899677063235968e-05, "loss": 2.3832, "mean_token_accuracy": 0.4103448331356049, "step": 139635 }, { "epoch": 0.1406468910933372, "grad_norm": 9.93093762813725, "learning_rate": 4.89966599854661e-05, "loss": 2.3763, "mean_token_accuracy": 0.46551724672317507, "step": 139640 }, { "epoch": 0.14065192714644137, "grad_norm": 12.325785950215145, "learning_rate": 4.899654933261035e-05, "loss": 2.5596, "mean_token_accuracy": 0.4103448212146759, "step": 139645 }, { "epoch": 0.14065696319954554, "grad_norm": 10.450418428944218, "learning_rate": 4.8996438673792444e-05, "loss": 2.3352, "mean_token_accuracy": 0.42068964838981626, "step": 139650 }, { "epoch": 0.14066199925264972, "grad_norm": 11.979308607118154, "learning_rate": 4.8996328009012424e-05, "loss": 2.2246, "mean_token_accuracy": 0.4655172348022461, "step": 139655 }, { "epoch": 0.1406670353057539, "grad_norm": 9.971127001869693, "learning_rate": 4.899621733827032e-05, "loss": 2.5945, "mean_token_accuracy": 0.39655171930789945, "step": 139660 }, { "epoch": 0.14067207135885806, "grad_norm": 10.396666104581882, "learning_rate": 4.899610666156615e-05, "loss": 2.1845, "mean_token_accuracy": 0.4689655125141144, "step": 139665 }, { "epoch": 0.14067710741196224, "grad_norm": 9.985233472204856, "learning_rate": 4.899599597889997e-05, "loss": 2.1423, "mean_token_accuracy": 0.4862068951129913, "step": 139670 }, { "epoch": 0.1406821434650664, "grad_norm": 12.209251818893916, "learning_rate": 4.899588529027178e-05, "loss": 2.2087, "mean_token_accuracy": 0.43793103098869324, "step": 139675 }, { "epoch": 0.14068717951817059, "grad_norm": 10.066478584763802, "learning_rate": 4.899577459568164e-05, "loss": 2.1785, "mean_token_accuracy": 0.45015124082565305, "step": 139680 }, { "epoch": 0.14069221557127476, "grad_norm": 14.365214932951366, "learning_rate": 4.899566389512956e-05, "loss": 2.0712, "mean_token_accuracy": 0.4896551728248596, "step": 139685 }, { "epoch": 0.14069725162437893, "grad_norm": 12.85979836097937, "learning_rate": 4.899555318861557e-05, "loss": 1.9809, "mean_token_accuracy": 0.4724137902259827, "step": 139690 }, { "epoch": 0.1407022876774831, "grad_norm": 11.075975719662246, "learning_rate": 4.8995442476139714e-05, "loss": 2.5468, "mean_token_accuracy": 0.44827585816383364, "step": 139695 }, { "epoch": 0.14070732373058728, "grad_norm": 9.748092925762462, "learning_rate": 4.899533175770202e-05, "loss": 2.6427, "mean_token_accuracy": 0.38275861740112305, "step": 139700 }, { "epoch": 0.14071235978369145, "grad_norm": 10.502854651784915, "learning_rate": 4.8995221033302515e-05, "loss": 2.3565, "mean_token_accuracy": 0.43103448748588563, "step": 139705 }, { "epoch": 0.14071739583679563, "grad_norm": 19.017710038931856, "learning_rate": 4.899511030294123e-05, "loss": 2.3597, "mean_token_accuracy": 0.458620685338974, "step": 139710 }, { "epoch": 0.1407224318898998, "grad_norm": 9.037820447294438, "learning_rate": 4.8994999566618196e-05, "loss": 2.1866, "mean_token_accuracy": 0.4965517222881317, "step": 139715 }, { "epoch": 0.14072746794300398, "grad_norm": 9.972292385846975, "learning_rate": 4.899488882433344e-05, "loss": 2.193, "mean_token_accuracy": 0.4517241418361664, "step": 139720 }, { "epoch": 0.14073250399610815, "grad_norm": 11.05430110814426, "learning_rate": 4.899477807608699e-05, "loss": 2.7626, "mean_token_accuracy": 0.36896551847457887, "step": 139725 }, { "epoch": 0.14073754004921232, "grad_norm": 10.639225865633867, "learning_rate": 4.89946673218789e-05, "loss": 2.323, "mean_token_accuracy": 0.45862069725990295, "step": 139730 }, { "epoch": 0.1407425761023165, "grad_norm": 8.9008135600385, "learning_rate": 4.8994556561709174e-05, "loss": 2.833, "mean_token_accuracy": 0.3965517282485962, "step": 139735 }, { "epoch": 0.14074761215542064, "grad_norm": 11.262624325821289, "learning_rate": 4.8994445795577853e-05, "loss": 2.5571, "mean_token_accuracy": 0.42068966031074523, "step": 139740 }, { "epoch": 0.14075264820852482, "grad_norm": 8.759702841218077, "learning_rate": 4.899433502348497e-05, "loss": 2.3394, "mean_token_accuracy": 0.43103447556495667, "step": 139745 }, { "epoch": 0.140757684261629, "grad_norm": 9.990617824041623, "learning_rate": 4.899422424543055e-05, "loss": 2.0778, "mean_token_accuracy": 0.4676345944404602, "step": 139750 }, { "epoch": 0.14076272031473316, "grad_norm": 11.990691204399099, "learning_rate": 4.899411346141463e-05, "loss": 2.411, "mean_token_accuracy": 0.441379314661026, "step": 139755 }, { "epoch": 0.14076775636783734, "grad_norm": 8.14239853760547, "learning_rate": 4.899400267143724e-05, "loss": 2.2878, "mean_token_accuracy": 0.5000000059604645, "step": 139760 }, { "epoch": 0.1407727924209415, "grad_norm": 7.304990894467067, "learning_rate": 4.89938918754984e-05, "loss": 2.3353, "mean_token_accuracy": 0.4560344874858856, "step": 139765 }, { "epoch": 0.14077782847404569, "grad_norm": 9.711098969676604, "learning_rate": 4.899378107359815e-05, "loss": 2.4662, "mean_token_accuracy": 0.47931033968925474, "step": 139770 }, { "epoch": 0.14078286452714986, "grad_norm": 11.452736443222044, "learning_rate": 4.8993670265736526e-05, "loss": 2.7586, "mean_token_accuracy": 0.41923774480819703, "step": 139775 }, { "epoch": 0.14078790058025403, "grad_norm": 11.65883598095452, "learning_rate": 4.8993559451913544e-05, "loss": 2.6439, "mean_token_accuracy": 0.417241370677948, "step": 139780 }, { "epoch": 0.1407929366333582, "grad_norm": 8.965376572137226, "learning_rate": 4.899344863212925e-05, "loss": 2.1841, "mean_token_accuracy": 0.47586206793785096, "step": 139785 }, { "epoch": 0.14079797268646238, "grad_norm": 10.979349000585401, "learning_rate": 4.8993337806383665e-05, "loss": 2.337, "mean_token_accuracy": 0.47586206793785096, "step": 139790 }, { "epoch": 0.14080300873956655, "grad_norm": 10.040133198237875, "learning_rate": 4.899322697467682e-05, "loss": 2.5308, "mean_token_accuracy": 0.4034482717514038, "step": 139795 }, { "epoch": 0.14080804479267073, "grad_norm": 12.413311189740595, "learning_rate": 4.899311613700875e-05, "loss": 2.5288, "mean_token_accuracy": 0.36551723480224607, "step": 139800 }, { "epoch": 0.1408130808457749, "grad_norm": 9.630840695022773, "learning_rate": 4.8993005293379494e-05, "loss": 2.7143, "mean_token_accuracy": 0.4448275864124298, "step": 139805 }, { "epoch": 0.14081811689887908, "grad_norm": 11.555177692253574, "learning_rate": 4.899289444378906e-05, "loss": 2.3341, "mean_token_accuracy": 0.3862068891525269, "step": 139810 }, { "epoch": 0.14082315295198325, "grad_norm": 9.979713772713549, "learning_rate": 4.8992783588237504e-05, "loss": 2.3791, "mean_token_accuracy": 0.39310344457626345, "step": 139815 }, { "epoch": 0.14082818900508742, "grad_norm": 9.296548387463963, "learning_rate": 4.899267272672483e-05, "loss": 2.7434, "mean_token_accuracy": 0.4068965494632721, "step": 139820 }, { "epoch": 0.1408332250581916, "grad_norm": 10.655820474670318, "learning_rate": 4.899256185925109e-05, "loss": 2.3232, "mean_token_accuracy": 0.45862069725990295, "step": 139825 }, { "epoch": 0.14083826111129577, "grad_norm": 13.862068093895052, "learning_rate": 4.8992450985816306e-05, "loss": 2.8521, "mean_token_accuracy": 0.3793103516101837, "step": 139830 }, { "epoch": 0.14084329716439994, "grad_norm": 8.981484648042969, "learning_rate": 4.8992340106420506e-05, "loss": 1.9815, "mean_token_accuracy": 0.49999999403953554, "step": 139835 }, { "epoch": 0.14084833321750412, "grad_norm": 10.667547079564688, "learning_rate": 4.8992229221063735e-05, "loss": 2.6164, "mean_token_accuracy": 0.3862069010734558, "step": 139840 }, { "epoch": 0.1408533692706083, "grad_norm": 9.57885945424305, "learning_rate": 4.899211832974602e-05, "loss": 2.629, "mean_token_accuracy": 0.4413793087005615, "step": 139845 }, { "epoch": 0.14085840532371247, "grad_norm": 11.856917455682977, "learning_rate": 4.899200743246736e-05, "loss": 2.8038, "mean_token_accuracy": 0.43793103098869324, "step": 139850 }, { "epoch": 0.14086344137681664, "grad_norm": 9.610660169698743, "learning_rate": 4.899189652922783e-05, "loss": 2.2006, "mean_token_accuracy": 0.42758620381355283, "step": 139855 }, { "epoch": 0.1408684774299208, "grad_norm": 9.77261766777269, "learning_rate": 4.899178562002744e-05, "loss": 2.3198, "mean_token_accuracy": 0.4310344815254211, "step": 139860 }, { "epoch": 0.140873513483025, "grad_norm": 9.766768902788696, "learning_rate": 4.899167470486622e-05, "loss": 2.1202, "mean_token_accuracy": 0.4931034564971924, "step": 139865 }, { "epoch": 0.14087854953612916, "grad_norm": 11.156741153298752, "learning_rate": 4.899156378374422e-05, "loss": 2.4204, "mean_token_accuracy": 0.3984875977039337, "step": 139870 }, { "epoch": 0.14088358558923333, "grad_norm": 8.550468581124601, "learning_rate": 4.899145285666144e-05, "loss": 2.6242, "mean_token_accuracy": 0.38965517580509185, "step": 139875 }, { "epoch": 0.14088862164233748, "grad_norm": 11.476300785924476, "learning_rate": 4.8991341923617924e-05, "loss": 2.3596, "mean_token_accuracy": 0.39310345649719236, "step": 139880 }, { "epoch": 0.14089365769544165, "grad_norm": 9.637903713103686, "learning_rate": 4.89912309846137e-05, "loss": 2.3063, "mean_token_accuracy": 0.38620689511299133, "step": 139885 }, { "epoch": 0.14089869374854583, "grad_norm": 11.384715198003379, "learning_rate": 4.899112003964881e-05, "loss": 2.5156, "mean_token_accuracy": 0.4137930989265442, "step": 139890 }, { "epoch": 0.14090372980165, "grad_norm": 10.143126159734122, "learning_rate": 4.899100908872328e-05, "loss": 2.2249, "mean_token_accuracy": 0.4344827592372894, "step": 139895 }, { "epoch": 0.14090876585475418, "grad_norm": 9.208608954645548, "learning_rate": 4.8990898131837134e-05, "loss": 2.2479, "mean_token_accuracy": 0.42413793206214906, "step": 139900 }, { "epoch": 0.14091380190785835, "grad_norm": 10.489483594896107, "learning_rate": 4.899078716899041e-05, "loss": 2.1731, "mean_token_accuracy": 0.46551724076271056, "step": 139905 }, { "epoch": 0.14091883796096252, "grad_norm": 9.781124148368592, "learning_rate": 4.899067620018314e-05, "loss": 2.4242, "mean_token_accuracy": 0.4503932178020477, "step": 139910 }, { "epoch": 0.1409238740140667, "grad_norm": 10.323881929166419, "learning_rate": 4.8990565225415345e-05, "loss": 2.7946, "mean_token_accuracy": 0.41034482717514037, "step": 139915 }, { "epoch": 0.14092891006717087, "grad_norm": 10.819996630872469, "learning_rate": 4.899045424468706e-05, "loss": 2.3114, "mean_token_accuracy": 0.41379310488700866, "step": 139920 }, { "epoch": 0.14093394612027504, "grad_norm": 11.843478603817722, "learning_rate": 4.8990343257998316e-05, "loss": 2.6739, "mean_token_accuracy": 0.4172413766384125, "step": 139925 }, { "epoch": 0.14093898217337922, "grad_norm": 10.319134352011824, "learning_rate": 4.8990232265349156e-05, "loss": 2.338, "mean_token_accuracy": 0.44482758045196535, "step": 139930 }, { "epoch": 0.1409440182264834, "grad_norm": 10.518440036706624, "learning_rate": 4.8990121266739594e-05, "loss": 2.2685, "mean_token_accuracy": 0.41724138259887694, "step": 139935 }, { "epoch": 0.14094905427958757, "grad_norm": 10.728684087341568, "learning_rate": 4.899001026216966e-05, "loss": 2.6778, "mean_token_accuracy": 0.4103448331356049, "step": 139940 }, { "epoch": 0.14095409033269174, "grad_norm": 10.516534104407803, "learning_rate": 4.89898992516394e-05, "loss": 2.4708, "mean_token_accuracy": 0.47586206793785096, "step": 139945 }, { "epoch": 0.1409591263857959, "grad_norm": 9.390443545951973, "learning_rate": 4.898978823514883e-05, "loss": 2.2734, "mean_token_accuracy": 0.4620689690113068, "step": 139950 }, { "epoch": 0.1409641624389001, "grad_norm": 9.4896109536185, "learning_rate": 4.8989677212697996e-05, "loss": 2.3454, "mean_token_accuracy": 0.4517241358757019, "step": 139955 }, { "epoch": 0.14096919849200426, "grad_norm": 9.213443594944872, "learning_rate": 4.898956618428691e-05, "loss": 2.2364, "mean_token_accuracy": 0.4517241358757019, "step": 139960 }, { "epoch": 0.14097423454510843, "grad_norm": 11.50410476388823, "learning_rate": 4.8989455149915615e-05, "loss": 2.3556, "mean_token_accuracy": 0.4103448331356049, "step": 139965 }, { "epoch": 0.1409792705982126, "grad_norm": 14.455722093071376, "learning_rate": 4.898934410958414e-05, "loss": 2.4618, "mean_token_accuracy": 0.4413793087005615, "step": 139970 }, { "epoch": 0.14098430665131678, "grad_norm": 14.396036685765093, "learning_rate": 4.8989233063292516e-05, "loss": 2.5183, "mean_token_accuracy": 0.4068965494632721, "step": 139975 }, { "epoch": 0.14098934270442096, "grad_norm": 10.747526328631206, "learning_rate": 4.898912201104077e-05, "loss": 2.3234, "mean_token_accuracy": 0.47241378426551817, "step": 139980 }, { "epoch": 0.14099437875752513, "grad_norm": 10.97783751007184, "learning_rate": 4.898901095282894e-05, "loss": 2.6459, "mean_token_accuracy": 0.35862069129943847, "step": 139985 }, { "epoch": 0.1409994148106293, "grad_norm": 9.044167778940276, "learning_rate": 4.898889988865706e-05, "loss": 1.7793, "mean_token_accuracy": 0.5793103396892547, "step": 139990 }, { "epoch": 0.14100445086373348, "grad_norm": 10.707788793871304, "learning_rate": 4.898878881852514e-05, "loss": 2.7049, "mean_token_accuracy": 0.3965517163276672, "step": 139995 }, { "epoch": 0.14100948691683765, "grad_norm": 10.898108399224432, "learning_rate": 4.898867774243323e-05, "loss": 2.8714, "mean_token_accuracy": 0.40859044194221494, "step": 140000 }, { "epoch": 0.14101452296994182, "grad_norm": 8.18986887852621, "learning_rate": 4.898856666038136e-05, "loss": 2.2691, "mean_token_accuracy": 0.46896552443504336, "step": 140005 }, { "epoch": 0.141019559023046, "grad_norm": 12.161511605561584, "learning_rate": 4.898845557236955e-05, "loss": 2.5507, "mean_token_accuracy": 0.3896551728248596, "step": 140010 }, { "epoch": 0.14102459507615017, "grad_norm": 10.658953642251863, "learning_rate": 4.898834447839783e-05, "loss": 2.294, "mean_token_accuracy": 0.42413793206214906, "step": 140015 }, { "epoch": 0.14102963112925432, "grad_norm": 10.485906680599228, "learning_rate": 4.898823337846625e-05, "loss": 2.537, "mean_token_accuracy": 0.4068965494632721, "step": 140020 }, { "epoch": 0.1410346671823585, "grad_norm": 11.290276387902704, "learning_rate": 4.898812227257482e-05, "loss": 2.3955, "mean_token_accuracy": 0.40689656138420105, "step": 140025 }, { "epoch": 0.14103970323546267, "grad_norm": 8.670751035579551, "learning_rate": 4.898801116072358e-05, "loss": 2.0738, "mean_token_accuracy": 0.47586206197738645, "step": 140030 }, { "epoch": 0.14104473928856684, "grad_norm": 10.153570688196185, "learning_rate": 4.898790004291257e-05, "loss": 2.2202, "mean_token_accuracy": 0.4551724135875702, "step": 140035 }, { "epoch": 0.141049775341671, "grad_norm": 9.6228233344508, "learning_rate": 4.8987788919141796e-05, "loss": 2.1784, "mean_token_accuracy": 0.4586206912994385, "step": 140040 }, { "epoch": 0.1410548113947752, "grad_norm": 11.342022243938711, "learning_rate": 4.898767778941132e-05, "loss": 2.0338, "mean_token_accuracy": 0.47586206197738645, "step": 140045 }, { "epoch": 0.14105984744787936, "grad_norm": 13.364459776718911, "learning_rate": 4.898756665372114e-05, "loss": 2.2932, "mean_token_accuracy": 0.4770935893058777, "step": 140050 }, { "epoch": 0.14106488350098353, "grad_norm": 10.27067917033712, "learning_rate": 4.898745551207131e-05, "loss": 2.1734, "mean_token_accuracy": 0.4551724135875702, "step": 140055 }, { "epoch": 0.1410699195540877, "grad_norm": 8.861173507432301, "learning_rate": 4.8987344364461856e-05, "loss": 1.9017, "mean_token_accuracy": 0.48620688915252686, "step": 140060 }, { "epoch": 0.14107495560719188, "grad_norm": 9.07117296886931, "learning_rate": 4.898723321089281e-05, "loss": 2.1245, "mean_token_accuracy": 0.506896561384201, "step": 140065 }, { "epoch": 0.14107999166029606, "grad_norm": 10.99698495734675, "learning_rate": 4.8987122051364184e-05, "loss": 2.5377, "mean_token_accuracy": 0.4379310369491577, "step": 140070 }, { "epoch": 0.14108502771340023, "grad_norm": 11.00143540255811, "learning_rate": 4.898701088587604e-05, "loss": 2.5042, "mean_token_accuracy": 0.41034482717514037, "step": 140075 }, { "epoch": 0.1410900637665044, "grad_norm": 9.113838658528676, "learning_rate": 4.898689971442839e-05, "loss": 2.4596, "mean_token_accuracy": 0.3896551728248596, "step": 140080 }, { "epoch": 0.14109509981960858, "grad_norm": 14.643268076382094, "learning_rate": 4.8986788537021264e-05, "loss": 2.8461, "mean_token_accuracy": 0.379310342669487, "step": 140085 }, { "epoch": 0.14110013587271275, "grad_norm": 8.575846274495344, "learning_rate": 4.89866773536547e-05, "loss": 2.4397, "mean_token_accuracy": 0.39310344457626345, "step": 140090 }, { "epoch": 0.14110517192581692, "grad_norm": 9.03748019875443, "learning_rate": 4.898656616432873e-05, "loss": 2.5223, "mean_token_accuracy": 0.4482758641242981, "step": 140095 }, { "epoch": 0.1411102079789211, "grad_norm": 10.441266561970286, "learning_rate": 4.898645496904337e-05, "loss": 2.4091, "mean_token_accuracy": 0.42068966031074523, "step": 140100 }, { "epoch": 0.14111524403202527, "grad_norm": 8.736722222990101, "learning_rate": 4.898634376779867e-05, "loss": 2.7521, "mean_token_accuracy": 0.4034482777118683, "step": 140105 }, { "epoch": 0.14112028008512945, "grad_norm": 12.10196233947874, "learning_rate": 4.898623256059466e-05, "loss": 2.6441, "mean_token_accuracy": 0.3896551728248596, "step": 140110 }, { "epoch": 0.14112531613823362, "grad_norm": 11.653963721379736, "learning_rate": 4.8986121347431345e-05, "loss": 2.3006, "mean_token_accuracy": 0.4172413766384125, "step": 140115 }, { "epoch": 0.1411303521913378, "grad_norm": 12.20022416476889, "learning_rate": 4.8986010128308784e-05, "loss": 2.5637, "mean_token_accuracy": 0.3793103516101837, "step": 140120 }, { "epoch": 0.14113538824444197, "grad_norm": 14.65555960243942, "learning_rate": 4.8985898903227e-05, "loss": 2.2879, "mean_token_accuracy": 0.4448275864124298, "step": 140125 }, { "epoch": 0.14114042429754614, "grad_norm": 12.230146963168526, "learning_rate": 4.898578767218601e-05, "loss": 2.5196, "mean_token_accuracy": 0.4344827592372894, "step": 140130 }, { "epoch": 0.14114546035065031, "grad_norm": 10.025759506542542, "learning_rate": 4.898567643518587e-05, "loss": 2.3615, "mean_token_accuracy": 0.42722323536872864, "step": 140135 }, { "epoch": 0.1411504964037545, "grad_norm": 9.653743575317753, "learning_rate": 4.89855651922266e-05, "loss": 2.2445, "mean_token_accuracy": 0.482758617401123, "step": 140140 }, { "epoch": 0.14115553245685866, "grad_norm": 10.933280997987332, "learning_rate": 4.8985453943308215e-05, "loss": 2.3478, "mean_token_accuracy": 0.4676950931549072, "step": 140145 }, { "epoch": 0.14116056850996284, "grad_norm": 9.70933537446517, "learning_rate": 4.8985342688430766e-05, "loss": 2.4706, "mean_token_accuracy": 0.42068966031074523, "step": 140150 }, { "epoch": 0.141165604563067, "grad_norm": 12.193108437295077, "learning_rate": 4.898523142759428e-05, "loss": 2.6538, "mean_token_accuracy": 0.3758620709180832, "step": 140155 }, { "epoch": 0.14117064061617116, "grad_norm": 10.52239724888491, "learning_rate": 4.8985120160798784e-05, "loss": 2.1904, "mean_token_accuracy": 0.4551724135875702, "step": 140160 }, { "epoch": 0.14117567666927533, "grad_norm": 8.414153533225, "learning_rate": 4.89850088880443e-05, "loss": 2.3012, "mean_token_accuracy": 0.44676345586776733, "step": 140165 }, { "epoch": 0.1411807127223795, "grad_norm": 12.635108441515952, "learning_rate": 4.8984897609330875e-05, "loss": 2.3956, "mean_token_accuracy": 0.3862069010734558, "step": 140170 }, { "epoch": 0.14118574877548368, "grad_norm": 10.341726897217049, "learning_rate": 4.898478632465854e-05, "loss": 2.2588, "mean_token_accuracy": 0.4482758641242981, "step": 140175 }, { "epoch": 0.14119078482858785, "grad_norm": 9.47824428022073, "learning_rate": 4.89846750340273e-05, "loss": 2.1953, "mean_token_accuracy": 0.46896551847457885, "step": 140180 }, { "epoch": 0.14119582088169202, "grad_norm": 17.177649131223404, "learning_rate": 4.898456373743723e-05, "loss": 2.4312, "mean_token_accuracy": 0.4517241358757019, "step": 140185 }, { "epoch": 0.1412008569347962, "grad_norm": 11.872364210819864, "learning_rate": 4.8984452434888325e-05, "loss": 2.5291, "mean_token_accuracy": 0.3999999940395355, "step": 140190 }, { "epoch": 0.14120589298790037, "grad_norm": 9.727180919595572, "learning_rate": 4.8984341126380626e-05, "loss": 2.4045, "mean_token_accuracy": 0.43448275327682495, "step": 140195 }, { "epoch": 0.14121092904100455, "grad_norm": 10.716446686141841, "learning_rate": 4.898422981191416e-05, "loss": 2.3554, "mean_token_accuracy": 0.41724138259887694, "step": 140200 }, { "epoch": 0.14121596509410872, "grad_norm": 9.746793355315841, "learning_rate": 4.898411849148897e-05, "loss": 2.2881, "mean_token_accuracy": 0.44482758045196535, "step": 140205 }, { "epoch": 0.1412210011472129, "grad_norm": 10.276787786817653, "learning_rate": 4.8984007165105075e-05, "loss": 2.1275, "mean_token_accuracy": 0.43448275327682495, "step": 140210 }, { "epoch": 0.14122603720031707, "grad_norm": 13.52914833486473, "learning_rate": 4.898389583276252e-05, "loss": 2.7987, "mean_token_accuracy": 0.35862069129943847, "step": 140215 }, { "epoch": 0.14123107325342124, "grad_norm": 10.313252126485908, "learning_rate": 4.898378449446132e-05, "loss": 2.6326, "mean_token_accuracy": 0.38965517580509185, "step": 140220 }, { "epoch": 0.14123610930652541, "grad_norm": 10.09076594803918, "learning_rate": 4.898367315020151e-05, "loss": 2.2662, "mean_token_accuracy": 0.4551724135875702, "step": 140225 }, { "epoch": 0.1412411453596296, "grad_norm": 9.925232496000808, "learning_rate": 4.898356179998312e-05, "loss": 2.4097, "mean_token_accuracy": 0.4103448301553726, "step": 140230 }, { "epoch": 0.14124618141273376, "grad_norm": 11.172973133006497, "learning_rate": 4.898345044380619e-05, "loss": 2.351, "mean_token_accuracy": 0.43793103098869324, "step": 140235 }, { "epoch": 0.14125121746583794, "grad_norm": 9.712642526144881, "learning_rate": 4.898333908167075e-05, "loss": 2.2714, "mean_token_accuracy": 0.4551724135875702, "step": 140240 }, { "epoch": 0.1412562535189421, "grad_norm": 10.373756526269561, "learning_rate": 4.898322771357682e-05, "loss": 2.3885, "mean_token_accuracy": 0.493103438615799, "step": 140245 }, { "epoch": 0.14126128957204628, "grad_norm": 11.621263239404584, "learning_rate": 4.8983116339524435e-05, "loss": 2.5346, "mean_token_accuracy": 0.4034482777118683, "step": 140250 }, { "epoch": 0.14126632562515046, "grad_norm": 10.09838222890984, "learning_rate": 4.898300495951362e-05, "loss": 2.2988, "mean_token_accuracy": 0.4172413766384125, "step": 140255 }, { "epoch": 0.14127136167825463, "grad_norm": 10.525047106236844, "learning_rate": 4.898289357354442e-05, "loss": 2.3908, "mean_token_accuracy": 0.4154264986515045, "step": 140260 }, { "epoch": 0.1412763977313588, "grad_norm": 9.381272791933913, "learning_rate": 4.898278218161687e-05, "loss": 2.123, "mean_token_accuracy": 0.4551724135875702, "step": 140265 }, { "epoch": 0.14128143378446298, "grad_norm": 13.257994199044582, "learning_rate": 4.898267078373098e-05, "loss": 2.3994, "mean_token_accuracy": 0.4137930989265442, "step": 140270 }, { "epoch": 0.14128646983756715, "grad_norm": 10.18273344471407, "learning_rate": 4.89825593798868e-05, "loss": 2.3609, "mean_token_accuracy": 0.42413792610168455, "step": 140275 }, { "epoch": 0.14129150589067133, "grad_norm": 10.144082618994162, "learning_rate": 4.8982447970084346e-05, "loss": 2.1577, "mean_token_accuracy": 0.4758620738983154, "step": 140280 }, { "epoch": 0.1412965419437755, "grad_norm": 12.126970683517973, "learning_rate": 4.898233655432365e-05, "loss": 2.7429, "mean_token_accuracy": 0.4379310369491577, "step": 140285 }, { "epoch": 0.14130157799687967, "grad_norm": 11.139577454047947, "learning_rate": 4.898222513260475e-05, "loss": 2.388, "mean_token_accuracy": 0.42758620381355283, "step": 140290 }, { "epoch": 0.14130661404998385, "grad_norm": 13.314973888332993, "learning_rate": 4.898211370492768e-05, "loss": 2.6831, "mean_token_accuracy": 0.42413793206214906, "step": 140295 }, { "epoch": 0.141311650103088, "grad_norm": 9.660334531709022, "learning_rate": 4.898200227129246e-05, "loss": 2.3828, "mean_token_accuracy": 0.4517241418361664, "step": 140300 }, { "epoch": 0.14131668615619217, "grad_norm": 11.606781396537196, "learning_rate": 4.898189083169913e-05, "loss": 2.2712, "mean_token_accuracy": 0.4896551728248596, "step": 140305 }, { "epoch": 0.14132172220929634, "grad_norm": 10.287606596183796, "learning_rate": 4.898177938614771e-05, "loss": 2.0992, "mean_token_accuracy": 0.4965517222881317, "step": 140310 }, { "epoch": 0.14132675826240051, "grad_norm": 12.174701373617077, "learning_rate": 4.898166793463825e-05, "loss": 2.0341, "mean_token_accuracy": 0.4620689630508423, "step": 140315 }, { "epoch": 0.1413317943155047, "grad_norm": 8.855311388685147, "learning_rate": 4.8981556477170764e-05, "loss": 2.1052, "mean_token_accuracy": 0.43448275327682495, "step": 140320 }, { "epoch": 0.14133683036860886, "grad_norm": 10.731210668084534, "learning_rate": 4.8981445013745286e-05, "loss": 2.5581, "mean_token_accuracy": 0.4068965494632721, "step": 140325 }, { "epoch": 0.14134186642171304, "grad_norm": 11.96474676320133, "learning_rate": 4.898133354436185e-05, "loss": 2.5796, "mean_token_accuracy": 0.43448275327682495, "step": 140330 }, { "epoch": 0.1413469024748172, "grad_norm": 12.604994495618143, "learning_rate": 4.898122206902049e-05, "loss": 2.5921, "mean_token_accuracy": 0.4413793087005615, "step": 140335 }, { "epoch": 0.14135193852792138, "grad_norm": 9.624686642942736, "learning_rate": 4.898111058772122e-05, "loss": 2.2435, "mean_token_accuracy": 0.3896551787853241, "step": 140340 }, { "epoch": 0.14135697458102556, "grad_norm": 12.948842647200312, "learning_rate": 4.89809991004641e-05, "loss": 2.4847, "mean_token_accuracy": 0.4068965554237366, "step": 140345 }, { "epoch": 0.14136201063412973, "grad_norm": 11.421800298599992, "learning_rate": 4.8980887607249136e-05, "loss": 2.8777, "mean_token_accuracy": 0.3758620649576187, "step": 140350 }, { "epoch": 0.1413670466872339, "grad_norm": 13.720396630523153, "learning_rate": 4.898077610807637e-05, "loss": 2.7729, "mean_token_accuracy": 0.358620685338974, "step": 140355 }, { "epoch": 0.14137208274033808, "grad_norm": 11.030175853799784, "learning_rate": 4.898066460294583e-05, "loss": 2.123, "mean_token_accuracy": 0.4517241418361664, "step": 140360 }, { "epoch": 0.14137711879344225, "grad_norm": 11.493065712468525, "learning_rate": 4.898055309185755e-05, "loss": 2.1129, "mean_token_accuracy": 0.4793103337287903, "step": 140365 }, { "epoch": 0.14138215484654643, "grad_norm": 10.992088475638084, "learning_rate": 4.8980441574811556e-05, "loss": 2.3463, "mean_token_accuracy": 0.42758620381355283, "step": 140370 }, { "epoch": 0.1413871908996506, "grad_norm": 12.328007538954466, "learning_rate": 4.898033005180789e-05, "loss": 2.7487, "mean_token_accuracy": 0.38965516686439516, "step": 140375 }, { "epoch": 0.14139222695275477, "grad_norm": 11.117514349694028, "learning_rate": 4.898021852284656e-05, "loss": 2.3066, "mean_token_accuracy": 0.4172413766384125, "step": 140380 }, { "epoch": 0.14139726300585895, "grad_norm": 10.827037726596515, "learning_rate": 4.898010698792762e-05, "loss": 2.5234, "mean_token_accuracy": 0.4206896543502808, "step": 140385 }, { "epoch": 0.14140229905896312, "grad_norm": 10.78278508323296, "learning_rate": 4.897999544705108e-05, "loss": 1.923, "mean_token_accuracy": 0.46551724076271056, "step": 140390 }, { "epoch": 0.1414073351120673, "grad_norm": 13.052926176205943, "learning_rate": 4.8979883900217e-05, "loss": 2.2951, "mean_token_accuracy": 0.43278887271881106, "step": 140395 }, { "epoch": 0.14141237116517147, "grad_norm": 11.414591722506675, "learning_rate": 4.8979772347425395e-05, "loss": 2.3205, "mean_token_accuracy": 0.42413792610168455, "step": 140400 }, { "epoch": 0.14141740721827564, "grad_norm": 10.634271188252404, "learning_rate": 4.8979660788676286e-05, "loss": 2.4997, "mean_token_accuracy": 0.42413793206214906, "step": 140405 }, { "epoch": 0.14142244327137982, "grad_norm": 9.796247610264974, "learning_rate": 4.897954922396971e-05, "loss": 2.4256, "mean_token_accuracy": 0.4, "step": 140410 }, { "epoch": 0.141427479324484, "grad_norm": 9.402012681689591, "learning_rate": 4.8979437653305706e-05, "loss": 2.4828, "mean_token_accuracy": 0.46406533718109133, "step": 140415 }, { "epoch": 0.14143251537758816, "grad_norm": 10.008120138217288, "learning_rate": 4.89793260766843e-05, "loss": 2.8613, "mean_token_accuracy": 0.3448275953531265, "step": 140420 }, { "epoch": 0.14143755143069234, "grad_norm": 8.451043286737669, "learning_rate": 4.897921449410552e-05, "loss": 2.5674, "mean_token_accuracy": 0.4310344815254211, "step": 140425 }, { "epoch": 0.1414425874837965, "grad_norm": 9.344740315998466, "learning_rate": 4.89791029055694e-05, "loss": 2.4221, "mean_token_accuracy": 0.4206896543502808, "step": 140430 }, { "epoch": 0.14144762353690069, "grad_norm": 11.750915121098629, "learning_rate": 4.8978991311075976e-05, "loss": 2.19, "mean_token_accuracy": 0.4758620738983154, "step": 140435 }, { "epoch": 0.14145265959000483, "grad_norm": 13.94916707577565, "learning_rate": 4.897887971062527e-05, "loss": 2.6946, "mean_token_accuracy": 0.4034482777118683, "step": 140440 }, { "epoch": 0.141457695643109, "grad_norm": 9.422062147159924, "learning_rate": 4.897876810421731e-05, "loss": 2.3551, "mean_token_accuracy": 0.43793103098869324, "step": 140445 }, { "epoch": 0.14146273169621318, "grad_norm": 10.45485081064072, "learning_rate": 4.897865649185214e-05, "loss": 3.1192, "mean_token_accuracy": 0.37241379618644715, "step": 140450 }, { "epoch": 0.14146776774931735, "grad_norm": 8.97546546694762, "learning_rate": 4.8978544873529784e-05, "loss": 2.5455, "mean_token_accuracy": 0.4517241358757019, "step": 140455 }, { "epoch": 0.14147280380242153, "grad_norm": 9.723024655199998, "learning_rate": 4.8978433249250275e-05, "loss": 2.13, "mean_token_accuracy": 0.49655172824859617, "step": 140460 }, { "epoch": 0.1414778398555257, "grad_norm": 9.354849215674433, "learning_rate": 4.8978321619013645e-05, "loss": 2.0469, "mean_token_accuracy": 0.49655171036720275, "step": 140465 }, { "epoch": 0.14148287590862987, "grad_norm": 9.501963620531358, "learning_rate": 4.8978209982819914e-05, "loss": 1.9376, "mean_token_accuracy": 0.5034482657909394, "step": 140470 }, { "epoch": 0.14148791196173405, "grad_norm": 10.944631798962858, "learning_rate": 4.897809834066913e-05, "loss": 3.0087, "mean_token_accuracy": 0.4068965494632721, "step": 140475 }, { "epoch": 0.14149294801483822, "grad_norm": 9.022146326981478, "learning_rate": 4.897798669256131e-05, "loss": 2.233, "mean_token_accuracy": 0.42758620977401735, "step": 140480 }, { "epoch": 0.1414979840679424, "grad_norm": 15.865809505163934, "learning_rate": 4.897787503849649e-05, "loss": 2.4466, "mean_token_accuracy": 0.42413792610168455, "step": 140485 }, { "epoch": 0.14150302012104657, "grad_norm": 9.63268504884066, "learning_rate": 4.89777633784747e-05, "loss": 2.2981, "mean_token_accuracy": 0.4551724135875702, "step": 140490 }, { "epoch": 0.14150805617415074, "grad_norm": 10.635618866900481, "learning_rate": 4.897765171249598e-05, "loss": 2.2536, "mean_token_accuracy": 0.43793103098869324, "step": 140495 }, { "epoch": 0.14151309222725492, "grad_norm": 8.329877176197359, "learning_rate": 4.897754004056034e-05, "loss": 2.033, "mean_token_accuracy": 0.5172413766384125, "step": 140500 }, { "epoch": 0.1415181282803591, "grad_norm": 10.562381120803575, "learning_rate": 4.897742836266784e-05, "loss": 2.4704, "mean_token_accuracy": 0.4172413766384125, "step": 140505 }, { "epoch": 0.14152316433346326, "grad_norm": 11.061384329355556, "learning_rate": 4.8977316678818485e-05, "loss": 2.3914, "mean_token_accuracy": 0.4206896543502808, "step": 140510 }, { "epoch": 0.14152820038656744, "grad_norm": 10.317934170653393, "learning_rate": 4.897720498901232e-05, "loss": 2.2801, "mean_token_accuracy": 0.42256503701210024, "step": 140515 }, { "epoch": 0.1415332364396716, "grad_norm": 12.259346922344315, "learning_rate": 4.897709329324937e-05, "loss": 2.77, "mean_token_accuracy": 0.3999999940395355, "step": 140520 }, { "epoch": 0.14153827249277579, "grad_norm": 11.834567606563937, "learning_rate": 4.897698159152967e-05, "loss": 2.2481, "mean_token_accuracy": 0.4344827592372894, "step": 140525 }, { "epoch": 0.14154330854587996, "grad_norm": 10.065975099062419, "learning_rate": 4.897686988385325e-05, "loss": 2.3026, "mean_token_accuracy": 0.42068966031074523, "step": 140530 }, { "epoch": 0.14154834459898413, "grad_norm": 12.308906671307767, "learning_rate": 4.8976758170220134e-05, "loss": 2.9094, "mean_token_accuracy": 0.4124621868133545, "step": 140535 }, { "epoch": 0.1415533806520883, "grad_norm": 8.744237157228927, "learning_rate": 4.8976646450630365e-05, "loss": 2.2514, "mean_token_accuracy": 0.4517241358757019, "step": 140540 }, { "epoch": 0.14155841670519248, "grad_norm": 9.012054690036376, "learning_rate": 4.897653472508396e-05, "loss": 2.7568, "mean_token_accuracy": 0.3999999940395355, "step": 140545 }, { "epoch": 0.14156345275829665, "grad_norm": 13.739321139411189, "learning_rate": 4.897642299358097e-05, "loss": 2.6432, "mean_token_accuracy": 0.37586207389831544, "step": 140550 }, { "epoch": 0.14156848881140083, "grad_norm": 9.658993374433193, "learning_rate": 4.89763112561214e-05, "loss": 2.3521, "mean_token_accuracy": 0.46412582993507384, "step": 140555 }, { "epoch": 0.141573524864505, "grad_norm": 10.756090730949467, "learning_rate": 4.897619951270531e-05, "loss": 2.1926, "mean_token_accuracy": 0.4517241299152374, "step": 140560 }, { "epoch": 0.14157856091760918, "grad_norm": 9.69602627991262, "learning_rate": 4.897608776333271e-05, "loss": 2.2113, "mean_token_accuracy": 0.4535390198230743, "step": 140565 }, { "epoch": 0.14158359697071335, "grad_norm": 10.914551114022528, "learning_rate": 4.897597600800363e-05, "loss": 2.248, "mean_token_accuracy": 0.4482758641242981, "step": 140570 }, { "epoch": 0.14158863302381752, "grad_norm": 12.221413371806607, "learning_rate": 4.8975864246718114e-05, "loss": 2.4784, "mean_token_accuracy": 0.4034482717514038, "step": 140575 }, { "epoch": 0.14159366907692167, "grad_norm": 8.552375192417331, "learning_rate": 4.897575247947618e-05, "loss": 1.8452, "mean_token_accuracy": 0.5448275923728942, "step": 140580 }, { "epoch": 0.14159870513002584, "grad_norm": 10.830551814681563, "learning_rate": 4.897564070627788e-05, "loss": 2.4263, "mean_token_accuracy": 0.38620689511299133, "step": 140585 }, { "epoch": 0.14160374118313002, "grad_norm": 10.372813144746793, "learning_rate": 4.897552892712322e-05, "loss": 2.7543, "mean_token_accuracy": 0.382758629322052, "step": 140590 }, { "epoch": 0.1416087772362342, "grad_norm": 11.76858045557601, "learning_rate": 4.897541714201224e-05, "loss": 2.6237, "mean_token_accuracy": 0.3655172407627106, "step": 140595 }, { "epoch": 0.14161381328933836, "grad_norm": 10.338713370355235, "learning_rate": 4.897530535094498e-05, "loss": 2.333, "mean_token_accuracy": 0.458620685338974, "step": 140600 }, { "epoch": 0.14161884934244254, "grad_norm": 10.054601672940803, "learning_rate": 4.8975193553921464e-05, "loss": 2.1073, "mean_token_accuracy": 0.4620689570903778, "step": 140605 }, { "epoch": 0.1416238853955467, "grad_norm": 10.043036507454254, "learning_rate": 4.897508175094172e-05, "loss": 2.5921, "mean_token_accuracy": 0.4344827651977539, "step": 140610 }, { "epoch": 0.14162892144865089, "grad_norm": 9.512135524580431, "learning_rate": 4.897496994200579e-05, "loss": 2.229, "mean_token_accuracy": 0.4379310429096222, "step": 140615 }, { "epoch": 0.14163395750175506, "grad_norm": 10.612037186521766, "learning_rate": 4.897485812711368e-05, "loss": 2.5287, "mean_token_accuracy": 0.36551724672317504, "step": 140620 }, { "epoch": 0.14163899355485923, "grad_norm": 10.980393505520793, "learning_rate": 4.897474630626545e-05, "loss": 2.237, "mean_token_accuracy": 0.46551724672317507, "step": 140625 }, { "epoch": 0.1416440296079634, "grad_norm": 11.188049625385073, "learning_rate": 4.897463447946112e-05, "loss": 2.5239, "mean_token_accuracy": 0.36551723480224607, "step": 140630 }, { "epoch": 0.14164906566106758, "grad_norm": 11.796677818491794, "learning_rate": 4.8974522646700714e-05, "loss": 2.1772, "mean_token_accuracy": 0.5068965435028077, "step": 140635 }, { "epoch": 0.14165410171417175, "grad_norm": 11.863186522702293, "learning_rate": 4.897441080798428e-05, "loss": 2.3496, "mean_token_accuracy": 0.42413792610168455, "step": 140640 }, { "epoch": 0.14165913776727593, "grad_norm": 8.572983100789882, "learning_rate": 4.897429896331182e-05, "loss": 2.0626, "mean_token_accuracy": 0.47586206197738645, "step": 140645 }, { "epoch": 0.1416641738203801, "grad_norm": 10.05207531341534, "learning_rate": 4.8974187112683396e-05, "loss": 2.338, "mean_token_accuracy": 0.4379310369491577, "step": 140650 }, { "epoch": 0.14166920987348428, "grad_norm": 10.126779680806836, "learning_rate": 4.897407525609903e-05, "loss": 2.7301, "mean_token_accuracy": 0.3862068891525269, "step": 140655 }, { "epoch": 0.14167424592658845, "grad_norm": 9.008011981512118, "learning_rate": 4.8973963393558735e-05, "loss": 2.1246, "mean_token_accuracy": 0.5081280767917633, "step": 140660 }, { "epoch": 0.14167928197969262, "grad_norm": 10.93459614300818, "learning_rate": 4.8973851525062566e-05, "loss": 2.5802, "mean_token_accuracy": 0.42413793206214906, "step": 140665 }, { "epoch": 0.1416843180327968, "grad_norm": 10.600525510421074, "learning_rate": 4.897373965061054e-05, "loss": 2.5237, "mean_token_accuracy": 0.45027223229408264, "step": 140670 }, { "epoch": 0.14168935408590097, "grad_norm": 11.35846045729706, "learning_rate": 4.89736277702027e-05, "loss": 2.1242, "mean_token_accuracy": 0.4137930989265442, "step": 140675 }, { "epoch": 0.14169439013900514, "grad_norm": 9.047634473736348, "learning_rate": 4.897351588383905e-05, "loss": 2.687, "mean_token_accuracy": 0.37931033968925476, "step": 140680 }, { "epoch": 0.14169942619210932, "grad_norm": 9.971295003993943, "learning_rate": 4.897340399151966e-05, "loss": 2.498, "mean_token_accuracy": 0.39655172228813174, "step": 140685 }, { "epoch": 0.1417044622452135, "grad_norm": 10.846671202121541, "learning_rate": 4.897329209324453e-05, "loss": 2.3081, "mean_token_accuracy": 0.4034482717514038, "step": 140690 }, { "epoch": 0.14170949829831767, "grad_norm": 9.840885510986078, "learning_rate": 4.8973180189013714e-05, "loss": 2.7341, "mean_token_accuracy": 0.4068965554237366, "step": 140695 }, { "epoch": 0.14171453435142184, "grad_norm": 9.388392577875948, "learning_rate": 4.897306827882722e-05, "loss": 2.3073, "mean_token_accuracy": 0.4689655065536499, "step": 140700 }, { "epoch": 0.141719570404526, "grad_norm": 9.677848832983646, "learning_rate": 4.8972956362685095e-05, "loss": 2.2022, "mean_token_accuracy": 0.4363581418991089, "step": 140705 }, { "epoch": 0.1417246064576302, "grad_norm": 8.924277262526973, "learning_rate": 4.8972844440587366e-05, "loss": 2.4606, "mean_token_accuracy": 0.44482758045196535, "step": 140710 }, { "epoch": 0.14172964251073436, "grad_norm": 8.81836687695403, "learning_rate": 4.897273251253407e-05, "loss": 2.1631, "mean_token_accuracy": 0.5103448271751404, "step": 140715 }, { "epoch": 0.1417346785638385, "grad_norm": 9.532537457285772, "learning_rate": 4.897262057852522e-05, "loss": 2.4825, "mean_token_accuracy": 0.36896551847457887, "step": 140720 }, { "epoch": 0.14173971461694268, "grad_norm": 10.598305056216368, "learning_rate": 4.897250863856086e-05, "loss": 2.331, "mean_token_accuracy": 0.46551724672317507, "step": 140725 }, { "epoch": 0.14174475067004685, "grad_norm": 11.849648045054636, "learning_rate": 4.897239669264102e-05, "loss": 2.3966, "mean_token_accuracy": 0.4034482717514038, "step": 140730 }, { "epoch": 0.14174978672315103, "grad_norm": 9.875225871715957, "learning_rate": 4.897228474076574e-05, "loss": 2.3271, "mean_token_accuracy": 0.4551724076271057, "step": 140735 }, { "epoch": 0.1417548227762552, "grad_norm": 9.090098733677863, "learning_rate": 4.897217278293503e-05, "loss": 2.3069, "mean_token_accuracy": 0.44827585816383364, "step": 140740 }, { "epoch": 0.14175985882935938, "grad_norm": 10.361103920331118, "learning_rate": 4.897206081914894e-05, "loss": 2.3218, "mean_token_accuracy": 0.41379310488700866, "step": 140745 }, { "epoch": 0.14176489488246355, "grad_norm": 9.454296164036574, "learning_rate": 4.897194884940749e-05, "loss": 2.2077, "mean_token_accuracy": 0.4620689690113068, "step": 140750 }, { "epoch": 0.14176993093556772, "grad_norm": 9.522459822864715, "learning_rate": 4.8971836873710714e-05, "loss": 2.3656, "mean_token_accuracy": 0.41724138259887694, "step": 140755 }, { "epoch": 0.1417749669886719, "grad_norm": 12.117112242135113, "learning_rate": 4.897172489205865e-05, "loss": 2.7839, "mean_token_accuracy": 0.38463400602340697, "step": 140760 }, { "epoch": 0.14178000304177607, "grad_norm": 10.290577897456522, "learning_rate": 4.897161290445132e-05, "loss": 2.4051, "mean_token_accuracy": 0.41034482717514037, "step": 140765 }, { "epoch": 0.14178503909488024, "grad_norm": 10.597945414836664, "learning_rate": 4.897150091088875e-05, "loss": 2.5502, "mean_token_accuracy": 0.46751360297203065, "step": 140770 }, { "epoch": 0.14179007514798442, "grad_norm": 8.834526259676629, "learning_rate": 4.897138891137099e-05, "loss": 2.2364, "mean_token_accuracy": 0.4517241418361664, "step": 140775 }, { "epoch": 0.1417951112010886, "grad_norm": 9.268951993418353, "learning_rate": 4.8971276905898055e-05, "loss": 2.5022, "mean_token_accuracy": 0.4034482717514038, "step": 140780 }, { "epoch": 0.14180014725419277, "grad_norm": 15.94133768404537, "learning_rate": 4.897116489446998e-05, "loss": 2.7268, "mean_token_accuracy": 0.36896551847457887, "step": 140785 }, { "epoch": 0.14180518330729694, "grad_norm": 9.713797860014633, "learning_rate": 4.8971052877086806e-05, "loss": 2.292, "mean_token_accuracy": 0.49655171632766726, "step": 140790 }, { "epoch": 0.1418102193604011, "grad_norm": 9.255380320866411, "learning_rate": 4.897094085374855e-05, "loss": 2.2783, "mean_token_accuracy": 0.4344827592372894, "step": 140795 }, { "epoch": 0.1418152554135053, "grad_norm": 10.894301802127519, "learning_rate": 4.897082882445524e-05, "loss": 2.4727, "mean_token_accuracy": 0.4724137902259827, "step": 140800 }, { "epoch": 0.14182029146660946, "grad_norm": 10.18585837496387, "learning_rate": 4.897071678920693e-05, "loss": 2.3684, "mean_token_accuracy": 0.46896551847457885, "step": 140805 }, { "epoch": 0.14182532751971363, "grad_norm": 14.05805239361398, "learning_rate": 4.8970604748003634e-05, "loss": 2.4313, "mean_token_accuracy": 0.4275861978530884, "step": 140810 }, { "epoch": 0.1418303635728178, "grad_norm": 9.655686560449553, "learning_rate": 4.897049270084538e-05, "loss": 2.612, "mean_token_accuracy": 0.42589232325553894, "step": 140815 }, { "epoch": 0.14183539962592198, "grad_norm": 11.071090820043537, "learning_rate": 4.897038064773221e-05, "loss": 2.5678, "mean_token_accuracy": 0.4068965554237366, "step": 140820 }, { "epoch": 0.14184043567902616, "grad_norm": 10.058146522739435, "learning_rate": 4.8970268588664134e-05, "loss": 2.6578, "mean_token_accuracy": 0.3862068891525269, "step": 140825 }, { "epoch": 0.14184547173213033, "grad_norm": 9.119488496681655, "learning_rate": 4.8970156523641217e-05, "loss": 2.0372, "mean_token_accuracy": 0.4931034445762634, "step": 140830 }, { "epoch": 0.1418505077852345, "grad_norm": 10.037200280132705, "learning_rate": 4.8970044452663474e-05, "loss": 2.3195, "mean_token_accuracy": 0.44482759237289426, "step": 140835 }, { "epoch": 0.14185554383833868, "grad_norm": 10.46885876160073, "learning_rate": 4.896993237573092e-05, "loss": 2.0095, "mean_token_accuracy": 0.5335148334503174, "step": 140840 }, { "epoch": 0.14186057989144285, "grad_norm": 11.322084224315978, "learning_rate": 4.8969820292843604e-05, "loss": 2.4778, "mean_token_accuracy": 0.41379311084747317, "step": 140845 }, { "epoch": 0.14186561594454702, "grad_norm": 11.422134024762268, "learning_rate": 4.896970820400156e-05, "loss": 2.2072, "mean_token_accuracy": 0.44827587008476255, "step": 140850 }, { "epoch": 0.1418706519976512, "grad_norm": 9.873205871418836, "learning_rate": 4.896959610920481e-05, "loss": 2.2568, "mean_token_accuracy": 0.46896551847457885, "step": 140855 }, { "epoch": 0.14187568805075534, "grad_norm": 10.120730953598713, "learning_rate": 4.8969484008453384e-05, "loss": 2.7048, "mean_token_accuracy": 0.3896551638841629, "step": 140860 }, { "epoch": 0.14188072410385952, "grad_norm": 11.311150174074205, "learning_rate": 4.896937190174731e-05, "loss": 2.5134, "mean_token_accuracy": 0.48844525814056394, "step": 140865 }, { "epoch": 0.1418857601569637, "grad_norm": 9.899761657734727, "learning_rate": 4.896925978908664e-05, "loss": 2.331, "mean_token_accuracy": 0.46896551847457885, "step": 140870 }, { "epoch": 0.14189079621006787, "grad_norm": 10.912927801440418, "learning_rate": 4.896914767047138e-05, "loss": 2.5445, "mean_token_accuracy": 0.3620689630508423, "step": 140875 }, { "epoch": 0.14189583226317204, "grad_norm": 9.32575424118185, "learning_rate": 4.8969035545901575e-05, "loss": 2.4505, "mean_token_accuracy": 0.3862068891525269, "step": 140880 }, { "epoch": 0.1419008683162762, "grad_norm": 11.095413021837498, "learning_rate": 4.896892341537726e-05, "loss": 2.0422, "mean_token_accuracy": 0.43793103098869324, "step": 140885 }, { "epoch": 0.1419059043693804, "grad_norm": 9.911363407725052, "learning_rate": 4.8968811278898454e-05, "loss": 2.0976, "mean_token_accuracy": 0.5034482717514038, "step": 140890 }, { "epoch": 0.14191094042248456, "grad_norm": 10.368251280404074, "learning_rate": 4.896869913646518e-05, "loss": 2.3692, "mean_token_accuracy": 0.4379310429096222, "step": 140895 }, { "epoch": 0.14191597647558873, "grad_norm": 9.285587338149734, "learning_rate": 4.8968586988077505e-05, "loss": 2.3245, "mean_token_accuracy": 0.44482758045196535, "step": 140900 }, { "epoch": 0.1419210125286929, "grad_norm": 9.070771564999916, "learning_rate": 4.896847483373543e-05, "loss": 2.0363, "mean_token_accuracy": 0.4310344815254211, "step": 140905 }, { "epoch": 0.14192604858179708, "grad_norm": 14.42470625561916, "learning_rate": 4.8968362673438986e-05, "loss": 3.1887, "mean_token_accuracy": 0.358620685338974, "step": 140910 }, { "epoch": 0.14193108463490126, "grad_norm": 9.513326418077009, "learning_rate": 4.896825050718821e-05, "loss": 2.4314, "mean_token_accuracy": 0.44482759237289426, "step": 140915 }, { "epoch": 0.14193612068800543, "grad_norm": 11.541750092723131, "learning_rate": 4.896813833498315e-05, "loss": 2.6271, "mean_token_accuracy": 0.4413793087005615, "step": 140920 }, { "epoch": 0.1419411567411096, "grad_norm": 11.632716192325871, "learning_rate": 4.896802615682381e-05, "loss": 2.5468, "mean_token_accuracy": 0.4241379380226135, "step": 140925 }, { "epoch": 0.14194619279421378, "grad_norm": 8.007504897940324, "learning_rate": 4.896791397271023e-05, "loss": 2.08, "mean_token_accuracy": 0.4361161530017853, "step": 140930 }, { "epoch": 0.14195122884731795, "grad_norm": 10.710936697031785, "learning_rate": 4.896780178264245e-05, "loss": 2.3661, "mean_token_accuracy": 0.4241379380226135, "step": 140935 }, { "epoch": 0.14195626490042212, "grad_norm": 8.92219122155108, "learning_rate": 4.89676895866205e-05, "loss": 2.4784, "mean_token_accuracy": 0.38275861740112305, "step": 140940 }, { "epoch": 0.1419613009535263, "grad_norm": 10.811052885337256, "learning_rate": 4.8967577384644396e-05, "loss": 2.2442, "mean_token_accuracy": 0.4448275864124298, "step": 140945 }, { "epoch": 0.14196633700663047, "grad_norm": 7.148822758124012, "learning_rate": 4.896746517671418e-05, "loss": 1.8096, "mean_token_accuracy": 0.5016333878040313, "step": 140950 }, { "epoch": 0.14197137305973465, "grad_norm": 10.426764610385678, "learning_rate": 4.896735296282989e-05, "loss": 2.9665, "mean_token_accuracy": 0.3482758641242981, "step": 140955 }, { "epoch": 0.14197640911283882, "grad_norm": 10.146766630414817, "learning_rate": 4.896724074299154e-05, "loss": 2.3365, "mean_token_accuracy": 0.4551724135875702, "step": 140960 }, { "epoch": 0.141981445165943, "grad_norm": 9.96404973689838, "learning_rate": 4.896712851719918e-05, "loss": 2.4104, "mean_token_accuracy": 0.4000000059604645, "step": 140965 }, { "epoch": 0.14198648121904717, "grad_norm": 10.211723997932223, "learning_rate": 4.8967016285452825e-05, "loss": 2.1196, "mean_token_accuracy": 0.42413793206214906, "step": 140970 }, { "epoch": 0.14199151727215134, "grad_norm": 12.061668823075768, "learning_rate": 4.896690404775252e-05, "loss": 2.2852, "mean_token_accuracy": 0.44482759237289426, "step": 140975 }, { "epoch": 0.14199655332525551, "grad_norm": 12.303384484024459, "learning_rate": 4.8966791804098286e-05, "loss": 2.387, "mean_token_accuracy": 0.3620689630508423, "step": 140980 }, { "epoch": 0.1420015893783597, "grad_norm": 9.525906890567837, "learning_rate": 4.896667955449015e-05, "loss": 2.2701, "mean_token_accuracy": 0.4, "step": 140985 }, { "epoch": 0.14200662543146386, "grad_norm": 11.858504179675501, "learning_rate": 4.8966567298928155e-05, "loss": 2.2479, "mean_token_accuracy": 0.47241379618644713, "step": 140990 }, { "epoch": 0.14201166148456804, "grad_norm": 8.844314648514631, "learning_rate": 4.896645503741234e-05, "loss": 2.4965, "mean_token_accuracy": 0.4275861978530884, "step": 140995 }, { "epoch": 0.14201669753767218, "grad_norm": 14.983828707664037, "learning_rate": 4.8966342769942704e-05, "loss": 2.7224, "mean_token_accuracy": 0.36896551847457887, "step": 141000 }, { "epoch": 0.14202173359077636, "grad_norm": 10.077346424562178, "learning_rate": 4.8966230496519303e-05, "loss": 2.0267, "mean_token_accuracy": 0.43623714447021483, "step": 141005 }, { "epoch": 0.14202676964388053, "grad_norm": 13.948258456025338, "learning_rate": 4.896611821714217e-05, "loss": 2.9053, "mean_token_accuracy": 0.3655172407627106, "step": 141010 }, { "epoch": 0.1420318056969847, "grad_norm": 11.388061491974396, "learning_rate": 4.896600593181133e-05, "loss": 2.932, "mean_token_accuracy": 0.32758620083332063, "step": 141015 }, { "epoch": 0.14203684175008888, "grad_norm": 10.037468793346019, "learning_rate": 4.896589364052681e-05, "loss": 2.2387, "mean_token_accuracy": 0.4172413766384125, "step": 141020 }, { "epoch": 0.14204187780319305, "grad_norm": 10.974305205920933, "learning_rate": 4.8965781343288636e-05, "loss": 2.1298, "mean_token_accuracy": 0.4862068951129913, "step": 141025 }, { "epoch": 0.14204691385629722, "grad_norm": 10.59801725174759, "learning_rate": 4.896566904009686e-05, "loss": 2.1937, "mean_token_accuracy": 0.4911070704460144, "step": 141030 }, { "epoch": 0.1420519499094014, "grad_norm": 10.657638654456118, "learning_rate": 4.896555673095149e-05, "loss": 2.1561, "mean_token_accuracy": 0.4620689690113068, "step": 141035 }, { "epoch": 0.14205698596250557, "grad_norm": 11.44847654385035, "learning_rate": 4.896544441585257e-05, "loss": 2.171, "mean_token_accuracy": 0.4862069010734558, "step": 141040 }, { "epoch": 0.14206202201560975, "grad_norm": 13.137746523735641, "learning_rate": 4.8965332094800135e-05, "loss": 2.2783, "mean_token_accuracy": 0.4705665111541748, "step": 141045 }, { "epoch": 0.14206705806871392, "grad_norm": 16.95535342428976, "learning_rate": 4.896521976779421e-05, "loss": 2.398, "mean_token_accuracy": 0.39310344457626345, "step": 141050 }, { "epoch": 0.1420720941218181, "grad_norm": 10.212740368942896, "learning_rate": 4.8965107434834815e-05, "loss": 2.1489, "mean_token_accuracy": 0.47586206197738645, "step": 141055 }, { "epoch": 0.14207713017492227, "grad_norm": 10.045407358934472, "learning_rate": 4.8964995095922e-05, "loss": 2.0879, "mean_token_accuracy": 0.4517241358757019, "step": 141060 }, { "epoch": 0.14208216622802644, "grad_norm": 10.77785501540689, "learning_rate": 4.896488275105579e-05, "loss": 2.2385, "mean_token_accuracy": 0.458620685338974, "step": 141065 }, { "epoch": 0.14208720228113061, "grad_norm": 8.825761376826689, "learning_rate": 4.8964770400236214e-05, "loss": 2.2815, "mean_token_accuracy": 0.43793103098869324, "step": 141070 }, { "epoch": 0.1420922383342348, "grad_norm": 10.668019924457385, "learning_rate": 4.896465804346331e-05, "loss": 2.3725, "mean_token_accuracy": 0.42068966031074523, "step": 141075 }, { "epoch": 0.14209727438733896, "grad_norm": 9.77261111577999, "learning_rate": 4.8964545680737095e-05, "loss": 2.1173, "mean_token_accuracy": 0.4931034564971924, "step": 141080 }, { "epoch": 0.14210231044044314, "grad_norm": 10.185101362941314, "learning_rate": 4.896443331205761e-05, "loss": 2.9549, "mean_token_accuracy": 0.36896551847457887, "step": 141085 }, { "epoch": 0.1421073464935473, "grad_norm": 13.254612313178525, "learning_rate": 4.896432093742488e-05, "loss": 2.3473, "mean_token_accuracy": 0.45862069725990295, "step": 141090 }, { "epoch": 0.14211238254665148, "grad_norm": 13.987393775269584, "learning_rate": 4.896420855683895e-05, "loss": 2.5207, "mean_token_accuracy": 0.4034482777118683, "step": 141095 }, { "epoch": 0.14211741859975566, "grad_norm": 10.93681104332027, "learning_rate": 4.896409617029983e-05, "loss": 2.7626, "mean_token_accuracy": 0.4068965494632721, "step": 141100 }, { "epoch": 0.14212245465285983, "grad_norm": 10.272301624324763, "learning_rate": 4.896398377780757e-05, "loss": 2.9075, "mean_token_accuracy": 0.37586206793785093, "step": 141105 }, { "epoch": 0.142127490705964, "grad_norm": 13.508000210201565, "learning_rate": 4.896387137936219e-05, "loss": 2.9287, "mean_token_accuracy": 0.358620685338974, "step": 141110 }, { "epoch": 0.14213252675906818, "grad_norm": 10.26811277042134, "learning_rate": 4.8963758974963734e-05, "loss": 2.2963, "mean_token_accuracy": 0.3965517312288284, "step": 141115 }, { "epoch": 0.14213756281217235, "grad_norm": 9.513383640491488, "learning_rate": 4.896364656461221e-05, "loss": 2.2715, "mean_token_accuracy": 0.420689657330513, "step": 141120 }, { "epoch": 0.14214259886527653, "grad_norm": 14.05237692527733, "learning_rate": 4.8963534148307676e-05, "loss": 2.882, "mean_token_accuracy": 0.42413792610168455, "step": 141125 }, { "epoch": 0.1421476349183807, "grad_norm": 11.168760733997585, "learning_rate": 4.8963421726050144e-05, "loss": 2.3859, "mean_token_accuracy": 0.44482758045196535, "step": 141130 }, { "epoch": 0.14215267097148487, "grad_norm": 10.366485561412672, "learning_rate": 4.896330929783966e-05, "loss": 2.4547, "mean_token_accuracy": 0.4413793087005615, "step": 141135 }, { "epoch": 0.14215770702458902, "grad_norm": 10.309330661182006, "learning_rate": 4.8963196863676245e-05, "loss": 2.2219, "mean_token_accuracy": 0.44827585816383364, "step": 141140 }, { "epoch": 0.1421627430776932, "grad_norm": 10.613685918498128, "learning_rate": 4.8963084423559925e-05, "loss": 2.2536, "mean_token_accuracy": 0.4448275864124298, "step": 141145 }, { "epoch": 0.14216777913079737, "grad_norm": 11.271630130401839, "learning_rate": 4.896297197749074e-05, "loss": 2.3566, "mean_token_accuracy": 0.4586206912994385, "step": 141150 }, { "epoch": 0.14217281518390154, "grad_norm": 9.019749163164834, "learning_rate": 4.8962859525468727e-05, "loss": 2.4052, "mean_token_accuracy": 0.4413793087005615, "step": 141155 }, { "epoch": 0.14217785123700571, "grad_norm": 8.435157768006238, "learning_rate": 4.89627470674939e-05, "loss": 2.4991, "mean_token_accuracy": 0.4586206912994385, "step": 141160 }, { "epoch": 0.1421828872901099, "grad_norm": 10.295988183147786, "learning_rate": 4.896263460356631e-05, "loss": 2.3865, "mean_token_accuracy": 0.44827585816383364, "step": 141165 }, { "epoch": 0.14218792334321406, "grad_norm": 10.145669246967278, "learning_rate": 4.8962522133685974e-05, "loss": 2.4595, "mean_token_accuracy": 0.4261947929859161, "step": 141170 }, { "epoch": 0.14219295939631824, "grad_norm": 9.663443038021436, "learning_rate": 4.8962409657852925e-05, "loss": 2.2954, "mean_token_accuracy": 0.4413793087005615, "step": 141175 }, { "epoch": 0.1421979954494224, "grad_norm": 11.220140350573832, "learning_rate": 4.89622971760672e-05, "loss": 2.0092, "mean_token_accuracy": 0.4620689630508423, "step": 141180 }, { "epoch": 0.14220303150252658, "grad_norm": 11.483491156430729, "learning_rate": 4.896218468832882e-05, "loss": 2.5964, "mean_token_accuracy": 0.39818512201309203, "step": 141185 }, { "epoch": 0.14220806755563076, "grad_norm": 8.145439930215328, "learning_rate": 4.896207219463783e-05, "loss": 2.1218, "mean_token_accuracy": 0.4344827592372894, "step": 141190 }, { "epoch": 0.14221310360873493, "grad_norm": 9.869774171982263, "learning_rate": 4.896195969499425e-05, "loss": 2.3997, "mean_token_accuracy": 0.37931033968925476, "step": 141195 }, { "epoch": 0.1422181396618391, "grad_norm": 14.60416652870884, "learning_rate": 4.8961847189398123e-05, "loss": 2.2127, "mean_token_accuracy": 0.4689655125141144, "step": 141200 }, { "epoch": 0.14222317571494328, "grad_norm": 10.039637809862839, "learning_rate": 4.896173467784947e-05, "loss": 2.2846, "mean_token_accuracy": 0.417241370677948, "step": 141205 }, { "epoch": 0.14222821176804745, "grad_norm": 11.299360646116535, "learning_rate": 4.896162216034832e-05, "loss": 2.25, "mean_token_accuracy": 0.4517241358757019, "step": 141210 }, { "epoch": 0.14223324782115163, "grad_norm": 9.687789816232236, "learning_rate": 4.8961509636894705e-05, "loss": 2.2678, "mean_token_accuracy": 0.4103448331356049, "step": 141215 }, { "epoch": 0.1422382838742558, "grad_norm": 9.846454307770834, "learning_rate": 4.896139710748867e-05, "loss": 2.4145, "mean_token_accuracy": 0.4310344815254211, "step": 141220 }, { "epoch": 0.14224331992735997, "grad_norm": 10.14602689265939, "learning_rate": 4.8961284572130225e-05, "loss": 2.2042, "mean_token_accuracy": 0.4655172288417816, "step": 141225 }, { "epoch": 0.14224835598046415, "grad_norm": 9.220046500442663, "learning_rate": 4.896117203081943e-05, "loss": 2.2613, "mean_token_accuracy": 0.40689654350280763, "step": 141230 }, { "epoch": 0.14225339203356832, "grad_norm": 9.086221933721525, "learning_rate": 4.896105948355629e-05, "loss": 2.2105, "mean_token_accuracy": 0.4896551787853241, "step": 141235 }, { "epoch": 0.1422584280866725, "grad_norm": 10.703765692638385, "learning_rate": 4.8960946930340847e-05, "loss": 2.2976, "mean_token_accuracy": 0.4206896543502808, "step": 141240 }, { "epoch": 0.14226346413977667, "grad_norm": 10.550652540569354, "learning_rate": 4.8960834371173124e-05, "loss": 2.1626, "mean_token_accuracy": 0.47931033968925474, "step": 141245 }, { "epoch": 0.14226850019288084, "grad_norm": 11.790994265025699, "learning_rate": 4.896072180605316e-05, "loss": 2.5608, "mean_token_accuracy": 0.4359951615333557, "step": 141250 }, { "epoch": 0.14227353624598502, "grad_norm": 10.668019039878052, "learning_rate": 4.8960609234980985e-05, "loss": 2.3075, "mean_token_accuracy": 0.4551724076271057, "step": 141255 }, { "epoch": 0.1422785722990892, "grad_norm": 11.173237362527454, "learning_rate": 4.8960496657956636e-05, "loss": 2.3156, "mean_token_accuracy": 0.44016939401626587, "step": 141260 }, { "epoch": 0.14228360835219336, "grad_norm": 10.792219036543454, "learning_rate": 4.896038407498013e-05, "loss": 2.4937, "mean_token_accuracy": 0.39655172228813174, "step": 141265 }, { "epoch": 0.14228864440529754, "grad_norm": 8.952040291371814, "learning_rate": 4.8960271486051514e-05, "loss": 2.0192, "mean_token_accuracy": 0.4689655125141144, "step": 141270 }, { "epoch": 0.1422936804584017, "grad_norm": 13.450481730834248, "learning_rate": 4.896015889117081e-05, "loss": 2.5972, "mean_token_accuracy": 0.41379310488700866, "step": 141275 }, { "epoch": 0.14229871651150586, "grad_norm": 9.566379926835328, "learning_rate": 4.8960046290338044e-05, "loss": 2.4572, "mean_token_accuracy": 0.47586206793785096, "step": 141280 }, { "epoch": 0.14230375256461003, "grad_norm": 13.285743698725172, "learning_rate": 4.8959933683553266e-05, "loss": 2.8435, "mean_token_accuracy": 0.356745308637619, "step": 141285 }, { "epoch": 0.1423087886177142, "grad_norm": 14.17482383744256, "learning_rate": 4.8959821070816483e-05, "loss": 2.2732, "mean_token_accuracy": 0.47586206197738645, "step": 141290 }, { "epoch": 0.14231382467081838, "grad_norm": 20.14986715247233, "learning_rate": 4.895970845212775e-05, "loss": 3.0606, "mean_token_accuracy": 0.36551723778247835, "step": 141295 }, { "epoch": 0.14231886072392255, "grad_norm": 15.76744823165706, "learning_rate": 4.8959595827487084e-05, "loss": 2.5924, "mean_token_accuracy": 0.3896551787853241, "step": 141300 }, { "epoch": 0.14232389677702673, "grad_norm": 10.286503459414803, "learning_rate": 4.8959483196894514e-05, "loss": 2.8485, "mean_token_accuracy": 0.3827586233615875, "step": 141305 }, { "epoch": 0.1423289328301309, "grad_norm": 13.26480661680959, "learning_rate": 4.8959370560350074e-05, "loss": 2.4321, "mean_token_accuracy": 0.43103448748588563, "step": 141310 }, { "epoch": 0.14233396888323507, "grad_norm": 10.900787353088155, "learning_rate": 4.89592579178538e-05, "loss": 2.5457, "mean_token_accuracy": 0.39310344457626345, "step": 141315 }, { "epoch": 0.14233900493633925, "grad_norm": 9.448613502341418, "learning_rate": 4.895914526940573e-05, "loss": 2.467, "mean_token_accuracy": 0.4689655125141144, "step": 141320 }, { "epoch": 0.14234404098944342, "grad_norm": 9.36175886338441, "learning_rate": 4.8959032615005874e-05, "loss": 2.1982, "mean_token_accuracy": 0.4586206912994385, "step": 141325 }, { "epoch": 0.1423490770425476, "grad_norm": 10.516834633718496, "learning_rate": 4.8958919954654285e-05, "loss": 2.071, "mean_token_accuracy": 0.4689655125141144, "step": 141330 }, { "epoch": 0.14235411309565177, "grad_norm": 9.838949492684728, "learning_rate": 4.8958807288350976e-05, "loss": 2.6374, "mean_token_accuracy": 0.37931033968925476, "step": 141335 }, { "epoch": 0.14235914914875594, "grad_norm": 12.854416776601228, "learning_rate": 4.8958694616095994e-05, "loss": 2.6133, "mean_token_accuracy": 0.398124623298645, "step": 141340 }, { "epoch": 0.14236418520186012, "grad_norm": 10.656888231225333, "learning_rate": 4.895858193788936e-05, "loss": 2.3674, "mean_token_accuracy": 0.4758620738983154, "step": 141345 }, { "epoch": 0.1423692212549643, "grad_norm": 9.194759405948771, "learning_rate": 4.89584692537311e-05, "loss": 2.4147, "mean_token_accuracy": 0.42413793206214906, "step": 141350 }, { "epoch": 0.14237425730806846, "grad_norm": 10.72545074738459, "learning_rate": 4.895835656362127e-05, "loss": 2.2215, "mean_token_accuracy": 0.4620689630508423, "step": 141355 }, { "epoch": 0.14237929336117264, "grad_norm": 11.605693258750737, "learning_rate": 4.895824386755987e-05, "loss": 2.4759, "mean_token_accuracy": 0.4172413766384125, "step": 141360 }, { "epoch": 0.1423843294142768, "grad_norm": 9.310789496504128, "learning_rate": 4.895813116554696e-05, "loss": 2.3005, "mean_token_accuracy": 0.44827585816383364, "step": 141365 }, { "epoch": 0.14238936546738098, "grad_norm": 11.613144257311893, "learning_rate": 4.895801845758254e-05, "loss": 2.4648, "mean_token_accuracy": 0.3896551787853241, "step": 141370 }, { "epoch": 0.14239440152048516, "grad_norm": 9.475547683301434, "learning_rate": 4.8957905743666665e-05, "loss": 2.452, "mean_token_accuracy": 0.441379314661026, "step": 141375 }, { "epoch": 0.14239943757358933, "grad_norm": 9.899667098061835, "learning_rate": 4.8957793023799364e-05, "loss": 2.5849, "mean_token_accuracy": 0.40859044790267945, "step": 141380 }, { "epoch": 0.1424044736266935, "grad_norm": 11.097587615629124, "learning_rate": 4.895768029798066e-05, "loss": 2.5797, "mean_token_accuracy": 0.38620689511299133, "step": 141385 }, { "epoch": 0.14240950967979768, "grad_norm": 12.721831261273225, "learning_rate": 4.8957567566210595e-05, "loss": 2.6485, "mean_token_accuracy": 0.41379310488700866, "step": 141390 }, { "epoch": 0.14241454573290185, "grad_norm": 10.538936985079907, "learning_rate": 4.895745482848919e-05, "loss": 2.3958, "mean_token_accuracy": 0.4241379380226135, "step": 141395 }, { "epoch": 0.14241958178600603, "grad_norm": 11.422928366408936, "learning_rate": 4.895734208481648e-05, "loss": 2.6473, "mean_token_accuracy": 0.43103448748588563, "step": 141400 }, { "epoch": 0.1424246178391102, "grad_norm": 10.579202051044097, "learning_rate": 4.895722933519249e-05, "loss": 2.6898, "mean_token_accuracy": 0.4172413766384125, "step": 141405 }, { "epoch": 0.14242965389221438, "grad_norm": 9.195161677949168, "learning_rate": 4.895711657961726e-05, "loss": 2.8399, "mean_token_accuracy": 0.41724138855934145, "step": 141410 }, { "epoch": 0.14243468994531855, "grad_norm": 11.884989064117356, "learning_rate": 4.895700381809083e-05, "loss": 2.3041, "mean_token_accuracy": 0.43448275327682495, "step": 141415 }, { "epoch": 0.1424397259984227, "grad_norm": 12.612884798410827, "learning_rate": 4.8956891050613215e-05, "loss": 2.1731, "mean_token_accuracy": 0.4724137902259827, "step": 141420 }, { "epoch": 0.14244476205152687, "grad_norm": 10.313498016522264, "learning_rate": 4.895677827718444e-05, "loss": 2.5638, "mean_token_accuracy": 0.3931034505367279, "step": 141425 }, { "epoch": 0.14244979810463104, "grad_norm": 10.475617810084003, "learning_rate": 4.895666549780455e-05, "loss": 2.5571, "mean_token_accuracy": 0.42068965137004855, "step": 141430 }, { "epoch": 0.14245483415773522, "grad_norm": 11.950850135254194, "learning_rate": 4.8956552712473585e-05, "loss": 2.5198, "mean_token_accuracy": 0.4172413766384125, "step": 141435 }, { "epoch": 0.1424598702108394, "grad_norm": 14.038478340352409, "learning_rate": 4.8956439921191554e-05, "loss": 2.3094, "mean_token_accuracy": 0.46551724076271056, "step": 141440 }, { "epoch": 0.14246490626394356, "grad_norm": 10.737440516268325, "learning_rate": 4.8956327123958506e-05, "loss": 2.2301, "mean_token_accuracy": 0.4517241418361664, "step": 141445 }, { "epoch": 0.14246994231704774, "grad_norm": 10.05330475387196, "learning_rate": 4.895621432077446e-05, "loss": 2.8713, "mean_token_accuracy": 0.3551724076271057, "step": 141450 }, { "epoch": 0.1424749783701519, "grad_norm": 10.875036855316745, "learning_rate": 4.8956101511639464e-05, "loss": 2.5182, "mean_token_accuracy": 0.43793103098869324, "step": 141455 }, { "epoch": 0.14248001442325609, "grad_norm": 9.225976812589868, "learning_rate": 4.895598869655353e-05, "loss": 2.6044, "mean_token_accuracy": 0.39655172228813174, "step": 141460 }, { "epoch": 0.14248505047636026, "grad_norm": 12.412333450213206, "learning_rate": 4.8955875875516695e-05, "loss": 2.5763, "mean_token_accuracy": 0.4, "step": 141465 }, { "epoch": 0.14249008652946443, "grad_norm": 11.264765658813884, "learning_rate": 4.8955763048529e-05, "loss": 2.3981, "mean_token_accuracy": 0.40344828367233276, "step": 141470 }, { "epoch": 0.1424951225825686, "grad_norm": 11.90655647059862, "learning_rate": 4.8955650215590464e-05, "loss": 2.6728, "mean_token_accuracy": 0.4137930989265442, "step": 141475 }, { "epoch": 0.14250015863567278, "grad_norm": 15.634529799107476, "learning_rate": 4.895553737670113e-05, "loss": 2.7791, "mean_token_accuracy": 0.39655172228813174, "step": 141480 }, { "epoch": 0.14250519468877695, "grad_norm": 19.621687256677887, "learning_rate": 4.8955424531861015e-05, "loss": 2.2949, "mean_token_accuracy": 0.4978826344013214, "step": 141485 }, { "epoch": 0.14251023074188113, "grad_norm": 9.88725786825097, "learning_rate": 4.895531168107016e-05, "loss": 2.5174, "mean_token_accuracy": 0.41034482717514037, "step": 141490 }, { "epoch": 0.1425152667949853, "grad_norm": 10.004261285613284, "learning_rate": 4.895519882432859e-05, "loss": 1.9665, "mean_token_accuracy": 0.47586206197738645, "step": 141495 }, { "epoch": 0.14252030284808948, "grad_norm": 10.025632276024881, "learning_rate": 4.895508596163635e-05, "loss": 2.6842, "mean_token_accuracy": 0.3448275804519653, "step": 141500 }, { "epoch": 0.14252533890119365, "grad_norm": 11.662455053865623, "learning_rate": 4.895497309299346e-05, "loss": 2.3911, "mean_token_accuracy": 0.3827586233615875, "step": 141505 }, { "epoch": 0.14253037495429782, "grad_norm": 9.685060655803396, "learning_rate": 4.895486021839994e-05, "loss": 1.8858, "mean_token_accuracy": 0.5424682378768921, "step": 141510 }, { "epoch": 0.142535411007402, "grad_norm": 15.323721565962153, "learning_rate": 4.895474733785585e-05, "loss": 2.1344, "mean_token_accuracy": 0.48475498557090757, "step": 141515 }, { "epoch": 0.14254044706050617, "grad_norm": 9.633800734797438, "learning_rate": 4.8954634451361205e-05, "loss": 2.251, "mean_token_accuracy": 0.46896552443504336, "step": 141520 }, { "epoch": 0.14254548311361034, "grad_norm": 11.132390898904177, "learning_rate": 4.895452155891603e-05, "loss": 2.6389, "mean_token_accuracy": 0.39655171930789945, "step": 141525 }, { "epoch": 0.14255051916671452, "grad_norm": 11.897015722219962, "learning_rate": 4.895440866052036e-05, "loss": 2.2948, "mean_token_accuracy": 0.47931033968925474, "step": 141530 }, { "epoch": 0.1425555552198187, "grad_norm": 15.912652194894457, "learning_rate": 4.8954295756174237e-05, "loss": 2.5154, "mean_token_accuracy": 0.4448275864124298, "step": 141535 }, { "epoch": 0.14256059127292287, "grad_norm": 9.229246553208593, "learning_rate": 4.8954182845877686e-05, "loss": 2.1304, "mean_token_accuracy": 0.4344827592372894, "step": 141540 }, { "epoch": 0.14256562732602704, "grad_norm": 12.401056819916683, "learning_rate": 4.8954069929630736e-05, "loss": 2.2053, "mean_token_accuracy": 0.5241379380226135, "step": 141545 }, { "epoch": 0.1425706633791312, "grad_norm": 11.106357915030257, "learning_rate": 4.8953957007433414e-05, "loss": 2.6212, "mean_token_accuracy": 0.38620689511299133, "step": 141550 }, { "epoch": 0.1425756994322354, "grad_norm": 14.16393875869589, "learning_rate": 4.8953844079285766e-05, "loss": 2.8523, "mean_token_accuracy": 0.3862069010734558, "step": 141555 }, { "epoch": 0.14258073548533953, "grad_norm": 9.021376015537998, "learning_rate": 4.895373114518781e-05, "loss": 2.232, "mean_token_accuracy": 0.42758620381355283, "step": 141560 }, { "epoch": 0.1425857715384437, "grad_norm": 10.94063795694331, "learning_rate": 4.895361820513958e-05, "loss": 2.0968, "mean_token_accuracy": 0.458620685338974, "step": 141565 }, { "epoch": 0.14259080759154788, "grad_norm": 9.38740314338868, "learning_rate": 4.8953505259141116e-05, "loss": 2.3606, "mean_token_accuracy": 0.46444128155708314, "step": 141570 }, { "epoch": 0.14259584364465205, "grad_norm": 10.899526411405446, "learning_rate": 4.895339230719243e-05, "loss": 2.1431, "mean_token_accuracy": 0.44845736026763916, "step": 141575 }, { "epoch": 0.14260087969775623, "grad_norm": 11.584637535459546, "learning_rate": 4.895327934929357e-05, "loss": 2.127, "mean_token_accuracy": 0.4862069010734558, "step": 141580 }, { "epoch": 0.1426059157508604, "grad_norm": 12.549067034394517, "learning_rate": 4.895316638544457e-05, "loss": 2.477, "mean_token_accuracy": 0.4034482777118683, "step": 141585 }, { "epoch": 0.14261095180396458, "grad_norm": 10.435713558871418, "learning_rate": 4.895305341564545e-05, "loss": 2.37, "mean_token_accuracy": 0.4379310429096222, "step": 141590 }, { "epoch": 0.14261598785706875, "grad_norm": 9.502680736548118, "learning_rate": 4.895294043989625e-05, "loss": 1.8999, "mean_token_accuracy": 0.5206896543502808, "step": 141595 }, { "epoch": 0.14262102391017292, "grad_norm": 11.779035599684613, "learning_rate": 4.895282745819699e-05, "loss": 2.4356, "mean_token_accuracy": 0.43448275327682495, "step": 141600 }, { "epoch": 0.1426260599632771, "grad_norm": 8.814478921204374, "learning_rate": 4.895271447054771e-05, "loss": 2.3021, "mean_token_accuracy": 0.4517241358757019, "step": 141605 }, { "epoch": 0.14263109601638127, "grad_norm": 10.207233498975144, "learning_rate": 4.895260147694844e-05, "loss": 2.1742, "mean_token_accuracy": 0.42758620381355283, "step": 141610 }, { "epoch": 0.14263613206948544, "grad_norm": 10.744285534654255, "learning_rate": 4.895248847739921e-05, "loss": 2.6359, "mean_token_accuracy": 0.3724137932062149, "step": 141615 }, { "epoch": 0.14264116812258962, "grad_norm": 8.952708532882186, "learning_rate": 4.895237547190005e-05, "loss": 2.2507, "mean_token_accuracy": 0.46031457781791685, "step": 141620 }, { "epoch": 0.1426462041756938, "grad_norm": 10.828567856076612, "learning_rate": 4.8952262460451005e-05, "loss": 2.4986, "mean_token_accuracy": 0.3862069010734558, "step": 141625 }, { "epoch": 0.14265124022879797, "grad_norm": 8.939414661333021, "learning_rate": 4.8952149443052085e-05, "loss": 2.2078, "mean_token_accuracy": 0.4517241358757019, "step": 141630 }, { "epoch": 0.14265627628190214, "grad_norm": 12.20278928812962, "learning_rate": 4.8952036419703335e-05, "loss": 2.2184, "mean_token_accuracy": 0.43103447556495667, "step": 141635 }, { "epoch": 0.1426613123350063, "grad_norm": 10.665320726712025, "learning_rate": 4.8951923390404774e-05, "loss": 2.3348, "mean_token_accuracy": 0.42068966031074523, "step": 141640 }, { "epoch": 0.1426663483881105, "grad_norm": 8.682706110536937, "learning_rate": 4.895181035515645e-05, "loss": 2.5533, "mean_token_accuracy": 0.43103448748588563, "step": 141645 }, { "epoch": 0.14267138444121466, "grad_norm": 10.984521591415628, "learning_rate": 4.895169731395839e-05, "loss": 2.8462, "mean_token_accuracy": 0.4056866317987442, "step": 141650 }, { "epoch": 0.14267642049431883, "grad_norm": 14.628061707971717, "learning_rate": 4.895158426681062e-05, "loss": 2.7121, "mean_token_accuracy": 0.417241370677948, "step": 141655 }, { "epoch": 0.142681456547423, "grad_norm": 8.964339430464383, "learning_rate": 4.895147121371316e-05, "loss": 2.3723, "mean_token_accuracy": 0.3999999940395355, "step": 141660 }, { "epoch": 0.14268649260052718, "grad_norm": 11.136021743360747, "learning_rate": 4.8951358154666074e-05, "loss": 2.1417, "mean_token_accuracy": 0.46551724076271056, "step": 141665 }, { "epoch": 0.14269152865363136, "grad_norm": 8.182646104682481, "learning_rate": 4.895124508966936e-05, "loss": 2.1598, "mean_token_accuracy": 0.47586206793785096, "step": 141670 }, { "epoch": 0.14269656470673553, "grad_norm": 12.637494700926819, "learning_rate": 4.895113201872307e-05, "loss": 2.1847, "mean_token_accuracy": 0.5068965494632721, "step": 141675 }, { "epoch": 0.1427016007598397, "grad_norm": 8.989064363423491, "learning_rate": 4.895101894182722e-05, "loss": 2.4409, "mean_token_accuracy": 0.43448275327682495, "step": 141680 }, { "epoch": 0.14270663681294388, "grad_norm": 10.214071476566891, "learning_rate": 4.895090585898186e-05, "loss": 2.2494, "mean_token_accuracy": 0.482758617401123, "step": 141685 }, { "epoch": 0.14271167286604805, "grad_norm": 11.091593225008213, "learning_rate": 4.8950792770187005e-05, "loss": 2.7683, "mean_token_accuracy": 0.39655172228813174, "step": 141690 }, { "epoch": 0.14271670891915222, "grad_norm": 10.315438827310407, "learning_rate": 4.8950679675442695e-05, "loss": 2.2512, "mean_token_accuracy": 0.4758620738983154, "step": 141695 }, { "epoch": 0.14272174497225637, "grad_norm": 9.15601974817395, "learning_rate": 4.895056657474896e-05, "loss": 2.5103, "mean_token_accuracy": 0.3862069010734558, "step": 141700 }, { "epoch": 0.14272678102536054, "grad_norm": 11.300411610102584, "learning_rate": 4.895045346810583e-05, "loss": 2.9462, "mean_token_accuracy": 0.3724137991666794, "step": 141705 }, { "epoch": 0.14273181707846472, "grad_norm": 8.528404433474439, "learning_rate": 4.895034035551333e-05, "loss": 2.4233, "mean_token_accuracy": 0.4482758641242981, "step": 141710 }, { "epoch": 0.1427368531315689, "grad_norm": 11.10031752440764, "learning_rate": 4.8950227236971506e-05, "loss": 2.5772, "mean_token_accuracy": 0.43103448748588563, "step": 141715 }, { "epoch": 0.14274188918467307, "grad_norm": 9.284322448936308, "learning_rate": 4.895011411248038e-05, "loss": 2.3742, "mean_token_accuracy": 0.45517241060733793, "step": 141720 }, { "epoch": 0.14274692523777724, "grad_norm": 8.671194285909623, "learning_rate": 4.895000098203999e-05, "loss": 2.5617, "mean_token_accuracy": 0.44137930274009707, "step": 141725 }, { "epoch": 0.1427519612908814, "grad_norm": 9.944315520966793, "learning_rate": 4.894988784565035e-05, "loss": 2.1421, "mean_token_accuracy": 0.48275862336158754, "step": 141730 }, { "epoch": 0.1427569973439856, "grad_norm": 10.457472562897905, "learning_rate": 4.894977470331151e-05, "loss": 2.0252, "mean_token_accuracy": 0.47586206793785096, "step": 141735 }, { "epoch": 0.14276203339708976, "grad_norm": 13.507136978983162, "learning_rate": 4.89496615550235e-05, "loss": 2.3075, "mean_token_accuracy": 0.4448275864124298, "step": 141740 }, { "epoch": 0.14276706945019393, "grad_norm": 10.092187187820274, "learning_rate": 4.894954840078634e-05, "loss": 2.7683, "mean_token_accuracy": 0.41379311084747317, "step": 141745 }, { "epoch": 0.1427721055032981, "grad_norm": 9.605043920109585, "learning_rate": 4.8949435240600066e-05, "loss": 2.3649, "mean_token_accuracy": 0.4620689690113068, "step": 141750 }, { "epoch": 0.14277714155640228, "grad_norm": 13.04330234819312, "learning_rate": 4.894932207446472e-05, "loss": 2.504, "mean_token_accuracy": 0.40514216423034666, "step": 141755 }, { "epoch": 0.14278217760950646, "grad_norm": 10.000405101912865, "learning_rate": 4.8949208902380314e-05, "loss": 2.4243, "mean_token_accuracy": 0.4103448331356049, "step": 141760 }, { "epoch": 0.14278721366261063, "grad_norm": 10.681181248950077, "learning_rate": 4.8949095724346894e-05, "loss": 2.5671, "mean_token_accuracy": 0.4379310369491577, "step": 141765 }, { "epoch": 0.1427922497157148, "grad_norm": 10.58088004630637, "learning_rate": 4.894898254036449e-05, "loss": 2.5083, "mean_token_accuracy": 0.42758620381355283, "step": 141770 }, { "epoch": 0.14279728576881898, "grad_norm": 8.403765330799994, "learning_rate": 4.8948869350433126e-05, "loss": 2.1921, "mean_token_accuracy": 0.45378100872039795, "step": 141775 }, { "epoch": 0.14280232182192315, "grad_norm": 11.229487922552703, "learning_rate": 4.8948756154552844e-05, "loss": 2.4494, "mean_token_accuracy": 0.441379314661026, "step": 141780 }, { "epoch": 0.14280735787502732, "grad_norm": 10.276600902528685, "learning_rate": 4.894864295272367e-05, "loss": 2.2468, "mean_token_accuracy": 0.4517241418361664, "step": 141785 }, { "epoch": 0.1428123939281315, "grad_norm": 9.962363912546921, "learning_rate": 4.894852974494562e-05, "loss": 2.3183, "mean_token_accuracy": 0.3758620619773865, "step": 141790 }, { "epoch": 0.14281742998123567, "grad_norm": 8.784905427274058, "learning_rate": 4.894841653121876e-05, "loss": 2.5422, "mean_token_accuracy": 0.42413793206214906, "step": 141795 }, { "epoch": 0.14282246603433985, "grad_norm": 11.91506185260996, "learning_rate": 4.8948303311543084e-05, "loss": 2.1749, "mean_token_accuracy": 0.47586207985877993, "step": 141800 }, { "epoch": 0.14282750208744402, "grad_norm": 8.563560731086097, "learning_rate": 4.894819008591866e-05, "loss": 2.1818, "mean_token_accuracy": 0.46896551847457885, "step": 141805 }, { "epoch": 0.1428325381405482, "grad_norm": 8.086693389918691, "learning_rate": 4.894807685434548e-05, "loss": 2.4703, "mean_token_accuracy": 0.4437386512756348, "step": 141810 }, { "epoch": 0.14283757419365237, "grad_norm": 9.376110628291343, "learning_rate": 4.8947963616823614e-05, "loss": 2.1402, "mean_token_accuracy": 0.4620689690113068, "step": 141815 }, { "epoch": 0.14284261024675654, "grad_norm": 10.167587644893137, "learning_rate": 4.894785037335306e-05, "loss": 2.4622, "mean_token_accuracy": 0.4655172288417816, "step": 141820 }, { "epoch": 0.14284764629986071, "grad_norm": 12.146371620836993, "learning_rate": 4.8947737123933877e-05, "loss": 2.2601, "mean_token_accuracy": 0.4551724135875702, "step": 141825 }, { "epoch": 0.1428526823529649, "grad_norm": 10.11365606599262, "learning_rate": 4.894762386856608e-05, "loss": 2.5054, "mean_token_accuracy": 0.4344827592372894, "step": 141830 }, { "epoch": 0.14285771840606906, "grad_norm": 9.250691631376075, "learning_rate": 4.89475106072497e-05, "loss": 1.9619, "mean_token_accuracy": 0.5206896543502808, "step": 141835 }, { "epoch": 0.1428627544591732, "grad_norm": 9.84959232650346, "learning_rate": 4.8947397339984785e-05, "loss": 2.1061, "mean_token_accuracy": 0.4655172348022461, "step": 141840 }, { "epoch": 0.14286779051227738, "grad_norm": 10.034749605122135, "learning_rate": 4.8947284066771344e-05, "loss": 2.4274, "mean_token_accuracy": 0.4344827592372894, "step": 141845 }, { "epoch": 0.14287282656538156, "grad_norm": 10.79047950197894, "learning_rate": 4.894717078760942e-05, "loss": 2.4638, "mean_token_accuracy": 0.3965517282485962, "step": 141850 }, { "epoch": 0.14287786261848573, "grad_norm": 9.555681721030963, "learning_rate": 4.8947057502499037e-05, "loss": 2.6829, "mean_token_accuracy": 0.4221415638923645, "step": 141855 }, { "epoch": 0.1428828986715899, "grad_norm": 10.026774264029338, "learning_rate": 4.894694421144025e-05, "loss": 2.1912, "mean_token_accuracy": 0.42413792610168455, "step": 141860 }, { "epoch": 0.14288793472469408, "grad_norm": 11.657065929371347, "learning_rate": 4.894683091443306e-05, "loss": 2.4754, "mean_token_accuracy": 0.42413793206214906, "step": 141865 }, { "epoch": 0.14289297077779825, "grad_norm": 14.326566556699827, "learning_rate": 4.894671761147751e-05, "loss": 2.5484, "mean_token_accuracy": 0.4517241418361664, "step": 141870 }, { "epoch": 0.14289800683090242, "grad_norm": 10.12768213909908, "learning_rate": 4.894660430257364e-05, "loss": 2.4187, "mean_token_accuracy": 0.4344827592372894, "step": 141875 }, { "epoch": 0.1429030428840066, "grad_norm": 18.994702174819135, "learning_rate": 4.894649098772147e-05, "loss": 2.5575, "mean_token_accuracy": 0.42413792610168455, "step": 141880 }, { "epoch": 0.14290807893711077, "grad_norm": 9.9006838276388, "learning_rate": 4.894637766692104e-05, "loss": 2.1047, "mean_token_accuracy": 0.4781609117984772, "step": 141885 }, { "epoch": 0.14291311499021495, "grad_norm": 9.174694897367916, "learning_rate": 4.894626434017237e-05, "loss": 2.1653, "mean_token_accuracy": 0.46013309359550475, "step": 141890 }, { "epoch": 0.14291815104331912, "grad_norm": 9.485597095521344, "learning_rate": 4.894615100747551e-05, "loss": 2.0575, "mean_token_accuracy": 0.5034482777118683, "step": 141895 }, { "epoch": 0.1429231870964233, "grad_norm": 13.328185000393646, "learning_rate": 4.8946037668830466e-05, "loss": 2.2879, "mean_token_accuracy": 0.46551724672317507, "step": 141900 }, { "epoch": 0.14292822314952747, "grad_norm": 11.38363801832749, "learning_rate": 4.8945924324237294e-05, "loss": 2.1266, "mean_token_accuracy": 0.4744101583957672, "step": 141905 }, { "epoch": 0.14293325920263164, "grad_norm": 11.097844435500429, "learning_rate": 4.894581097369601e-05, "loss": 2.2888, "mean_token_accuracy": 0.4586206912994385, "step": 141910 }, { "epoch": 0.14293829525573581, "grad_norm": 10.024183880059436, "learning_rate": 4.894569761720665e-05, "loss": 2.3068, "mean_token_accuracy": 0.44137930274009707, "step": 141915 }, { "epoch": 0.14294333130884, "grad_norm": 13.216345590116223, "learning_rate": 4.8945584254769246e-05, "loss": 2.7632, "mean_token_accuracy": 0.4068965554237366, "step": 141920 }, { "epoch": 0.14294836736194416, "grad_norm": 9.509369766761399, "learning_rate": 4.8945470886383834e-05, "loss": 2.2577, "mean_token_accuracy": 0.46551724672317507, "step": 141925 }, { "epoch": 0.14295340341504834, "grad_norm": 10.317496259801056, "learning_rate": 4.894535751205043e-05, "loss": 2.2601, "mean_token_accuracy": 0.4206896543502808, "step": 141930 }, { "epoch": 0.1429584394681525, "grad_norm": 15.33368799426953, "learning_rate": 4.894524413176909e-05, "loss": 2.702, "mean_token_accuracy": 0.3655172407627106, "step": 141935 }, { "epoch": 0.14296347552125668, "grad_norm": 9.533239394331332, "learning_rate": 4.894513074553982e-05, "loss": 1.9119, "mean_token_accuracy": 0.5019963681697845, "step": 141940 }, { "epoch": 0.14296851157436086, "grad_norm": 12.1401776094745, "learning_rate": 4.8945017353362663e-05, "loss": 2.868, "mean_token_accuracy": 0.3655172407627106, "step": 141945 }, { "epoch": 0.14297354762746503, "grad_norm": 11.291174179479324, "learning_rate": 4.894490395523767e-05, "loss": 2.2961, "mean_token_accuracy": 0.4724137902259827, "step": 141950 }, { "epoch": 0.1429785836805692, "grad_norm": 8.72942414483867, "learning_rate": 4.894479055116483e-05, "loss": 2.6986, "mean_token_accuracy": 0.42413792610168455, "step": 141955 }, { "epoch": 0.14298361973367338, "grad_norm": 10.589229903088244, "learning_rate": 4.8944677141144206e-05, "loss": 2.39, "mean_token_accuracy": 0.3913490533828735, "step": 141960 }, { "epoch": 0.14298865578677755, "grad_norm": 8.648879054593172, "learning_rate": 4.894456372517582e-05, "loss": 2.2915, "mean_token_accuracy": 0.45628078281879425, "step": 141965 }, { "epoch": 0.14299369183988173, "grad_norm": 9.6028357468956, "learning_rate": 4.89444503032597e-05, "loss": 2.3294, "mean_token_accuracy": 0.42068964838981626, "step": 141970 }, { "epoch": 0.1429987278929859, "grad_norm": 16.6827246438967, "learning_rate": 4.894433687539588e-05, "loss": 2.4573, "mean_token_accuracy": 0.4344827592372894, "step": 141975 }, { "epoch": 0.14300376394609005, "grad_norm": 11.24473563606688, "learning_rate": 4.8944223441584406e-05, "loss": 2.2142, "mean_token_accuracy": 0.43793103098869324, "step": 141980 }, { "epoch": 0.14300879999919422, "grad_norm": 10.181697928473278, "learning_rate": 4.894411000182529e-05, "loss": 2.5375, "mean_token_accuracy": 0.3965517282485962, "step": 141985 }, { "epoch": 0.1430138360522984, "grad_norm": 10.622712703749242, "learning_rate": 4.8943996556118574e-05, "loss": 2.3641, "mean_token_accuracy": 0.4344827592372894, "step": 141990 }, { "epoch": 0.14301887210540257, "grad_norm": 10.66806468324331, "learning_rate": 4.894388310446428e-05, "loss": 2.2145, "mean_token_accuracy": 0.4620689690113068, "step": 141995 }, { "epoch": 0.14302390815850674, "grad_norm": 11.58854942185488, "learning_rate": 4.894376964686244e-05, "loss": 2.6412, "mean_token_accuracy": 0.4172413766384125, "step": 142000 }, { "epoch": 0.14302894421161091, "grad_norm": 11.034475025489074, "learning_rate": 4.89436561833131e-05, "loss": 2.1873, "mean_token_accuracy": 0.4655172288417816, "step": 142005 }, { "epoch": 0.1430339802647151, "grad_norm": 10.761635140303058, "learning_rate": 4.894354271381628e-05, "loss": 2.3423, "mean_token_accuracy": 0.37586207389831544, "step": 142010 }, { "epoch": 0.14303901631781926, "grad_norm": 10.175484263393212, "learning_rate": 4.8943429238372014e-05, "loss": 2.2858, "mean_token_accuracy": 0.41034482717514037, "step": 142015 }, { "epoch": 0.14304405237092344, "grad_norm": 12.468903387806174, "learning_rate": 4.894331575698033e-05, "loss": 2.5247, "mean_token_accuracy": 0.4262552857398987, "step": 142020 }, { "epoch": 0.1430490884240276, "grad_norm": 9.768972183838297, "learning_rate": 4.8943202269641265e-05, "loss": 2.624, "mean_token_accuracy": 0.4206896543502808, "step": 142025 }, { "epoch": 0.14305412447713178, "grad_norm": 8.723564683420744, "learning_rate": 4.894308877635485e-05, "loss": 2.2417, "mean_token_accuracy": 0.4586206912994385, "step": 142030 }, { "epoch": 0.14305916053023596, "grad_norm": 12.44844643459062, "learning_rate": 4.894297527712111e-05, "loss": 2.5936, "mean_token_accuracy": 0.42758620381355283, "step": 142035 }, { "epoch": 0.14306419658334013, "grad_norm": 12.31040023863623, "learning_rate": 4.894286177194008e-05, "loss": 2.7149, "mean_token_accuracy": 0.39086509346961973, "step": 142040 }, { "epoch": 0.1430692326364443, "grad_norm": 10.27872409693468, "learning_rate": 4.894274826081179e-05, "loss": 2.1609, "mean_token_accuracy": 0.5, "step": 142045 }, { "epoch": 0.14307426868954848, "grad_norm": 11.683319826760895, "learning_rate": 4.894263474373628e-05, "loss": 2.5845, "mean_token_accuracy": 0.4103448331356049, "step": 142050 }, { "epoch": 0.14307930474265265, "grad_norm": 11.309545943273221, "learning_rate": 4.8942521220713576e-05, "loss": 2.3464, "mean_token_accuracy": 0.482758617401123, "step": 142055 }, { "epoch": 0.14308434079575683, "grad_norm": 10.663402743574363, "learning_rate": 4.89424076917437e-05, "loss": 2.3397, "mean_token_accuracy": 0.3931034505367279, "step": 142060 }, { "epoch": 0.143089376848861, "grad_norm": 14.732504452074618, "learning_rate": 4.8942294156826704e-05, "loss": 2.8556, "mean_token_accuracy": 0.3482758551836014, "step": 142065 }, { "epoch": 0.14309441290196517, "grad_norm": 9.293152140553792, "learning_rate": 4.8942180615962595e-05, "loss": 2.3359, "mean_token_accuracy": 0.4103448331356049, "step": 142070 }, { "epoch": 0.14309944895506935, "grad_norm": 10.314799527898382, "learning_rate": 4.894206706915143e-05, "loss": 2.2959, "mean_token_accuracy": 0.39655172228813174, "step": 142075 }, { "epoch": 0.14310448500817352, "grad_norm": 12.020262661884685, "learning_rate": 4.894195351639322e-05, "loss": 2.9017, "mean_token_accuracy": 0.32413792610168457, "step": 142080 }, { "epoch": 0.1431095210612777, "grad_norm": 9.32319227505478, "learning_rate": 4.8941839957688e-05, "loss": 2.4485, "mean_token_accuracy": 0.4068965554237366, "step": 142085 }, { "epoch": 0.14311455711438187, "grad_norm": 12.616994441756953, "learning_rate": 4.8941726393035814e-05, "loss": 2.3747, "mean_token_accuracy": 0.41724138259887694, "step": 142090 }, { "epoch": 0.14311959316748604, "grad_norm": 9.696162821039412, "learning_rate": 4.894161282243668e-05, "loss": 2.4629, "mean_token_accuracy": 0.46551724672317507, "step": 142095 }, { "epoch": 0.14312462922059022, "grad_norm": 11.866414190036519, "learning_rate": 4.894149924589064e-05, "loss": 2.5934, "mean_token_accuracy": 0.43103448748588563, "step": 142100 }, { "epoch": 0.1431296652736944, "grad_norm": 8.865310754608231, "learning_rate": 4.894138566339771e-05, "loss": 2.106, "mean_token_accuracy": 0.4689655125141144, "step": 142105 }, { "epoch": 0.14313470132679856, "grad_norm": 10.18423949656946, "learning_rate": 4.8941272074957945e-05, "loss": 2.602, "mean_token_accuracy": 0.38620689511299133, "step": 142110 }, { "epoch": 0.14313973737990274, "grad_norm": 12.771904845471733, "learning_rate": 4.8941158480571356e-05, "loss": 2.4986, "mean_token_accuracy": 0.41379310488700866, "step": 142115 }, { "epoch": 0.14314477343300688, "grad_norm": 14.332773844458725, "learning_rate": 4.8941044880237985e-05, "loss": 2.2115, "mean_token_accuracy": 0.4344827592372894, "step": 142120 }, { "epoch": 0.14314980948611106, "grad_norm": 12.679406207532665, "learning_rate": 4.894093127395786e-05, "loss": 2.3028, "mean_token_accuracy": 0.39310345351696013, "step": 142125 }, { "epoch": 0.14315484553921523, "grad_norm": 10.211925230908582, "learning_rate": 4.8940817661731016e-05, "loss": 2.1921, "mean_token_accuracy": 0.38620689511299133, "step": 142130 }, { "epoch": 0.1431598815923194, "grad_norm": 11.023294846962461, "learning_rate": 4.894070404355748e-05, "loss": 2.4552, "mean_token_accuracy": 0.4137930989265442, "step": 142135 }, { "epoch": 0.14316491764542358, "grad_norm": 12.815710717691665, "learning_rate": 4.8940590419437274e-05, "loss": 2.4909, "mean_token_accuracy": 0.45517241954803467, "step": 142140 }, { "epoch": 0.14316995369852775, "grad_norm": 10.20301371367607, "learning_rate": 4.8940476789370445e-05, "loss": 2.1383, "mean_token_accuracy": 0.4931034564971924, "step": 142145 }, { "epoch": 0.14317498975163193, "grad_norm": 11.292461760416774, "learning_rate": 4.8940363153357025e-05, "loss": 2.3221, "mean_token_accuracy": 0.482758617401123, "step": 142150 }, { "epoch": 0.1431800258047361, "grad_norm": 11.77007366153293, "learning_rate": 4.894024951139703e-05, "loss": 2.3725, "mean_token_accuracy": 0.4034482717514038, "step": 142155 }, { "epoch": 0.14318506185784027, "grad_norm": 10.2591162277197, "learning_rate": 4.894013586349052e-05, "loss": 2.1751, "mean_token_accuracy": 0.46551724076271056, "step": 142160 }, { "epoch": 0.14319009791094445, "grad_norm": 8.648655269257274, "learning_rate": 4.894002220963749e-05, "loss": 1.9065, "mean_token_accuracy": 0.5068965494632721, "step": 142165 }, { "epoch": 0.14319513396404862, "grad_norm": 8.565182878262995, "learning_rate": 4.8939908549838e-05, "loss": 2.029, "mean_token_accuracy": 0.49999998807907103, "step": 142170 }, { "epoch": 0.1432001700171528, "grad_norm": 13.197079025667072, "learning_rate": 4.893979488409207e-05, "loss": 2.6033, "mean_token_accuracy": 0.39655172228813174, "step": 142175 }, { "epoch": 0.14320520607025697, "grad_norm": 10.69538313057205, "learning_rate": 4.8939681212399735e-05, "loss": 2.4933, "mean_token_accuracy": 0.3896551787853241, "step": 142180 }, { "epoch": 0.14321024212336114, "grad_norm": 11.014820888254176, "learning_rate": 4.8939567534761015e-05, "loss": 2.5046, "mean_token_accuracy": 0.43793103098869324, "step": 142185 }, { "epoch": 0.14321527817646532, "grad_norm": 10.308990344874427, "learning_rate": 4.893945385117596e-05, "loss": 2.3255, "mean_token_accuracy": 0.4172413766384125, "step": 142190 }, { "epoch": 0.1432203142295695, "grad_norm": 11.021667571167184, "learning_rate": 4.8939340161644586e-05, "loss": 2.5997, "mean_token_accuracy": 0.4068965494632721, "step": 142195 }, { "epoch": 0.14322535028267366, "grad_norm": 9.243108800555603, "learning_rate": 4.893922646616694e-05, "loss": 2.3038, "mean_token_accuracy": 0.4137930989265442, "step": 142200 }, { "epoch": 0.14323038633577784, "grad_norm": 10.51774654850775, "learning_rate": 4.893911276474304e-05, "loss": 2.2999, "mean_token_accuracy": 0.47586206793785096, "step": 142205 }, { "epoch": 0.143235422388882, "grad_norm": 9.34607383792057, "learning_rate": 4.893899905737292e-05, "loss": 2.5204, "mean_token_accuracy": 0.39310344457626345, "step": 142210 }, { "epoch": 0.14324045844198618, "grad_norm": 10.980732864523231, "learning_rate": 4.893888534405662e-05, "loss": 2.4498, "mean_token_accuracy": 0.39655172228813174, "step": 142215 }, { "epoch": 0.14324549449509036, "grad_norm": 9.370142588671659, "learning_rate": 4.8938771624794155e-05, "loss": 2.4707, "mean_token_accuracy": 0.4379310369491577, "step": 142220 }, { "epoch": 0.14325053054819453, "grad_norm": 10.171151405433928, "learning_rate": 4.893865789958557e-05, "loss": 2.1027, "mean_token_accuracy": 0.4517241358757019, "step": 142225 }, { "epoch": 0.1432555666012987, "grad_norm": 10.034188907682145, "learning_rate": 4.8938544168430895e-05, "loss": 2.4175, "mean_token_accuracy": 0.42068964838981626, "step": 142230 }, { "epoch": 0.14326060265440288, "grad_norm": 10.59538714056528, "learning_rate": 4.893843043133016e-05, "loss": 2.4987, "mean_token_accuracy": 0.4189957737922668, "step": 142235 }, { "epoch": 0.14326563870750705, "grad_norm": 10.253105010731302, "learning_rate": 4.89383166882834e-05, "loss": 2.3388, "mean_token_accuracy": 0.41379311084747317, "step": 142240 }, { "epoch": 0.14327067476061123, "grad_norm": 8.801381718030939, "learning_rate": 4.8938202939290635e-05, "loss": 2.1775, "mean_token_accuracy": 0.4310344815254211, "step": 142245 }, { "epoch": 0.1432757108137154, "grad_norm": 10.40045671004918, "learning_rate": 4.893808918435192e-05, "loss": 2.6706, "mean_token_accuracy": 0.39999999701976774, "step": 142250 }, { "epoch": 0.14328074686681957, "grad_norm": 12.647901745772565, "learning_rate": 4.8937975423467254e-05, "loss": 2.2626, "mean_token_accuracy": 0.4344827651977539, "step": 142255 }, { "epoch": 0.14328578291992372, "grad_norm": 10.663679437221587, "learning_rate": 4.893786165663668e-05, "loss": 2.5321, "mean_token_accuracy": 0.39655172228813174, "step": 142260 }, { "epoch": 0.1432908189730279, "grad_norm": 9.456632714736738, "learning_rate": 4.893774788386025e-05, "loss": 2.2809, "mean_token_accuracy": 0.4034482777118683, "step": 142265 }, { "epoch": 0.14329585502613207, "grad_norm": 10.69034084326508, "learning_rate": 4.893763410513798e-05, "loss": 2.5798, "mean_token_accuracy": 0.4034482777118683, "step": 142270 }, { "epoch": 0.14330089107923624, "grad_norm": 8.105797660799526, "learning_rate": 4.89375203204699e-05, "loss": 2.1371, "mean_token_accuracy": 0.4482758641242981, "step": 142275 }, { "epoch": 0.14330592713234042, "grad_norm": 10.02794028244597, "learning_rate": 4.893740652985604e-05, "loss": 2.2947, "mean_token_accuracy": 0.4137930989265442, "step": 142280 }, { "epoch": 0.1433109631854446, "grad_norm": 11.6287036775017, "learning_rate": 4.893729273329644e-05, "loss": 2.3262, "mean_token_accuracy": 0.4448275864124298, "step": 142285 }, { "epoch": 0.14331599923854876, "grad_norm": 11.493178326797263, "learning_rate": 4.8937178930791125e-05, "loss": 2.0035, "mean_token_accuracy": 0.4918330252170563, "step": 142290 }, { "epoch": 0.14332103529165294, "grad_norm": 9.74977521948768, "learning_rate": 4.893706512234013e-05, "loss": 2.1981, "mean_token_accuracy": 0.48275861144065857, "step": 142295 }, { "epoch": 0.1433260713447571, "grad_norm": 12.018917736964728, "learning_rate": 4.893695130794348e-05, "loss": 2.4145, "mean_token_accuracy": 0.43448275327682495, "step": 142300 }, { "epoch": 0.14333110739786128, "grad_norm": 10.532603538100084, "learning_rate": 4.8936837487601214e-05, "loss": 2.5207, "mean_token_accuracy": 0.38620689511299133, "step": 142305 }, { "epoch": 0.14333614345096546, "grad_norm": 9.027791921491541, "learning_rate": 4.893672366131337e-05, "loss": 1.9632, "mean_token_accuracy": 0.5156684815883636, "step": 142310 }, { "epoch": 0.14334117950406963, "grad_norm": 9.032475741494851, "learning_rate": 4.893660982907996e-05, "loss": 2.4655, "mean_token_accuracy": 0.39310344457626345, "step": 142315 }, { "epoch": 0.1433462155571738, "grad_norm": 11.448570660489818, "learning_rate": 4.893649599090103e-05, "loss": 2.5471, "mean_token_accuracy": 0.4, "step": 142320 }, { "epoch": 0.14335125161027798, "grad_norm": 10.220177076975029, "learning_rate": 4.8936382146776605e-05, "loss": 2.2138, "mean_token_accuracy": 0.42413793206214906, "step": 142325 }, { "epoch": 0.14335628766338215, "grad_norm": 9.753661335563747, "learning_rate": 4.893626829670672e-05, "loss": 2.5121, "mean_token_accuracy": 0.40508167147636415, "step": 142330 }, { "epoch": 0.14336132371648633, "grad_norm": 9.36079811304816, "learning_rate": 4.893615444069141e-05, "loss": 2.3672, "mean_token_accuracy": 0.46400484442710876, "step": 142335 }, { "epoch": 0.1433663597695905, "grad_norm": 10.56503968391867, "learning_rate": 4.8936040578730704e-05, "loss": 2.6566, "mean_token_accuracy": 0.39655172228813174, "step": 142340 }, { "epoch": 0.14337139582269467, "grad_norm": 11.57958333115118, "learning_rate": 4.893592671082462e-05, "loss": 2.2532, "mean_token_accuracy": 0.4156079888343811, "step": 142345 }, { "epoch": 0.14337643187579885, "grad_norm": 13.011856053615718, "learning_rate": 4.893581283697322e-05, "loss": 2.3316, "mean_token_accuracy": 0.458620685338974, "step": 142350 }, { "epoch": 0.14338146792890302, "grad_norm": 9.904553765279603, "learning_rate": 4.89356989571765e-05, "loss": 2.7929, "mean_token_accuracy": 0.3620689660310745, "step": 142355 }, { "epoch": 0.1433865039820072, "grad_norm": 11.753921076166261, "learning_rate": 4.893558507143452e-05, "loss": 2.5901, "mean_token_accuracy": 0.4, "step": 142360 }, { "epoch": 0.14339154003511137, "grad_norm": 10.497688320923682, "learning_rate": 4.8935471179747296e-05, "loss": 2.5919, "mean_token_accuracy": 0.42758620381355283, "step": 142365 }, { "epoch": 0.14339657608821554, "grad_norm": 8.736465012408782, "learning_rate": 4.893535728211486e-05, "loss": 2.4362, "mean_token_accuracy": 0.4103448331356049, "step": 142370 }, { "epoch": 0.14340161214131972, "grad_norm": 7.743734806437265, "learning_rate": 4.893524337853725e-05, "loss": 2.2784, "mean_token_accuracy": 0.4620689690113068, "step": 142375 }, { "epoch": 0.1434066481944239, "grad_norm": 11.084259441332001, "learning_rate": 4.89351294690145e-05, "loss": 2.6369, "mean_token_accuracy": 0.4172413766384125, "step": 142380 }, { "epoch": 0.14341168424752807, "grad_norm": 9.398699408113318, "learning_rate": 4.893501555354664e-05, "loss": 2.231, "mean_token_accuracy": 0.43103448748588563, "step": 142385 }, { "epoch": 0.14341672030063224, "grad_norm": 10.144258018426022, "learning_rate": 4.893490163213369e-05, "loss": 2.5378, "mean_token_accuracy": 0.443254691362381, "step": 142390 }, { "epoch": 0.1434217563537364, "grad_norm": 8.438241155200776, "learning_rate": 4.893478770477569e-05, "loss": 2.6303, "mean_token_accuracy": 0.4413793087005615, "step": 142395 }, { "epoch": 0.14342679240684056, "grad_norm": 10.585559060276362, "learning_rate": 4.893467377147268e-05, "loss": 2.4211, "mean_token_accuracy": 0.4500302493572235, "step": 142400 }, { "epoch": 0.14343182845994473, "grad_norm": 7.276441687843659, "learning_rate": 4.893455983222468e-05, "loss": 2.1778, "mean_token_accuracy": 0.5183908104896545, "step": 142405 }, { "epoch": 0.1434368645130489, "grad_norm": 10.72973024094628, "learning_rate": 4.893444588703172e-05, "loss": 2.5671, "mean_token_accuracy": 0.4206896543502808, "step": 142410 }, { "epoch": 0.14344190056615308, "grad_norm": 14.851634928703682, "learning_rate": 4.893433193589385e-05, "loss": 2.5111, "mean_token_accuracy": 0.4172413766384125, "step": 142415 }, { "epoch": 0.14344693661925725, "grad_norm": 10.997921866015664, "learning_rate": 4.893421797881107e-05, "loss": 2.3297, "mean_token_accuracy": 0.43950393199920657, "step": 142420 }, { "epoch": 0.14345197267236143, "grad_norm": 8.909375251276662, "learning_rate": 4.8934104015783436e-05, "loss": 2.4451, "mean_token_accuracy": 0.41554749608039854, "step": 142425 }, { "epoch": 0.1434570087254656, "grad_norm": 10.02553404993454, "learning_rate": 4.893399004681097e-05, "loss": 2.2527, "mean_token_accuracy": 0.42413793206214906, "step": 142430 }, { "epoch": 0.14346204477856977, "grad_norm": 11.292403071647975, "learning_rate": 4.8933876071893714e-05, "loss": 2.1369, "mean_token_accuracy": 0.4862068951129913, "step": 142435 }, { "epoch": 0.14346708083167395, "grad_norm": 10.521073119759935, "learning_rate": 4.89337620910317e-05, "loss": 2.5993, "mean_token_accuracy": 0.4103448331356049, "step": 142440 }, { "epoch": 0.14347211688477812, "grad_norm": 12.589572489070424, "learning_rate": 4.893364810422493e-05, "loss": 2.7581, "mean_token_accuracy": 0.40344826579093934, "step": 142445 }, { "epoch": 0.1434771529378823, "grad_norm": 9.386092869301782, "learning_rate": 4.893353411147348e-05, "loss": 2.2453, "mean_token_accuracy": 0.42413793206214906, "step": 142450 }, { "epoch": 0.14348218899098647, "grad_norm": 18.646650802756803, "learning_rate": 4.8933420112777344e-05, "loss": 2.3946, "mean_token_accuracy": 0.4068965554237366, "step": 142455 }, { "epoch": 0.14348722504409064, "grad_norm": 12.281833304853626, "learning_rate": 4.8933306108136574e-05, "loss": 2.4013, "mean_token_accuracy": 0.39310344457626345, "step": 142460 }, { "epoch": 0.14349226109719482, "grad_norm": 10.939944439176033, "learning_rate": 4.89331920975512e-05, "loss": 2.5234, "mean_token_accuracy": 0.41034482717514037, "step": 142465 }, { "epoch": 0.143497297150299, "grad_norm": 10.605068418150696, "learning_rate": 4.893307808102124e-05, "loss": 2.1838, "mean_token_accuracy": 0.46896551847457885, "step": 142470 }, { "epoch": 0.14350233320340317, "grad_norm": 9.507031802556483, "learning_rate": 4.893296405854676e-05, "loss": 2.0857, "mean_token_accuracy": 0.4635813653469086, "step": 142475 }, { "epoch": 0.14350736925650734, "grad_norm": 10.335730599836609, "learning_rate": 4.893285003012774e-05, "loss": 2.2415, "mean_token_accuracy": 0.4586206912994385, "step": 142480 }, { "epoch": 0.1435124053096115, "grad_norm": 8.428254249613822, "learning_rate": 4.893273599576425e-05, "loss": 1.8031, "mean_token_accuracy": 0.5158499658107758, "step": 142485 }, { "epoch": 0.1435174413627157, "grad_norm": 10.457251660945689, "learning_rate": 4.893262195545631e-05, "loss": 2.5364, "mean_token_accuracy": 0.41034482717514037, "step": 142490 }, { "epoch": 0.14352247741581986, "grad_norm": 11.449481289769475, "learning_rate": 4.893250790920396e-05, "loss": 2.381, "mean_token_accuracy": 0.42758620381355283, "step": 142495 }, { "epoch": 0.14352751346892403, "grad_norm": 13.515358398855035, "learning_rate": 4.893239385700721e-05, "loss": 2.6162, "mean_token_accuracy": 0.3999999940395355, "step": 142500 }, { "epoch": 0.1435325495220282, "grad_norm": 8.921441336596908, "learning_rate": 4.893227979886611e-05, "loss": 2.3353, "mean_token_accuracy": 0.4379310369491577, "step": 142505 }, { "epoch": 0.14353758557513238, "grad_norm": 10.055709108630017, "learning_rate": 4.893216573478069e-05, "loss": 2.6292, "mean_token_accuracy": 0.39310344457626345, "step": 142510 }, { "epoch": 0.14354262162823656, "grad_norm": 12.05665796282248, "learning_rate": 4.893205166475098e-05, "loss": 2.4443, "mean_token_accuracy": 0.4068965524435043, "step": 142515 }, { "epoch": 0.14354765768134073, "grad_norm": 11.963735281836508, "learning_rate": 4.893193758877701e-05, "loss": 2.9896, "mean_token_accuracy": 0.36551724672317504, "step": 142520 }, { "epoch": 0.1435526937344449, "grad_norm": 10.565423138164762, "learning_rate": 4.893182350685881e-05, "loss": 2.4077, "mean_token_accuracy": 0.41724138259887694, "step": 142525 }, { "epoch": 0.14355772978754908, "grad_norm": 10.716848205079073, "learning_rate": 4.893170941899641e-05, "loss": 2.3145, "mean_token_accuracy": 0.4689655125141144, "step": 142530 }, { "epoch": 0.14356276584065325, "grad_norm": 12.29720545120543, "learning_rate": 4.893159532518986e-05, "loss": 2.7395, "mean_token_accuracy": 0.37586206793785093, "step": 142535 }, { "epoch": 0.1435678018937574, "grad_norm": 11.653353411762163, "learning_rate": 4.8931481225439166e-05, "loss": 2.8983, "mean_token_accuracy": 0.4186932861804962, "step": 142540 }, { "epoch": 0.14357283794686157, "grad_norm": 9.83492647673044, "learning_rate": 4.893136711974437e-05, "loss": 2.3948, "mean_token_accuracy": 0.4172413766384125, "step": 142545 }, { "epoch": 0.14357787399996574, "grad_norm": 10.976336079313233, "learning_rate": 4.89312530081055e-05, "loss": 2.515, "mean_token_accuracy": 0.40344828367233276, "step": 142550 }, { "epoch": 0.14358291005306992, "grad_norm": 10.00846077115079, "learning_rate": 4.89311388905226e-05, "loss": 2.507, "mean_token_accuracy": 0.4413793087005615, "step": 142555 }, { "epoch": 0.1435879461061741, "grad_norm": 11.00399921509478, "learning_rate": 4.8931024766995695e-05, "loss": 2.1564, "mean_token_accuracy": 0.4517241358757019, "step": 142560 }, { "epoch": 0.14359298215927827, "grad_norm": 14.913380959347803, "learning_rate": 4.893091063752481e-05, "loss": 2.5114, "mean_token_accuracy": 0.44022988677024844, "step": 142565 }, { "epoch": 0.14359801821238244, "grad_norm": 12.594649962353268, "learning_rate": 4.893079650210998e-05, "loss": 2.94, "mean_token_accuracy": 0.3793103516101837, "step": 142570 }, { "epoch": 0.1436030542654866, "grad_norm": 9.047690944809839, "learning_rate": 4.893068236075125e-05, "loss": 2.1825, "mean_token_accuracy": 0.48130671977996825, "step": 142575 }, { "epoch": 0.1436080903185908, "grad_norm": 9.057478762686248, "learning_rate": 4.893056821344863e-05, "loss": 2.2233, "mean_token_accuracy": 0.43448275327682495, "step": 142580 }, { "epoch": 0.14361312637169496, "grad_norm": 10.349138760724113, "learning_rate": 4.8930454060202165e-05, "loss": 2.7152, "mean_token_accuracy": 0.38965516686439516, "step": 142585 }, { "epoch": 0.14361816242479913, "grad_norm": 9.973013972395451, "learning_rate": 4.8930339901011885e-05, "loss": 2.5617, "mean_token_accuracy": 0.42758620977401735, "step": 142590 }, { "epoch": 0.1436231984779033, "grad_norm": 9.956689188971596, "learning_rate": 4.893022573587782e-05, "loss": 2.6338, "mean_token_accuracy": 0.3931034505367279, "step": 142595 }, { "epoch": 0.14362823453100748, "grad_norm": 15.062488479970959, "learning_rate": 4.893011156480001e-05, "loss": 2.3249, "mean_token_accuracy": 0.4517241358757019, "step": 142600 }, { "epoch": 0.14363327058411166, "grad_norm": 10.167529023060766, "learning_rate": 4.8929997387778466e-05, "loss": 2.2516, "mean_token_accuracy": 0.4620689570903778, "step": 142605 }, { "epoch": 0.14363830663721583, "grad_norm": 10.327946073253557, "learning_rate": 4.892988320481324e-05, "loss": 2.6832, "mean_token_accuracy": 0.39655172228813174, "step": 142610 }, { "epoch": 0.14364334269032, "grad_norm": 14.274125365171392, "learning_rate": 4.892976901590435e-05, "loss": 2.6174, "mean_token_accuracy": 0.40689656138420105, "step": 142615 }, { "epoch": 0.14364837874342418, "grad_norm": 10.082656262496846, "learning_rate": 4.892965482105183e-05, "loss": 2.3299, "mean_token_accuracy": 0.4206896543502808, "step": 142620 }, { "epoch": 0.14365341479652835, "grad_norm": 12.980735696920775, "learning_rate": 4.892954062025572e-05, "loss": 2.131, "mean_token_accuracy": 0.48965516686439514, "step": 142625 }, { "epoch": 0.14365845084963252, "grad_norm": 9.569055479685993, "learning_rate": 4.892942641351605e-05, "loss": 1.8868, "mean_token_accuracy": 0.4979064047336578, "step": 142630 }, { "epoch": 0.1436634869027367, "grad_norm": 10.08435068373998, "learning_rate": 4.8929312200832845e-05, "loss": 2.5762, "mean_token_accuracy": 0.4330308556556702, "step": 142635 }, { "epoch": 0.14366852295584087, "grad_norm": 8.307796679513682, "learning_rate": 4.8929197982206135e-05, "loss": 2.203, "mean_token_accuracy": 0.45728976726531984, "step": 142640 }, { "epoch": 0.14367355900894505, "grad_norm": 15.810018936395103, "learning_rate": 4.892908375763597e-05, "loss": 3.0555, "mean_token_accuracy": 0.41379310488700866, "step": 142645 }, { "epoch": 0.14367859506204922, "grad_norm": 16.86658739806709, "learning_rate": 4.8928969527122366e-05, "loss": 2.5404, "mean_token_accuracy": 0.441379314661026, "step": 142650 }, { "epoch": 0.1436836311151534, "grad_norm": 8.742553063071579, "learning_rate": 4.8928855290665355e-05, "loss": 2.3121, "mean_token_accuracy": 0.44482758045196535, "step": 142655 }, { "epoch": 0.14368866716825757, "grad_norm": 11.062227354150226, "learning_rate": 4.8928741048264964e-05, "loss": 2.3729, "mean_token_accuracy": 0.3896551728248596, "step": 142660 }, { "epoch": 0.14369370322136174, "grad_norm": 8.749707763812081, "learning_rate": 4.892862679992124e-05, "loss": 2.2242, "mean_token_accuracy": 0.43103447556495667, "step": 142665 }, { "epoch": 0.14369873927446591, "grad_norm": 8.784119900665745, "learning_rate": 4.892851254563421e-05, "loss": 2.1423, "mean_token_accuracy": 0.47791893482208253, "step": 142670 }, { "epoch": 0.1437037753275701, "grad_norm": 8.644596631490936, "learning_rate": 4.8928398285403894e-05, "loss": 2.2741, "mean_token_accuracy": 0.49183303117752075, "step": 142675 }, { "epoch": 0.14370881138067423, "grad_norm": 13.500314469573821, "learning_rate": 4.892828401923033e-05, "loss": 2.4512, "mean_token_accuracy": 0.4068965554237366, "step": 142680 }, { "epoch": 0.1437138474337784, "grad_norm": 12.742102927445767, "learning_rate": 4.892816974711356e-05, "loss": 2.6951, "mean_token_accuracy": 0.3551724135875702, "step": 142685 }, { "epoch": 0.14371888348688258, "grad_norm": 9.636528134294212, "learning_rate": 4.8928055469053595e-05, "loss": 2.2518, "mean_token_accuracy": 0.44482757449150084, "step": 142690 }, { "epoch": 0.14372391953998676, "grad_norm": 14.573304839801267, "learning_rate": 4.892794118505049e-05, "loss": 2.4239, "mean_token_accuracy": 0.4413793087005615, "step": 142695 }, { "epoch": 0.14372895559309093, "grad_norm": 10.123872182408846, "learning_rate": 4.892782689510427e-05, "loss": 2.2216, "mean_token_accuracy": 0.43103447556495667, "step": 142700 }, { "epoch": 0.1437339916461951, "grad_norm": 9.28094841795952, "learning_rate": 4.892771259921495e-05, "loss": 2.433, "mean_token_accuracy": 0.40689654350280763, "step": 142705 }, { "epoch": 0.14373902769929928, "grad_norm": 9.579510297904683, "learning_rate": 4.892759829738258e-05, "loss": 2.1139, "mean_token_accuracy": 0.4379310369491577, "step": 142710 }, { "epoch": 0.14374406375240345, "grad_norm": 12.808606537632743, "learning_rate": 4.892748398960718e-05, "loss": 2.5815, "mean_token_accuracy": 0.4103448331356049, "step": 142715 }, { "epoch": 0.14374909980550762, "grad_norm": 12.547084600884551, "learning_rate": 4.892736967588879e-05, "loss": 2.7051, "mean_token_accuracy": 0.3965517282485962, "step": 142720 }, { "epoch": 0.1437541358586118, "grad_norm": 10.806085269547168, "learning_rate": 4.892725535622744e-05, "loss": 2.5653, "mean_token_accuracy": 0.42413792610168455, "step": 142725 }, { "epoch": 0.14375917191171597, "grad_norm": 9.315899254453186, "learning_rate": 4.892714103062317e-05, "loss": 2.548, "mean_token_accuracy": 0.3965517282485962, "step": 142730 }, { "epoch": 0.14376420796482015, "grad_norm": 9.085792076205212, "learning_rate": 4.892702669907599e-05, "loss": 2.1217, "mean_token_accuracy": 0.43793103098869324, "step": 142735 }, { "epoch": 0.14376924401792432, "grad_norm": 10.004012546130024, "learning_rate": 4.8926912361585945e-05, "loss": 2.3576, "mean_token_accuracy": 0.4, "step": 142740 }, { "epoch": 0.1437742800710285, "grad_norm": 12.469702844078748, "learning_rate": 4.892679801815307e-05, "loss": 2.9837, "mean_token_accuracy": 0.37931033968925476, "step": 142745 }, { "epoch": 0.14377931612413267, "grad_norm": 12.871215290354318, "learning_rate": 4.892668366877739e-05, "loss": 2.583, "mean_token_accuracy": 0.43103448748588563, "step": 142750 }, { "epoch": 0.14378435217723684, "grad_norm": 10.421704312204175, "learning_rate": 4.8926569313458944e-05, "loss": 2.4026, "mean_token_accuracy": 0.41379311084747317, "step": 142755 }, { "epoch": 0.14378938823034101, "grad_norm": 9.895373798535891, "learning_rate": 4.892645495219775e-05, "loss": 2.2403, "mean_token_accuracy": 0.4413793087005615, "step": 142760 }, { "epoch": 0.1437944242834452, "grad_norm": 11.124880046749746, "learning_rate": 4.8926340584993854e-05, "loss": 2.489, "mean_token_accuracy": 0.42068964838981626, "step": 142765 }, { "epoch": 0.14379946033654936, "grad_norm": 9.161263075594407, "learning_rate": 4.892622621184729e-05, "loss": 2.1112, "mean_token_accuracy": 0.458620685338974, "step": 142770 }, { "epoch": 0.14380449638965354, "grad_norm": 8.468632696864248, "learning_rate": 4.892611183275807e-05, "loss": 1.9238, "mean_token_accuracy": 0.5034482777118683, "step": 142775 }, { "epoch": 0.1438095324427577, "grad_norm": 8.991145199280211, "learning_rate": 4.8925997447726245e-05, "loss": 2.2855, "mean_token_accuracy": 0.4275861978530884, "step": 142780 }, { "epoch": 0.14381456849586188, "grad_norm": 11.383034455708005, "learning_rate": 4.892588305675184e-05, "loss": 2.5955, "mean_token_accuracy": 0.4448275864124298, "step": 142785 }, { "epoch": 0.14381960454896606, "grad_norm": 11.491862019309762, "learning_rate": 4.892576865983489e-05, "loss": 2.502, "mean_token_accuracy": 0.3931034505367279, "step": 142790 }, { "epoch": 0.14382464060207023, "grad_norm": 8.651065048269503, "learning_rate": 4.892565425697541e-05, "loss": 2.5239, "mean_token_accuracy": 0.4724137902259827, "step": 142795 }, { "epoch": 0.1438296766551744, "grad_norm": 10.464849541224504, "learning_rate": 4.892553984817346e-05, "loss": 2.0151, "mean_token_accuracy": 0.4767241358757019, "step": 142800 }, { "epoch": 0.14383471270827858, "grad_norm": 10.842183693315686, "learning_rate": 4.892542543342904e-05, "loss": 2.1677, "mean_token_accuracy": 0.42758620977401735, "step": 142805 }, { "epoch": 0.14383974876138275, "grad_norm": 9.730482362758144, "learning_rate": 4.8925311012742216e-05, "loss": 2.3562, "mean_token_accuracy": 0.45862069725990295, "step": 142810 }, { "epoch": 0.14384478481448693, "grad_norm": 10.228069904042458, "learning_rate": 4.892519658611299e-05, "loss": 2.367, "mean_token_accuracy": 0.41379310488700866, "step": 142815 }, { "epoch": 0.14384982086759107, "grad_norm": 10.667346683059518, "learning_rate": 4.8925082153541406e-05, "loss": 2.3034, "mean_token_accuracy": 0.42758620977401735, "step": 142820 }, { "epoch": 0.14385485692069525, "grad_norm": 11.759707994048476, "learning_rate": 4.89249677150275e-05, "loss": 2.3183, "mean_token_accuracy": 0.42413792610168455, "step": 142825 }, { "epoch": 0.14385989297379942, "grad_norm": 13.097501186563894, "learning_rate": 4.89248532705713e-05, "loss": 2.5067, "mean_token_accuracy": 0.382758629322052, "step": 142830 }, { "epoch": 0.1438649290269036, "grad_norm": 9.467002903542522, "learning_rate": 4.892473882017284e-05, "loss": 2.1393, "mean_token_accuracy": 0.4689655125141144, "step": 142835 }, { "epoch": 0.14386996508000777, "grad_norm": 10.855511156032291, "learning_rate": 4.892462436383214e-05, "loss": 2.3535, "mean_token_accuracy": 0.4448275864124298, "step": 142840 }, { "epoch": 0.14387500113311194, "grad_norm": 14.699044491647461, "learning_rate": 4.892450990154924e-05, "loss": 2.7592, "mean_token_accuracy": 0.4344827592372894, "step": 142845 }, { "epoch": 0.14388003718621611, "grad_norm": 17.951133107832828, "learning_rate": 4.892439543332418e-05, "loss": 2.2737, "mean_token_accuracy": 0.44137930274009707, "step": 142850 }, { "epoch": 0.1438850732393203, "grad_norm": 11.935975563885311, "learning_rate": 4.892428095915698e-05, "loss": 1.9793, "mean_token_accuracy": 0.482758617401123, "step": 142855 }, { "epoch": 0.14389010929242446, "grad_norm": 11.93193313017178, "learning_rate": 4.892416647904767e-05, "loss": 2.6294, "mean_token_accuracy": 0.39655172228813174, "step": 142860 }, { "epoch": 0.14389514534552864, "grad_norm": 11.509025055382219, "learning_rate": 4.89240519929963e-05, "loss": 2.2546, "mean_token_accuracy": 0.45517241954803467, "step": 142865 }, { "epoch": 0.1439001813986328, "grad_norm": 12.35155089295012, "learning_rate": 4.892393750100289e-05, "loss": 2.5757, "mean_token_accuracy": 0.4379310369491577, "step": 142870 }, { "epoch": 0.14390521745173698, "grad_norm": 9.107493141827403, "learning_rate": 4.892382300306746e-05, "loss": 2.3494, "mean_token_accuracy": 0.4, "step": 142875 }, { "epoch": 0.14391025350484116, "grad_norm": 9.706155659895366, "learning_rate": 4.892370849919006e-05, "loss": 2.5125, "mean_token_accuracy": 0.4137930989265442, "step": 142880 }, { "epoch": 0.14391528955794533, "grad_norm": 10.051711676076456, "learning_rate": 4.892359398937071e-05, "loss": 2.0437, "mean_token_accuracy": 0.4724137902259827, "step": 142885 }, { "epoch": 0.1439203256110495, "grad_norm": 9.271766836140744, "learning_rate": 4.892347947360945e-05, "loss": 2.3653, "mean_token_accuracy": 0.45172414779663084, "step": 142890 }, { "epoch": 0.14392536166415368, "grad_norm": 14.211886455223294, "learning_rate": 4.89233649519063e-05, "loss": 2.3437, "mean_token_accuracy": 0.44137930274009707, "step": 142895 }, { "epoch": 0.14393039771725785, "grad_norm": 12.761658982227075, "learning_rate": 4.892325042426131e-05, "loss": 2.2547, "mean_token_accuracy": 0.4517241358757019, "step": 142900 }, { "epoch": 0.14393543377036203, "grad_norm": 13.60360436706123, "learning_rate": 4.892313589067449e-05, "loss": 2.3689, "mean_token_accuracy": 0.41379310488700866, "step": 142905 }, { "epoch": 0.1439404698234662, "grad_norm": 10.97842570300436, "learning_rate": 4.8923021351145894e-05, "loss": 2.1077, "mean_token_accuracy": 0.44827587008476255, "step": 142910 }, { "epoch": 0.14394550587657037, "grad_norm": 10.144580239775088, "learning_rate": 4.892290680567554e-05, "loss": 2.3137, "mean_token_accuracy": 0.4620689630508423, "step": 142915 }, { "epoch": 0.14395054192967455, "grad_norm": 8.362098187957232, "learning_rate": 4.8922792254263465e-05, "loss": 2.5756, "mean_token_accuracy": 0.4344827592372894, "step": 142920 }, { "epoch": 0.14395557798277872, "grad_norm": 11.499226014986755, "learning_rate": 4.8922677696909695e-05, "loss": 2.5759, "mean_token_accuracy": 0.4068965554237366, "step": 142925 }, { "epoch": 0.1439606140358829, "grad_norm": 10.976927907102782, "learning_rate": 4.892256313361426e-05, "loss": 2.2757, "mean_token_accuracy": 0.4620689630508423, "step": 142930 }, { "epoch": 0.14396565008898707, "grad_norm": 9.320894159894085, "learning_rate": 4.892244856437721e-05, "loss": 2.2437, "mean_token_accuracy": 0.4586206912994385, "step": 142935 }, { "epoch": 0.14397068614209124, "grad_norm": 10.061830760009846, "learning_rate": 4.892233398919855e-05, "loss": 2.569, "mean_token_accuracy": 0.4068965554237366, "step": 142940 }, { "epoch": 0.14397572219519542, "grad_norm": 9.423061277116496, "learning_rate": 4.892221940807834e-05, "loss": 2.4951, "mean_token_accuracy": 0.43998789191246035, "step": 142945 }, { "epoch": 0.1439807582482996, "grad_norm": 9.72547668072557, "learning_rate": 4.892210482101659e-05, "loss": 2.6852, "mean_token_accuracy": 0.42758620977401735, "step": 142950 }, { "epoch": 0.14398579430140374, "grad_norm": 11.770839052130134, "learning_rate": 4.8921990228013344e-05, "loss": 2.0784, "mean_token_accuracy": 0.4344827592372894, "step": 142955 }, { "epoch": 0.1439908303545079, "grad_norm": 9.130040836127705, "learning_rate": 4.8921875629068624e-05, "loss": 2.7847, "mean_token_accuracy": 0.358620685338974, "step": 142960 }, { "epoch": 0.14399586640761208, "grad_norm": 12.249613521821624, "learning_rate": 4.8921761024182475e-05, "loss": 2.8651, "mean_token_accuracy": 0.40175439715385436, "step": 142965 }, { "epoch": 0.14400090246071626, "grad_norm": 10.401410801803454, "learning_rate": 4.892164641335491e-05, "loss": 2.4592, "mean_token_accuracy": 0.4344827592372894, "step": 142970 }, { "epoch": 0.14400593851382043, "grad_norm": 12.195850219459128, "learning_rate": 4.8921531796585976e-05, "loss": 2.3793, "mean_token_accuracy": 0.4448275864124298, "step": 142975 }, { "epoch": 0.1440109745669246, "grad_norm": 11.443995089878216, "learning_rate": 4.892141717387569e-05, "loss": 2.5288, "mean_token_accuracy": 0.39310344457626345, "step": 142980 }, { "epoch": 0.14401601062002878, "grad_norm": 10.035048004010267, "learning_rate": 4.892130254522411e-05, "loss": 2.4465, "mean_token_accuracy": 0.4517241418361664, "step": 142985 }, { "epoch": 0.14402104667313295, "grad_norm": 8.892666886622552, "learning_rate": 4.892118791063125e-05, "loss": 2.1489, "mean_token_accuracy": 0.46896551847457885, "step": 142990 }, { "epoch": 0.14402608272623713, "grad_norm": 8.853451102214152, "learning_rate": 4.892107327009714e-05, "loss": 2.0947, "mean_token_accuracy": 0.5228675007820129, "step": 142995 }, { "epoch": 0.1440311187793413, "grad_norm": 10.36582724648141, "learning_rate": 4.8920958623621814e-05, "loss": 2.3684, "mean_token_accuracy": 0.42413792610168455, "step": 143000 }, { "epoch": 0.14403615483244547, "grad_norm": 10.178254030609075, "learning_rate": 4.8920843971205306e-05, "loss": 2.2843, "mean_token_accuracy": 0.441379314661026, "step": 143005 }, { "epoch": 0.14404119088554965, "grad_norm": 10.352139427298674, "learning_rate": 4.892072931284765e-05, "loss": 2.0948, "mean_token_accuracy": 0.43103448748588563, "step": 143010 }, { "epoch": 0.14404622693865382, "grad_norm": 9.310362326069008, "learning_rate": 4.892061464854887e-05, "loss": 2.3319, "mean_token_accuracy": 0.4448275864124298, "step": 143015 }, { "epoch": 0.144051262991758, "grad_norm": 10.919517886956424, "learning_rate": 4.8920499978309e-05, "loss": 2.4419, "mean_token_accuracy": 0.4325468838214874, "step": 143020 }, { "epoch": 0.14405629904486217, "grad_norm": 10.886104098074167, "learning_rate": 4.892038530212808e-05, "loss": 2.5615, "mean_token_accuracy": 0.42068964838981626, "step": 143025 }, { "epoch": 0.14406133509796634, "grad_norm": 11.708641266032057, "learning_rate": 4.892027062000614e-05, "loss": 2.6762, "mean_token_accuracy": 0.42758620977401735, "step": 143030 }, { "epoch": 0.14406637115107052, "grad_norm": 9.038643535728255, "learning_rate": 4.89201559319432e-05, "loss": 2.2711, "mean_token_accuracy": 0.46896551847457885, "step": 143035 }, { "epoch": 0.1440714072041747, "grad_norm": 10.023533905000063, "learning_rate": 4.89200412379393e-05, "loss": 2.3468, "mean_token_accuracy": 0.39655172228813174, "step": 143040 }, { "epoch": 0.14407644325727886, "grad_norm": 9.515347929939889, "learning_rate": 4.891992653799447e-05, "loss": 1.9717, "mean_token_accuracy": 0.4931034505367279, "step": 143045 }, { "epoch": 0.14408147931038304, "grad_norm": 9.57465746427421, "learning_rate": 4.891981183210875e-05, "loss": 2.2037, "mean_token_accuracy": 0.4310344815254211, "step": 143050 }, { "epoch": 0.1440865153634872, "grad_norm": 9.275227159154056, "learning_rate": 4.8919697120282165e-05, "loss": 2.2079, "mean_token_accuracy": 0.4744101583957672, "step": 143055 }, { "epoch": 0.14409155141659138, "grad_norm": 10.82387792293545, "learning_rate": 4.8919582402514755e-05, "loss": 2.3596, "mean_token_accuracy": 0.43793103098869324, "step": 143060 }, { "epoch": 0.14409658746969556, "grad_norm": 10.027555293206424, "learning_rate": 4.891946767880653e-05, "loss": 2.2664, "mean_token_accuracy": 0.43103448748588563, "step": 143065 }, { "epoch": 0.14410162352279973, "grad_norm": 10.302555316224726, "learning_rate": 4.8919352949157534e-05, "loss": 2.1611, "mean_token_accuracy": 0.4344827651977539, "step": 143070 }, { "epoch": 0.1441066595759039, "grad_norm": 8.502397602443207, "learning_rate": 4.8919238213567806e-05, "loss": 2.0091, "mean_token_accuracy": 0.5034482777118683, "step": 143075 }, { "epoch": 0.14411169562900808, "grad_norm": 9.34425559516383, "learning_rate": 4.891912347203738e-05, "loss": 2.5947, "mean_token_accuracy": 0.43103447556495667, "step": 143080 }, { "epoch": 0.14411673168211225, "grad_norm": 10.170930713389188, "learning_rate": 4.8919008724566266e-05, "loss": 2.5939, "mean_token_accuracy": 0.4068965554237366, "step": 143085 }, { "epoch": 0.14412176773521643, "grad_norm": 8.503277582441921, "learning_rate": 4.8918893971154524e-05, "loss": 2.432, "mean_token_accuracy": 0.4275861978530884, "step": 143090 }, { "epoch": 0.14412680378832057, "grad_norm": 9.46354659690624, "learning_rate": 4.891877921180216e-05, "loss": 2.1359, "mean_token_accuracy": 0.47931034564971925, "step": 143095 }, { "epoch": 0.14413183984142475, "grad_norm": 10.137913563625476, "learning_rate": 4.891866444650922e-05, "loss": 2.2744, "mean_token_accuracy": 0.4638838529586792, "step": 143100 }, { "epoch": 0.14413687589452892, "grad_norm": 16.099043559733598, "learning_rate": 4.8918549675275736e-05, "loss": 2.7884, "mean_token_accuracy": 0.4206896543502808, "step": 143105 }, { "epoch": 0.1441419119476331, "grad_norm": 8.901882095246567, "learning_rate": 4.891843489810174e-05, "loss": 2.2721, "mean_token_accuracy": 0.5178463518619537, "step": 143110 }, { "epoch": 0.14414694800073727, "grad_norm": 13.038421232099818, "learning_rate": 4.891832011498726e-05, "loss": 2.5571, "mean_token_accuracy": 0.3965517163276672, "step": 143115 }, { "epoch": 0.14415198405384144, "grad_norm": 11.266259422140152, "learning_rate": 4.8918205325932324e-05, "loss": 2.6415, "mean_token_accuracy": 0.41034482717514037, "step": 143120 }, { "epoch": 0.14415702010694562, "grad_norm": 9.480887745959103, "learning_rate": 4.8918090530936976e-05, "loss": 2.4457, "mean_token_accuracy": 0.4137930989265442, "step": 143125 }, { "epoch": 0.1441620561600498, "grad_norm": 13.243234792372652, "learning_rate": 4.891797573000124e-05, "loss": 2.312, "mean_token_accuracy": 0.40689656138420105, "step": 143130 }, { "epoch": 0.14416709221315396, "grad_norm": 11.138285367638243, "learning_rate": 4.891786092312514e-05, "loss": 2.6573, "mean_token_accuracy": 0.37241379022598264, "step": 143135 }, { "epoch": 0.14417212826625814, "grad_norm": 10.895345954382462, "learning_rate": 4.891774611030873e-05, "loss": 2.4479, "mean_token_accuracy": 0.44283121824264526, "step": 143140 }, { "epoch": 0.1441771643193623, "grad_norm": 9.091524903573514, "learning_rate": 4.891763129155201e-05, "loss": 2.3169, "mean_token_accuracy": 0.45862069725990295, "step": 143145 }, { "epoch": 0.14418220037246648, "grad_norm": 10.69980679483711, "learning_rate": 4.891751646685505e-05, "loss": 2.2274, "mean_token_accuracy": 0.4344827651977539, "step": 143150 }, { "epoch": 0.14418723642557066, "grad_norm": 11.215358222077949, "learning_rate": 4.8917401636217854e-05, "loss": 2.6825, "mean_token_accuracy": 0.4172413766384125, "step": 143155 }, { "epoch": 0.14419227247867483, "grad_norm": 10.291894865198412, "learning_rate": 4.891728679964046e-05, "loss": 2.2819, "mean_token_accuracy": 0.4, "step": 143160 }, { "epoch": 0.144197308531779, "grad_norm": 9.485089848847432, "learning_rate": 4.89171719571229e-05, "loss": 2.0843, "mean_token_accuracy": 0.47586206793785096, "step": 143165 }, { "epoch": 0.14420234458488318, "grad_norm": 12.620909235561669, "learning_rate": 4.891705710866521e-05, "loss": 2.5386, "mean_token_accuracy": 0.417241370677948, "step": 143170 }, { "epoch": 0.14420738063798735, "grad_norm": 8.761753680024981, "learning_rate": 4.891694225426742e-05, "loss": 2.0999, "mean_token_accuracy": 0.4620689690113068, "step": 143175 }, { "epoch": 0.14421241669109153, "grad_norm": 11.052827646988586, "learning_rate": 4.891682739392955e-05, "loss": 2.2684, "mean_token_accuracy": 0.43793103098869324, "step": 143180 }, { "epoch": 0.1442174527441957, "grad_norm": 7.482576051375531, "learning_rate": 4.891671252765165e-05, "loss": 2.2995, "mean_token_accuracy": 0.4413793087005615, "step": 143185 }, { "epoch": 0.14422248879729987, "grad_norm": 11.638067613836233, "learning_rate": 4.891659765543376e-05, "loss": 2.5494, "mean_token_accuracy": 0.4068965554237366, "step": 143190 }, { "epoch": 0.14422752485040405, "grad_norm": 10.08076645440741, "learning_rate": 4.8916482777275877e-05, "loss": 2.4693, "mean_token_accuracy": 0.4189352750778198, "step": 143195 }, { "epoch": 0.14423256090350822, "grad_norm": 9.752454165354598, "learning_rate": 4.891636789317806e-05, "loss": 2.2789, "mean_token_accuracy": 0.42413793206214906, "step": 143200 }, { "epoch": 0.1442375969566124, "grad_norm": 17.415005957185635, "learning_rate": 4.891625300314033e-05, "loss": 2.61, "mean_token_accuracy": 0.4379310369491577, "step": 143205 }, { "epoch": 0.14424263300971657, "grad_norm": 14.367878434977694, "learning_rate": 4.891613810716273e-05, "loss": 2.4245, "mean_token_accuracy": 0.4709618926048279, "step": 143210 }, { "epoch": 0.14424766906282074, "grad_norm": 14.849141981857443, "learning_rate": 4.891602320524528e-05, "loss": 2.3179, "mean_token_accuracy": 0.4689655125141144, "step": 143215 }, { "epoch": 0.14425270511592492, "grad_norm": 9.688235532458037, "learning_rate": 4.891590829738802e-05, "loss": 2.1415, "mean_token_accuracy": 0.4620689630508423, "step": 143220 }, { "epoch": 0.1442577411690291, "grad_norm": 10.787323860969947, "learning_rate": 4.891579338359098e-05, "loss": 2.2664, "mean_token_accuracy": 0.4413793087005615, "step": 143225 }, { "epoch": 0.14426277722213326, "grad_norm": 10.068157699710326, "learning_rate": 4.891567846385418e-05, "loss": 2.3069, "mean_token_accuracy": 0.44482759237289426, "step": 143230 }, { "epoch": 0.1442678132752374, "grad_norm": 11.02357286740229, "learning_rate": 4.891556353817766e-05, "loss": 2.7127, "mean_token_accuracy": 0.4034482717514038, "step": 143235 }, { "epoch": 0.14427284932834158, "grad_norm": 10.844049314339482, "learning_rate": 4.891544860656146e-05, "loss": 2.5795, "mean_token_accuracy": 0.3931034505367279, "step": 143240 }, { "epoch": 0.14427788538144576, "grad_norm": 11.516562104779565, "learning_rate": 4.89153336690056e-05, "loss": 2.5312, "mean_token_accuracy": 0.43448275327682495, "step": 143245 }, { "epoch": 0.14428292143454993, "grad_norm": 9.863996749709504, "learning_rate": 4.8915218725510124e-05, "loss": 2.5991, "mean_token_accuracy": 0.33793103098869326, "step": 143250 }, { "epoch": 0.1442879574876541, "grad_norm": 12.275832968848517, "learning_rate": 4.8915103776075056e-05, "loss": 2.2932, "mean_token_accuracy": 0.4068965494632721, "step": 143255 }, { "epoch": 0.14429299354075828, "grad_norm": 10.95577215644058, "learning_rate": 4.891498882070043e-05, "loss": 2.0273, "mean_token_accuracy": 0.5241379380226135, "step": 143260 }, { "epoch": 0.14429802959386245, "grad_norm": 8.574616501875061, "learning_rate": 4.891487385938627e-05, "loss": 2.5562, "mean_token_accuracy": 0.4034482777118683, "step": 143265 }, { "epoch": 0.14430306564696663, "grad_norm": 10.340551851702875, "learning_rate": 4.891475889213263e-05, "loss": 2.4589, "mean_token_accuracy": 0.4, "step": 143270 }, { "epoch": 0.1443081017000708, "grad_norm": 9.705779604379604, "learning_rate": 4.891464391893951e-05, "loss": 1.9368, "mean_token_accuracy": 0.5206896483898162, "step": 143275 }, { "epoch": 0.14431313775317497, "grad_norm": 8.623271610185295, "learning_rate": 4.8914528939806966e-05, "loss": 2.1983, "mean_token_accuracy": 0.43793103098869324, "step": 143280 }, { "epoch": 0.14431817380627915, "grad_norm": 9.591571117723104, "learning_rate": 4.891441395473502e-05, "loss": 2.3306, "mean_token_accuracy": 0.4379310369491577, "step": 143285 }, { "epoch": 0.14432320985938332, "grad_norm": 8.771184680138322, "learning_rate": 4.891429896372371e-05, "loss": 2.482, "mean_token_accuracy": 0.3896551787853241, "step": 143290 }, { "epoch": 0.1443282459124875, "grad_norm": 10.690560367218001, "learning_rate": 4.891418396677306e-05, "loss": 2.1146, "mean_token_accuracy": 0.4517241418361664, "step": 143295 }, { "epoch": 0.14433328196559167, "grad_norm": 10.843560529973189, "learning_rate": 4.8914068963883106e-05, "loss": 2.5281, "mean_token_accuracy": 0.4172413766384125, "step": 143300 }, { "epoch": 0.14433831801869584, "grad_norm": 10.618059285439788, "learning_rate": 4.8913953955053884e-05, "loss": 2.2213, "mean_token_accuracy": 0.44482759237289426, "step": 143305 }, { "epoch": 0.14434335407180002, "grad_norm": 9.805842091284974, "learning_rate": 4.891383894028543e-05, "loss": 2.0299, "mean_token_accuracy": 0.47931034564971925, "step": 143310 }, { "epoch": 0.1443483901249042, "grad_norm": 10.48552327669885, "learning_rate": 4.891372391957775e-05, "loss": 2.6893, "mean_token_accuracy": 0.42758620381355283, "step": 143315 }, { "epoch": 0.14435342617800836, "grad_norm": 12.89758142964133, "learning_rate": 4.8913608892930905e-05, "loss": 2.6489, "mean_token_accuracy": 0.3931034505367279, "step": 143320 }, { "epoch": 0.14435846223111254, "grad_norm": 12.64000100251207, "learning_rate": 4.8913493860344916e-05, "loss": 2.2976, "mean_token_accuracy": 0.3793103456497192, "step": 143325 }, { "epoch": 0.1443634982842167, "grad_norm": 9.952823769657149, "learning_rate": 4.891337882181981e-05, "loss": 2.4684, "mean_token_accuracy": 0.37241379022598264, "step": 143330 }, { "epoch": 0.1443685343373209, "grad_norm": 12.520595778832488, "learning_rate": 4.891326377735562e-05, "loss": 2.6512, "mean_token_accuracy": 0.38620689511299133, "step": 143335 }, { "epoch": 0.14437357039042506, "grad_norm": 11.066279583864217, "learning_rate": 4.891314872695239e-05, "loss": 2.6658, "mean_token_accuracy": 0.41034482717514037, "step": 143340 }, { "epoch": 0.14437860644352923, "grad_norm": 13.380214678805146, "learning_rate": 4.891303367061015e-05, "loss": 3.1223, "mean_token_accuracy": 0.3517241418361664, "step": 143345 }, { "epoch": 0.1443836424966334, "grad_norm": 7.514151261822749, "learning_rate": 4.8912918608328905e-05, "loss": 2.4334, "mean_token_accuracy": 0.43793103098869324, "step": 143350 }, { "epoch": 0.14438867854973758, "grad_norm": 9.585838792894119, "learning_rate": 4.891280354010872e-05, "loss": 2.1383, "mean_token_accuracy": 0.4241379380226135, "step": 143355 }, { "epoch": 0.14439371460284176, "grad_norm": 11.104781221037275, "learning_rate": 4.891268846594961e-05, "loss": 2.234, "mean_token_accuracy": 0.458620685338974, "step": 143360 }, { "epoch": 0.14439875065594593, "grad_norm": 11.944601811366978, "learning_rate": 4.891257338585161e-05, "loss": 2.9673, "mean_token_accuracy": 0.35862069129943847, "step": 143365 }, { "epoch": 0.1444037867090501, "grad_norm": 10.148460206790432, "learning_rate": 4.8912458299814754e-05, "loss": 2.3615, "mean_token_accuracy": 0.441379314661026, "step": 143370 }, { "epoch": 0.14440882276215425, "grad_norm": 10.721689126599045, "learning_rate": 4.8912343207839077e-05, "loss": 1.9903, "mean_token_accuracy": 0.5103448271751404, "step": 143375 }, { "epoch": 0.14441385881525842, "grad_norm": 13.132127896281627, "learning_rate": 4.891222810992461e-05, "loss": 2.1801, "mean_token_accuracy": 0.48275862336158754, "step": 143380 }, { "epoch": 0.1444188948683626, "grad_norm": 10.862661004551356, "learning_rate": 4.891211300607137e-05, "loss": 2.7413, "mean_token_accuracy": 0.4620689630508423, "step": 143385 }, { "epoch": 0.14442393092146677, "grad_norm": 11.849320444589557, "learning_rate": 4.89119978962794e-05, "loss": 2.1309, "mean_token_accuracy": 0.43103447556495667, "step": 143390 }, { "epoch": 0.14442896697457094, "grad_norm": 11.592519995421126, "learning_rate": 4.891188278054875e-05, "loss": 2.4221, "mean_token_accuracy": 0.4517241358757019, "step": 143395 }, { "epoch": 0.14443400302767512, "grad_norm": 11.637374528984818, "learning_rate": 4.8911767658879416e-05, "loss": 2.4152, "mean_token_accuracy": 0.4586206912994385, "step": 143400 }, { "epoch": 0.1444390390807793, "grad_norm": 10.958413941240806, "learning_rate": 4.8911652531271455e-05, "loss": 2.4273, "mean_token_accuracy": 0.42758620977401735, "step": 143405 }, { "epoch": 0.14444407513388346, "grad_norm": 12.207092557902424, "learning_rate": 4.891153739772489e-05, "loss": 2.801, "mean_token_accuracy": 0.3517241388559341, "step": 143410 }, { "epoch": 0.14444911118698764, "grad_norm": 9.957504072861148, "learning_rate": 4.891142225823977e-05, "loss": 2.3038, "mean_token_accuracy": 0.4137930989265442, "step": 143415 }, { "epoch": 0.1444541472400918, "grad_norm": 10.479548449903048, "learning_rate": 4.891130711281609e-05, "loss": 2.7249, "mean_token_accuracy": 0.3758620619773865, "step": 143420 }, { "epoch": 0.144459183293196, "grad_norm": 11.93547655899798, "learning_rate": 4.8911191961453914e-05, "loss": 2.7033, "mean_token_accuracy": 0.37586206793785093, "step": 143425 }, { "epoch": 0.14446421934630016, "grad_norm": 9.575876754599406, "learning_rate": 4.8911076804153274e-05, "loss": 2.3689, "mean_token_accuracy": 0.44827586114406587, "step": 143430 }, { "epoch": 0.14446925539940433, "grad_norm": 8.01727140145651, "learning_rate": 4.8910961640914174e-05, "loss": 2.2264, "mean_token_accuracy": 0.4468844473361969, "step": 143435 }, { "epoch": 0.1444742914525085, "grad_norm": 11.87748897012932, "learning_rate": 4.891084647173667e-05, "loss": 2.5445, "mean_token_accuracy": 0.4413793087005615, "step": 143440 }, { "epoch": 0.14447932750561268, "grad_norm": 10.887153348004146, "learning_rate": 4.891073129662079e-05, "loss": 2.3559, "mean_token_accuracy": 0.4878078818321228, "step": 143445 }, { "epoch": 0.14448436355871686, "grad_norm": 9.895765928188267, "learning_rate": 4.891061611556657e-05, "loss": 2.2234, "mean_token_accuracy": 0.46896551847457885, "step": 143450 }, { "epoch": 0.14448939961182103, "grad_norm": 9.411800088793186, "learning_rate": 4.891050092857403e-05, "loss": 2.2195, "mean_token_accuracy": 0.40889292359352114, "step": 143455 }, { "epoch": 0.1444944356649252, "grad_norm": 17.251257754668526, "learning_rate": 4.8910385735643206e-05, "loss": 3.2276, "mean_token_accuracy": 0.3827586203813553, "step": 143460 }, { "epoch": 0.14449947171802938, "grad_norm": 11.387152471798426, "learning_rate": 4.8910270536774126e-05, "loss": 2.4489, "mean_token_accuracy": 0.42068964838981626, "step": 143465 }, { "epoch": 0.14450450777113355, "grad_norm": 8.928583937612485, "learning_rate": 4.891015533196684e-05, "loss": 2.0579, "mean_token_accuracy": 0.49655171632766726, "step": 143470 }, { "epoch": 0.14450954382423772, "grad_norm": 9.385929338101583, "learning_rate": 4.891004012122136e-05, "loss": 2.1213, "mean_token_accuracy": 0.4620689570903778, "step": 143475 }, { "epoch": 0.1445145798773419, "grad_norm": 9.638575526377231, "learning_rate": 4.890992490453773e-05, "loss": 2.4355, "mean_token_accuracy": 0.4467634618282318, "step": 143480 }, { "epoch": 0.14451961593044607, "grad_norm": 11.146512895679482, "learning_rate": 4.890980968191597e-05, "loss": 2.3323, "mean_token_accuracy": 0.3896551728248596, "step": 143485 }, { "epoch": 0.14452465198355025, "grad_norm": 10.041846521572744, "learning_rate": 4.890969445335613e-05, "loss": 2.7848, "mean_token_accuracy": 0.4068965494632721, "step": 143490 }, { "epoch": 0.14452968803665442, "grad_norm": 11.61979238003141, "learning_rate": 4.890957921885823e-05, "loss": 2.4216, "mean_token_accuracy": 0.43103448748588563, "step": 143495 }, { "epoch": 0.1445347240897586, "grad_norm": 12.021135306229978, "learning_rate": 4.8909463978422294e-05, "loss": 2.1835, "mean_token_accuracy": 0.4882637560367584, "step": 143500 }, { "epoch": 0.14453976014286277, "grad_norm": 13.628813098609225, "learning_rate": 4.890934873204837e-05, "loss": 2.3797, "mean_token_accuracy": 0.4482758641242981, "step": 143505 }, { "epoch": 0.14454479619596694, "grad_norm": 8.37439364426545, "learning_rate": 4.8909233479736476e-05, "loss": 2.1544, "mean_token_accuracy": 0.4137930989265442, "step": 143510 }, { "epoch": 0.1445498322490711, "grad_norm": 10.731539105938554, "learning_rate": 4.890911822148666e-05, "loss": 2.2245, "mean_token_accuracy": 0.42413792610168455, "step": 143515 }, { "epoch": 0.14455486830217526, "grad_norm": 10.665186522741884, "learning_rate": 4.890900295729894e-05, "loss": 2.5141, "mean_token_accuracy": 0.38620689809322356, "step": 143520 }, { "epoch": 0.14455990435527943, "grad_norm": 10.553039706236603, "learning_rate": 4.8908887687173354e-05, "loss": 2.2351, "mean_token_accuracy": 0.4534180283546448, "step": 143525 }, { "epoch": 0.1445649404083836, "grad_norm": 7.852568444741093, "learning_rate": 4.890877241110994e-05, "loss": 2.7851, "mean_token_accuracy": 0.40532365441322327, "step": 143530 }, { "epoch": 0.14456997646148778, "grad_norm": 12.041815270921463, "learning_rate": 4.890865712910871e-05, "loss": 2.8099, "mean_token_accuracy": 0.39655172228813174, "step": 143535 }, { "epoch": 0.14457501251459196, "grad_norm": 10.350282734708696, "learning_rate": 4.890854184116972e-05, "loss": 2.3552, "mean_token_accuracy": 0.42758620977401735, "step": 143540 }, { "epoch": 0.14458004856769613, "grad_norm": 10.244674767421802, "learning_rate": 4.890842654729299e-05, "loss": 2.1739, "mean_token_accuracy": 0.4310344815254211, "step": 143545 }, { "epoch": 0.1445850846208003, "grad_norm": 10.646964327069766, "learning_rate": 4.890831124747855e-05, "loss": 2.3636, "mean_token_accuracy": 0.49165154695510865, "step": 143550 }, { "epoch": 0.14459012067390448, "grad_norm": 11.933970967377196, "learning_rate": 4.890819594172644e-05, "loss": 2.4161, "mean_token_accuracy": 0.44137930274009707, "step": 143555 }, { "epoch": 0.14459515672700865, "grad_norm": 9.8570826445566, "learning_rate": 4.890808063003667e-05, "loss": 2.2871, "mean_token_accuracy": 0.4344827592372894, "step": 143560 }, { "epoch": 0.14460019278011282, "grad_norm": 9.144568269671431, "learning_rate": 4.890796531240931e-05, "loss": 2.4222, "mean_token_accuracy": 0.45317604541778567, "step": 143565 }, { "epoch": 0.144605228833217, "grad_norm": 8.426753187654638, "learning_rate": 4.890784998884436e-05, "loss": 2.2292, "mean_token_accuracy": 0.5172413766384125, "step": 143570 }, { "epoch": 0.14461026488632117, "grad_norm": 9.789786819455657, "learning_rate": 4.890773465934187e-05, "loss": 2.4364, "mean_token_accuracy": 0.43793103098869324, "step": 143575 }, { "epoch": 0.14461530093942535, "grad_norm": 11.07853969471019, "learning_rate": 4.890761932390186e-05, "loss": 2.3389, "mean_token_accuracy": 0.4517241299152374, "step": 143580 }, { "epoch": 0.14462033699252952, "grad_norm": 9.86452615311759, "learning_rate": 4.890750398252436e-05, "loss": 2.1072, "mean_token_accuracy": 0.4793103337287903, "step": 143585 }, { "epoch": 0.1446253730456337, "grad_norm": 10.310264323735417, "learning_rate": 4.890738863520942e-05, "loss": 2.2695, "mean_token_accuracy": 0.458620685338974, "step": 143590 }, { "epoch": 0.14463040909873787, "grad_norm": 11.954127932678544, "learning_rate": 4.890727328195706e-05, "loss": 2.0617, "mean_token_accuracy": 0.4724137902259827, "step": 143595 }, { "epoch": 0.14463544515184204, "grad_norm": 10.064799958434044, "learning_rate": 4.890715792276731e-05, "loss": 2.9538, "mean_token_accuracy": 0.3655172407627106, "step": 143600 }, { "epoch": 0.14464048120494621, "grad_norm": 11.880906390614564, "learning_rate": 4.8907042557640196e-05, "loss": 2.1926, "mean_token_accuracy": 0.4206896543502808, "step": 143605 }, { "epoch": 0.1446455172580504, "grad_norm": 9.074700355558805, "learning_rate": 4.8906927186575776e-05, "loss": 2.1178, "mean_token_accuracy": 0.4655172288417816, "step": 143610 }, { "epoch": 0.14465055331115456, "grad_norm": 14.580498351142044, "learning_rate": 4.8906811809574054e-05, "loss": 2.2088, "mean_token_accuracy": 0.458620685338974, "step": 143615 }, { "epoch": 0.14465558936425874, "grad_norm": 7.764915631679862, "learning_rate": 4.890669642663507e-05, "loss": 2.3413, "mean_token_accuracy": 0.4222625494003296, "step": 143620 }, { "epoch": 0.1446606254173629, "grad_norm": 15.547314601621624, "learning_rate": 4.8906581037758864e-05, "loss": 2.3682, "mean_token_accuracy": 0.40689654350280763, "step": 143625 }, { "epoch": 0.14466566147046708, "grad_norm": 10.311099846790047, "learning_rate": 4.890646564294547e-05, "loss": 2.2407, "mean_token_accuracy": 0.4620689630508423, "step": 143630 }, { "epoch": 0.14467069752357126, "grad_norm": 13.303215093196087, "learning_rate": 4.89063502421949e-05, "loss": 2.4223, "mean_token_accuracy": 0.4379310250282288, "step": 143635 }, { "epoch": 0.14467573357667543, "grad_norm": 11.191849086810171, "learning_rate": 4.890623483550721e-05, "loss": 2.3015, "mean_token_accuracy": 0.44827585220336913, "step": 143640 }, { "epoch": 0.1446807696297796, "grad_norm": 9.40574544389425, "learning_rate": 4.890611942288241e-05, "loss": 2.3691, "mean_token_accuracy": 0.4793103337287903, "step": 143645 }, { "epoch": 0.14468580568288378, "grad_norm": 13.39674112459057, "learning_rate": 4.890600400432055e-05, "loss": 1.9823, "mean_token_accuracy": 0.4931034445762634, "step": 143650 }, { "epoch": 0.14469084173598792, "grad_norm": 10.63053464737138, "learning_rate": 4.8905888579821655e-05, "loss": 2.3018, "mean_token_accuracy": 0.4517241418361664, "step": 143655 }, { "epoch": 0.1446958777890921, "grad_norm": 11.041860587795977, "learning_rate": 4.890577314938576e-05, "loss": 2.4835, "mean_token_accuracy": 0.42758620977401735, "step": 143660 }, { "epoch": 0.14470091384219627, "grad_norm": 7.713130534594883, "learning_rate": 4.890565771301289e-05, "loss": 2.0691, "mean_token_accuracy": 0.5229280233383179, "step": 143665 }, { "epoch": 0.14470594989530045, "grad_norm": 12.0256046617136, "learning_rate": 4.8905542270703086e-05, "loss": 2.8295, "mean_token_accuracy": 0.4482758641242981, "step": 143670 }, { "epoch": 0.14471098594840462, "grad_norm": 11.197485021689355, "learning_rate": 4.890542682245636e-05, "loss": 2.5196, "mean_token_accuracy": 0.4034482717514038, "step": 143675 }, { "epoch": 0.1447160220015088, "grad_norm": 10.002284322971597, "learning_rate": 4.890531136827277e-05, "loss": 2.4056, "mean_token_accuracy": 0.47931034564971925, "step": 143680 }, { "epoch": 0.14472105805461297, "grad_norm": 13.844670141628352, "learning_rate": 4.890519590815234e-05, "loss": 2.8464, "mean_token_accuracy": 0.41034482717514037, "step": 143685 }, { "epoch": 0.14472609410771714, "grad_norm": 10.90476468155278, "learning_rate": 4.890508044209509e-05, "loss": 2.6711, "mean_token_accuracy": 0.39310344457626345, "step": 143690 }, { "epoch": 0.14473113016082131, "grad_norm": 7.351219742039671, "learning_rate": 4.890496497010107e-05, "loss": 2.3591, "mean_token_accuracy": 0.4413793087005615, "step": 143695 }, { "epoch": 0.1447361662139255, "grad_norm": 8.678514876001069, "learning_rate": 4.89048494921703e-05, "loss": 2.5466, "mean_token_accuracy": 0.43448275327682495, "step": 143700 }, { "epoch": 0.14474120226702966, "grad_norm": 8.508233171386774, "learning_rate": 4.890473400830282e-05, "loss": 1.9673, "mean_token_accuracy": 0.5158499598503112, "step": 143705 }, { "epoch": 0.14474623832013384, "grad_norm": 9.973779322066303, "learning_rate": 4.8904618518498644e-05, "loss": 2.104, "mean_token_accuracy": 0.49352691173553465, "step": 143710 }, { "epoch": 0.144751274373238, "grad_norm": 13.70444786587233, "learning_rate": 4.8904503022757824e-05, "loss": 2.6227, "mean_token_accuracy": 0.4172413766384125, "step": 143715 }, { "epoch": 0.14475631042634218, "grad_norm": 9.773112128442369, "learning_rate": 4.890438752108039e-05, "loss": 2.3243, "mean_token_accuracy": 0.43448275327682495, "step": 143720 }, { "epoch": 0.14476134647944636, "grad_norm": 10.901284331543764, "learning_rate": 4.890427201346636e-05, "loss": 2.3806, "mean_token_accuracy": 0.4275862157344818, "step": 143725 }, { "epoch": 0.14476638253255053, "grad_norm": 10.053272252718848, "learning_rate": 4.890415649991578e-05, "loss": 2.1494, "mean_token_accuracy": 0.4586206912994385, "step": 143730 }, { "epoch": 0.1447714185856547, "grad_norm": 8.806981634197653, "learning_rate": 4.890404098042868e-05, "loss": 2.4275, "mean_token_accuracy": 0.4551724135875702, "step": 143735 }, { "epoch": 0.14477645463875888, "grad_norm": 9.854604057972457, "learning_rate": 4.890392545500509e-05, "loss": 2.1196, "mean_token_accuracy": 0.4965517222881317, "step": 143740 }, { "epoch": 0.14478149069186305, "grad_norm": 9.044237419350468, "learning_rate": 4.890380992364503e-05, "loss": 2.2412, "mean_token_accuracy": 0.47586206793785096, "step": 143745 }, { "epoch": 0.14478652674496723, "grad_norm": 9.760579061885334, "learning_rate": 4.890369438634856e-05, "loss": 2.3011, "mean_token_accuracy": 0.41724138259887694, "step": 143750 }, { "epoch": 0.1447915627980714, "grad_norm": 9.526871137559967, "learning_rate": 4.890357884311569e-05, "loss": 2.0646, "mean_token_accuracy": 0.47586207985877993, "step": 143755 }, { "epoch": 0.14479659885117557, "grad_norm": 9.34023446067502, "learning_rate": 4.890346329394645e-05, "loss": 2.427, "mean_token_accuracy": 0.44137930274009707, "step": 143760 }, { "epoch": 0.14480163490427975, "grad_norm": 10.966220555941506, "learning_rate": 4.8903347738840885e-05, "loss": 2.6758, "mean_token_accuracy": 0.35862068831920624, "step": 143765 }, { "epoch": 0.14480667095738392, "grad_norm": 9.180049688747873, "learning_rate": 4.8903232177799016e-05, "loss": 2.6232, "mean_token_accuracy": 0.4310344815254211, "step": 143770 }, { "epoch": 0.1448117070104881, "grad_norm": 9.935572026306854, "learning_rate": 4.890311661082089e-05, "loss": 2.2243, "mean_token_accuracy": 0.4344827592372894, "step": 143775 }, { "epoch": 0.14481674306359227, "grad_norm": 10.180242047202107, "learning_rate": 4.890300103790653e-05, "loss": 2.3633, "mean_token_accuracy": 0.4620689630508423, "step": 143780 }, { "epoch": 0.14482177911669644, "grad_norm": 10.296106936531059, "learning_rate": 4.890288545905596e-05, "loss": 2.6595, "mean_token_accuracy": 0.4068965494632721, "step": 143785 }, { "epoch": 0.14482681516980062, "grad_norm": 10.871421274096532, "learning_rate": 4.890276987426923e-05, "loss": 2.3724, "mean_token_accuracy": 0.44827587008476255, "step": 143790 }, { "epoch": 0.14483185122290476, "grad_norm": 11.610330271156743, "learning_rate": 4.890265428354635e-05, "loss": 2.5323, "mean_token_accuracy": 0.3999999940395355, "step": 143795 }, { "epoch": 0.14483688727600894, "grad_norm": 9.325752887287116, "learning_rate": 4.8902538686887374e-05, "loss": 2.2223, "mean_token_accuracy": 0.4689655125141144, "step": 143800 }, { "epoch": 0.1448419233291131, "grad_norm": 10.764415738498748, "learning_rate": 4.890242308429231e-05, "loss": 2.5769, "mean_token_accuracy": 0.4103448331356049, "step": 143805 }, { "epoch": 0.14484695938221728, "grad_norm": 11.105961830874552, "learning_rate": 4.890230747576122e-05, "loss": 2.3693, "mean_token_accuracy": 0.46551724076271056, "step": 143810 }, { "epoch": 0.14485199543532146, "grad_norm": 9.875217497119914, "learning_rate": 4.890219186129412e-05, "loss": 2.1929, "mean_token_accuracy": 0.47241379618644713, "step": 143815 }, { "epoch": 0.14485703148842563, "grad_norm": 8.880986339448448, "learning_rate": 4.890207624089103e-05, "loss": 2.303, "mean_token_accuracy": 0.47931034564971925, "step": 143820 }, { "epoch": 0.1448620675415298, "grad_norm": 10.336485399460186, "learning_rate": 4.8901960614552e-05, "loss": 2.492, "mean_token_accuracy": 0.4344827651977539, "step": 143825 }, { "epoch": 0.14486710359463398, "grad_norm": 10.687870028820543, "learning_rate": 4.890184498227706e-05, "loss": 2.5015, "mean_token_accuracy": 0.358620685338974, "step": 143830 }, { "epoch": 0.14487213964773815, "grad_norm": 10.55102736755888, "learning_rate": 4.8901729344066226e-05, "loss": 2.3304, "mean_token_accuracy": 0.4068965494632721, "step": 143835 }, { "epoch": 0.14487717570084233, "grad_norm": 8.726308751882007, "learning_rate": 4.8901613699919555e-05, "loss": 2.3365, "mean_token_accuracy": 0.44827585816383364, "step": 143840 }, { "epoch": 0.1448822117539465, "grad_norm": 8.48515825576751, "learning_rate": 4.890149804983707e-05, "loss": 2.4447, "mean_token_accuracy": 0.35862069129943847, "step": 143845 }, { "epoch": 0.14488724780705067, "grad_norm": 9.840174812097512, "learning_rate": 4.8901382393818783e-05, "loss": 2.3096, "mean_token_accuracy": 0.4379310369491577, "step": 143850 }, { "epoch": 0.14489228386015485, "grad_norm": 10.298639407294521, "learning_rate": 4.890126673186476e-05, "loss": 2.3572, "mean_token_accuracy": 0.42758620977401735, "step": 143855 }, { "epoch": 0.14489731991325902, "grad_norm": 12.306937003980694, "learning_rate": 4.8901151063975e-05, "loss": 2.5047, "mean_token_accuracy": 0.3655172407627106, "step": 143860 }, { "epoch": 0.1449023559663632, "grad_norm": 8.951201298382005, "learning_rate": 4.890103539014957e-05, "loss": 2.1799, "mean_token_accuracy": 0.4745916426181793, "step": 143865 }, { "epoch": 0.14490739201946737, "grad_norm": 8.840214351787456, "learning_rate": 4.8900919710388464e-05, "loss": 2.7421, "mean_token_accuracy": 0.3620689660310745, "step": 143870 }, { "epoch": 0.14491242807257154, "grad_norm": 10.873226613590463, "learning_rate": 4.8900804024691744e-05, "loss": 2.2906, "mean_token_accuracy": 0.4034482717514038, "step": 143875 }, { "epoch": 0.14491746412567572, "grad_norm": 9.147833652217198, "learning_rate": 4.890068833305943e-05, "loss": 2.1011, "mean_token_accuracy": 0.47931034564971925, "step": 143880 }, { "epoch": 0.1449225001787799, "grad_norm": 8.761168710927961, "learning_rate": 4.890057263549155e-05, "loss": 2.6739, "mean_token_accuracy": 0.37931033968925476, "step": 143885 }, { "epoch": 0.14492753623188406, "grad_norm": 10.937823363854655, "learning_rate": 4.890045693198815e-05, "loss": 2.4387, "mean_token_accuracy": 0.39655172228813174, "step": 143890 }, { "epoch": 0.14493257228498824, "grad_norm": 9.293145269453115, "learning_rate": 4.890034122254925e-05, "loss": 2.1434, "mean_token_accuracy": 0.4862068951129913, "step": 143895 }, { "epoch": 0.1449376083380924, "grad_norm": 10.425609436580242, "learning_rate": 4.890022550717489e-05, "loss": 2.3657, "mean_token_accuracy": 0.4310344815254211, "step": 143900 }, { "epoch": 0.14494264439119658, "grad_norm": 9.38570091352509, "learning_rate": 4.890010978586509e-05, "loss": 2.411, "mean_token_accuracy": 0.41034482717514037, "step": 143905 }, { "epoch": 0.14494768044430076, "grad_norm": 8.32616852429847, "learning_rate": 4.8899994058619884e-05, "loss": 2.4168, "mean_token_accuracy": 0.39655172228813174, "step": 143910 }, { "epoch": 0.14495271649740493, "grad_norm": 9.779628102895474, "learning_rate": 4.8899878325439324e-05, "loss": 2.1989, "mean_token_accuracy": 0.4379310369491577, "step": 143915 }, { "epoch": 0.1449577525505091, "grad_norm": 10.694421346236242, "learning_rate": 4.889976258632342e-05, "loss": 2.5396, "mean_token_accuracy": 0.3793103456497192, "step": 143920 }, { "epoch": 0.14496278860361328, "grad_norm": 8.627128455322516, "learning_rate": 4.889964684127221e-05, "loss": 2.3402, "mean_token_accuracy": 0.4620689630508423, "step": 143925 }, { "epoch": 0.14496782465671745, "grad_norm": 15.435647028289457, "learning_rate": 4.889953109028573e-05, "loss": 2.7457, "mean_token_accuracy": 0.4344827592372894, "step": 143930 }, { "epoch": 0.1449728607098216, "grad_norm": 14.648393878955599, "learning_rate": 4.8899415333364015e-05, "loss": 2.563, "mean_token_accuracy": 0.42758620381355283, "step": 143935 }, { "epoch": 0.14497789676292577, "grad_norm": 10.800983892020598, "learning_rate": 4.889929957050709e-05, "loss": 2.5798, "mean_token_accuracy": 0.40344828367233276, "step": 143940 }, { "epoch": 0.14498293281602995, "grad_norm": 11.240491480234054, "learning_rate": 4.889918380171499e-05, "loss": 2.0261, "mean_token_accuracy": 0.48965516686439514, "step": 143945 }, { "epoch": 0.14498796886913412, "grad_norm": 12.04618851131278, "learning_rate": 4.889906802698775e-05, "loss": 2.5256, "mean_token_accuracy": 0.40344828367233276, "step": 143950 }, { "epoch": 0.1449930049222383, "grad_norm": 10.037136549439175, "learning_rate": 4.889895224632539e-05, "loss": 2.4872, "mean_token_accuracy": 0.4241379380226135, "step": 143955 }, { "epoch": 0.14499804097534247, "grad_norm": 10.301120866236207, "learning_rate": 4.889883645972795e-05, "loss": 2.7662, "mean_token_accuracy": 0.3876587986946106, "step": 143960 }, { "epoch": 0.14500307702844664, "grad_norm": 10.856067419009127, "learning_rate": 4.889872066719547e-05, "loss": 2.3838, "mean_token_accuracy": 0.4, "step": 143965 }, { "epoch": 0.14500811308155082, "grad_norm": 9.641385640304472, "learning_rate": 4.889860486872797e-05, "loss": 2.4189, "mean_token_accuracy": 0.4068965554237366, "step": 143970 }, { "epoch": 0.145013149134655, "grad_norm": 11.699035947852236, "learning_rate": 4.8898489064325494e-05, "loss": 2.5506, "mean_token_accuracy": 0.44827585220336913, "step": 143975 }, { "epoch": 0.14501818518775916, "grad_norm": 11.057039924185045, "learning_rate": 4.889837325398806e-05, "loss": 2.1407, "mean_token_accuracy": 0.4620689690113068, "step": 143980 }, { "epoch": 0.14502322124086334, "grad_norm": 10.45350702650619, "learning_rate": 4.889825743771571e-05, "loss": 2.2872, "mean_token_accuracy": 0.4551724135875702, "step": 143985 }, { "epoch": 0.1450282572939675, "grad_norm": 12.202902585242608, "learning_rate": 4.8898141615508476e-05, "loss": 2.8517, "mean_token_accuracy": 0.41034482717514037, "step": 143990 }, { "epoch": 0.14503329334707168, "grad_norm": 14.710312929269715, "learning_rate": 4.889802578736638e-05, "loss": 2.2663, "mean_token_accuracy": 0.4551724076271057, "step": 143995 }, { "epoch": 0.14503832940017586, "grad_norm": 12.836859931946204, "learning_rate": 4.8897909953289466e-05, "loss": 2.4801, "mean_token_accuracy": 0.3896551728248596, "step": 144000 }, { "epoch": 0.14504336545328003, "grad_norm": 9.863589158517167, "learning_rate": 4.889779411327777e-05, "loss": 2.4554, "mean_token_accuracy": 0.4034482717514038, "step": 144005 }, { "epoch": 0.1450484015063842, "grad_norm": 10.612544796577074, "learning_rate": 4.889767826733131e-05, "loss": 2.4213, "mean_token_accuracy": 0.41379310488700866, "step": 144010 }, { "epoch": 0.14505343755948838, "grad_norm": 10.389338223339522, "learning_rate": 4.889756241545012e-05, "loss": 2.5346, "mean_token_accuracy": 0.3793103456497192, "step": 144015 }, { "epoch": 0.14505847361259255, "grad_norm": 9.35086172850557, "learning_rate": 4.889744655763424e-05, "loss": 2.1159, "mean_token_accuracy": 0.4379310369491577, "step": 144020 }, { "epoch": 0.14506350966569673, "grad_norm": 13.299832752644116, "learning_rate": 4.88973306938837e-05, "loss": 2.2956, "mean_token_accuracy": 0.5, "step": 144025 }, { "epoch": 0.1450685457188009, "grad_norm": 10.985377744809222, "learning_rate": 4.889721482419852e-05, "loss": 2.6731, "mean_token_accuracy": 0.37931033968925476, "step": 144030 }, { "epoch": 0.14507358177190507, "grad_norm": 19.253025299708447, "learning_rate": 4.889709894857875e-05, "loss": 2.1133, "mean_token_accuracy": 0.42413792610168455, "step": 144035 }, { "epoch": 0.14507861782500925, "grad_norm": 13.273572952704141, "learning_rate": 4.889698306702442e-05, "loss": 3.144, "mean_token_accuracy": 0.34137930274009703, "step": 144040 }, { "epoch": 0.14508365387811342, "grad_norm": 11.204478365249356, "learning_rate": 4.889686717953556e-05, "loss": 2.5541, "mean_token_accuracy": 0.37586206793785093, "step": 144045 }, { "epoch": 0.1450886899312176, "grad_norm": 9.790070463210167, "learning_rate": 4.889675128611218e-05, "loss": 2.7196, "mean_token_accuracy": 0.4188142716884613, "step": 144050 }, { "epoch": 0.14509372598432177, "grad_norm": 9.169571866707972, "learning_rate": 4.889663538675435e-05, "loss": 2.6706, "mean_token_accuracy": 0.4, "step": 144055 }, { "epoch": 0.14509876203742594, "grad_norm": 9.731395408579983, "learning_rate": 4.889651948146207e-05, "loss": 2.4059, "mean_token_accuracy": 0.4310344815254211, "step": 144060 }, { "epoch": 0.14510379809053012, "grad_norm": 10.655534593442033, "learning_rate": 4.88964035702354e-05, "loss": 2.2562, "mean_token_accuracy": 0.4534785211086273, "step": 144065 }, { "epoch": 0.1451088341436343, "grad_norm": 9.43224057953019, "learning_rate": 4.889628765307435e-05, "loss": 2.0552, "mean_token_accuracy": 0.4931034445762634, "step": 144070 }, { "epoch": 0.14511387019673844, "grad_norm": 11.139666092470963, "learning_rate": 4.889617172997895e-05, "loss": 2.4151, "mean_token_accuracy": 0.4586206912994385, "step": 144075 }, { "epoch": 0.1451189062498426, "grad_norm": 10.036144004922507, "learning_rate": 4.889605580094925e-05, "loss": 2.6416, "mean_token_accuracy": 0.4449485898017883, "step": 144080 }, { "epoch": 0.14512394230294678, "grad_norm": 10.019564781152859, "learning_rate": 4.8895939865985276e-05, "loss": 2.4254, "mean_token_accuracy": 0.4068965494632721, "step": 144085 }, { "epoch": 0.14512897835605096, "grad_norm": 11.033667837102554, "learning_rate": 4.889582392508705e-05, "loss": 2.3985, "mean_token_accuracy": 0.44482758045196535, "step": 144090 }, { "epoch": 0.14513401440915513, "grad_norm": 12.277800521706919, "learning_rate": 4.889570797825463e-05, "loss": 2.2557, "mean_token_accuracy": 0.4344827592372894, "step": 144095 }, { "epoch": 0.1451390504622593, "grad_norm": 10.14419244234258, "learning_rate": 4.889559202548801e-05, "loss": 2.2994, "mean_token_accuracy": 0.39655172228813174, "step": 144100 }, { "epoch": 0.14514408651536348, "grad_norm": 14.973001143199081, "learning_rate": 4.889547606678726e-05, "loss": 2.7306, "mean_token_accuracy": 0.3965517163276672, "step": 144105 }, { "epoch": 0.14514912256846765, "grad_norm": 9.416034107534177, "learning_rate": 4.889536010215238e-05, "loss": 2.1685, "mean_token_accuracy": 0.4586206912994385, "step": 144110 }, { "epoch": 0.14515415862157183, "grad_norm": 11.049515826058766, "learning_rate": 4.889524413158343e-05, "loss": 2.6506, "mean_token_accuracy": 0.334482753276825, "step": 144115 }, { "epoch": 0.145159194674676, "grad_norm": 9.46568139107262, "learning_rate": 4.8895128155080425e-05, "loss": 2.3446, "mean_token_accuracy": 0.4286751329898834, "step": 144120 }, { "epoch": 0.14516423072778017, "grad_norm": 10.525009490374117, "learning_rate": 4.88950121726434e-05, "loss": 2.5606, "mean_token_accuracy": 0.4344827592372894, "step": 144125 }, { "epoch": 0.14516926678088435, "grad_norm": 12.206420934587465, "learning_rate": 4.889489618427239e-05, "loss": 2.2318, "mean_token_accuracy": 0.44482758045196535, "step": 144130 }, { "epoch": 0.14517430283398852, "grad_norm": 8.235072848658808, "learning_rate": 4.889478018996742e-05, "loss": 2.4855, "mean_token_accuracy": 0.40199636816978457, "step": 144135 }, { "epoch": 0.1451793388870927, "grad_norm": 15.38345435333593, "learning_rate": 4.889466418972854e-05, "loss": 2.3603, "mean_token_accuracy": 0.482758617401123, "step": 144140 }, { "epoch": 0.14518437494019687, "grad_norm": 10.226437540212723, "learning_rate": 4.889454818355576e-05, "loss": 2.2697, "mean_token_accuracy": 0.495099812746048, "step": 144145 }, { "epoch": 0.14518941099330104, "grad_norm": 10.255810840952867, "learning_rate": 4.889443217144913e-05, "loss": 2.3935, "mean_token_accuracy": 0.4, "step": 144150 }, { "epoch": 0.14519444704640522, "grad_norm": 9.170845774254225, "learning_rate": 4.8894316153408665e-05, "loss": 2.5126, "mean_token_accuracy": 0.4172413766384125, "step": 144155 }, { "epoch": 0.1451994830995094, "grad_norm": 13.151268072543482, "learning_rate": 4.889420012943441e-05, "loss": 2.662, "mean_token_accuracy": 0.4206896543502808, "step": 144160 }, { "epoch": 0.14520451915261356, "grad_norm": 8.869687218031247, "learning_rate": 4.889408409952639e-05, "loss": 2.1596, "mean_token_accuracy": 0.44827585816383364, "step": 144165 }, { "epoch": 0.14520955520571774, "grad_norm": 9.361657593129753, "learning_rate": 4.889396806368465e-05, "loss": 2.6107, "mean_token_accuracy": 0.4082274615764618, "step": 144170 }, { "epoch": 0.1452145912588219, "grad_norm": 11.213247843621472, "learning_rate": 4.889385202190921e-05, "loss": 2.2497, "mean_token_accuracy": 0.4413793087005615, "step": 144175 }, { "epoch": 0.14521962731192609, "grad_norm": 21.232124236132805, "learning_rate": 4.88937359742001e-05, "loss": 3.0354, "mean_token_accuracy": 0.44137930274009707, "step": 144180 }, { "epoch": 0.14522466336503026, "grad_norm": 9.486034007939356, "learning_rate": 4.8893619920557366e-05, "loss": 2.5019, "mean_token_accuracy": 0.41034482717514037, "step": 144185 }, { "epoch": 0.14522969941813443, "grad_norm": 11.67197113763365, "learning_rate": 4.889350386098103e-05, "loss": 2.3774, "mean_token_accuracy": 0.4294010937213898, "step": 144190 }, { "epoch": 0.1452347354712386, "grad_norm": 10.244797816806349, "learning_rate": 4.889338779547112e-05, "loss": 2.4288, "mean_token_accuracy": 0.4034482717514038, "step": 144195 }, { "epoch": 0.14523977152434278, "grad_norm": 9.340278528944898, "learning_rate": 4.8893271724027686e-05, "loss": 2.3846, "mean_token_accuracy": 0.40508167147636415, "step": 144200 }, { "epoch": 0.14524480757744695, "grad_norm": 9.9208010306362, "learning_rate": 4.8893155646650734e-05, "loss": 2.325, "mean_token_accuracy": 0.42068966031074523, "step": 144205 }, { "epoch": 0.14524984363055113, "grad_norm": 10.508248707737343, "learning_rate": 4.889303956334032e-05, "loss": 2.3219, "mean_token_accuracy": 0.41724138259887694, "step": 144210 }, { "epoch": 0.14525487968365527, "grad_norm": 10.969547268367997, "learning_rate": 4.889292347409646e-05, "loss": 2.5017, "mean_token_accuracy": 0.4068965494632721, "step": 144215 }, { "epoch": 0.14525991573675945, "grad_norm": 8.92339435587438, "learning_rate": 4.889280737891919e-05, "loss": 2.3111, "mean_token_accuracy": 0.42758620381355283, "step": 144220 }, { "epoch": 0.14526495178986362, "grad_norm": 12.869771875676484, "learning_rate": 4.8892691277808554e-05, "loss": 2.8821, "mean_token_accuracy": 0.4103448212146759, "step": 144225 }, { "epoch": 0.1452699878429678, "grad_norm": 9.532101150508655, "learning_rate": 4.889257517076458e-05, "loss": 2.202, "mean_token_accuracy": 0.4275861978530884, "step": 144230 }, { "epoch": 0.14527502389607197, "grad_norm": 9.401701099697474, "learning_rate": 4.889245905778728e-05, "loss": 2.6174, "mean_token_accuracy": 0.41034482717514037, "step": 144235 }, { "epoch": 0.14528005994917614, "grad_norm": 10.289291533082476, "learning_rate": 4.8892342938876704e-05, "loss": 2.3804, "mean_token_accuracy": 0.44827585816383364, "step": 144240 }, { "epoch": 0.14528509600228032, "grad_norm": 9.365251462868741, "learning_rate": 4.889222681403289e-05, "loss": 2.5318, "mean_token_accuracy": 0.41240169703960416, "step": 144245 }, { "epoch": 0.1452901320553845, "grad_norm": 14.171632730720827, "learning_rate": 4.889211068325587e-05, "loss": 2.3127, "mean_token_accuracy": 0.48275862336158754, "step": 144250 }, { "epoch": 0.14529516810848866, "grad_norm": 27.186534065406306, "learning_rate": 4.889199454654565e-05, "loss": 3.1015, "mean_token_accuracy": 0.38965516686439516, "step": 144255 }, { "epoch": 0.14530020416159284, "grad_norm": 12.761725413589406, "learning_rate": 4.8891878403902286e-05, "loss": 2.1759, "mean_token_accuracy": 0.47241379618644713, "step": 144260 }, { "epoch": 0.145305240214697, "grad_norm": 12.483021539479228, "learning_rate": 4.889176225532581e-05, "loss": 2.8166, "mean_token_accuracy": 0.3840290367603302, "step": 144265 }, { "epoch": 0.14531027626780119, "grad_norm": 8.647452654036163, "learning_rate": 4.8891646100816244e-05, "loss": 2.0301, "mean_token_accuracy": 0.4862069010734558, "step": 144270 }, { "epoch": 0.14531531232090536, "grad_norm": 8.676734337797246, "learning_rate": 4.8891529940373624e-05, "loss": 2.4508, "mean_token_accuracy": 0.44137930274009707, "step": 144275 }, { "epoch": 0.14532034837400953, "grad_norm": 10.376473478992587, "learning_rate": 4.8891413773997985e-05, "loss": 2.3951, "mean_token_accuracy": 0.404779189825058, "step": 144280 }, { "epoch": 0.1453253844271137, "grad_norm": 11.34289837909769, "learning_rate": 4.889129760168936e-05, "loss": 2.6268, "mean_token_accuracy": 0.38620689511299133, "step": 144285 }, { "epoch": 0.14533042048021788, "grad_norm": 11.995882775810296, "learning_rate": 4.889118142344778e-05, "loss": 2.4344, "mean_token_accuracy": 0.4379310250282288, "step": 144290 }, { "epoch": 0.14533545653332205, "grad_norm": 9.681560242490411, "learning_rate": 4.8891065239273277e-05, "loss": 2.5085, "mean_token_accuracy": 0.42413792610168455, "step": 144295 }, { "epoch": 0.14534049258642623, "grad_norm": 12.58500341296998, "learning_rate": 4.889094904916588e-05, "loss": 2.6026, "mean_token_accuracy": 0.41724138259887694, "step": 144300 }, { "epoch": 0.1453455286395304, "grad_norm": 11.11766631688563, "learning_rate": 4.889083285312562e-05, "loss": 2.4082, "mean_token_accuracy": 0.45517240166664125, "step": 144305 }, { "epoch": 0.14535056469263458, "grad_norm": 13.12004566989439, "learning_rate": 4.889071665115253e-05, "loss": 2.0867, "mean_token_accuracy": 0.47241379618644713, "step": 144310 }, { "epoch": 0.14535560074573875, "grad_norm": 9.25192071354174, "learning_rate": 4.889060044324665e-05, "loss": 2.2359, "mean_token_accuracy": 0.4517241299152374, "step": 144315 }, { "epoch": 0.14536063679884292, "grad_norm": 8.62132500190601, "learning_rate": 4.889048422940801e-05, "loss": 2.434, "mean_token_accuracy": 0.42413793206214906, "step": 144320 }, { "epoch": 0.1453656728519471, "grad_norm": 10.00444366000446, "learning_rate": 4.889036800963664e-05, "loss": 2.359, "mean_token_accuracy": 0.42413793206214906, "step": 144325 }, { "epoch": 0.14537070890505127, "grad_norm": 10.381742255197999, "learning_rate": 4.8890251783932565e-05, "loss": 2.7726, "mean_token_accuracy": 0.3793103516101837, "step": 144330 }, { "epoch": 0.14537574495815545, "grad_norm": 10.554536801910256, "learning_rate": 4.889013555229583e-05, "loss": 2.0251, "mean_token_accuracy": 0.4931034505367279, "step": 144335 }, { "epoch": 0.14538078101125962, "grad_norm": 10.825861935288485, "learning_rate": 4.8890019314726455e-05, "loss": 2.2847, "mean_token_accuracy": 0.4275862127542496, "step": 144340 }, { "epoch": 0.1453858170643638, "grad_norm": 10.392230735549658, "learning_rate": 4.888990307122448e-05, "loss": 2.3777, "mean_token_accuracy": 0.4, "step": 144345 }, { "epoch": 0.14539085311746797, "grad_norm": 11.413451660256577, "learning_rate": 4.888978682178994e-05, "loss": 2.4745, "mean_token_accuracy": 0.3620689570903778, "step": 144350 }, { "epoch": 0.1453958891705721, "grad_norm": 11.464395644345553, "learning_rate": 4.8889670566422856e-05, "loss": 2.1961, "mean_token_accuracy": 0.4482758641242981, "step": 144355 }, { "epoch": 0.1454009252236763, "grad_norm": 10.265047272305177, "learning_rate": 4.888955430512328e-05, "loss": 2.168, "mean_token_accuracy": 0.482758629322052, "step": 144360 }, { "epoch": 0.14540596127678046, "grad_norm": 12.192577203677107, "learning_rate": 4.8889438037891216e-05, "loss": 2.2678, "mean_token_accuracy": 0.4620689690113068, "step": 144365 }, { "epoch": 0.14541099732988463, "grad_norm": 10.837831977032371, "learning_rate": 4.888932176472672e-05, "loss": 2.2325, "mean_token_accuracy": 0.4173645317554474, "step": 144370 }, { "epoch": 0.1454160333829888, "grad_norm": 7.604900852444863, "learning_rate": 4.8889205485629806e-05, "loss": 2.3525, "mean_token_accuracy": 0.43793103098869324, "step": 144375 }, { "epoch": 0.14542106943609298, "grad_norm": 9.625856543534868, "learning_rate": 4.888908920060053e-05, "loss": 2.7043, "mean_token_accuracy": 0.43103447556495667, "step": 144380 }, { "epoch": 0.14542610548919715, "grad_norm": 12.043598692983185, "learning_rate": 4.88889729096389e-05, "loss": 2.0682, "mean_token_accuracy": 0.48275862336158754, "step": 144385 }, { "epoch": 0.14543114154230133, "grad_norm": 11.3587931051436, "learning_rate": 4.8888856612744974e-05, "loss": 2.5668, "mean_token_accuracy": 0.35862069129943847, "step": 144390 }, { "epoch": 0.1454361775954055, "grad_norm": 9.963673080854955, "learning_rate": 4.888874030991875e-05, "loss": 2.5672, "mean_token_accuracy": 0.47586206793785096, "step": 144395 }, { "epoch": 0.14544121364850968, "grad_norm": 12.708974744406197, "learning_rate": 4.888862400116028e-05, "loss": 2.2343, "mean_token_accuracy": 0.42068966031074523, "step": 144400 }, { "epoch": 0.14544624970161385, "grad_norm": 9.616563885099296, "learning_rate": 4.88885076864696e-05, "loss": 2.7389, "mean_token_accuracy": 0.36896551251411436, "step": 144405 }, { "epoch": 0.14545128575471802, "grad_norm": 9.197532718057804, "learning_rate": 4.888839136584675e-05, "loss": 2.2281, "mean_token_accuracy": 0.5068965375423431, "step": 144410 }, { "epoch": 0.1454563218078222, "grad_norm": 11.832377091990546, "learning_rate": 4.888827503929174e-05, "loss": 2.6893, "mean_token_accuracy": 0.4517241358757019, "step": 144415 }, { "epoch": 0.14546135786092637, "grad_norm": 9.884544140312132, "learning_rate": 4.888815870680461e-05, "loss": 2.376, "mean_token_accuracy": 0.42758620381355283, "step": 144420 }, { "epoch": 0.14546639391403055, "grad_norm": 11.07355858223955, "learning_rate": 4.8888042368385395e-05, "loss": 2.3433, "mean_token_accuracy": 0.4172413766384125, "step": 144425 }, { "epoch": 0.14547142996713472, "grad_norm": 10.661454421438036, "learning_rate": 4.8887926024034134e-05, "loss": 2.2484, "mean_token_accuracy": 0.39310344457626345, "step": 144430 }, { "epoch": 0.1454764660202389, "grad_norm": 10.424891466571518, "learning_rate": 4.888780967375084e-05, "loss": 2.2839, "mean_token_accuracy": 0.4655172288417816, "step": 144435 }, { "epoch": 0.14548150207334307, "grad_norm": 15.509448158895806, "learning_rate": 4.888769331753556e-05, "loss": 2.6166, "mean_token_accuracy": 0.41724138259887694, "step": 144440 }, { "epoch": 0.14548653812644724, "grad_norm": 10.743288438907072, "learning_rate": 4.888757695538833e-05, "loss": 2.4881, "mean_token_accuracy": 0.4551724135875702, "step": 144445 }, { "epoch": 0.14549157417955141, "grad_norm": 10.242524086885805, "learning_rate": 4.888746058730917e-05, "loss": 2.2962, "mean_token_accuracy": 0.42758620381355283, "step": 144450 }, { "epoch": 0.1454966102326556, "grad_norm": 11.292890722852055, "learning_rate": 4.888734421329812e-05, "loss": 2.3614, "mean_token_accuracy": 0.4448275983333588, "step": 144455 }, { "epoch": 0.14550164628575976, "grad_norm": 10.741908329471995, "learning_rate": 4.888722783335521e-05, "loss": 2.2343, "mean_token_accuracy": 0.4965517222881317, "step": 144460 }, { "epoch": 0.14550668233886394, "grad_norm": 9.362354509511386, "learning_rate": 4.888711144748047e-05, "loss": 2.4286, "mean_token_accuracy": 0.4724137902259827, "step": 144465 }, { "epoch": 0.1455117183919681, "grad_norm": 12.36193028359177, "learning_rate": 4.888699505567394e-05, "loss": 2.1978, "mean_token_accuracy": 0.47931033968925474, "step": 144470 }, { "epoch": 0.14551675444507228, "grad_norm": 10.337179575992268, "learning_rate": 4.888687865793565e-05, "loss": 2.4558, "mean_token_accuracy": 0.36896551847457887, "step": 144475 }, { "epoch": 0.14552179049817646, "grad_norm": 10.519706713246107, "learning_rate": 4.888676225426562e-05, "loss": 2.3247, "mean_token_accuracy": 0.44482759237289426, "step": 144480 }, { "epoch": 0.14552682655128063, "grad_norm": 10.951551038644812, "learning_rate": 4.8886645844663895e-05, "loss": 2.2377, "mean_token_accuracy": 0.4379310369491577, "step": 144485 }, { "epoch": 0.1455318626043848, "grad_norm": 12.437850575141454, "learning_rate": 4.8886529429130506e-05, "loss": 2.4488, "mean_token_accuracy": 0.4620689690113068, "step": 144490 }, { "epoch": 0.14553689865748895, "grad_norm": 16.517742840874035, "learning_rate": 4.888641300766549e-05, "loss": 2.2771, "mean_token_accuracy": 0.4862069010734558, "step": 144495 }, { "epoch": 0.14554193471059312, "grad_norm": 10.909007851881578, "learning_rate": 4.888629658026886e-05, "loss": 2.5681, "mean_token_accuracy": 0.4172413766384125, "step": 144500 }, { "epoch": 0.1455469707636973, "grad_norm": 10.717016654914923, "learning_rate": 4.8886180146940665e-05, "loss": 2.4762, "mean_token_accuracy": 0.44313369393348695, "step": 144505 }, { "epoch": 0.14555200681680147, "grad_norm": 10.697570069704058, "learning_rate": 4.888606370768093e-05, "loss": 2.4675, "mean_token_accuracy": 0.43448275327682495, "step": 144510 }, { "epoch": 0.14555704286990565, "grad_norm": 9.545413177348333, "learning_rate": 4.8885947262489704e-05, "loss": 2.2167, "mean_token_accuracy": 0.4344827592372894, "step": 144515 }, { "epoch": 0.14556207892300982, "grad_norm": 9.894923804973, "learning_rate": 4.8885830811366986e-05, "loss": 2.0823, "mean_token_accuracy": 0.4517241418361664, "step": 144520 }, { "epoch": 0.145567114976114, "grad_norm": 11.294093033446769, "learning_rate": 4.888571435431284e-05, "loss": 2.065, "mean_token_accuracy": 0.4896551728248596, "step": 144525 }, { "epoch": 0.14557215102921817, "grad_norm": 14.935364140106937, "learning_rate": 4.8885597891327285e-05, "loss": 2.6554, "mean_token_accuracy": 0.3931034505367279, "step": 144530 }, { "epoch": 0.14557718708232234, "grad_norm": 10.080596441750812, "learning_rate": 4.888548142241035e-05, "loss": 2.432, "mean_token_accuracy": 0.4137930989265442, "step": 144535 }, { "epoch": 0.14558222313542651, "grad_norm": 10.142654123478007, "learning_rate": 4.888536494756208e-05, "loss": 2.2946, "mean_token_accuracy": 0.4586206912994385, "step": 144540 }, { "epoch": 0.1455872591885307, "grad_norm": 9.648018681217982, "learning_rate": 4.88852484667825e-05, "loss": 2.0938, "mean_token_accuracy": 0.458620685338974, "step": 144545 }, { "epoch": 0.14559229524163486, "grad_norm": 10.32925051710693, "learning_rate": 4.888513198007164e-05, "loss": 2.2811, "mean_token_accuracy": 0.4620689690113068, "step": 144550 }, { "epoch": 0.14559733129473904, "grad_norm": 10.24128686709177, "learning_rate": 4.888501548742952e-05, "loss": 2.3273, "mean_token_accuracy": 0.4551724076271057, "step": 144555 }, { "epoch": 0.1456023673478432, "grad_norm": 10.271126002448788, "learning_rate": 4.8884898988856205e-05, "loss": 2.5657, "mean_token_accuracy": 0.41724138259887694, "step": 144560 }, { "epoch": 0.14560740340094738, "grad_norm": 10.905717947339431, "learning_rate": 4.8884782484351695e-05, "loss": 2.6367, "mean_token_accuracy": 0.3931034505367279, "step": 144565 }, { "epoch": 0.14561243945405156, "grad_norm": 8.531675511345025, "learning_rate": 4.888466597391604e-05, "loss": 2.0644, "mean_token_accuracy": 0.4931034445762634, "step": 144570 }, { "epoch": 0.14561747550715573, "grad_norm": 9.93483498824128, "learning_rate": 4.8884549457549265e-05, "loss": 2.3304, "mean_token_accuracy": 0.4379310369491577, "step": 144575 }, { "epoch": 0.1456225115602599, "grad_norm": 11.004631565552463, "learning_rate": 4.888443293525141e-05, "loss": 2.1023, "mean_token_accuracy": 0.441379314661026, "step": 144580 }, { "epoch": 0.14562754761336408, "grad_norm": 8.735859471364368, "learning_rate": 4.8884316407022505e-05, "loss": 2.4805, "mean_token_accuracy": 0.4551724076271057, "step": 144585 }, { "epoch": 0.14563258366646825, "grad_norm": 10.749132653151824, "learning_rate": 4.888419987286257e-05, "loss": 2.4813, "mean_token_accuracy": 0.4310344815254211, "step": 144590 }, { "epoch": 0.14563761971957243, "grad_norm": 9.67260251110979, "learning_rate": 4.888408333277166e-05, "loss": 2.0654, "mean_token_accuracy": 0.46206897497177124, "step": 144595 }, { "epoch": 0.1456426557726766, "grad_norm": 10.701243916762587, "learning_rate": 4.88839667867498e-05, "loss": 2.3127, "mean_token_accuracy": 0.4551724076271057, "step": 144600 }, { "epoch": 0.14564769182578077, "grad_norm": 9.96710520524778, "learning_rate": 4.8883850234797e-05, "loss": 2.6524, "mean_token_accuracy": 0.36896551847457887, "step": 144605 }, { "epoch": 0.14565272787888495, "grad_norm": 9.511383630915285, "learning_rate": 4.8883733676913316e-05, "loss": 2.3262, "mean_token_accuracy": 0.4482758641242981, "step": 144610 }, { "epoch": 0.14565776393198912, "grad_norm": 10.993620650344084, "learning_rate": 4.8883617113098774e-05, "loss": 2.1842, "mean_token_accuracy": 0.47056649923324584, "step": 144615 }, { "epoch": 0.1456627999850933, "grad_norm": 13.003419172956475, "learning_rate": 4.888350054335341e-05, "loss": 2.2209, "mean_token_accuracy": 0.4344827502965927, "step": 144620 }, { "epoch": 0.14566783603819747, "grad_norm": 8.647210635726747, "learning_rate": 4.8883383967677246e-05, "loss": 1.8081, "mean_token_accuracy": 0.5329703629016876, "step": 144625 }, { "epoch": 0.14567287209130164, "grad_norm": 12.14729915501484, "learning_rate": 4.888326738607032e-05, "loss": 2.6107, "mean_token_accuracy": 0.39310344457626345, "step": 144630 }, { "epoch": 0.1456779081444058, "grad_norm": 10.757932704802165, "learning_rate": 4.888315079853267e-05, "loss": 2.602, "mean_token_accuracy": 0.4172413796186447, "step": 144635 }, { "epoch": 0.14568294419750996, "grad_norm": 9.619088775702318, "learning_rate": 4.8883034205064324e-05, "loss": 2.5713, "mean_token_accuracy": 0.42952207922935487, "step": 144640 }, { "epoch": 0.14568798025061414, "grad_norm": 10.68771651191752, "learning_rate": 4.8882917605665304e-05, "loss": 2.4639, "mean_token_accuracy": 0.41724138259887694, "step": 144645 }, { "epoch": 0.1456930163037183, "grad_norm": 10.225864230620648, "learning_rate": 4.888280100033566e-05, "loss": 2.1057, "mean_token_accuracy": 0.4882637619972229, "step": 144650 }, { "epoch": 0.14569805235682248, "grad_norm": 8.644768773421884, "learning_rate": 4.888268438907541e-05, "loss": 2.1361, "mean_token_accuracy": 0.5034482717514038, "step": 144655 }, { "epoch": 0.14570308840992666, "grad_norm": 10.463296715261665, "learning_rate": 4.88825677718846e-05, "loss": 2.43, "mean_token_accuracy": 0.42068964838981626, "step": 144660 }, { "epoch": 0.14570812446303083, "grad_norm": 8.812853379726597, "learning_rate": 4.8882451148763256e-05, "loss": 2.3339, "mean_token_accuracy": 0.4896551728248596, "step": 144665 }, { "epoch": 0.145713160516135, "grad_norm": 11.800741116160715, "learning_rate": 4.888233451971141e-05, "loss": 2.1268, "mean_token_accuracy": 0.505626130104065, "step": 144670 }, { "epoch": 0.14571819656923918, "grad_norm": 10.695167028861896, "learning_rate": 4.8882217884729096e-05, "loss": 2.5146, "mean_token_accuracy": 0.4379310369491577, "step": 144675 }, { "epoch": 0.14572323262234335, "grad_norm": 14.041231694539492, "learning_rate": 4.8882101243816333e-05, "loss": 2.7504, "mean_token_accuracy": 0.3862069010734558, "step": 144680 }, { "epoch": 0.14572826867544753, "grad_norm": 11.26069328809716, "learning_rate": 4.888198459697317e-05, "loss": 2.2443, "mean_token_accuracy": 0.4517241299152374, "step": 144685 }, { "epoch": 0.1457333047285517, "grad_norm": 11.164173089689157, "learning_rate": 4.8881867944199634e-05, "loss": 2.4696, "mean_token_accuracy": 0.3862069010734558, "step": 144690 }, { "epoch": 0.14573834078165587, "grad_norm": 9.87908515702691, "learning_rate": 4.888175128549575e-05, "loss": 2.2186, "mean_token_accuracy": 0.44827585220336913, "step": 144695 }, { "epoch": 0.14574337683476005, "grad_norm": 9.334249511607466, "learning_rate": 4.888163462086157e-05, "loss": 2.2677, "mean_token_accuracy": 0.4206896543502808, "step": 144700 }, { "epoch": 0.14574841288786422, "grad_norm": 10.264404103126378, "learning_rate": 4.888151795029711e-05, "loss": 2.6023, "mean_token_accuracy": 0.42758620381355283, "step": 144705 }, { "epoch": 0.1457534489409684, "grad_norm": 14.628177256114146, "learning_rate": 4.88814012738024e-05, "loss": 2.8311, "mean_token_accuracy": 0.3448275804519653, "step": 144710 }, { "epoch": 0.14575848499407257, "grad_norm": 10.255341114933445, "learning_rate": 4.8881284591377484e-05, "loss": 2.4064, "mean_token_accuracy": 0.4103448212146759, "step": 144715 }, { "epoch": 0.14576352104717674, "grad_norm": 9.817811501160612, "learning_rate": 4.888116790302239e-05, "loss": 2.2874, "mean_token_accuracy": 0.4034482777118683, "step": 144720 }, { "epoch": 0.14576855710028092, "grad_norm": 14.262689213451118, "learning_rate": 4.888105120873714e-05, "loss": 2.5467, "mean_token_accuracy": 0.43793103098869324, "step": 144725 }, { "epoch": 0.1457735931533851, "grad_norm": 11.520927008923671, "learning_rate": 4.888093450852179e-05, "loss": 2.1516, "mean_token_accuracy": 0.46551724076271056, "step": 144730 }, { "epoch": 0.14577862920648926, "grad_norm": 10.016385601931507, "learning_rate": 4.888081780237635e-05, "loss": 2.1276, "mean_token_accuracy": 0.4327283680438995, "step": 144735 }, { "epoch": 0.14578366525959344, "grad_norm": 8.022945322771255, "learning_rate": 4.8880701090300865e-05, "loss": 2.4319, "mean_token_accuracy": 0.46067755222320556, "step": 144740 }, { "epoch": 0.1457887013126976, "grad_norm": 10.68885753251828, "learning_rate": 4.888058437229535e-05, "loss": 2.1898, "mean_token_accuracy": 0.44827585816383364, "step": 144745 }, { "epoch": 0.14579373736580178, "grad_norm": 16.14074096211515, "learning_rate": 4.888046764835987e-05, "loss": 2.1014, "mean_token_accuracy": 0.4586206912994385, "step": 144750 }, { "epoch": 0.14579877341890596, "grad_norm": 14.552204652856918, "learning_rate": 4.8880350918494424e-05, "loss": 2.7088, "mean_token_accuracy": 0.35862069129943847, "step": 144755 }, { "epoch": 0.14580380947201013, "grad_norm": 9.37388644784603, "learning_rate": 4.888023418269906e-05, "loss": 2.4195, "mean_token_accuracy": 0.4344827592372894, "step": 144760 }, { "epoch": 0.1458088455251143, "grad_norm": 10.50407241743583, "learning_rate": 4.88801174409738e-05, "loss": 2.7224, "mean_token_accuracy": 0.39310344457626345, "step": 144765 }, { "epoch": 0.14581388157821848, "grad_norm": 13.889496809619446, "learning_rate": 4.88800006933187e-05, "loss": 2.5856, "mean_token_accuracy": 0.39310344457626345, "step": 144770 }, { "epoch": 0.14581891763132263, "grad_norm": 10.190710151364954, "learning_rate": 4.887988393973377e-05, "loss": 2.0719, "mean_token_accuracy": 0.4586206912994385, "step": 144775 }, { "epoch": 0.1458239536844268, "grad_norm": 9.850653234987464, "learning_rate": 4.887976718021905e-05, "loss": 2.4285, "mean_token_accuracy": 0.4551724135875702, "step": 144780 }, { "epoch": 0.14582898973753097, "grad_norm": 12.893192656296447, "learning_rate": 4.887965041477457e-05, "loss": 2.3197, "mean_token_accuracy": 0.44827585220336913, "step": 144785 }, { "epoch": 0.14583402579063515, "grad_norm": 9.405295492733737, "learning_rate": 4.8879533643400374e-05, "loss": 2.3793, "mean_token_accuracy": 0.4206896543502808, "step": 144790 }, { "epoch": 0.14583906184373932, "grad_norm": 9.637295185609057, "learning_rate": 4.887941686609647e-05, "loss": 2.7076, "mean_token_accuracy": 0.40145190954208376, "step": 144795 }, { "epoch": 0.1458440978968435, "grad_norm": 16.44437324800028, "learning_rate": 4.8879300082862914e-05, "loss": 2.939, "mean_token_accuracy": 0.4517241358757019, "step": 144800 }, { "epoch": 0.14584913394994767, "grad_norm": 10.621185130230701, "learning_rate": 4.8879183293699735e-05, "loss": 2.7376, "mean_token_accuracy": 0.36206896901130675, "step": 144805 }, { "epoch": 0.14585417000305184, "grad_norm": 11.449473763842597, "learning_rate": 4.8879066498606945e-05, "loss": 2.2818, "mean_token_accuracy": 0.47241379618644713, "step": 144810 }, { "epoch": 0.14585920605615602, "grad_norm": 16.575242993507, "learning_rate": 4.88789496975846e-05, "loss": 2.5262, "mean_token_accuracy": 0.42068966031074523, "step": 144815 }, { "epoch": 0.1458642421092602, "grad_norm": 9.991273353251882, "learning_rate": 4.887883289063272e-05, "loss": 2.0831, "mean_token_accuracy": 0.4640048384666443, "step": 144820 }, { "epoch": 0.14586927816236436, "grad_norm": 12.521648153428837, "learning_rate": 4.8878716077751344e-05, "loss": 2.697, "mean_token_accuracy": 0.4172413766384125, "step": 144825 }, { "epoch": 0.14587431421546854, "grad_norm": 9.451919881371168, "learning_rate": 4.887859925894049e-05, "loss": 2.5282, "mean_token_accuracy": 0.4103448212146759, "step": 144830 }, { "epoch": 0.1458793502685727, "grad_norm": 15.12196023674418, "learning_rate": 4.8878482434200216e-05, "loss": 2.7586, "mean_token_accuracy": 0.42758620381355283, "step": 144835 }, { "epoch": 0.14588438632167688, "grad_norm": 11.906774201948894, "learning_rate": 4.887836560353053e-05, "loss": 2.7351, "mean_token_accuracy": 0.44482758045196535, "step": 144840 }, { "epoch": 0.14588942237478106, "grad_norm": 9.744268153766406, "learning_rate": 4.887824876693149e-05, "loss": 2.2401, "mean_token_accuracy": 0.41724138259887694, "step": 144845 }, { "epoch": 0.14589445842788523, "grad_norm": 9.31832115400844, "learning_rate": 4.8878131924403105e-05, "loss": 2.2482, "mean_token_accuracy": 0.4482758641242981, "step": 144850 }, { "epoch": 0.1458994944809894, "grad_norm": 12.447226022350334, "learning_rate": 4.887801507594541e-05, "loss": 2.2735, "mean_token_accuracy": 0.43793103098869324, "step": 144855 }, { "epoch": 0.14590453053409358, "grad_norm": 10.5553289273563, "learning_rate": 4.8877898221558445e-05, "loss": 2.5304, "mean_token_accuracy": 0.4, "step": 144860 }, { "epoch": 0.14590956658719775, "grad_norm": 10.332532340072381, "learning_rate": 4.887778136124224e-05, "loss": 2.3174, "mean_token_accuracy": 0.5, "step": 144865 }, { "epoch": 0.14591460264030193, "grad_norm": 11.227655717835669, "learning_rate": 4.887766449499684e-05, "loss": 2.9671, "mean_token_accuracy": 0.3655172437429428, "step": 144870 }, { "epoch": 0.1459196386934061, "grad_norm": 10.788963230355808, "learning_rate": 4.887754762282225e-05, "loss": 2.2361, "mean_token_accuracy": 0.45862069725990295, "step": 144875 }, { "epoch": 0.14592467474651027, "grad_norm": 9.199285460307284, "learning_rate": 4.887743074471852e-05, "loss": 2.3151, "mean_token_accuracy": 0.4344827592372894, "step": 144880 }, { "epoch": 0.14592971079961445, "grad_norm": 11.87326953583054, "learning_rate": 4.887731386068568e-05, "loss": 2.686, "mean_token_accuracy": 0.3931034505367279, "step": 144885 }, { "epoch": 0.14593474685271862, "grad_norm": 9.187627287430418, "learning_rate": 4.887719697072376e-05, "loss": 2.4483, "mean_token_accuracy": 0.4689655065536499, "step": 144890 }, { "epoch": 0.1459397829058228, "grad_norm": 10.03346699047246, "learning_rate": 4.8877080074832805e-05, "loss": 2.2087, "mean_token_accuracy": 0.4586206912994385, "step": 144895 }, { "epoch": 0.14594481895892697, "grad_norm": 10.606343132149872, "learning_rate": 4.887696317301283e-05, "loss": 2.5578, "mean_token_accuracy": 0.3913490653038025, "step": 144900 }, { "epoch": 0.14594985501203114, "grad_norm": 8.970144784164436, "learning_rate": 4.887684626526387e-05, "loss": 2.2966, "mean_token_accuracy": 0.42413793206214906, "step": 144905 }, { "epoch": 0.14595489106513532, "grad_norm": 10.741852925020835, "learning_rate": 4.887672935158597e-05, "loss": 2.3262, "mean_token_accuracy": 0.43793103098869324, "step": 144910 }, { "epoch": 0.14595992711823946, "grad_norm": 10.67657215363258, "learning_rate": 4.887661243197915e-05, "loss": 2.284, "mean_token_accuracy": 0.4210591077804565, "step": 144915 }, { "epoch": 0.14596496317134364, "grad_norm": 12.2909904199333, "learning_rate": 4.887649550644345e-05, "loss": 2.5811, "mean_token_accuracy": 0.4000000059604645, "step": 144920 }, { "epoch": 0.1459699992244478, "grad_norm": 10.135351060230445, "learning_rate": 4.88763785749789e-05, "loss": 2.5141, "mean_token_accuracy": 0.42413793206214906, "step": 144925 }, { "epoch": 0.14597503527755198, "grad_norm": 9.41953744451291, "learning_rate": 4.887626163758552e-05, "loss": 2.4457, "mean_token_accuracy": 0.4034482777118683, "step": 144930 }, { "epoch": 0.14598007133065616, "grad_norm": 12.382012870046914, "learning_rate": 4.887614469426337e-05, "loss": 2.4097, "mean_token_accuracy": 0.4965517222881317, "step": 144935 }, { "epoch": 0.14598510738376033, "grad_norm": 9.502633008980666, "learning_rate": 4.887602774501246e-05, "loss": 2.3572, "mean_token_accuracy": 0.45517241954803467, "step": 144940 }, { "epoch": 0.1459901434368645, "grad_norm": 10.56196352739203, "learning_rate": 4.887591078983283e-05, "loss": 2.3738, "mean_token_accuracy": 0.441379314661026, "step": 144945 }, { "epoch": 0.14599517948996868, "grad_norm": 9.595458511014769, "learning_rate": 4.8875793828724504e-05, "loss": 2.2889, "mean_token_accuracy": 0.38620689511299133, "step": 144950 }, { "epoch": 0.14600021554307285, "grad_norm": 11.25543358036961, "learning_rate": 4.887567686168753e-05, "loss": 2.4449, "mean_token_accuracy": 0.42413792610168455, "step": 144955 }, { "epoch": 0.14600525159617703, "grad_norm": 10.655611753061484, "learning_rate": 4.8875559888721935e-05, "loss": 2.6358, "mean_token_accuracy": 0.3896551787853241, "step": 144960 }, { "epoch": 0.1460102876492812, "grad_norm": 9.803499990848563, "learning_rate": 4.887544290982774e-05, "loss": 1.8953, "mean_token_accuracy": 0.5482758462429047, "step": 144965 }, { "epoch": 0.14601532370238537, "grad_norm": 12.138479923900167, "learning_rate": 4.887532592500499e-05, "loss": 1.9609, "mean_token_accuracy": 0.49999998807907103, "step": 144970 }, { "epoch": 0.14602035975548955, "grad_norm": 12.150081441851995, "learning_rate": 4.8875208934253716e-05, "loss": 2.7599, "mean_token_accuracy": 0.43793103098869324, "step": 144975 }, { "epoch": 0.14602539580859372, "grad_norm": 10.643119612542845, "learning_rate": 4.887509193757394e-05, "loss": 2.5193, "mean_token_accuracy": 0.42758620977401735, "step": 144980 }, { "epoch": 0.1460304318616979, "grad_norm": 10.235546829888639, "learning_rate": 4.8874974934965705e-05, "loss": 2.5595, "mean_token_accuracy": 0.43103447556495667, "step": 144985 }, { "epoch": 0.14603546791480207, "grad_norm": 8.764113881085224, "learning_rate": 4.887485792642905e-05, "loss": 2.547, "mean_token_accuracy": 0.4137930929660797, "step": 144990 }, { "epoch": 0.14604050396790624, "grad_norm": 9.833380073682754, "learning_rate": 4.887474091196399e-05, "loss": 2.604, "mean_token_accuracy": 0.41379310488700866, "step": 144995 }, { "epoch": 0.14604554002101042, "grad_norm": 12.703148089164713, "learning_rate": 4.887462389157057e-05, "loss": 2.3965, "mean_token_accuracy": 0.43103448748588563, "step": 145000 }, { "epoch": 0.1460505760741146, "grad_norm": 10.74192104244049, "learning_rate": 4.8874506865248815e-05, "loss": 2.6119, "mean_token_accuracy": 0.3862069010734558, "step": 145005 }, { "epoch": 0.14605561212721876, "grad_norm": 9.778281601497044, "learning_rate": 4.887438983299876e-05, "loss": 2.1423, "mean_token_accuracy": 0.5034482717514038, "step": 145010 }, { "epoch": 0.14606064818032294, "grad_norm": 10.005973435299799, "learning_rate": 4.8874272794820445e-05, "loss": 2.4001, "mean_token_accuracy": 0.4206896424293518, "step": 145015 }, { "epoch": 0.1460656842334271, "grad_norm": 8.16984249525536, "learning_rate": 4.8874155750713894e-05, "loss": 1.981, "mean_token_accuracy": 0.4482758641242981, "step": 145020 }, { "epoch": 0.14607072028653129, "grad_norm": 9.253231203084932, "learning_rate": 4.8874038700679136e-05, "loss": 2.4178, "mean_token_accuracy": 0.47241379618644713, "step": 145025 }, { "epoch": 0.14607575633963546, "grad_norm": 9.520827044277427, "learning_rate": 4.887392164471621e-05, "loss": 2.2402, "mean_token_accuracy": 0.47761645913124084, "step": 145030 }, { "epoch": 0.14608079239273963, "grad_norm": 8.605783949684087, "learning_rate": 4.887380458282515e-05, "loss": 2.2401, "mean_token_accuracy": 0.44482759237289426, "step": 145035 }, { "epoch": 0.1460858284458438, "grad_norm": 12.92517872220332, "learning_rate": 4.887368751500598e-05, "loss": 2.4023, "mean_token_accuracy": 0.42413792610168455, "step": 145040 }, { "epoch": 0.14609086449894798, "grad_norm": 11.228400260967698, "learning_rate": 4.887357044125874e-05, "loss": 2.3198, "mean_token_accuracy": 0.4379310369491577, "step": 145045 }, { "epoch": 0.14609590055205215, "grad_norm": 11.77582828047089, "learning_rate": 4.887345336158347e-05, "loss": 2.6586, "mean_token_accuracy": 0.3965517282485962, "step": 145050 }, { "epoch": 0.1461009366051563, "grad_norm": 9.257639507098217, "learning_rate": 4.8873336275980184e-05, "loss": 2.2383, "mean_token_accuracy": 0.43793103098869324, "step": 145055 }, { "epoch": 0.14610597265826047, "grad_norm": 15.881600711056246, "learning_rate": 4.887321918444893e-05, "loss": 2.9169, "mean_token_accuracy": 0.3482758581638336, "step": 145060 }, { "epoch": 0.14611100871136465, "grad_norm": 10.521979159805525, "learning_rate": 4.8873102086989725e-05, "loss": 2.0545, "mean_token_accuracy": 0.47586206197738645, "step": 145065 }, { "epoch": 0.14611604476446882, "grad_norm": 6.059787968589027, "learning_rate": 4.887298498360261e-05, "loss": 2.011, "mean_token_accuracy": 0.4782819151878357, "step": 145070 }, { "epoch": 0.146121080817573, "grad_norm": 10.82478389979442, "learning_rate": 4.887286787428762e-05, "loss": 2.254, "mean_token_accuracy": 0.4413793087005615, "step": 145075 }, { "epoch": 0.14612611687067717, "grad_norm": 12.594626774951399, "learning_rate": 4.887275075904479e-05, "loss": 2.6804, "mean_token_accuracy": 0.4034482777118683, "step": 145080 }, { "epoch": 0.14613115292378134, "grad_norm": 10.213882292348739, "learning_rate": 4.887263363787415e-05, "loss": 2.462, "mean_token_accuracy": 0.4517241418361664, "step": 145085 }, { "epoch": 0.14613618897688552, "grad_norm": 16.243449315093645, "learning_rate": 4.887251651077573e-05, "loss": 2.3108, "mean_token_accuracy": 0.43103448748588563, "step": 145090 }, { "epoch": 0.1461412250299897, "grad_norm": 9.518482863534286, "learning_rate": 4.887239937774955e-05, "loss": 2.5031, "mean_token_accuracy": 0.4068965494632721, "step": 145095 }, { "epoch": 0.14614626108309386, "grad_norm": 14.651437891454007, "learning_rate": 4.8872282238795664e-05, "loss": 2.6737, "mean_token_accuracy": 0.4137930989265442, "step": 145100 }, { "epoch": 0.14615129713619804, "grad_norm": 15.969765915014376, "learning_rate": 4.8872165093914094e-05, "loss": 2.143, "mean_token_accuracy": 0.4620689630508423, "step": 145105 }, { "epoch": 0.1461563331893022, "grad_norm": 9.623776600143229, "learning_rate": 4.887204794310488e-05, "loss": 2.5268, "mean_token_accuracy": 0.4310344815254211, "step": 145110 }, { "epoch": 0.14616136924240639, "grad_norm": 11.175351583446634, "learning_rate": 4.8871930786368045e-05, "loss": 2.288, "mean_token_accuracy": 0.4620689690113068, "step": 145115 }, { "epoch": 0.14616640529551056, "grad_norm": 12.608747568334623, "learning_rate": 4.887181362370362e-05, "loss": 2.6211, "mean_token_accuracy": 0.3944948613643646, "step": 145120 }, { "epoch": 0.14617144134861473, "grad_norm": 9.880089663522657, "learning_rate": 4.8871696455111654e-05, "loss": 2.7392, "mean_token_accuracy": 0.39310344457626345, "step": 145125 }, { "epoch": 0.1461764774017189, "grad_norm": 10.977726237919411, "learning_rate": 4.887157928059216e-05, "loss": 2.407, "mean_token_accuracy": 0.4206896424293518, "step": 145130 }, { "epoch": 0.14618151345482308, "grad_norm": 10.771524191081141, "learning_rate": 4.887146210014518e-05, "loss": 2.3254, "mean_token_accuracy": 0.4517241358757019, "step": 145135 }, { "epoch": 0.14618654950792725, "grad_norm": 12.72612007111849, "learning_rate": 4.887134491377074e-05, "loss": 2.3624, "mean_token_accuracy": 0.4379310369491577, "step": 145140 }, { "epoch": 0.14619158556103143, "grad_norm": 10.386076914076511, "learning_rate": 4.887122772146889e-05, "loss": 2.7472, "mean_token_accuracy": 0.3965517282485962, "step": 145145 }, { "epoch": 0.1461966216141356, "grad_norm": 12.509018692820693, "learning_rate": 4.8871110523239635e-05, "loss": 2.5605, "mean_token_accuracy": 0.417241370677948, "step": 145150 }, { "epoch": 0.14620165766723978, "grad_norm": 8.319202162844528, "learning_rate": 4.887099331908304e-05, "loss": 2.2244, "mean_token_accuracy": 0.4620689690113068, "step": 145155 }, { "epoch": 0.14620669372034395, "grad_norm": 9.336887161305897, "learning_rate": 4.8870876108999114e-05, "loss": 2.6074, "mean_token_accuracy": 0.417241370677948, "step": 145160 }, { "epoch": 0.14621172977344812, "grad_norm": 9.818742450836854, "learning_rate": 4.887075889298789e-05, "loss": 2.3112, "mean_token_accuracy": 0.4137930989265442, "step": 145165 }, { "epoch": 0.1462167658265523, "grad_norm": 14.405686081070389, "learning_rate": 4.887064167104941e-05, "loss": 2.2286, "mean_token_accuracy": 0.4517241299152374, "step": 145170 }, { "epoch": 0.14622180187965647, "grad_norm": 8.62197203962883, "learning_rate": 4.88705244431837e-05, "loss": 2.2324, "mean_token_accuracy": 0.4689655125141144, "step": 145175 }, { "epoch": 0.14622683793276064, "grad_norm": 10.482808664392508, "learning_rate": 4.88704072093908e-05, "loss": 2.2475, "mean_token_accuracy": 0.441379314661026, "step": 145180 }, { "epoch": 0.14623187398586482, "grad_norm": 10.305381296142238, "learning_rate": 4.887028996967074e-05, "loss": 1.9466, "mean_token_accuracy": 0.46896551847457885, "step": 145185 }, { "epoch": 0.146236910038969, "grad_norm": 9.108531661038388, "learning_rate": 4.887017272402355e-05, "loss": 2.3282, "mean_token_accuracy": 0.43448275327682495, "step": 145190 }, { "epoch": 0.14624194609207314, "grad_norm": 11.23526797433356, "learning_rate": 4.887005547244925e-05, "loss": 2.0589, "mean_token_accuracy": 0.49534180760383606, "step": 145195 }, { "epoch": 0.1462469821451773, "grad_norm": 9.714520385387342, "learning_rate": 4.8869938214947894e-05, "loss": 2.1993, "mean_token_accuracy": 0.4398064136505127, "step": 145200 }, { "epoch": 0.14625201819828149, "grad_norm": 9.586311871662996, "learning_rate": 4.8869820951519515e-05, "loss": 2.5142, "mean_token_accuracy": 0.44827585816383364, "step": 145205 }, { "epoch": 0.14625705425138566, "grad_norm": 8.767572791243037, "learning_rate": 4.886970368216412e-05, "loss": 2.3565, "mean_token_accuracy": 0.43793103098869324, "step": 145210 }, { "epoch": 0.14626209030448983, "grad_norm": 9.161731312713252, "learning_rate": 4.886958640688177e-05, "loss": 2.3168, "mean_token_accuracy": 0.4448275864124298, "step": 145215 }, { "epoch": 0.146267126357594, "grad_norm": 10.536170174216142, "learning_rate": 4.886946912567248e-05, "loss": 2.2404, "mean_token_accuracy": 0.47241379618644713, "step": 145220 }, { "epoch": 0.14627216241069818, "grad_norm": 9.092385023752758, "learning_rate": 4.8869351838536295e-05, "loss": 2.7206, "mean_token_accuracy": 0.36896550953388213, "step": 145225 }, { "epoch": 0.14627719846380235, "grad_norm": 11.271421972496517, "learning_rate": 4.886923454547323e-05, "loss": 2.2669, "mean_token_accuracy": 0.458620685338974, "step": 145230 }, { "epoch": 0.14628223451690653, "grad_norm": 9.986911204398492, "learning_rate": 4.886911724648333e-05, "loss": 2.5131, "mean_token_accuracy": 0.4000000059604645, "step": 145235 }, { "epoch": 0.1462872705700107, "grad_norm": 12.166151040017963, "learning_rate": 4.886899994156663e-05, "loss": 2.37, "mean_token_accuracy": 0.4724137902259827, "step": 145240 }, { "epoch": 0.14629230662311488, "grad_norm": 10.630490326639443, "learning_rate": 4.886888263072316e-05, "loss": 2.0936, "mean_token_accuracy": 0.5034482836723327, "step": 145245 }, { "epoch": 0.14629734267621905, "grad_norm": 11.499656341197936, "learning_rate": 4.8868765313952945e-05, "loss": 2.0978, "mean_token_accuracy": 0.4707804024219513, "step": 145250 }, { "epoch": 0.14630237872932322, "grad_norm": 11.46836326496535, "learning_rate": 4.886864799125603e-05, "loss": 2.4239, "mean_token_accuracy": 0.4, "step": 145255 }, { "epoch": 0.1463074147824274, "grad_norm": 10.375279649780268, "learning_rate": 4.886853066263244e-05, "loss": 2.7977, "mean_token_accuracy": 0.44137930274009707, "step": 145260 }, { "epoch": 0.14631245083553157, "grad_norm": 12.346465022982581, "learning_rate": 4.88684133280822e-05, "loss": 2.4361, "mean_token_accuracy": 0.4103448331356049, "step": 145265 }, { "epoch": 0.14631748688863574, "grad_norm": 8.116920205755989, "learning_rate": 4.886829598760536e-05, "loss": 2.1886, "mean_token_accuracy": 0.47586206197738645, "step": 145270 }, { "epoch": 0.14632252294173992, "grad_norm": 8.748390537776741, "learning_rate": 4.8868178641201936e-05, "loss": 2.4177, "mean_token_accuracy": 0.417241370677948, "step": 145275 }, { "epoch": 0.1463275589948441, "grad_norm": 9.844327932894803, "learning_rate": 4.886806128887197e-05, "loss": 2.3415, "mean_token_accuracy": 0.4154264986515045, "step": 145280 }, { "epoch": 0.14633259504794827, "grad_norm": 9.610179290266823, "learning_rate": 4.886794393061549e-05, "loss": 2.3172, "mean_token_accuracy": 0.42758620977401735, "step": 145285 }, { "epoch": 0.14633763110105244, "grad_norm": 9.9467020217488, "learning_rate": 4.886782656643254e-05, "loss": 2.3103, "mean_token_accuracy": 0.4471869349479675, "step": 145290 }, { "epoch": 0.1463426671541566, "grad_norm": 13.027939965778941, "learning_rate": 4.886770919632313e-05, "loss": 2.4696, "mean_token_accuracy": 0.38275861740112305, "step": 145295 }, { "epoch": 0.1463477032072608, "grad_norm": 9.822032466540874, "learning_rate": 4.886759182028732e-05, "loss": 2.6223, "mean_token_accuracy": 0.41379310488700866, "step": 145300 }, { "epoch": 0.14635273926036496, "grad_norm": 8.86115774028444, "learning_rate": 4.8867474438325125e-05, "loss": 2.1155, "mean_token_accuracy": 0.4379310369491577, "step": 145305 }, { "epoch": 0.14635777531346914, "grad_norm": 9.15420892047723, "learning_rate": 4.886735705043658e-05, "loss": 2.0348, "mean_token_accuracy": 0.4862068831920624, "step": 145310 }, { "epoch": 0.1463628113665733, "grad_norm": 13.90180387713896, "learning_rate": 4.886723965662171e-05, "loss": 2.5824, "mean_token_accuracy": 0.3999999940395355, "step": 145315 }, { "epoch": 0.14636784741967748, "grad_norm": 9.560134711230804, "learning_rate": 4.886712225688057e-05, "loss": 2.6392, "mean_token_accuracy": 0.3862068891525269, "step": 145320 }, { "epoch": 0.14637288347278166, "grad_norm": 9.257689002483183, "learning_rate": 4.886700485121318e-05, "loss": 2.3795, "mean_token_accuracy": 0.4379310369491577, "step": 145325 }, { "epoch": 0.14637791952588583, "grad_norm": 9.88636912998103, "learning_rate": 4.8866887439619555e-05, "loss": 2.5104, "mean_token_accuracy": 0.39310344457626345, "step": 145330 }, { "epoch": 0.14638295557898998, "grad_norm": 12.641085148222354, "learning_rate": 4.8866770022099756e-05, "loss": 2.6555, "mean_token_accuracy": 0.38965516686439516, "step": 145335 }, { "epoch": 0.14638799163209415, "grad_norm": 10.587093023030333, "learning_rate": 4.88666525986538e-05, "loss": 2.385, "mean_token_accuracy": 0.42413792610168455, "step": 145340 }, { "epoch": 0.14639302768519832, "grad_norm": 9.942878377791454, "learning_rate": 4.886653516928173e-05, "loss": 3.044, "mean_token_accuracy": 0.3758620619773865, "step": 145345 }, { "epoch": 0.1463980637383025, "grad_norm": 10.328591194376648, "learning_rate": 4.886641773398356e-05, "loss": 2.4966, "mean_token_accuracy": 0.4137930989265442, "step": 145350 }, { "epoch": 0.14640309979140667, "grad_norm": 11.040157081548655, "learning_rate": 4.886630029275935e-05, "loss": 2.4591, "mean_token_accuracy": 0.45716878175735476, "step": 145355 }, { "epoch": 0.14640813584451084, "grad_norm": 9.512107579349054, "learning_rate": 4.88661828456091e-05, "loss": 2.3314, "mean_token_accuracy": 0.44482759237289426, "step": 145360 }, { "epoch": 0.14641317189761502, "grad_norm": 10.014396220280643, "learning_rate": 4.886606539253287e-05, "loss": 2.3752, "mean_token_accuracy": 0.42758620977401735, "step": 145365 }, { "epoch": 0.1464182079507192, "grad_norm": 9.076690843344158, "learning_rate": 4.886594793353068e-05, "loss": 2.2153, "mean_token_accuracy": 0.4344827651977539, "step": 145370 }, { "epoch": 0.14642324400382337, "grad_norm": 10.510217708281013, "learning_rate": 4.8865830468602565e-05, "loss": 2.7783, "mean_token_accuracy": 0.334482753276825, "step": 145375 }, { "epoch": 0.14642828005692754, "grad_norm": 9.385462644882063, "learning_rate": 4.886571299774855e-05, "loss": 2.1743, "mean_token_accuracy": 0.441379314661026, "step": 145380 }, { "epoch": 0.1464333161100317, "grad_norm": 11.103320864408245, "learning_rate": 4.886559552096868e-05, "loss": 2.6643, "mean_token_accuracy": 0.42413793206214906, "step": 145385 }, { "epoch": 0.1464383521631359, "grad_norm": 10.882895083002902, "learning_rate": 4.886547803826299e-05, "loss": 2.3235, "mean_token_accuracy": 0.4903940916061401, "step": 145390 }, { "epoch": 0.14644338821624006, "grad_norm": 10.353339572409475, "learning_rate": 4.886536054963149e-05, "loss": 2.5862, "mean_token_accuracy": 0.38620689511299133, "step": 145395 }, { "epoch": 0.14644842426934424, "grad_norm": 10.53692987441267, "learning_rate": 4.886524305507424e-05, "loss": 2.4441, "mean_token_accuracy": 0.3517241358757019, "step": 145400 }, { "epoch": 0.1464534603224484, "grad_norm": 9.812351582450763, "learning_rate": 4.8865125554591254e-05, "loss": 2.2427, "mean_token_accuracy": 0.4172413766384125, "step": 145405 }, { "epoch": 0.14645849637555258, "grad_norm": 11.825106032235364, "learning_rate": 4.886500804818258e-05, "loss": 2.0443, "mean_token_accuracy": 0.4896551728248596, "step": 145410 }, { "epoch": 0.14646353242865676, "grad_norm": 11.246363289145366, "learning_rate": 4.886489053584823e-05, "loss": 2.281, "mean_token_accuracy": 0.4793103337287903, "step": 145415 }, { "epoch": 0.14646856848176093, "grad_norm": 10.221547869805566, "learning_rate": 4.886477301758825e-05, "loss": 2.2731, "mean_token_accuracy": 0.4551724135875702, "step": 145420 }, { "epoch": 0.1464736045348651, "grad_norm": 9.076963328962508, "learning_rate": 4.886465549340267e-05, "loss": 2.1997, "mean_token_accuracy": 0.43448275327682495, "step": 145425 }, { "epoch": 0.14647864058796928, "grad_norm": 10.908653531218096, "learning_rate": 4.886453796329153e-05, "loss": 2.0624, "mean_token_accuracy": 0.4551724135875702, "step": 145430 }, { "epoch": 0.14648367664107345, "grad_norm": 8.602774489269136, "learning_rate": 4.886442042725485e-05, "loss": 2.4516, "mean_token_accuracy": 0.4500302493572235, "step": 145435 }, { "epoch": 0.14648871269417763, "grad_norm": 9.171289380464234, "learning_rate": 4.886430288529267e-05, "loss": 2.4143, "mean_token_accuracy": 0.4137930989265442, "step": 145440 }, { "epoch": 0.1464937487472818, "grad_norm": 9.473873284077241, "learning_rate": 4.8864185337405015e-05, "loss": 2.3762, "mean_token_accuracy": 0.4655172348022461, "step": 145445 }, { "epoch": 0.14649878480038597, "grad_norm": 9.450784426802032, "learning_rate": 4.886406778359193e-05, "loss": 2.2168, "mean_token_accuracy": 0.49879008531570435, "step": 145450 }, { "epoch": 0.14650382085349015, "grad_norm": 10.045406209960491, "learning_rate": 4.886395022385344e-05, "loss": 2.4078, "mean_token_accuracy": 0.441379314661026, "step": 145455 }, { "epoch": 0.14650885690659432, "grad_norm": 12.911688311279304, "learning_rate": 4.8863832658189586e-05, "loss": 2.5789, "mean_token_accuracy": 0.3896551638841629, "step": 145460 }, { "epoch": 0.1465138929596985, "grad_norm": 10.831772057316453, "learning_rate": 4.886371508660038e-05, "loss": 2.6407, "mean_token_accuracy": 0.4034482717514038, "step": 145465 }, { "epoch": 0.14651892901280267, "grad_norm": 12.57706631841123, "learning_rate": 4.886359750908588e-05, "loss": 2.3528, "mean_token_accuracy": 0.4586206912994385, "step": 145470 }, { "epoch": 0.1465239650659068, "grad_norm": 14.186897043768306, "learning_rate": 4.8863479925646095e-05, "loss": 2.5375, "mean_token_accuracy": 0.4068965494632721, "step": 145475 }, { "epoch": 0.146529001119011, "grad_norm": 14.189834806165358, "learning_rate": 4.886336233628108e-05, "loss": 2.7029, "mean_token_accuracy": 0.36206896901130675, "step": 145480 }, { "epoch": 0.14653403717211516, "grad_norm": 11.351023344231312, "learning_rate": 4.886324474099085e-05, "loss": 2.2815, "mean_token_accuracy": 0.47586206793785096, "step": 145485 }, { "epoch": 0.14653907322521934, "grad_norm": 9.745486790013976, "learning_rate": 4.886312713977544e-05, "loss": 2.4366, "mean_token_accuracy": 0.441379314661026, "step": 145490 }, { "epoch": 0.1465441092783235, "grad_norm": 9.986840580878154, "learning_rate": 4.8863009532634904e-05, "loss": 2.4682, "mean_token_accuracy": 0.41542649269104004, "step": 145495 }, { "epoch": 0.14654914533142768, "grad_norm": 9.979332401274913, "learning_rate": 4.886289191956924e-05, "loss": 2.2858, "mean_token_accuracy": 0.4359951615333557, "step": 145500 }, { "epoch": 0.14655418138453186, "grad_norm": 11.922447717112854, "learning_rate": 4.886277430057851e-05, "loss": 2.6458, "mean_token_accuracy": 0.3931034505367279, "step": 145505 }, { "epoch": 0.14655921743763603, "grad_norm": 11.247768707166953, "learning_rate": 4.886265667566272e-05, "loss": 2.4043, "mean_token_accuracy": 0.42758620977401735, "step": 145510 }, { "epoch": 0.1465642534907402, "grad_norm": 11.350808532624301, "learning_rate": 4.886253904482193e-05, "loss": 2.2584, "mean_token_accuracy": 0.4413793206214905, "step": 145515 }, { "epoch": 0.14656928954384438, "grad_norm": 11.445006644942893, "learning_rate": 4.8862421408056156e-05, "loss": 2.4112, "mean_token_accuracy": 0.4601330876350403, "step": 145520 }, { "epoch": 0.14657432559694855, "grad_norm": 10.538342848766543, "learning_rate": 4.8862303765365435e-05, "loss": 2.156, "mean_token_accuracy": 0.4379310369491577, "step": 145525 }, { "epoch": 0.14657936165005273, "grad_norm": 13.66871592578065, "learning_rate": 4.88621861167498e-05, "loss": 2.6588, "mean_token_accuracy": 0.37586206793785093, "step": 145530 }, { "epoch": 0.1465843977031569, "grad_norm": 10.509664315299132, "learning_rate": 4.8862068462209284e-05, "loss": 2.2062, "mean_token_accuracy": 0.44827585816383364, "step": 145535 }, { "epoch": 0.14658943375626107, "grad_norm": 13.571823734696153, "learning_rate": 4.886195080174392e-05, "loss": 2.7454, "mean_token_accuracy": 0.3965517163276672, "step": 145540 }, { "epoch": 0.14659446980936525, "grad_norm": 9.084225900083837, "learning_rate": 4.8861833135353734e-05, "loss": 2.5542, "mean_token_accuracy": 0.4034482777118683, "step": 145545 }, { "epoch": 0.14659950586246942, "grad_norm": 10.65159439266977, "learning_rate": 4.886171546303877e-05, "loss": 2.6409, "mean_token_accuracy": 0.4068965554237366, "step": 145550 }, { "epoch": 0.1466045419155736, "grad_norm": 10.186836341788988, "learning_rate": 4.886159778479905e-05, "loss": 2.2854, "mean_token_accuracy": 0.47586206793785096, "step": 145555 }, { "epoch": 0.14660957796867777, "grad_norm": 11.683359317168403, "learning_rate": 4.886148010063461e-05, "loss": 2.2593, "mean_token_accuracy": 0.47931034564971925, "step": 145560 }, { "epoch": 0.14661461402178194, "grad_norm": 10.465072147948156, "learning_rate": 4.886136241054549e-05, "loss": 1.9789, "mean_token_accuracy": 0.5285714328289032, "step": 145565 }, { "epoch": 0.14661965007488612, "grad_norm": 10.995216407366447, "learning_rate": 4.886124471453171e-05, "loss": 2.3427, "mean_token_accuracy": 0.45862067937850953, "step": 145570 }, { "epoch": 0.1466246861279903, "grad_norm": 11.83054520627998, "learning_rate": 4.886112701259331e-05, "loss": 2.5649, "mean_token_accuracy": 0.3946158468723297, "step": 145575 }, { "epoch": 0.14662972218109446, "grad_norm": 9.973110372983491, "learning_rate": 4.8861009304730325e-05, "loss": 2.8555, "mean_token_accuracy": 0.42413792610168455, "step": 145580 }, { "epoch": 0.14663475823419864, "grad_norm": 10.28904497414514, "learning_rate": 4.886089159094278e-05, "loss": 2.4667, "mean_token_accuracy": 0.42758620977401735, "step": 145585 }, { "epoch": 0.1466397942873028, "grad_norm": 12.150565256179517, "learning_rate": 4.886077387123072e-05, "loss": 2.5299, "mean_token_accuracy": 0.42758620381355283, "step": 145590 }, { "epoch": 0.14664483034040698, "grad_norm": 9.636097363438672, "learning_rate": 4.8860656145594167e-05, "loss": 2.0032, "mean_token_accuracy": 0.5103448331356049, "step": 145595 }, { "epoch": 0.14664986639351116, "grad_norm": 11.035740832358089, "learning_rate": 4.8860538414033155e-05, "loss": 2.4901, "mean_token_accuracy": 0.42413792610168455, "step": 145600 }, { "epoch": 0.14665490244661533, "grad_norm": 11.133286192020643, "learning_rate": 4.886042067654771e-05, "loss": 2.165, "mean_token_accuracy": 0.46436781883239747, "step": 145605 }, { "epoch": 0.1466599384997195, "grad_norm": 11.052990380521088, "learning_rate": 4.886030293313788e-05, "loss": 2.3456, "mean_token_accuracy": 0.4413793087005615, "step": 145610 }, { "epoch": 0.14666497455282365, "grad_norm": 11.031521595009053, "learning_rate": 4.886018518380369e-05, "loss": 2.1885, "mean_token_accuracy": 0.43103448748588563, "step": 145615 }, { "epoch": 0.14667001060592783, "grad_norm": 10.377146410079444, "learning_rate": 4.886006742854517e-05, "loss": 2.4718, "mean_token_accuracy": 0.38275861740112305, "step": 145620 }, { "epoch": 0.146675046659032, "grad_norm": 8.90540535417392, "learning_rate": 4.885994966736236e-05, "loss": 2.8863, "mean_token_accuracy": 0.38620689511299133, "step": 145625 }, { "epoch": 0.14668008271213617, "grad_norm": 15.74599136085124, "learning_rate": 4.8859831900255294e-05, "loss": 2.3255, "mean_token_accuracy": 0.40344828367233276, "step": 145630 }, { "epoch": 0.14668511876524035, "grad_norm": 14.888125979975138, "learning_rate": 4.885971412722399e-05, "loss": 2.3624, "mean_token_accuracy": 0.4310344815254211, "step": 145635 }, { "epoch": 0.14669015481834452, "grad_norm": 9.107663751917197, "learning_rate": 4.885959634826849e-05, "loss": 2.5915, "mean_token_accuracy": 0.4586206912994385, "step": 145640 }, { "epoch": 0.1466951908714487, "grad_norm": 10.089719794649529, "learning_rate": 4.885947856338883e-05, "loss": 2.4501, "mean_token_accuracy": 0.4501512348651886, "step": 145645 }, { "epoch": 0.14670022692455287, "grad_norm": 9.851325316151765, "learning_rate": 4.885936077258504e-05, "loss": 1.8812, "mean_token_accuracy": 0.5379310250282288, "step": 145650 }, { "epoch": 0.14670526297765704, "grad_norm": 11.715360924264887, "learning_rate": 4.885924297585715e-05, "loss": 2.679, "mean_token_accuracy": 0.41899576783180237, "step": 145655 }, { "epoch": 0.14671029903076122, "grad_norm": 10.347225279342785, "learning_rate": 4.885912517320518e-05, "loss": 2.4853, "mean_token_accuracy": 0.42413793206214906, "step": 145660 }, { "epoch": 0.1467153350838654, "grad_norm": 12.14598176067242, "learning_rate": 4.88590073646292e-05, "loss": 2.7475, "mean_token_accuracy": 0.3655172407627106, "step": 145665 }, { "epoch": 0.14672037113696956, "grad_norm": 9.64587504699479, "learning_rate": 4.88588895501292e-05, "loss": 2.1623, "mean_token_accuracy": 0.44301270246505736, "step": 145670 }, { "epoch": 0.14672540719007374, "grad_norm": 11.070236644950853, "learning_rate": 4.8858771729705245e-05, "loss": 2.4456, "mean_token_accuracy": 0.4379310369491577, "step": 145675 }, { "epoch": 0.1467304432431779, "grad_norm": 10.066350406119934, "learning_rate": 4.885865390335735e-05, "loss": 2.211, "mean_token_accuracy": 0.47241380214691164, "step": 145680 }, { "epoch": 0.14673547929628208, "grad_norm": 9.692268990257364, "learning_rate": 4.885853607108556e-05, "loss": 2.2728, "mean_token_accuracy": 0.4068965494632721, "step": 145685 }, { "epoch": 0.14674051534938626, "grad_norm": 9.183747003832968, "learning_rate": 4.885841823288989e-05, "loss": 2.355, "mean_token_accuracy": 0.4103448301553726, "step": 145690 }, { "epoch": 0.14674555140249043, "grad_norm": 10.9120006558695, "learning_rate": 4.885830038877039e-05, "loss": 2.407, "mean_token_accuracy": 0.41034482717514037, "step": 145695 }, { "epoch": 0.1467505874555946, "grad_norm": 11.854291828484142, "learning_rate": 4.885818253872708e-05, "loss": 2.4213, "mean_token_accuracy": 0.47071990966796873, "step": 145700 }, { "epoch": 0.14675562350869878, "grad_norm": 9.838840960091375, "learning_rate": 4.885806468276e-05, "loss": 2.3391, "mean_token_accuracy": 0.441379314661026, "step": 145705 }, { "epoch": 0.14676065956180295, "grad_norm": 10.682206779740259, "learning_rate": 4.885794682086919e-05, "loss": 2.0728, "mean_token_accuracy": 0.5021173596382141, "step": 145710 }, { "epoch": 0.14676569561490713, "grad_norm": 12.44005279101634, "learning_rate": 4.885782895305466e-05, "loss": 2.3435, "mean_token_accuracy": 0.4137930989265442, "step": 145715 }, { "epoch": 0.1467707316680113, "grad_norm": 6.374985827492658, "learning_rate": 4.885771107931646e-05, "loss": 1.9341, "mean_token_accuracy": 0.5034482777118683, "step": 145720 }, { "epoch": 0.14677576772111547, "grad_norm": 10.600125199136711, "learning_rate": 4.885759319965463e-05, "loss": 2.2459, "mean_token_accuracy": 0.482758629322052, "step": 145725 }, { "epoch": 0.14678080377421965, "grad_norm": 9.128536649817926, "learning_rate": 4.885747531406919e-05, "loss": 2.233, "mean_token_accuracy": 0.4344827592372894, "step": 145730 }, { "epoch": 0.14678583982732382, "grad_norm": 8.463474046444865, "learning_rate": 4.885735742256016e-05, "loss": 2.2102, "mean_token_accuracy": 0.4013309180736542, "step": 145735 }, { "epoch": 0.146790875880428, "grad_norm": 12.409515188585273, "learning_rate": 4.885723952512759e-05, "loss": 2.5447, "mean_token_accuracy": 0.3896551728248596, "step": 145740 }, { "epoch": 0.14679591193353217, "grad_norm": 9.474995442155242, "learning_rate": 4.8857121621771515e-05, "loss": 2.7443, "mean_token_accuracy": 0.3931034505367279, "step": 145745 }, { "epoch": 0.14680094798663634, "grad_norm": 9.516640948538726, "learning_rate": 4.885700371249197e-05, "loss": 2.1318, "mean_token_accuracy": 0.42413793206214906, "step": 145750 }, { "epoch": 0.1468059840397405, "grad_norm": 11.653741616179131, "learning_rate": 4.885688579728897e-05, "loss": 2.5068, "mean_token_accuracy": 0.37241379022598264, "step": 145755 }, { "epoch": 0.14681102009284466, "grad_norm": 12.80646141498987, "learning_rate": 4.8856767876162565e-05, "loss": 2.2815, "mean_token_accuracy": 0.45317604541778567, "step": 145760 }, { "epoch": 0.14681605614594884, "grad_norm": 10.876172713804136, "learning_rate": 4.8856649949112774e-05, "loss": 2.5727, "mean_token_accuracy": 0.36896551251411436, "step": 145765 }, { "epoch": 0.146821092199053, "grad_norm": 12.618811538450247, "learning_rate": 4.885653201613964e-05, "loss": 2.3381, "mean_token_accuracy": 0.4068965494632721, "step": 145770 }, { "epoch": 0.14682612825215718, "grad_norm": 11.41519940743099, "learning_rate": 4.885641407724319e-05, "loss": 2.1336, "mean_token_accuracy": 0.4862069010734558, "step": 145775 }, { "epoch": 0.14683116430526136, "grad_norm": 12.008356128631757, "learning_rate": 4.885629613242346e-05, "loss": 2.907, "mean_token_accuracy": 0.3793103516101837, "step": 145780 }, { "epoch": 0.14683620035836553, "grad_norm": 11.428787394343868, "learning_rate": 4.8856178181680485e-05, "loss": 2.601, "mean_token_accuracy": 0.4310344934463501, "step": 145785 }, { "epoch": 0.1468412364114697, "grad_norm": 13.890208340176272, "learning_rate": 4.8856060225014294e-05, "loss": 2.2851, "mean_token_accuracy": 0.48965516686439514, "step": 145790 }, { "epoch": 0.14684627246457388, "grad_norm": 9.546459224088771, "learning_rate": 4.885594226242491e-05, "loss": 2.2061, "mean_token_accuracy": 0.4172413766384125, "step": 145795 }, { "epoch": 0.14685130851767805, "grad_norm": 9.459151425908313, "learning_rate": 4.885582429391239e-05, "loss": 2.4398, "mean_token_accuracy": 0.4, "step": 145800 }, { "epoch": 0.14685634457078223, "grad_norm": 9.159487169346878, "learning_rate": 4.885570631947674e-05, "loss": 2.3402, "mean_token_accuracy": 0.43793103098869324, "step": 145805 }, { "epoch": 0.1468613806238864, "grad_norm": 9.331179555651648, "learning_rate": 4.885558833911802e-05, "loss": 2.1356, "mean_token_accuracy": 0.4379310369491577, "step": 145810 }, { "epoch": 0.14686641667699057, "grad_norm": 11.26027385746331, "learning_rate": 4.8855470352836234e-05, "loss": 2.2584, "mean_token_accuracy": 0.4793103516101837, "step": 145815 }, { "epoch": 0.14687145273009475, "grad_norm": 11.14431777521666, "learning_rate": 4.885535236063144e-05, "loss": 2.3869, "mean_token_accuracy": 0.4221415638923645, "step": 145820 }, { "epoch": 0.14687648878319892, "grad_norm": 9.738854023127862, "learning_rate": 4.8855234362503646e-05, "loss": 2.2654, "mean_token_accuracy": 0.42068966031074523, "step": 145825 }, { "epoch": 0.1468815248363031, "grad_norm": 8.808123157843095, "learning_rate": 4.885511635845291e-05, "loss": 1.9636, "mean_token_accuracy": 0.5402903914451599, "step": 145830 }, { "epoch": 0.14688656088940727, "grad_norm": 13.935103884008377, "learning_rate": 4.885499834847925e-05, "loss": 2.3731, "mean_token_accuracy": 0.4379310429096222, "step": 145835 }, { "epoch": 0.14689159694251144, "grad_norm": 12.446096841380216, "learning_rate": 4.885488033258269e-05, "loss": 2.1414, "mean_token_accuracy": 0.4655172526836395, "step": 145840 }, { "epoch": 0.14689663299561562, "grad_norm": 9.523884079967656, "learning_rate": 4.8854762310763284e-05, "loss": 2.7792, "mean_token_accuracy": 0.4430127084255219, "step": 145845 }, { "epoch": 0.1469016690487198, "grad_norm": 12.15296460653672, "learning_rate": 4.8854644283021055e-05, "loss": 2.2877, "mean_token_accuracy": 0.45517241954803467, "step": 145850 }, { "epoch": 0.14690670510182396, "grad_norm": 11.280378439355024, "learning_rate": 4.885452624935604e-05, "loss": 2.4809, "mean_token_accuracy": 0.4034482717514038, "step": 145855 }, { "epoch": 0.14691174115492814, "grad_norm": 10.045969162404205, "learning_rate": 4.885440820976826e-05, "loss": 2.3391, "mean_token_accuracy": 0.4497882664203644, "step": 145860 }, { "epoch": 0.1469167772080323, "grad_norm": 10.652652667077671, "learning_rate": 4.885429016425776e-05, "loss": 2.5874, "mean_token_accuracy": 0.43248639106750486, "step": 145865 }, { "epoch": 0.14692181326113649, "grad_norm": 8.126211499967068, "learning_rate": 4.8854172112824566e-05, "loss": 2.0861, "mean_token_accuracy": 0.4551724076271057, "step": 145870 }, { "epoch": 0.14692684931424066, "grad_norm": 11.053502152155357, "learning_rate": 4.885405405546871e-05, "loss": 2.3849, "mean_token_accuracy": 0.4482758641242981, "step": 145875 }, { "epoch": 0.14693188536734483, "grad_norm": 8.014216011502475, "learning_rate": 4.8853935992190236e-05, "loss": 2.8637, "mean_token_accuracy": 0.39038113355636594, "step": 145880 }, { "epoch": 0.146936921420449, "grad_norm": 8.073890692666042, "learning_rate": 4.885381792298916e-05, "loss": 2.3527, "mean_token_accuracy": 0.43793103098869324, "step": 145885 }, { "epoch": 0.14694195747355318, "grad_norm": 10.47429765825648, "learning_rate": 4.8853699847865527e-05, "loss": 2.1991, "mean_token_accuracy": 0.4551724135875702, "step": 145890 }, { "epoch": 0.14694699352665733, "grad_norm": 9.686457165065384, "learning_rate": 4.8853581766819365e-05, "loss": 2.2451, "mean_token_accuracy": 0.4482758641242981, "step": 145895 }, { "epoch": 0.1469520295797615, "grad_norm": 9.966345895629013, "learning_rate": 4.885346367985071e-05, "loss": 2.2633, "mean_token_accuracy": 0.4620689690113068, "step": 145900 }, { "epoch": 0.14695706563286567, "grad_norm": 10.145368446867176, "learning_rate": 4.8853345586959584e-05, "loss": 2.3222, "mean_token_accuracy": 0.4344827592372894, "step": 145905 }, { "epoch": 0.14696210168596985, "grad_norm": 11.920314836844916, "learning_rate": 4.885322748814603e-05, "loss": 2.5712, "mean_token_accuracy": 0.4034482777118683, "step": 145910 }, { "epoch": 0.14696713773907402, "grad_norm": 9.37710871569525, "learning_rate": 4.8853109383410086e-05, "loss": 2.2676, "mean_token_accuracy": 0.45668481588363646, "step": 145915 }, { "epoch": 0.1469721737921782, "grad_norm": 11.302681313322084, "learning_rate": 4.885299127275177e-05, "loss": 2.3941, "mean_token_accuracy": 0.4433756828308105, "step": 145920 }, { "epoch": 0.14697720984528237, "grad_norm": 9.407473182492451, "learning_rate": 4.8852873156171124e-05, "loss": 2.3374, "mean_token_accuracy": 0.42487684488296507, "step": 145925 }, { "epoch": 0.14698224589838654, "grad_norm": 9.681881942910286, "learning_rate": 4.885275503366818e-05, "loss": 2.2165, "mean_token_accuracy": 0.4517241418361664, "step": 145930 }, { "epoch": 0.14698728195149072, "grad_norm": 9.536498020615863, "learning_rate": 4.885263690524297e-05, "loss": 2.2131, "mean_token_accuracy": 0.4655172288417816, "step": 145935 }, { "epoch": 0.1469923180045949, "grad_norm": 16.169762331169718, "learning_rate": 4.885251877089553e-05, "loss": 2.5006, "mean_token_accuracy": 0.38620689511299133, "step": 145940 }, { "epoch": 0.14699735405769906, "grad_norm": 15.228956956296214, "learning_rate": 4.8852400630625875e-05, "loss": 2.5596, "mean_token_accuracy": 0.4034482717514038, "step": 145945 }, { "epoch": 0.14700239011080324, "grad_norm": 9.122632604535788, "learning_rate": 4.885228248443407e-05, "loss": 2.2286, "mean_token_accuracy": 0.4671506345272064, "step": 145950 }, { "epoch": 0.1470074261639074, "grad_norm": 12.105011885731672, "learning_rate": 4.8852164332320124e-05, "loss": 2.9517, "mean_token_accuracy": 0.3413793116807938, "step": 145955 }, { "epoch": 0.14701246221701159, "grad_norm": 10.893302459486996, "learning_rate": 4.8852046174284075e-05, "loss": 2.3943, "mean_token_accuracy": 0.4517241358757019, "step": 145960 }, { "epoch": 0.14701749827011576, "grad_norm": 11.925568275375268, "learning_rate": 4.885192801032595e-05, "loss": 2.6216, "mean_token_accuracy": 0.37586206793785093, "step": 145965 }, { "epoch": 0.14702253432321993, "grad_norm": 13.107476373766335, "learning_rate": 4.885180984044579e-05, "loss": 2.3008, "mean_token_accuracy": 0.4361161530017853, "step": 145970 }, { "epoch": 0.1470275703763241, "grad_norm": 9.874422450074443, "learning_rate": 4.885169166464364e-05, "loss": 2.4107, "mean_token_accuracy": 0.43103448748588563, "step": 145975 }, { "epoch": 0.14703260642942828, "grad_norm": 10.381301587209133, "learning_rate": 4.88515734829195e-05, "loss": 2.2267, "mean_token_accuracy": 0.4379310369491577, "step": 145980 }, { "epoch": 0.14703764248253245, "grad_norm": 9.437146978980236, "learning_rate": 4.885145529527343e-05, "loss": 2.0833, "mean_token_accuracy": 0.4965517222881317, "step": 145985 }, { "epoch": 0.14704267853563663, "grad_norm": 11.392550468317465, "learning_rate": 4.885133710170546e-05, "loss": 2.5354, "mean_token_accuracy": 0.482758617401123, "step": 145990 }, { "epoch": 0.1470477145887408, "grad_norm": 8.982177489988265, "learning_rate": 4.8851218902215606e-05, "loss": 2.2162, "mean_token_accuracy": 0.4689655125141144, "step": 145995 }, { "epoch": 0.14705275064184498, "grad_norm": 8.216930060179326, "learning_rate": 4.885110069680391e-05, "loss": 2.0107, "mean_token_accuracy": 0.4517241299152374, "step": 146000 }, { "epoch": 0.14705778669494915, "grad_norm": 11.541375977142929, "learning_rate": 4.885098248547042e-05, "loss": 2.6444, "mean_token_accuracy": 0.3965517163276672, "step": 146005 }, { "epoch": 0.14706282274805332, "grad_norm": 9.540545112429314, "learning_rate": 4.885086426821515e-05, "loss": 2.0853, "mean_token_accuracy": 0.4551724135875702, "step": 146010 }, { "epoch": 0.1470678588011575, "grad_norm": 10.405333466756243, "learning_rate": 4.885074604503814e-05, "loss": 2.4269, "mean_token_accuracy": 0.41379310488700866, "step": 146015 }, { "epoch": 0.14707289485426167, "grad_norm": 10.129672544050004, "learning_rate": 4.8850627815939415e-05, "loss": 2.2649, "mean_token_accuracy": 0.4931034505367279, "step": 146020 }, { "epoch": 0.14707793090736584, "grad_norm": 9.660299529419826, "learning_rate": 4.8850509580919014e-05, "loss": 2.561, "mean_token_accuracy": 0.3999999940395355, "step": 146025 }, { "epoch": 0.14708296696047002, "grad_norm": 14.01725811037699, "learning_rate": 4.885039133997697e-05, "loss": 2.4122, "mean_token_accuracy": 0.441379314661026, "step": 146030 }, { "epoch": 0.14708800301357416, "grad_norm": 9.192443450348641, "learning_rate": 4.885027309311331e-05, "loss": 2.1291, "mean_token_accuracy": 0.4586206912994385, "step": 146035 }, { "epoch": 0.14709303906667834, "grad_norm": 9.589854701168147, "learning_rate": 4.8850154840328095e-05, "loss": 2.4355, "mean_token_accuracy": 0.42758620977401735, "step": 146040 }, { "epoch": 0.1470980751197825, "grad_norm": 11.26730163418241, "learning_rate": 4.885003658162132e-05, "loss": 2.5889, "mean_token_accuracy": 0.3517241358757019, "step": 146045 }, { "epoch": 0.14710311117288669, "grad_norm": 9.548168623782244, "learning_rate": 4.8849918316993025e-05, "loss": 2.7101, "mean_token_accuracy": 0.39655172228813174, "step": 146050 }, { "epoch": 0.14710814722599086, "grad_norm": 9.345142666263232, "learning_rate": 4.884980004644326e-05, "loss": 2.3462, "mean_token_accuracy": 0.42758620977401735, "step": 146055 }, { "epoch": 0.14711318327909503, "grad_norm": 10.592801790736106, "learning_rate": 4.884968176997205e-05, "loss": 2.3286, "mean_token_accuracy": 0.4379310369491577, "step": 146060 }, { "epoch": 0.1471182193321992, "grad_norm": 11.60479820521365, "learning_rate": 4.884956348757941e-05, "loss": 2.0803, "mean_token_accuracy": 0.4931034445762634, "step": 146065 }, { "epoch": 0.14712325538530338, "grad_norm": 9.607710392518346, "learning_rate": 4.8849445199265405e-05, "loss": 2.2281, "mean_token_accuracy": 0.47749546766281126, "step": 146070 }, { "epoch": 0.14712829143840755, "grad_norm": 9.143813328211191, "learning_rate": 4.884932690503004e-05, "loss": 2.2761, "mean_token_accuracy": 0.47241378426551817, "step": 146075 }, { "epoch": 0.14713332749151173, "grad_norm": 10.92389833516743, "learning_rate": 4.884920860487337e-05, "loss": 2.3137, "mean_token_accuracy": 0.4103448212146759, "step": 146080 }, { "epoch": 0.1471383635446159, "grad_norm": 10.549786870677622, "learning_rate": 4.884909029879542e-05, "loss": 2.3712, "mean_token_accuracy": 0.3965517163276672, "step": 146085 }, { "epoch": 0.14714339959772008, "grad_norm": 9.228650300356833, "learning_rate": 4.884897198679621e-05, "loss": 2.7161, "mean_token_accuracy": 0.4344827592372894, "step": 146090 }, { "epoch": 0.14714843565082425, "grad_norm": 9.961927474969645, "learning_rate": 4.8848853668875786e-05, "loss": 2.544, "mean_token_accuracy": 0.42758620977401735, "step": 146095 }, { "epoch": 0.14715347170392842, "grad_norm": 9.235307517937771, "learning_rate": 4.884873534503418e-05, "loss": 2.6298, "mean_token_accuracy": 0.37931033968925476, "step": 146100 }, { "epoch": 0.1471585077570326, "grad_norm": 12.61601264092315, "learning_rate": 4.8848617015271426e-05, "loss": 2.627, "mean_token_accuracy": 0.40689656138420105, "step": 146105 }, { "epoch": 0.14716354381013677, "grad_norm": 12.235489887095952, "learning_rate": 4.8848498679587543e-05, "loss": 2.445, "mean_token_accuracy": 0.4137930989265442, "step": 146110 }, { "epoch": 0.14716857986324094, "grad_norm": 11.280128927197522, "learning_rate": 4.884838033798259e-05, "loss": 2.5774, "mean_token_accuracy": 0.41379310488700866, "step": 146115 }, { "epoch": 0.14717361591634512, "grad_norm": 10.699662333630153, "learning_rate": 4.884826199045657e-05, "loss": 2.0909, "mean_token_accuracy": 0.42413792610168455, "step": 146120 }, { "epoch": 0.1471786519694493, "grad_norm": 12.644917554089782, "learning_rate": 4.884814363700954e-05, "loss": 2.3941, "mean_token_accuracy": 0.4137930989265442, "step": 146125 }, { "epoch": 0.14718368802255347, "grad_norm": 11.07824378501426, "learning_rate": 4.884802527764151e-05, "loss": 2.5755, "mean_token_accuracy": 0.43103448748588563, "step": 146130 }, { "epoch": 0.14718872407565764, "grad_norm": 10.704466925862716, "learning_rate": 4.884790691235253e-05, "loss": 2.2047, "mean_token_accuracy": 0.41724138259887694, "step": 146135 }, { "epoch": 0.1471937601287618, "grad_norm": 11.065507394242646, "learning_rate": 4.8847788541142635e-05, "loss": 2.3789, "mean_token_accuracy": 0.4896551728248596, "step": 146140 }, { "epoch": 0.147198796181866, "grad_norm": 11.552009885028353, "learning_rate": 4.884767016401185e-05, "loss": 2.6443, "mean_token_accuracy": 0.3931034505367279, "step": 146145 }, { "epoch": 0.14720383223497016, "grad_norm": 13.544766515680962, "learning_rate": 4.8847551780960206e-05, "loss": 2.7922, "mean_token_accuracy": 0.41379310488700866, "step": 146150 }, { "epoch": 0.14720886828807433, "grad_norm": 13.099892066700825, "learning_rate": 4.884743339198773e-05, "loss": 2.0267, "mean_token_accuracy": 0.4551724135875702, "step": 146155 }, { "epoch": 0.1472139043411785, "grad_norm": 9.914650598937762, "learning_rate": 4.884731499709448e-05, "loss": 2.6797, "mean_token_accuracy": 0.3965517282485962, "step": 146160 }, { "epoch": 0.14721894039428268, "grad_norm": 10.885627993540094, "learning_rate": 4.884719659628046e-05, "loss": 2.3055, "mean_token_accuracy": 0.4310344815254211, "step": 146165 }, { "epoch": 0.14722397644738686, "grad_norm": 10.279930308818976, "learning_rate": 4.884707818954573e-05, "loss": 2.0381, "mean_token_accuracy": 0.4862069010734558, "step": 146170 }, { "epoch": 0.147229012500491, "grad_norm": 14.802490336619384, "learning_rate": 4.8846959776890294e-05, "loss": 2.4012, "mean_token_accuracy": 0.4034482777118683, "step": 146175 }, { "epoch": 0.14723404855359518, "grad_norm": 18.691997385468763, "learning_rate": 4.88468413583142e-05, "loss": 2.9284, "mean_token_accuracy": 0.3896551698446274, "step": 146180 }, { "epoch": 0.14723908460669935, "grad_norm": 10.363947018254711, "learning_rate": 4.8846722933817485e-05, "loss": 2.0783, "mean_token_accuracy": 0.46412582993507384, "step": 146185 }, { "epoch": 0.14724412065980352, "grad_norm": 12.29520099921546, "learning_rate": 4.8846604503400176e-05, "loss": 2.4025, "mean_token_accuracy": 0.4480943739414215, "step": 146190 }, { "epoch": 0.1472491567129077, "grad_norm": 10.836766934614507, "learning_rate": 4.884648606706231e-05, "loss": 2.3418, "mean_token_accuracy": 0.4206896543502808, "step": 146195 }, { "epoch": 0.14725419276601187, "grad_norm": 9.770898605575736, "learning_rate": 4.884636762480391e-05, "loss": 2.5394, "mean_token_accuracy": 0.3999999940395355, "step": 146200 }, { "epoch": 0.14725922881911604, "grad_norm": 10.929832378382041, "learning_rate": 4.884624917662502e-05, "loss": 2.2056, "mean_token_accuracy": 0.38275861740112305, "step": 146205 }, { "epoch": 0.14726426487222022, "grad_norm": 10.414347683380713, "learning_rate": 4.8846130722525665e-05, "loss": 2.3572, "mean_token_accuracy": 0.42413792610168455, "step": 146210 }, { "epoch": 0.1472693009253244, "grad_norm": 12.836375186931496, "learning_rate": 4.8846012262505884e-05, "loss": 2.5569, "mean_token_accuracy": 0.4448275864124298, "step": 146215 }, { "epoch": 0.14727433697842857, "grad_norm": 11.862338603483455, "learning_rate": 4.884589379656571e-05, "loss": 2.1429, "mean_token_accuracy": 0.4918330252170563, "step": 146220 }, { "epoch": 0.14727937303153274, "grad_norm": 14.677319642046886, "learning_rate": 4.8845775324705165e-05, "loss": 2.5143, "mean_token_accuracy": 0.39655172228813174, "step": 146225 }, { "epoch": 0.1472844090846369, "grad_norm": 12.325516327666973, "learning_rate": 4.8845656846924295e-05, "loss": 2.2689, "mean_token_accuracy": 0.42413793206214906, "step": 146230 }, { "epoch": 0.1472894451377411, "grad_norm": 11.34140828625138, "learning_rate": 4.884553836322312e-05, "loss": 2.2706, "mean_token_accuracy": 0.47586206793785096, "step": 146235 }, { "epoch": 0.14729448119084526, "grad_norm": 10.034739788075274, "learning_rate": 4.884541987360169e-05, "loss": 2.2461, "mean_token_accuracy": 0.44827585816383364, "step": 146240 }, { "epoch": 0.14729951724394943, "grad_norm": 8.317131215685407, "learning_rate": 4.884530137806002e-05, "loss": 1.7991, "mean_token_accuracy": 0.5400483965873718, "step": 146245 }, { "epoch": 0.1473045532970536, "grad_norm": 9.722296539234446, "learning_rate": 4.884518287659815e-05, "loss": 2.7371, "mean_token_accuracy": 0.3586206793785095, "step": 146250 }, { "epoch": 0.14730958935015778, "grad_norm": 9.472312115737532, "learning_rate": 4.884506436921612e-05, "loss": 2.1124, "mean_token_accuracy": 0.4620689630508423, "step": 146255 }, { "epoch": 0.14731462540326196, "grad_norm": 10.0180877102542, "learning_rate": 4.884494585591395e-05, "loss": 2.2207, "mean_token_accuracy": 0.46551724076271056, "step": 146260 }, { "epoch": 0.14731966145636613, "grad_norm": 13.030258134888165, "learning_rate": 4.884482733669169e-05, "loss": 2.2696, "mean_token_accuracy": 0.4332123279571533, "step": 146265 }, { "epoch": 0.1473246975094703, "grad_norm": 11.729951151696431, "learning_rate": 4.884470881154935e-05, "loss": 2.8376, "mean_token_accuracy": 0.39310344457626345, "step": 146270 }, { "epoch": 0.14732973356257448, "grad_norm": 12.155701166412321, "learning_rate": 4.884459028048699e-05, "loss": 2.3954, "mean_token_accuracy": 0.44827585220336913, "step": 146275 }, { "epoch": 0.14733476961567865, "grad_norm": 8.762977626177326, "learning_rate": 4.884447174350462e-05, "loss": 2.0821, "mean_token_accuracy": 0.4862068951129913, "step": 146280 }, { "epoch": 0.14733980566878283, "grad_norm": 8.71140092842227, "learning_rate": 4.8844353200602285e-05, "loss": 2.2939, "mean_token_accuracy": 0.4448275864124298, "step": 146285 }, { "epoch": 0.147344841721887, "grad_norm": 10.090694337174211, "learning_rate": 4.884423465178001e-05, "loss": 2.4566, "mean_token_accuracy": 0.458620685338974, "step": 146290 }, { "epoch": 0.14734987777499117, "grad_norm": 11.481517029902541, "learning_rate": 4.8844116097037835e-05, "loss": 2.285, "mean_token_accuracy": 0.42758620381355283, "step": 146295 }, { "epoch": 0.14735491382809535, "grad_norm": 10.278153610245294, "learning_rate": 4.884399753637579e-05, "loss": 2.1368, "mean_token_accuracy": 0.41379311084747317, "step": 146300 }, { "epoch": 0.14735994988119952, "grad_norm": 10.90852096163775, "learning_rate": 4.88438789697939e-05, "loss": 2.3939, "mean_token_accuracy": 0.49183303117752075, "step": 146305 }, { "epoch": 0.1473649859343037, "grad_norm": 10.705909895860405, "learning_rate": 4.884376039729222e-05, "loss": 2.4323, "mean_token_accuracy": 0.42068966031074523, "step": 146310 }, { "epoch": 0.14737002198740784, "grad_norm": 9.015219956092839, "learning_rate": 4.884364181887075e-05, "loss": 2.2666, "mean_token_accuracy": 0.4517241418361664, "step": 146315 }, { "epoch": 0.147375058040512, "grad_norm": 11.08313096632233, "learning_rate": 4.884352323452955e-05, "loss": 2.5189, "mean_token_accuracy": 0.4068965494632721, "step": 146320 }, { "epoch": 0.1473800940936162, "grad_norm": 10.33400878703134, "learning_rate": 4.884340464426865e-05, "loss": 2.3309, "mean_token_accuracy": 0.4517241418361664, "step": 146325 }, { "epoch": 0.14738513014672036, "grad_norm": 12.154108546770908, "learning_rate": 4.884328604808807e-05, "loss": 2.332, "mean_token_accuracy": 0.4448275864124298, "step": 146330 }, { "epoch": 0.14739016619982453, "grad_norm": 8.830048582363991, "learning_rate": 4.8843167445987854e-05, "loss": 2.7378, "mean_token_accuracy": 0.3896551728248596, "step": 146335 }, { "epoch": 0.1473952022529287, "grad_norm": 9.68747061073109, "learning_rate": 4.8843048837968025e-05, "loss": 2.5257, "mean_token_accuracy": 0.4103448331356049, "step": 146340 }, { "epoch": 0.14740023830603288, "grad_norm": 10.260987762594217, "learning_rate": 4.884293022402863e-05, "loss": 2.1342, "mean_token_accuracy": 0.4448275864124298, "step": 146345 }, { "epoch": 0.14740527435913706, "grad_norm": 11.132578723949782, "learning_rate": 4.884281160416969e-05, "loss": 2.2275, "mean_token_accuracy": 0.44568965435028074, "step": 146350 }, { "epoch": 0.14741031041224123, "grad_norm": 10.54870335403427, "learning_rate": 4.884269297839124e-05, "loss": 2.3216, "mean_token_accuracy": 0.4551724135875702, "step": 146355 }, { "epoch": 0.1474153464653454, "grad_norm": 11.433128773766368, "learning_rate": 4.884257434669332e-05, "loss": 2.5821, "mean_token_accuracy": 0.4103448212146759, "step": 146360 }, { "epoch": 0.14742038251844958, "grad_norm": 10.30787475567201, "learning_rate": 4.884245570907595e-05, "loss": 2.2467, "mean_token_accuracy": 0.44827587008476255, "step": 146365 }, { "epoch": 0.14742541857155375, "grad_norm": 10.083902013749096, "learning_rate": 4.884233706553917e-05, "loss": 2.4386, "mean_token_accuracy": 0.41034482717514037, "step": 146370 }, { "epoch": 0.14743045462465793, "grad_norm": 8.56388368351227, "learning_rate": 4.8842218416083015e-05, "loss": 2.3002, "mean_token_accuracy": 0.4537205100059509, "step": 146375 }, { "epoch": 0.1474354906777621, "grad_norm": 10.110050960454688, "learning_rate": 4.884209976070752e-05, "loss": 2.4973, "mean_token_accuracy": 0.4294010877609253, "step": 146380 }, { "epoch": 0.14744052673086627, "grad_norm": 8.274268190874139, "learning_rate": 4.884198109941271e-05, "loss": 2.2145, "mean_token_accuracy": 0.46551724076271056, "step": 146385 }, { "epoch": 0.14744556278397045, "grad_norm": 8.71122633345135, "learning_rate": 4.884186243219862e-05, "loss": 2.6268, "mean_token_accuracy": 0.460556560754776, "step": 146390 }, { "epoch": 0.14745059883707462, "grad_norm": 8.68519901573476, "learning_rate": 4.884174375906529e-05, "loss": 2.2366, "mean_token_accuracy": 0.46896552443504336, "step": 146395 }, { "epoch": 0.1474556348901788, "grad_norm": 12.495651136904074, "learning_rate": 4.884162508001274e-05, "loss": 2.2491, "mean_token_accuracy": 0.4327283799648285, "step": 146400 }, { "epoch": 0.14746067094328297, "grad_norm": 9.91038335120461, "learning_rate": 4.884150639504103e-05, "loss": 2.2684, "mean_token_accuracy": 0.441379314661026, "step": 146405 }, { "epoch": 0.14746570699638714, "grad_norm": 9.188514823770724, "learning_rate": 4.884138770415015e-05, "loss": 2.1695, "mean_token_accuracy": 0.5206896483898162, "step": 146410 }, { "epoch": 0.14747074304949132, "grad_norm": 9.237549188900097, "learning_rate": 4.884126900734017e-05, "loss": 2.0063, "mean_token_accuracy": 0.49655172824859617, "step": 146415 }, { "epoch": 0.1474757791025955, "grad_norm": 10.956473396832555, "learning_rate": 4.8841150304611106e-05, "loss": 2.1587, "mean_token_accuracy": 0.49522080421447756, "step": 146420 }, { "epoch": 0.14748081515569966, "grad_norm": 10.815094748530132, "learning_rate": 4.8841031595963e-05, "loss": 2.2209, "mean_token_accuracy": 0.4206896543502808, "step": 146425 }, { "epoch": 0.14748585120880384, "grad_norm": 9.17025455778591, "learning_rate": 4.884091288139587e-05, "loss": 2.2416, "mean_token_accuracy": 0.4724137902259827, "step": 146430 }, { "epoch": 0.147490887261908, "grad_norm": 9.225252261706242, "learning_rate": 4.8840794160909757e-05, "loss": 2.2493, "mean_token_accuracy": 0.458620685338974, "step": 146435 }, { "epoch": 0.14749592331501218, "grad_norm": 10.240961357357124, "learning_rate": 4.88406754345047e-05, "loss": 2.4335, "mean_token_accuracy": 0.44978825449943544, "step": 146440 }, { "epoch": 0.14750095936811636, "grad_norm": 11.787256924838278, "learning_rate": 4.8840556702180726e-05, "loss": 2.1962, "mean_token_accuracy": 0.5091349065303803, "step": 146445 }, { "epoch": 0.14750599542122053, "grad_norm": 10.038616450124481, "learning_rate": 4.8840437963937875e-05, "loss": 2.3103, "mean_token_accuracy": 0.43448275327682495, "step": 146450 }, { "epoch": 0.14751103147432468, "grad_norm": 5.7273493980233185, "learning_rate": 4.884031921977617e-05, "loss": 1.8326, "mean_token_accuracy": 0.5257053256034852, "step": 146455 }, { "epoch": 0.14751606752742885, "grad_norm": 10.79087451912435, "learning_rate": 4.8840200469695646e-05, "loss": 2.2156, "mean_token_accuracy": 0.48275862336158754, "step": 146460 }, { "epoch": 0.14752110358053303, "grad_norm": 11.136187144581514, "learning_rate": 4.8840081713696336e-05, "loss": 2.3738, "mean_token_accuracy": 0.4344827651977539, "step": 146465 }, { "epoch": 0.1475261396336372, "grad_norm": 9.369930552973205, "learning_rate": 4.883996295177828e-05, "loss": 2.2108, "mean_token_accuracy": 0.4137930989265442, "step": 146470 }, { "epoch": 0.14753117568674137, "grad_norm": 11.28572459734448, "learning_rate": 4.8839844183941505e-05, "loss": 2.467, "mean_token_accuracy": 0.44482757449150084, "step": 146475 }, { "epoch": 0.14753621173984555, "grad_norm": 17.614949550329765, "learning_rate": 4.883972541018604e-05, "loss": 2.8033, "mean_token_accuracy": 0.3517241358757019, "step": 146480 }, { "epoch": 0.14754124779294972, "grad_norm": 10.475821291730258, "learning_rate": 4.883960663051193e-05, "loss": 2.7628, "mean_token_accuracy": 0.41724138259887694, "step": 146485 }, { "epoch": 0.1475462838460539, "grad_norm": 9.951961565298, "learning_rate": 4.883948784491919e-05, "loss": 2.4026, "mean_token_accuracy": 0.44482759237289426, "step": 146490 }, { "epoch": 0.14755131989915807, "grad_norm": 14.591603339776848, "learning_rate": 4.8839369053407874e-05, "loss": 2.4288, "mean_token_accuracy": 0.37241379022598264, "step": 146495 }, { "epoch": 0.14755635595226224, "grad_norm": 10.326592833186215, "learning_rate": 4.8839250255978e-05, "loss": 2.3553, "mean_token_accuracy": 0.4310344815254211, "step": 146500 }, { "epoch": 0.14756139200536642, "grad_norm": 10.85624867571091, "learning_rate": 4.88391314526296e-05, "loss": 2.7313, "mean_token_accuracy": 0.36896551847457887, "step": 146505 }, { "epoch": 0.1475664280584706, "grad_norm": 10.579138715553771, "learning_rate": 4.883901264336272e-05, "loss": 2.402, "mean_token_accuracy": 0.4379310369491577, "step": 146510 }, { "epoch": 0.14757146411157476, "grad_norm": 8.566260735839993, "learning_rate": 4.8838893828177376e-05, "loss": 2.2108, "mean_token_accuracy": 0.47586206793785096, "step": 146515 }, { "epoch": 0.14757650016467894, "grad_norm": 12.388594840787238, "learning_rate": 4.883877500707362e-05, "loss": 2.6501, "mean_token_accuracy": 0.4137930929660797, "step": 146520 }, { "epoch": 0.1475815362177831, "grad_norm": 9.676457560637665, "learning_rate": 4.883865618005147e-05, "loss": 2.3913, "mean_token_accuracy": 0.48275862336158754, "step": 146525 }, { "epoch": 0.14758657227088728, "grad_norm": 9.493631459136834, "learning_rate": 4.883853734711097e-05, "loss": 2.4776, "mean_token_accuracy": 0.39655172228813174, "step": 146530 }, { "epoch": 0.14759160832399146, "grad_norm": 11.880314317638344, "learning_rate": 4.8838418508252144e-05, "loss": 2.4365, "mean_token_accuracy": 0.44137929677963256, "step": 146535 }, { "epoch": 0.14759664437709563, "grad_norm": 10.421708421766668, "learning_rate": 4.8838299663475026e-05, "loss": 2.1948, "mean_token_accuracy": 0.47586206793785096, "step": 146540 }, { "epoch": 0.1476016804301998, "grad_norm": 10.640010068441317, "learning_rate": 4.8838180812779655e-05, "loss": 2.3917, "mean_token_accuracy": 0.4068965494632721, "step": 146545 }, { "epoch": 0.14760671648330398, "grad_norm": 12.38843265282111, "learning_rate": 4.883806195616605e-05, "loss": 2.7118, "mean_token_accuracy": 0.341379314661026, "step": 146550 }, { "epoch": 0.14761175253640815, "grad_norm": 10.730498109941115, "learning_rate": 4.883794309363426e-05, "loss": 2.767, "mean_token_accuracy": 0.3620689630508423, "step": 146555 }, { "epoch": 0.14761678858951233, "grad_norm": 9.462429444011796, "learning_rate": 4.883782422518431e-05, "loss": 2.1872, "mean_token_accuracy": 0.44972777366638184, "step": 146560 }, { "epoch": 0.1476218246426165, "grad_norm": 10.733736294996554, "learning_rate": 4.883770535081624e-05, "loss": 2.2791, "mean_token_accuracy": 0.4620689690113068, "step": 146565 }, { "epoch": 0.14762686069572067, "grad_norm": 10.149472125156075, "learning_rate": 4.883758647053008e-05, "loss": 2.2839, "mean_token_accuracy": 0.4620689690113068, "step": 146570 }, { "epoch": 0.14763189674882485, "grad_norm": 11.807214800314915, "learning_rate": 4.883746758432586e-05, "loss": 1.9094, "mean_token_accuracy": 0.4984271049499512, "step": 146575 }, { "epoch": 0.14763693280192902, "grad_norm": 9.609306155537865, "learning_rate": 4.8837348692203604e-05, "loss": 2.3248, "mean_token_accuracy": 0.4517241299152374, "step": 146580 }, { "epoch": 0.1476419688550332, "grad_norm": 10.27693372920149, "learning_rate": 4.883722979416336e-05, "loss": 2.5637, "mean_token_accuracy": 0.41179673075675965, "step": 146585 }, { "epoch": 0.14764700490813734, "grad_norm": 11.134791560753781, "learning_rate": 4.8837110890205154e-05, "loss": 2.7382, "mean_token_accuracy": 0.3896551728248596, "step": 146590 }, { "epoch": 0.14765204096124152, "grad_norm": 9.8899887104632, "learning_rate": 4.883699198032903e-05, "loss": 2.5688, "mean_token_accuracy": 0.43448275327682495, "step": 146595 }, { "epoch": 0.1476570770143457, "grad_norm": 9.55192809333942, "learning_rate": 4.8836873064535e-05, "loss": 2.6242, "mean_token_accuracy": 0.35977011919021606, "step": 146600 }, { "epoch": 0.14766211306744986, "grad_norm": 13.373182946888855, "learning_rate": 4.883675414282312e-05, "loss": 2.8299, "mean_token_accuracy": 0.4344827651977539, "step": 146605 }, { "epoch": 0.14766714912055404, "grad_norm": 9.577822422674908, "learning_rate": 4.88366352151934e-05, "loss": 2.5368, "mean_token_accuracy": 0.42758620381355283, "step": 146610 }, { "epoch": 0.1476721851736582, "grad_norm": 11.56112300602276, "learning_rate": 4.883651628164589e-05, "loss": 2.5942, "mean_token_accuracy": 0.41379310488700866, "step": 146615 }, { "epoch": 0.14767722122676238, "grad_norm": 10.63430778923677, "learning_rate": 4.883639734218062e-05, "loss": 2.4569, "mean_token_accuracy": 0.4068965494632721, "step": 146620 }, { "epoch": 0.14768225727986656, "grad_norm": 9.847355802926, "learning_rate": 4.883627839679761e-05, "loss": 2.0555, "mean_token_accuracy": 0.47586206793785096, "step": 146625 }, { "epoch": 0.14768729333297073, "grad_norm": 10.602198340302646, "learning_rate": 4.883615944549692e-05, "loss": 2.2781, "mean_token_accuracy": 0.46551724076271056, "step": 146630 }, { "epoch": 0.1476923293860749, "grad_norm": 9.208961841963491, "learning_rate": 4.8836040488278554e-05, "loss": 2.1805, "mean_token_accuracy": 0.5021173596382141, "step": 146635 }, { "epoch": 0.14769736543917908, "grad_norm": 9.608708423595962, "learning_rate": 4.883592152514256e-05, "loss": 2.5949, "mean_token_accuracy": 0.3862068921327591, "step": 146640 }, { "epoch": 0.14770240149228325, "grad_norm": 27.001126321356463, "learning_rate": 4.883580255608897e-05, "loss": 2.4959, "mean_token_accuracy": 0.4206896543502808, "step": 146645 }, { "epoch": 0.14770743754538743, "grad_norm": 11.383972108827463, "learning_rate": 4.883568358111781e-05, "loss": 2.2764, "mean_token_accuracy": 0.45517241954803467, "step": 146650 }, { "epoch": 0.1477124735984916, "grad_norm": 9.810501025851796, "learning_rate": 4.8835564600229134e-05, "loss": 2.2377, "mean_token_accuracy": 0.5, "step": 146655 }, { "epoch": 0.14771750965159577, "grad_norm": 13.543985645032281, "learning_rate": 4.883544561342295e-05, "loss": 2.1642, "mean_token_accuracy": 0.45172414779663084, "step": 146660 }, { "epoch": 0.14772254570469995, "grad_norm": 10.319474986798374, "learning_rate": 4.88353266206993e-05, "loss": 2.5935, "mean_token_accuracy": 0.37586206793785093, "step": 146665 }, { "epoch": 0.14772758175780412, "grad_norm": 10.089699163416679, "learning_rate": 4.883520762205822e-05, "loss": 2.2842, "mean_token_accuracy": 0.4586206912994385, "step": 146670 }, { "epoch": 0.1477326178109083, "grad_norm": 8.722409595277039, "learning_rate": 4.883508861749973e-05, "loss": 2.2236, "mean_token_accuracy": 0.4379310369491577, "step": 146675 }, { "epoch": 0.14773765386401247, "grad_norm": 9.33123581133712, "learning_rate": 4.883496960702389e-05, "loss": 2.1117, "mean_token_accuracy": 0.4448275864124298, "step": 146680 }, { "epoch": 0.14774268991711664, "grad_norm": 8.994285269822763, "learning_rate": 4.883485059063071e-05, "loss": 2.3298, "mean_token_accuracy": 0.4137930989265442, "step": 146685 }, { "epoch": 0.14774772597022082, "grad_norm": 9.849933635004636, "learning_rate": 4.8834731568320225e-05, "loss": 2.2012, "mean_token_accuracy": 0.4620689630508423, "step": 146690 }, { "epoch": 0.147752762023325, "grad_norm": 11.920533190500235, "learning_rate": 4.883461254009248e-05, "loss": 1.9521, "mean_token_accuracy": 0.5227465212345124, "step": 146695 }, { "epoch": 0.14775779807642916, "grad_norm": 8.508531091560226, "learning_rate": 4.883449350594749e-05, "loss": 2.1337, "mean_token_accuracy": 0.46551724076271056, "step": 146700 }, { "epoch": 0.14776283412953334, "grad_norm": 11.148575961078116, "learning_rate": 4.883437446588531e-05, "loss": 2.1781, "mean_token_accuracy": 0.4172413766384125, "step": 146705 }, { "epoch": 0.1477678701826375, "grad_norm": 11.104648109231968, "learning_rate": 4.8834255419905956e-05, "loss": 2.4606, "mean_token_accuracy": 0.41724138259887694, "step": 146710 }, { "epoch": 0.14777290623574169, "grad_norm": 10.183326127010416, "learning_rate": 4.883413636800946e-05, "loss": 2.2684, "mean_token_accuracy": 0.4137930989265442, "step": 146715 }, { "epoch": 0.14777794228884586, "grad_norm": 10.157512761626103, "learning_rate": 4.883401731019587e-05, "loss": 2.4628, "mean_token_accuracy": 0.41379311084747317, "step": 146720 }, { "epoch": 0.14778297834195003, "grad_norm": 12.331485137323705, "learning_rate": 4.883389824646522e-05, "loss": 2.2155, "mean_token_accuracy": 0.4620689690113068, "step": 146725 }, { "epoch": 0.14778801439505418, "grad_norm": 8.691006152056522, "learning_rate": 4.883377917681752e-05, "loss": 2.2292, "mean_token_accuracy": 0.44827585816383364, "step": 146730 }, { "epoch": 0.14779305044815835, "grad_norm": 9.33166202423798, "learning_rate": 4.883366010125282e-05, "loss": 1.9815, "mean_token_accuracy": 0.5129461526870728, "step": 146735 }, { "epoch": 0.14779808650126253, "grad_norm": 8.82905311199546, "learning_rate": 4.883354101977115e-05, "loss": 2.1317, "mean_token_accuracy": 0.47931033968925474, "step": 146740 }, { "epoch": 0.1478031225543667, "grad_norm": 8.156492362756103, "learning_rate": 4.883342193237254e-05, "loss": 2.0015, "mean_token_accuracy": 0.4896551728248596, "step": 146745 }, { "epoch": 0.14780815860747087, "grad_norm": 8.553112541562346, "learning_rate": 4.883330283905703e-05, "loss": 2.6608, "mean_token_accuracy": 0.38620689511299133, "step": 146750 }, { "epoch": 0.14781319466057505, "grad_norm": 10.045827060021612, "learning_rate": 4.883318373982464e-05, "loss": 2.3827, "mean_token_accuracy": 0.38620689511299133, "step": 146755 }, { "epoch": 0.14781823071367922, "grad_norm": 11.670589655730081, "learning_rate": 4.883306463467543e-05, "loss": 2.1676, "mean_token_accuracy": 0.4602216809988022, "step": 146760 }, { "epoch": 0.1478232667667834, "grad_norm": 9.917873117249139, "learning_rate": 4.88329455236094e-05, "loss": 2.4888, "mean_token_accuracy": 0.42758620977401735, "step": 146765 }, { "epoch": 0.14782830281988757, "grad_norm": 11.94839686292455, "learning_rate": 4.8832826406626605e-05, "loss": 2.4104, "mean_token_accuracy": 0.42758620381355283, "step": 146770 }, { "epoch": 0.14783333887299174, "grad_norm": 9.446728488471768, "learning_rate": 4.883270728372707e-05, "loss": 2.7662, "mean_token_accuracy": 0.3842710226774216, "step": 146775 }, { "epoch": 0.14783837492609592, "grad_norm": 11.060741567921166, "learning_rate": 4.883258815491083e-05, "loss": 2.4951, "mean_token_accuracy": 0.41379310488700866, "step": 146780 }, { "epoch": 0.1478434109792001, "grad_norm": 9.97709318867642, "learning_rate": 4.883246902017791e-05, "loss": 2.3365, "mean_token_accuracy": 0.417241370677948, "step": 146785 }, { "epoch": 0.14784844703230426, "grad_norm": 7.888987031662764, "learning_rate": 4.8832349879528354e-05, "loss": 2.3099, "mean_token_accuracy": 0.48275861144065857, "step": 146790 }, { "epoch": 0.14785348308540844, "grad_norm": 10.081654616337394, "learning_rate": 4.88322307329622e-05, "loss": 2.1395, "mean_token_accuracy": 0.4671506345272064, "step": 146795 }, { "epoch": 0.1478585191385126, "grad_norm": 9.190143550317845, "learning_rate": 4.8832111580479464e-05, "loss": 2.6588, "mean_token_accuracy": 0.45015124082565305, "step": 146800 }, { "epoch": 0.14786355519161679, "grad_norm": 11.854260742078802, "learning_rate": 4.883199242208018e-05, "loss": 2.3756, "mean_token_accuracy": 0.46551724076271056, "step": 146805 }, { "epoch": 0.14786859124472096, "grad_norm": 9.971496348896805, "learning_rate": 4.88318732577644e-05, "loss": 2.4286, "mean_token_accuracy": 0.4068965494632721, "step": 146810 }, { "epoch": 0.14787362729782513, "grad_norm": 13.351636449077631, "learning_rate": 4.883175408753214e-05, "loss": 2.3931, "mean_token_accuracy": 0.4011494219303131, "step": 146815 }, { "epoch": 0.1478786633509293, "grad_norm": 13.151232499274933, "learning_rate": 4.883163491138343e-05, "loss": 2.3639, "mean_token_accuracy": 0.4620689630508423, "step": 146820 }, { "epoch": 0.14788369940403348, "grad_norm": 10.853120985157206, "learning_rate": 4.883151572931833e-05, "loss": 2.5652, "mean_token_accuracy": 0.3862069010734558, "step": 146825 }, { "epoch": 0.14788873545713765, "grad_norm": 10.810063094173854, "learning_rate": 4.883139654133684e-05, "loss": 2.6214, "mean_token_accuracy": 0.4137930989265442, "step": 146830 }, { "epoch": 0.14789377151024183, "grad_norm": 11.68174636967568, "learning_rate": 4.883127734743902e-05, "loss": 2.4171, "mean_token_accuracy": 0.4448275864124298, "step": 146835 }, { "epoch": 0.147898807563346, "grad_norm": 10.452539981283827, "learning_rate": 4.8831158147624886e-05, "loss": 2.2912, "mean_token_accuracy": 0.43793103098869324, "step": 146840 }, { "epoch": 0.14790384361645018, "grad_norm": 11.457516000585034, "learning_rate": 4.883103894189447e-05, "loss": 2.5971, "mean_token_accuracy": 0.4318965435028076, "step": 146845 }, { "epoch": 0.14790887966955435, "grad_norm": 10.724010091830898, "learning_rate": 4.883091973024782e-05, "loss": 2.1616, "mean_token_accuracy": 0.4517241358757019, "step": 146850 }, { "epoch": 0.14791391572265852, "grad_norm": 14.631399793116891, "learning_rate": 4.883080051268495e-05, "loss": 2.3651, "mean_token_accuracy": 0.42413793206214906, "step": 146855 }, { "epoch": 0.1479189517757627, "grad_norm": 7.779416434873946, "learning_rate": 4.88306812892059e-05, "loss": 1.8556, "mean_token_accuracy": 0.5, "step": 146860 }, { "epoch": 0.14792398782886687, "grad_norm": 12.32688825140249, "learning_rate": 4.883056205981072e-05, "loss": 2.5505, "mean_token_accuracy": 0.4052026569843292, "step": 146865 }, { "epoch": 0.14792902388197102, "grad_norm": 10.818373112727803, "learning_rate": 4.8830442824499425e-05, "loss": 2.2349, "mean_token_accuracy": 0.41724138259887694, "step": 146870 }, { "epoch": 0.1479340599350752, "grad_norm": 19.448318614458486, "learning_rate": 4.8830323583272046e-05, "loss": 2.4397, "mean_token_accuracy": 0.3896551728248596, "step": 146875 }, { "epoch": 0.14793909598817936, "grad_norm": 8.917478137375324, "learning_rate": 4.883020433612863e-05, "loss": 2.3465, "mean_token_accuracy": 0.4379310429096222, "step": 146880 }, { "epoch": 0.14794413204128354, "grad_norm": 10.562110699761524, "learning_rate": 4.8830085083069196e-05, "loss": 2.3972, "mean_token_accuracy": 0.4310344815254211, "step": 146885 }, { "epoch": 0.1479491680943877, "grad_norm": 12.2661494371973, "learning_rate": 4.882996582409379e-05, "loss": 2.232, "mean_token_accuracy": 0.4137930989265442, "step": 146890 }, { "epoch": 0.14795420414749189, "grad_norm": 10.656975402327985, "learning_rate": 4.882984655920243e-05, "loss": 2.2246, "mean_token_accuracy": 0.4379310250282288, "step": 146895 }, { "epoch": 0.14795924020059606, "grad_norm": 9.811381690529911, "learning_rate": 4.8829727288395154e-05, "loss": 2.355, "mean_token_accuracy": 0.4482758641242981, "step": 146900 }, { "epoch": 0.14796427625370023, "grad_norm": 10.749886293706474, "learning_rate": 4.8829608011672005e-05, "loss": 2.6515, "mean_token_accuracy": 0.4068965554237366, "step": 146905 }, { "epoch": 0.1479693123068044, "grad_norm": 10.051160330449358, "learning_rate": 4.8829488729033016e-05, "loss": 2.1394, "mean_token_accuracy": 0.48620688915252686, "step": 146910 }, { "epoch": 0.14797434835990858, "grad_norm": 9.063474302065995, "learning_rate": 4.8829369440478206e-05, "loss": 2.207, "mean_token_accuracy": 0.4517241358757019, "step": 146915 }, { "epoch": 0.14797938441301275, "grad_norm": 14.468489261088424, "learning_rate": 4.8829250146007616e-05, "loss": 2.6316, "mean_token_accuracy": 0.4517241418361664, "step": 146920 }, { "epoch": 0.14798442046611693, "grad_norm": 9.873052513519497, "learning_rate": 4.882913084562129e-05, "loss": 2.1174, "mean_token_accuracy": 0.48275861144065857, "step": 146925 }, { "epoch": 0.1479894565192211, "grad_norm": 11.33735986014298, "learning_rate": 4.882901153931923e-05, "loss": 2.1657, "mean_token_accuracy": 0.4896551728248596, "step": 146930 }, { "epoch": 0.14799449257232528, "grad_norm": 13.303743717214939, "learning_rate": 4.8828892227101494e-05, "loss": 2.3075, "mean_token_accuracy": 0.4586206912994385, "step": 146935 }, { "epoch": 0.14799952862542945, "grad_norm": 11.751778294002708, "learning_rate": 4.8828772908968124e-05, "loss": 2.5029, "mean_token_accuracy": 0.4068965554237366, "step": 146940 }, { "epoch": 0.14800456467853362, "grad_norm": 13.795411416776913, "learning_rate": 4.882865358491913e-05, "loss": 2.4626, "mean_token_accuracy": 0.43448275327682495, "step": 146945 }, { "epoch": 0.1480096007316378, "grad_norm": 11.153811359712817, "learning_rate": 4.8828534254954554e-05, "loss": 2.5096, "mean_token_accuracy": 0.3793103456497192, "step": 146950 }, { "epoch": 0.14801463678474197, "grad_norm": 10.91217027596112, "learning_rate": 4.882841491907444e-05, "loss": 2.2654, "mean_token_accuracy": 0.4465819776058197, "step": 146955 }, { "epoch": 0.14801967283784614, "grad_norm": 12.347349310664852, "learning_rate": 4.882829557727879e-05, "loss": 2.3912, "mean_token_accuracy": 0.4344827592372894, "step": 146960 }, { "epoch": 0.14802470889095032, "grad_norm": 12.046115658967606, "learning_rate": 4.8828176229567676e-05, "loss": 2.347, "mean_token_accuracy": 0.4068965554237366, "step": 146965 }, { "epoch": 0.1480297449440545, "grad_norm": 8.633677325080502, "learning_rate": 4.8828056875941105e-05, "loss": 2.1154, "mean_token_accuracy": 0.46551724076271056, "step": 146970 }, { "epoch": 0.14803478099715867, "grad_norm": 9.70258487075281, "learning_rate": 4.882793751639912e-05, "loss": 2.3861, "mean_token_accuracy": 0.4206896543502808, "step": 146975 }, { "epoch": 0.14803981705026284, "grad_norm": 9.357207390730874, "learning_rate": 4.882781815094175e-05, "loss": 1.975, "mean_token_accuracy": 0.4931034505367279, "step": 146980 }, { "epoch": 0.148044853103367, "grad_norm": 10.092193929985303, "learning_rate": 4.882769877956903e-05, "loss": 2.3329, "mean_token_accuracy": 0.4758620738983154, "step": 146985 }, { "epoch": 0.1480498891564712, "grad_norm": 9.337911435620743, "learning_rate": 4.8827579402280994e-05, "loss": 2.4896, "mean_token_accuracy": 0.42068966031074523, "step": 146990 }, { "epoch": 0.14805492520957536, "grad_norm": 13.956530833603086, "learning_rate": 4.882746001907767e-05, "loss": 2.4379, "mean_token_accuracy": 0.4379310369491577, "step": 146995 }, { "epoch": 0.14805996126267953, "grad_norm": 9.694196148159097, "learning_rate": 4.88273406299591e-05, "loss": 2.0896, "mean_token_accuracy": 0.4886267364025116, "step": 147000 }, { "epoch": 0.1480649973157837, "grad_norm": 10.14038209671101, "learning_rate": 4.8827221234925316e-05, "loss": 2.3088, "mean_token_accuracy": 0.4310344815254211, "step": 147005 }, { "epoch": 0.14807003336888785, "grad_norm": 8.47962060171844, "learning_rate": 4.882710183397634e-05, "loss": 2.231, "mean_token_accuracy": 0.4724137902259827, "step": 147010 }, { "epoch": 0.14807506942199203, "grad_norm": 9.375338667017155, "learning_rate": 4.882698242711222e-05, "loss": 3.0113, "mean_token_accuracy": 0.4310344934463501, "step": 147015 }, { "epoch": 0.1480801054750962, "grad_norm": 13.394190069173844, "learning_rate": 4.882686301433298e-05, "loss": 2.5559, "mean_token_accuracy": 0.4482758641242981, "step": 147020 }, { "epoch": 0.14808514152820038, "grad_norm": 10.340004248478332, "learning_rate": 4.882674359563865e-05, "loss": 2.5679, "mean_token_accuracy": 0.4172413766384125, "step": 147025 }, { "epoch": 0.14809017758130455, "grad_norm": 10.805249286955643, "learning_rate": 4.882662417102928e-05, "loss": 2.6067, "mean_token_accuracy": 0.4172413766384125, "step": 147030 }, { "epoch": 0.14809521363440872, "grad_norm": 10.958884524553648, "learning_rate": 4.8826504740504875e-05, "loss": 2.4093, "mean_token_accuracy": 0.42413792610168455, "step": 147035 }, { "epoch": 0.1481002496875129, "grad_norm": 10.55947653101469, "learning_rate": 4.882638530406549e-05, "loss": 2.6572, "mean_token_accuracy": 0.3965517163276672, "step": 147040 }, { "epoch": 0.14810528574061707, "grad_norm": 8.80993214292461, "learning_rate": 4.882626586171116e-05, "loss": 2.1816, "mean_token_accuracy": 0.4620689630508423, "step": 147045 }, { "epoch": 0.14811032179372124, "grad_norm": 8.988300597427417, "learning_rate": 4.88261464134419e-05, "loss": 2.1558, "mean_token_accuracy": 0.46896551847457885, "step": 147050 }, { "epoch": 0.14811535784682542, "grad_norm": 11.325289023431656, "learning_rate": 4.882602695925776e-05, "loss": 2.6253, "mean_token_accuracy": 0.3724137872457504, "step": 147055 }, { "epoch": 0.1481203938999296, "grad_norm": 11.427566943905516, "learning_rate": 4.882590749915877e-05, "loss": 2.0728, "mean_token_accuracy": 0.47931034564971925, "step": 147060 }, { "epoch": 0.14812542995303377, "grad_norm": 9.773811031514485, "learning_rate": 4.882578803314496e-05, "loss": 2.5847, "mean_token_accuracy": 0.4068965494632721, "step": 147065 }, { "epoch": 0.14813046600613794, "grad_norm": 9.22098351003182, "learning_rate": 4.882566856121636e-05, "loss": 2.4532, "mean_token_accuracy": 0.4620689570903778, "step": 147070 }, { "epoch": 0.1481355020592421, "grad_norm": 10.794101241930523, "learning_rate": 4.8825549083372995e-05, "loss": 2.2344, "mean_token_accuracy": 0.48965516686439514, "step": 147075 }, { "epoch": 0.1481405381123463, "grad_norm": 11.782254365490648, "learning_rate": 4.8825429599614916e-05, "loss": 2.1795, "mean_token_accuracy": 0.458620685338974, "step": 147080 }, { "epoch": 0.14814557416545046, "grad_norm": 12.516758655723082, "learning_rate": 4.882531010994216e-05, "loss": 2.5111, "mean_token_accuracy": 0.43793103098869324, "step": 147085 }, { "epoch": 0.14815061021855463, "grad_norm": 9.292017853443491, "learning_rate": 4.8825190614354746e-05, "loss": 2.2597, "mean_token_accuracy": 0.43793103098869324, "step": 147090 }, { "epoch": 0.1481556462716588, "grad_norm": 11.525909744506324, "learning_rate": 4.88250711128527e-05, "loss": 2.7411, "mean_token_accuracy": 0.3793103516101837, "step": 147095 }, { "epoch": 0.14816068232476298, "grad_norm": 8.434245892908029, "learning_rate": 4.8824951605436085e-05, "loss": 2.376, "mean_token_accuracy": 0.3896551728248596, "step": 147100 }, { "epoch": 0.14816571837786716, "grad_norm": 11.081474491162624, "learning_rate": 4.88248320921049e-05, "loss": 2.3704, "mean_token_accuracy": 0.37586206793785093, "step": 147105 }, { "epoch": 0.14817075443097133, "grad_norm": 11.118349859095188, "learning_rate": 4.88247125728592e-05, "loss": 2.2949, "mean_token_accuracy": 0.4137930989265442, "step": 147110 }, { "epoch": 0.1481757904840755, "grad_norm": 17.346127663866344, "learning_rate": 4.8824593047699e-05, "loss": 2.5886, "mean_token_accuracy": 0.441379314661026, "step": 147115 }, { "epoch": 0.14818082653717968, "grad_norm": 11.41250087747657, "learning_rate": 4.8824473516624354e-05, "loss": 2.5483, "mean_token_accuracy": 0.4413793087005615, "step": 147120 }, { "epoch": 0.14818586259028385, "grad_norm": 10.469391016498895, "learning_rate": 4.882435397963528e-05, "loss": 2.5192, "mean_token_accuracy": 0.37241379618644715, "step": 147125 }, { "epoch": 0.14819089864338802, "grad_norm": 10.485920367909888, "learning_rate": 4.882423443673183e-05, "loss": 2.2307, "mean_token_accuracy": 0.4551724076271057, "step": 147130 }, { "epoch": 0.1481959346964922, "grad_norm": 12.344392980457553, "learning_rate": 4.8824114887914014e-05, "loss": 2.5895, "mean_token_accuracy": 0.4068965494632721, "step": 147135 }, { "epoch": 0.14820097074959637, "grad_norm": 9.396068885652161, "learning_rate": 4.8823995333181885e-05, "loss": 2.2102, "mean_token_accuracy": 0.44996975660324096, "step": 147140 }, { "epoch": 0.14820600680270055, "grad_norm": 9.656948972456169, "learning_rate": 4.882387577253545e-05, "loss": 2.5612, "mean_token_accuracy": 0.4137930989265442, "step": 147145 }, { "epoch": 0.1482110428558047, "grad_norm": 9.092816251556293, "learning_rate": 4.882375620597476e-05, "loss": 2.1762, "mean_token_accuracy": 0.4551724135875702, "step": 147150 }, { "epoch": 0.14821607890890887, "grad_norm": 9.671967396975852, "learning_rate": 4.882363663349986e-05, "loss": 2.1739, "mean_token_accuracy": 0.44482758045196535, "step": 147155 }, { "epoch": 0.14822111496201304, "grad_norm": 9.750344720450771, "learning_rate": 4.882351705511076e-05, "loss": 2.3524, "mean_token_accuracy": 0.42068966031074523, "step": 147160 }, { "epoch": 0.1482261510151172, "grad_norm": 9.83591948492884, "learning_rate": 4.882339747080751e-05, "loss": 2.6674, "mean_token_accuracy": 0.39655173420906065, "step": 147165 }, { "epoch": 0.1482311870682214, "grad_norm": 7.771731488091805, "learning_rate": 4.882327788059014e-05, "loss": 2.1133, "mean_token_accuracy": 0.4862068951129913, "step": 147170 }, { "epoch": 0.14823622312132556, "grad_norm": 9.491716710061436, "learning_rate": 4.8823158284458664e-05, "loss": 2.3641, "mean_token_accuracy": 0.4586206912994385, "step": 147175 }, { "epoch": 0.14824125917442973, "grad_norm": 10.11588444594482, "learning_rate": 4.882303868241314e-05, "loss": 1.9879, "mean_token_accuracy": 0.4862068951129913, "step": 147180 }, { "epoch": 0.1482462952275339, "grad_norm": 8.953449011958325, "learning_rate": 4.882291907445359e-05, "loss": 1.87, "mean_token_accuracy": 0.5436176657676697, "step": 147185 }, { "epoch": 0.14825133128063808, "grad_norm": 12.349147779226927, "learning_rate": 4.882279946058004e-05, "loss": 2.3262, "mean_token_accuracy": 0.4379310369491577, "step": 147190 }, { "epoch": 0.14825636733374226, "grad_norm": 9.809502896048327, "learning_rate": 4.882267984079255e-05, "loss": 2.1363, "mean_token_accuracy": 0.4448275864124298, "step": 147195 }, { "epoch": 0.14826140338684643, "grad_norm": 9.520689599131728, "learning_rate": 4.882256021509112e-05, "loss": 2.4987, "mean_token_accuracy": 0.42758620977401735, "step": 147200 }, { "epoch": 0.1482664394399506, "grad_norm": 10.98789466847955, "learning_rate": 4.8822440583475804e-05, "loss": 2.3156, "mean_token_accuracy": 0.3793103456497192, "step": 147205 }, { "epoch": 0.14827147549305478, "grad_norm": 7.906718049284113, "learning_rate": 4.882232094594663e-05, "loss": 2.284, "mean_token_accuracy": 0.4275861978530884, "step": 147210 }, { "epoch": 0.14827651154615895, "grad_norm": 10.278386647021943, "learning_rate": 4.882220130250363e-05, "loss": 2.2462, "mean_token_accuracy": 0.4842710256576538, "step": 147215 }, { "epoch": 0.14828154759926312, "grad_norm": 11.365946559692356, "learning_rate": 4.8822081653146844e-05, "loss": 2.3849, "mean_token_accuracy": 0.4551724135875702, "step": 147220 }, { "epoch": 0.1482865836523673, "grad_norm": 11.38413584827024, "learning_rate": 4.882196199787629e-05, "loss": 3.0469, "mean_token_accuracy": 0.34827586114406583, "step": 147225 }, { "epoch": 0.14829161970547147, "grad_norm": 9.04667339586641, "learning_rate": 4.8821842336692014e-05, "loss": 2.7558, "mean_token_accuracy": 0.38620689511299133, "step": 147230 }, { "epoch": 0.14829665575857565, "grad_norm": 11.260133601294315, "learning_rate": 4.882172266959405e-05, "loss": 2.2808, "mean_token_accuracy": 0.4275862157344818, "step": 147235 }, { "epoch": 0.14830169181167982, "grad_norm": 10.54253472660178, "learning_rate": 4.8821602996582414e-05, "loss": 3.0929, "mean_token_accuracy": 0.3379310369491577, "step": 147240 }, { "epoch": 0.148306727864784, "grad_norm": 10.29737561459594, "learning_rate": 4.882148331765717e-05, "loss": 2.2856, "mean_token_accuracy": 0.4551724076271057, "step": 147245 }, { "epoch": 0.14831176391788817, "grad_norm": 15.701734282528049, "learning_rate": 4.882136363281832e-05, "loss": 2.6464, "mean_token_accuracy": 0.358620685338974, "step": 147250 }, { "epoch": 0.14831679997099234, "grad_norm": 11.015287121419911, "learning_rate": 4.882124394206591e-05, "loss": 2.0684, "mean_token_accuracy": 0.42068964838981626, "step": 147255 }, { "epoch": 0.14832183602409652, "grad_norm": 9.730699479997783, "learning_rate": 4.882112424539998e-05, "loss": 2.5134, "mean_token_accuracy": 0.3999999940395355, "step": 147260 }, { "epoch": 0.1483268720772007, "grad_norm": 8.858295651404818, "learning_rate": 4.882100454282055e-05, "loss": 2.1884, "mean_token_accuracy": 0.45517241954803467, "step": 147265 }, { "epoch": 0.14833190813030486, "grad_norm": 11.316933747512023, "learning_rate": 4.8820884834327665e-05, "loss": 2.3365, "mean_token_accuracy": 0.43103448748588563, "step": 147270 }, { "epoch": 0.14833694418340904, "grad_norm": 12.726977749976244, "learning_rate": 4.8820765119921355e-05, "loss": 2.3813, "mean_token_accuracy": 0.43103448748588563, "step": 147275 }, { "epoch": 0.1483419802365132, "grad_norm": 19.096433258255583, "learning_rate": 4.8820645399601645e-05, "loss": 2.3175, "mean_token_accuracy": 0.46945813298225403, "step": 147280 }, { "epoch": 0.14834701628961738, "grad_norm": 11.658912463993541, "learning_rate": 4.8820525673368573e-05, "loss": 2.5151, "mean_token_accuracy": 0.4068965494632721, "step": 147285 }, { "epoch": 0.14835205234272153, "grad_norm": 10.908673894126805, "learning_rate": 4.882040594122218e-05, "loss": 2.0161, "mean_token_accuracy": 0.4517241358757019, "step": 147290 }, { "epoch": 0.1483570883958257, "grad_norm": 12.597855863039289, "learning_rate": 4.882028620316249e-05, "loss": 2.5975, "mean_token_accuracy": 0.4068965554237366, "step": 147295 }, { "epoch": 0.14836212444892988, "grad_norm": 10.314037616979181, "learning_rate": 4.8820166459189535e-05, "loss": 2.7618, "mean_token_accuracy": 0.441379314661026, "step": 147300 }, { "epoch": 0.14836716050203405, "grad_norm": 10.347742383963164, "learning_rate": 4.882004670930336e-05, "loss": 2.6356, "mean_token_accuracy": 0.4068965494632721, "step": 147305 }, { "epoch": 0.14837219655513822, "grad_norm": 10.840890410064661, "learning_rate": 4.881992695350399e-05, "loss": 2.3133, "mean_token_accuracy": 0.42758620381355283, "step": 147310 }, { "epoch": 0.1483772326082424, "grad_norm": 11.786699244765305, "learning_rate": 4.881980719179146e-05, "loss": 3.2202, "mean_token_accuracy": 0.36206896901130675, "step": 147315 }, { "epoch": 0.14838226866134657, "grad_norm": 8.944905191616261, "learning_rate": 4.8819687424165796e-05, "loss": 2.0835, "mean_token_accuracy": 0.48620688915252686, "step": 147320 }, { "epoch": 0.14838730471445075, "grad_norm": 10.845567889878728, "learning_rate": 4.881956765062703e-05, "loss": 2.4206, "mean_token_accuracy": 0.4068965554237366, "step": 147325 }, { "epoch": 0.14839234076755492, "grad_norm": 8.877828491981624, "learning_rate": 4.881944787117522e-05, "loss": 2.302, "mean_token_accuracy": 0.47586206197738645, "step": 147330 }, { "epoch": 0.1483973768206591, "grad_norm": 9.054392925784292, "learning_rate": 4.881932808581037e-05, "loss": 2.3116, "mean_token_accuracy": 0.4344827473163605, "step": 147335 }, { "epoch": 0.14840241287376327, "grad_norm": 10.744226897523658, "learning_rate": 4.881920829453253e-05, "loss": 2.3018, "mean_token_accuracy": 0.43793103098869324, "step": 147340 }, { "epoch": 0.14840744892686744, "grad_norm": 10.581016193136323, "learning_rate": 4.8819088497341726e-05, "loss": 2.2516, "mean_token_accuracy": 0.4517241418361664, "step": 147345 }, { "epoch": 0.14841248497997162, "grad_norm": 8.957900175853252, "learning_rate": 4.8818968694237996e-05, "loss": 2.0733, "mean_token_accuracy": 0.4551724135875702, "step": 147350 }, { "epoch": 0.1484175210330758, "grad_norm": 12.499291710417538, "learning_rate": 4.8818848885221366e-05, "loss": 2.5497, "mean_token_accuracy": 0.4689655125141144, "step": 147355 }, { "epoch": 0.14842255708617996, "grad_norm": 10.814685269998908, "learning_rate": 4.8818729070291876e-05, "loss": 2.1931, "mean_token_accuracy": 0.4379310369491577, "step": 147360 }, { "epoch": 0.14842759313928414, "grad_norm": 10.544726952120056, "learning_rate": 4.8818609249449555e-05, "loss": 2.2425, "mean_token_accuracy": 0.4327888786792755, "step": 147365 }, { "epoch": 0.1484326291923883, "grad_norm": 10.876655510084825, "learning_rate": 4.881848942269445e-05, "loss": 2.1023, "mean_token_accuracy": 0.4620689570903778, "step": 147370 }, { "epoch": 0.14843766524549248, "grad_norm": 10.004284273292003, "learning_rate": 4.8818369590026565e-05, "loss": 2.8315, "mean_token_accuracy": 0.39655172228813174, "step": 147375 }, { "epoch": 0.14844270129859666, "grad_norm": 13.002677917979186, "learning_rate": 4.881824975144596e-05, "loss": 2.3526, "mean_token_accuracy": 0.4583787083625793, "step": 147380 }, { "epoch": 0.14844773735170083, "grad_norm": 12.207589444904194, "learning_rate": 4.881812990695265e-05, "loss": 2.6505, "mean_token_accuracy": 0.37398669123649597, "step": 147385 }, { "epoch": 0.148452773404805, "grad_norm": 8.983110729782561, "learning_rate": 4.8818010056546686e-05, "loss": 2.0905, "mean_token_accuracy": 0.5137931048870087, "step": 147390 }, { "epoch": 0.14845780945790918, "grad_norm": 10.12616103653861, "learning_rate": 4.881789020022809e-05, "loss": 1.9616, "mean_token_accuracy": 0.4931034505367279, "step": 147395 }, { "epoch": 0.14846284551101335, "grad_norm": 10.835814003903316, "learning_rate": 4.88177703379969e-05, "loss": 2.1957, "mean_token_accuracy": 0.4551724076271057, "step": 147400 }, { "epoch": 0.14846788156411753, "grad_norm": 9.695695453402482, "learning_rate": 4.881765046985315e-05, "loss": 2.2653, "mean_token_accuracy": 0.42758620977401735, "step": 147405 }, { "epoch": 0.1484729176172217, "grad_norm": 10.379687579898073, "learning_rate": 4.8817530595796856e-05, "loss": 2.5179, "mean_token_accuracy": 0.3827586233615875, "step": 147410 }, { "epoch": 0.14847795367032587, "grad_norm": 11.628701836231295, "learning_rate": 4.881741071582808e-05, "loss": 2.4247, "mean_token_accuracy": 0.4034482777118683, "step": 147415 }, { "epoch": 0.14848298972343005, "grad_norm": 8.51684260282957, "learning_rate": 4.881729082994683e-05, "loss": 1.9461, "mean_token_accuracy": 0.482758617401123, "step": 147420 }, { "epoch": 0.14848802577653422, "grad_norm": 12.550463696593551, "learning_rate": 4.8817170938153156e-05, "loss": 2.5477, "mean_token_accuracy": 0.3862068891525269, "step": 147425 }, { "epoch": 0.14849306182963837, "grad_norm": 11.278154452532258, "learning_rate": 4.881705104044709e-05, "loss": 2.1288, "mean_token_accuracy": 0.47241378426551817, "step": 147430 }, { "epoch": 0.14849809788274254, "grad_norm": 11.713783084567902, "learning_rate": 4.8816931136828654e-05, "loss": 2.3778, "mean_token_accuracy": 0.42758620381355283, "step": 147435 }, { "epoch": 0.14850313393584672, "grad_norm": 9.540428320584386, "learning_rate": 4.881681122729789e-05, "loss": 2.2388, "mean_token_accuracy": 0.43448275327682495, "step": 147440 }, { "epoch": 0.1485081699889509, "grad_norm": 11.725206211368194, "learning_rate": 4.881669131185482e-05, "loss": 2.1806, "mean_token_accuracy": 0.4551724135875702, "step": 147445 }, { "epoch": 0.14851320604205506, "grad_norm": 11.10041759910945, "learning_rate": 4.881657139049949e-05, "loss": 2.4277, "mean_token_accuracy": 0.42413793206214906, "step": 147450 }, { "epoch": 0.14851824209515924, "grad_norm": 11.611295353632105, "learning_rate": 4.881645146323194e-05, "loss": 2.1882, "mean_token_accuracy": 0.4896551787853241, "step": 147455 }, { "epoch": 0.1485232781482634, "grad_norm": 12.018790977597904, "learning_rate": 4.881633153005218e-05, "loss": 2.1082, "mean_token_accuracy": 0.4776164650917053, "step": 147460 }, { "epoch": 0.14852831420136758, "grad_norm": 9.018404724864865, "learning_rate": 4.881621159096026e-05, "loss": 2.249, "mean_token_accuracy": 0.46551724076271056, "step": 147465 }, { "epoch": 0.14853335025447176, "grad_norm": 10.485857512990242, "learning_rate": 4.881609164595621e-05, "loss": 2.3322, "mean_token_accuracy": 0.4068965494632721, "step": 147470 }, { "epoch": 0.14853838630757593, "grad_norm": 10.614656554974275, "learning_rate": 4.881597169504006e-05, "loss": 2.818, "mean_token_accuracy": 0.4275861978530884, "step": 147475 }, { "epoch": 0.1485434223606801, "grad_norm": 9.60084873716458, "learning_rate": 4.8815851738211854e-05, "loss": 2.3064, "mean_token_accuracy": 0.43793103098869324, "step": 147480 }, { "epoch": 0.14854845841378428, "grad_norm": 13.41067614510491, "learning_rate": 4.881573177547161e-05, "loss": 2.21, "mean_token_accuracy": 0.5158499777317047, "step": 147485 }, { "epoch": 0.14855349446688845, "grad_norm": 10.319058287197134, "learning_rate": 4.8815611806819364e-05, "loss": 2.2256, "mean_token_accuracy": 0.4379310429096222, "step": 147490 }, { "epoch": 0.14855853051999263, "grad_norm": 10.532452207198254, "learning_rate": 4.8815491832255166e-05, "loss": 2.3208, "mean_token_accuracy": 0.441379314661026, "step": 147495 }, { "epoch": 0.1485635665730968, "grad_norm": 10.398734760351038, "learning_rate": 4.881537185177903e-05, "loss": 2.5989, "mean_token_accuracy": 0.358620685338974, "step": 147500 }, { "epoch": 0.14856860262620097, "grad_norm": 10.077617237512756, "learning_rate": 4.8815251865390997e-05, "loss": 2.4949, "mean_token_accuracy": 0.4, "step": 147505 }, { "epoch": 0.14857363867930515, "grad_norm": 9.634292263149101, "learning_rate": 4.8815131873091094e-05, "loss": 2.2713, "mean_token_accuracy": 0.4034482717514038, "step": 147510 }, { "epoch": 0.14857867473240932, "grad_norm": 12.523660076574277, "learning_rate": 4.881501187487936e-05, "loss": 2.4412, "mean_token_accuracy": 0.39655171930789945, "step": 147515 }, { "epoch": 0.1485837107855135, "grad_norm": 12.938800069132133, "learning_rate": 4.8814891870755834e-05, "loss": 2.5707, "mean_token_accuracy": 0.4298850536346436, "step": 147520 }, { "epoch": 0.14858874683861767, "grad_norm": 9.960086648719676, "learning_rate": 4.8814771860720545e-05, "loss": 2.4222, "mean_token_accuracy": 0.40689654350280763, "step": 147525 }, { "epoch": 0.14859378289172184, "grad_norm": 9.432934319798065, "learning_rate": 4.8814651844773515e-05, "loss": 2.2201, "mean_token_accuracy": 0.47126437425613404, "step": 147530 }, { "epoch": 0.14859881894482602, "grad_norm": 11.86340934981577, "learning_rate": 4.88145318229148e-05, "loss": 2.5709, "mean_token_accuracy": 0.429764062166214, "step": 147535 }, { "epoch": 0.1486038549979302, "grad_norm": 10.92149632769888, "learning_rate": 4.881441179514441e-05, "loss": 2.743, "mean_token_accuracy": 0.3793103456497192, "step": 147540 }, { "epoch": 0.14860889105103436, "grad_norm": 9.907376755441344, "learning_rate": 4.8814291761462386e-05, "loss": 2.2283, "mean_token_accuracy": 0.4931034445762634, "step": 147545 }, { "epoch": 0.14861392710413854, "grad_norm": 11.77960396762107, "learning_rate": 4.881417172186877e-05, "loss": 2.4171, "mean_token_accuracy": 0.3896551728248596, "step": 147550 }, { "epoch": 0.1486189631572427, "grad_norm": 10.121882667042206, "learning_rate": 4.881405167636359e-05, "loss": 2.0961, "mean_token_accuracy": 0.4620689630508423, "step": 147555 }, { "epoch": 0.14862399921034689, "grad_norm": 8.929683759149132, "learning_rate": 4.881393162494688e-05, "loss": 2.0055, "mean_token_accuracy": 0.5034482657909394, "step": 147560 }, { "epoch": 0.14862903526345106, "grad_norm": 12.849172693828802, "learning_rate": 4.8813811567618665e-05, "loss": 2.4908, "mean_token_accuracy": 0.40344828367233276, "step": 147565 }, { "epoch": 0.1486340713165552, "grad_norm": 10.906387330924511, "learning_rate": 4.881369150437899e-05, "loss": 2.3693, "mean_token_accuracy": 0.43950392603874205, "step": 147570 }, { "epoch": 0.14863910736965938, "grad_norm": 8.816824411742006, "learning_rate": 4.881357143522788e-05, "loss": 2.0319, "mean_token_accuracy": 0.4793103516101837, "step": 147575 }, { "epoch": 0.14864414342276355, "grad_norm": 15.477416527170242, "learning_rate": 4.881345136016538e-05, "loss": 2.8386, "mean_token_accuracy": 0.4068965494632721, "step": 147580 }, { "epoch": 0.14864917947586773, "grad_norm": 11.423052016586015, "learning_rate": 4.88133312791915e-05, "loss": 2.4312, "mean_token_accuracy": 0.4034482717514038, "step": 147585 }, { "epoch": 0.1486542155289719, "grad_norm": 10.382182083059218, "learning_rate": 4.8813211192306293e-05, "loss": 2.3771, "mean_token_accuracy": 0.4310344815254211, "step": 147590 }, { "epoch": 0.14865925158207607, "grad_norm": 10.319707353086667, "learning_rate": 4.881309109950978e-05, "loss": 2.7897, "mean_token_accuracy": 0.44137929677963256, "step": 147595 }, { "epoch": 0.14866428763518025, "grad_norm": 9.990199109384305, "learning_rate": 4.8812971000802014e-05, "loss": 2.4214, "mean_token_accuracy": 0.4310344815254211, "step": 147600 }, { "epoch": 0.14866932368828442, "grad_norm": 14.841897328334882, "learning_rate": 4.8812850896183015e-05, "loss": 2.8444, "mean_token_accuracy": 0.3655172407627106, "step": 147605 }, { "epoch": 0.1486743597413886, "grad_norm": 9.43993592075201, "learning_rate": 4.8812730785652816e-05, "loss": 2.4449, "mean_token_accuracy": 0.41724138259887694, "step": 147610 }, { "epoch": 0.14867939579449277, "grad_norm": 9.578414853263228, "learning_rate": 4.881261066921145e-05, "loss": 2.5002, "mean_token_accuracy": 0.41724138259887694, "step": 147615 }, { "epoch": 0.14868443184759694, "grad_norm": 8.461088875133578, "learning_rate": 4.881249054685896e-05, "loss": 2.0822, "mean_token_accuracy": 0.4862068951129913, "step": 147620 }, { "epoch": 0.14868946790070112, "grad_norm": 12.254562617019001, "learning_rate": 4.881237041859536e-05, "loss": 2.7349, "mean_token_accuracy": 0.3896551698446274, "step": 147625 }, { "epoch": 0.1486945039538053, "grad_norm": 10.02094343171751, "learning_rate": 4.88122502844207e-05, "loss": 2.5769, "mean_token_accuracy": 0.3793103456497192, "step": 147630 }, { "epoch": 0.14869954000690946, "grad_norm": 10.701606304690337, "learning_rate": 4.8812130144335e-05, "loss": 2.5673, "mean_token_accuracy": 0.3793103456497192, "step": 147635 }, { "epoch": 0.14870457606001364, "grad_norm": 9.376950709664813, "learning_rate": 4.881200999833831e-05, "loss": 2.4436, "mean_token_accuracy": 0.3896551728248596, "step": 147640 }, { "epoch": 0.1487096121131178, "grad_norm": 10.074649854714757, "learning_rate": 4.8811889846430655e-05, "loss": 2.4225, "mean_token_accuracy": 0.46551724672317507, "step": 147645 }, { "epoch": 0.14871464816622199, "grad_norm": 13.419942909230395, "learning_rate": 4.8811769688612064e-05, "loss": 2.4955, "mean_token_accuracy": 0.41379310488700866, "step": 147650 }, { "epoch": 0.14871968421932616, "grad_norm": 10.717162758198729, "learning_rate": 4.881164952488257e-05, "loss": 2.6211, "mean_token_accuracy": 0.3482758641242981, "step": 147655 }, { "epoch": 0.14872472027243033, "grad_norm": 8.995823963517623, "learning_rate": 4.881152935524222e-05, "loss": 2.2135, "mean_token_accuracy": 0.46551724076271056, "step": 147660 }, { "epoch": 0.1487297563255345, "grad_norm": 10.786590980877957, "learning_rate": 4.881140917969103e-05, "loss": 2.5346, "mean_token_accuracy": 0.4241379380226135, "step": 147665 }, { "epoch": 0.14873479237863868, "grad_norm": 9.489613952739157, "learning_rate": 4.8811288998229045e-05, "loss": 2.1175, "mean_token_accuracy": 0.47241379618644713, "step": 147670 }, { "epoch": 0.14873982843174285, "grad_norm": 12.306436383054477, "learning_rate": 4.8811168810856296e-05, "loss": 2.3714, "mean_token_accuracy": 0.4379310369491577, "step": 147675 }, { "epoch": 0.14874486448484703, "grad_norm": 8.856816820285616, "learning_rate": 4.8811048617572815e-05, "loss": 2.3793, "mean_token_accuracy": 0.41379310488700866, "step": 147680 }, { "epoch": 0.1487499005379512, "grad_norm": 15.026180005863395, "learning_rate": 4.881092841837863e-05, "loss": 2.5043, "mean_token_accuracy": 0.43103448748588563, "step": 147685 }, { "epoch": 0.14875493659105538, "grad_norm": 8.865992710769632, "learning_rate": 4.881080821327379e-05, "loss": 1.9951, "mean_token_accuracy": 0.519116747379303, "step": 147690 }, { "epoch": 0.14875997264415955, "grad_norm": 7.730155789677263, "learning_rate": 4.88106880022583e-05, "loss": 2.2018, "mean_token_accuracy": 0.5137930870056152, "step": 147695 }, { "epoch": 0.14876500869726372, "grad_norm": 8.94501605541947, "learning_rate": 4.881056778533222e-05, "loss": 2.249, "mean_token_accuracy": 0.47931034564971925, "step": 147700 }, { "epoch": 0.1487700447503679, "grad_norm": 11.35672634689044, "learning_rate": 4.8810447562495584e-05, "loss": 2.6019, "mean_token_accuracy": 0.4034482717514038, "step": 147705 }, { "epoch": 0.14877508080347204, "grad_norm": 10.510078978084575, "learning_rate": 4.8810327333748406e-05, "loss": 2.4527, "mean_token_accuracy": 0.37931033968925476, "step": 147710 }, { "epoch": 0.14878011685657622, "grad_norm": 9.062219936452191, "learning_rate": 4.881020709909073e-05, "loss": 2.0534, "mean_token_accuracy": 0.482758617401123, "step": 147715 }, { "epoch": 0.1487851529096804, "grad_norm": 8.528278659335324, "learning_rate": 4.8810086858522594e-05, "loss": 2.1196, "mean_token_accuracy": 0.48275862336158754, "step": 147720 }, { "epoch": 0.14879018896278456, "grad_norm": 10.355541838456608, "learning_rate": 4.880996661204402e-05, "loss": 2.6029, "mean_token_accuracy": 0.4310344815254211, "step": 147725 }, { "epoch": 0.14879522501588874, "grad_norm": 10.253693849275912, "learning_rate": 4.880984635965504e-05, "loss": 2.4976, "mean_token_accuracy": 0.4186327874660492, "step": 147730 }, { "epoch": 0.1488002610689929, "grad_norm": 11.489037045231054, "learning_rate": 4.8809726101355704e-05, "loss": 2.2262, "mean_token_accuracy": 0.4551724076271057, "step": 147735 }, { "epoch": 0.14880529712209709, "grad_norm": 9.50023536174181, "learning_rate": 4.880960583714604e-05, "loss": 2.1036, "mean_token_accuracy": 0.517241370677948, "step": 147740 }, { "epoch": 0.14881033317520126, "grad_norm": 10.366810459896795, "learning_rate": 4.880948556702607e-05, "loss": 2.5006, "mean_token_accuracy": 0.4068965494632721, "step": 147745 }, { "epoch": 0.14881536922830543, "grad_norm": 9.746488774058648, "learning_rate": 4.880936529099584e-05, "loss": 2.5077, "mean_token_accuracy": 0.3724137842655182, "step": 147750 }, { "epoch": 0.1488204052814096, "grad_norm": 12.204200318487382, "learning_rate": 4.8809245009055375e-05, "loss": 2.7431, "mean_token_accuracy": 0.3551724195480347, "step": 147755 }, { "epoch": 0.14882544133451378, "grad_norm": 11.305479314792676, "learning_rate": 4.880912472120472e-05, "loss": 2.4984, "mean_token_accuracy": 0.42758620381355283, "step": 147760 }, { "epoch": 0.14883047738761795, "grad_norm": 10.101441916861413, "learning_rate": 4.8809004427443885e-05, "loss": 2.4116, "mean_token_accuracy": 0.4758620738983154, "step": 147765 }, { "epoch": 0.14883551344072213, "grad_norm": 10.319224736819612, "learning_rate": 4.8808884127772924e-05, "loss": 2.4711, "mean_token_accuracy": 0.4068965554237366, "step": 147770 }, { "epoch": 0.1488405494938263, "grad_norm": 9.112383205884079, "learning_rate": 4.880876382219186e-05, "loss": 2.1193, "mean_token_accuracy": 0.4344827651977539, "step": 147775 }, { "epoch": 0.14884558554693048, "grad_norm": 12.345336425615708, "learning_rate": 4.880864351070074e-05, "loss": 2.2056, "mean_token_accuracy": 0.39310343861579894, "step": 147780 }, { "epoch": 0.14885062160003465, "grad_norm": 12.790394877488497, "learning_rate": 4.880852319329959e-05, "loss": 2.6085, "mean_token_accuracy": 0.4137930989265442, "step": 147785 }, { "epoch": 0.14885565765313882, "grad_norm": 16.995596870202775, "learning_rate": 4.880840286998843e-05, "loss": 2.6637, "mean_token_accuracy": 0.37931033968925476, "step": 147790 }, { "epoch": 0.148860693706243, "grad_norm": 11.57785781847776, "learning_rate": 4.880828254076732e-05, "loss": 2.576, "mean_token_accuracy": 0.4034482777118683, "step": 147795 }, { "epoch": 0.14886572975934717, "grad_norm": 8.066931882905035, "learning_rate": 4.880816220563626e-05, "loss": 2.2702, "mean_token_accuracy": 0.5030464053153991, "step": 147800 }, { "epoch": 0.14887076581245134, "grad_norm": 10.112780067315288, "learning_rate": 4.880804186459532e-05, "loss": 2.1863, "mean_token_accuracy": 0.4689655125141144, "step": 147805 }, { "epoch": 0.14887580186555552, "grad_norm": 11.163048650329113, "learning_rate": 4.88079215176445e-05, "loss": 2.2771, "mean_token_accuracy": 0.46358135938644407, "step": 147810 }, { "epoch": 0.1488808379186597, "grad_norm": 8.09102862164424, "learning_rate": 4.8807801164783856e-05, "loss": 1.9657, "mean_token_accuracy": 0.4862068951129913, "step": 147815 }, { "epoch": 0.14888587397176387, "grad_norm": 11.350811064503564, "learning_rate": 4.8807680806013414e-05, "loss": 2.1794, "mean_token_accuracy": 0.4655172348022461, "step": 147820 }, { "epoch": 0.14889091002486804, "grad_norm": 10.94535222072632, "learning_rate": 4.88075604413332e-05, "loss": 2.4754, "mean_token_accuracy": 0.4103448212146759, "step": 147825 }, { "epoch": 0.1488959460779722, "grad_norm": 8.66256721180295, "learning_rate": 4.880744007074326e-05, "loss": 2.2151, "mean_token_accuracy": 0.44827585816383364, "step": 147830 }, { "epoch": 0.1489009821310764, "grad_norm": 9.559076743415517, "learning_rate": 4.880731969424363e-05, "loss": 2.4939, "mean_token_accuracy": 0.4448275864124298, "step": 147835 }, { "epoch": 0.14890601818418056, "grad_norm": 8.925513148237549, "learning_rate": 4.8807199311834326e-05, "loss": 2.0131, "mean_token_accuracy": 0.47586206793785096, "step": 147840 }, { "epoch": 0.14891105423728473, "grad_norm": 11.116767062339065, "learning_rate": 4.880707892351539e-05, "loss": 2.3807, "mean_token_accuracy": 0.44827585816383364, "step": 147845 }, { "epoch": 0.14891609029038888, "grad_norm": 9.692581156856463, "learning_rate": 4.8806958529286854e-05, "loss": 2.4221, "mean_token_accuracy": 0.4068965554237366, "step": 147850 }, { "epoch": 0.14892112634349305, "grad_norm": 10.206136318908854, "learning_rate": 4.8806838129148766e-05, "loss": 2.3195, "mean_token_accuracy": 0.42413793206214906, "step": 147855 }, { "epoch": 0.14892616239659723, "grad_norm": 13.058360767419309, "learning_rate": 4.880671772310114e-05, "loss": 2.5849, "mean_token_accuracy": 0.4137930989265442, "step": 147860 }, { "epoch": 0.1489311984497014, "grad_norm": 10.876942210023405, "learning_rate": 4.880659731114401e-05, "loss": 2.4157, "mean_token_accuracy": 0.4448275864124298, "step": 147865 }, { "epoch": 0.14893623450280558, "grad_norm": 13.05876552219092, "learning_rate": 4.880647689327742e-05, "loss": 2.9669, "mean_token_accuracy": 0.35862069129943847, "step": 147870 }, { "epoch": 0.14894127055590975, "grad_norm": 13.758972929381663, "learning_rate": 4.8806356469501405e-05, "loss": 2.2748, "mean_token_accuracy": 0.4326073855161667, "step": 147875 }, { "epoch": 0.14894630660901392, "grad_norm": 9.031203565914096, "learning_rate": 4.880623603981599e-05, "loss": 2.4215, "mean_token_accuracy": 0.4482758641242981, "step": 147880 }, { "epoch": 0.1489513426621181, "grad_norm": 8.93043135600148, "learning_rate": 4.88061156042212e-05, "loss": 2.3501, "mean_token_accuracy": 0.4034482717514038, "step": 147885 }, { "epoch": 0.14895637871522227, "grad_norm": 12.75446161689368, "learning_rate": 4.8805995162717096e-05, "loss": 2.7954, "mean_token_accuracy": 0.3896551728248596, "step": 147890 }, { "epoch": 0.14896141476832644, "grad_norm": 12.473579975919137, "learning_rate": 4.8805874715303686e-05, "loss": 2.7843, "mean_token_accuracy": 0.4, "step": 147895 }, { "epoch": 0.14896645082143062, "grad_norm": 10.831228544433472, "learning_rate": 4.880575426198101e-05, "loss": 2.217, "mean_token_accuracy": 0.4206896543502808, "step": 147900 }, { "epoch": 0.1489714868745348, "grad_norm": 9.928650113786528, "learning_rate": 4.880563380274911e-05, "loss": 2.6684, "mean_token_accuracy": 0.3931034505367279, "step": 147905 }, { "epoch": 0.14897652292763897, "grad_norm": 10.192390372796485, "learning_rate": 4.880551333760801e-05, "loss": 2.4841, "mean_token_accuracy": 0.43448275327682495, "step": 147910 }, { "epoch": 0.14898155898074314, "grad_norm": 10.904328943273141, "learning_rate": 4.8805392866557745e-05, "loss": 2.3218, "mean_token_accuracy": 0.45862069725990295, "step": 147915 }, { "epoch": 0.1489865950338473, "grad_norm": 9.474393123707559, "learning_rate": 4.8805272389598356e-05, "loss": 2.4437, "mean_token_accuracy": 0.4448275864124298, "step": 147920 }, { "epoch": 0.1489916310869515, "grad_norm": 10.103336800914134, "learning_rate": 4.880515190672986e-05, "loss": 2.4376, "mean_token_accuracy": 0.4034482777118683, "step": 147925 }, { "epoch": 0.14899666714005566, "grad_norm": 11.397077837596717, "learning_rate": 4.880503141795231e-05, "loss": 2.4252, "mean_token_accuracy": 0.4310344815254211, "step": 147930 }, { "epoch": 0.14900170319315983, "grad_norm": 11.372523411108572, "learning_rate": 4.8804910923265714e-05, "loss": 2.2694, "mean_token_accuracy": 0.42758620381355283, "step": 147935 }, { "epoch": 0.149006739246264, "grad_norm": 10.13739814350393, "learning_rate": 4.880479042267014e-05, "loss": 2.4235, "mean_token_accuracy": 0.4310344815254211, "step": 147940 }, { "epoch": 0.14901177529936818, "grad_norm": 9.563234727808558, "learning_rate": 4.8804669916165593e-05, "loss": 2.0983, "mean_token_accuracy": 0.4849364757537842, "step": 147945 }, { "epoch": 0.14901681135247236, "grad_norm": 14.129655277679928, "learning_rate": 4.880454940375212e-05, "loss": 2.3818, "mean_token_accuracy": 0.4310344815254211, "step": 147950 }, { "epoch": 0.14902184740557653, "grad_norm": 9.535677167662188, "learning_rate": 4.880442888542976e-05, "loss": 2.3972, "mean_token_accuracy": 0.42758620381355283, "step": 147955 }, { "epoch": 0.1490268834586807, "grad_norm": 8.742882719173116, "learning_rate": 4.8804308361198516e-05, "loss": 1.9176, "mean_token_accuracy": 0.524137943983078, "step": 147960 }, { "epoch": 0.14903191951178488, "grad_norm": 12.473851344487406, "learning_rate": 4.8804187831058455e-05, "loss": 2.7512, "mean_token_accuracy": 0.3551724135875702, "step": 147965 }, { "epoch": 0.14903695556488905, "grad_norm": 10.01582926439821, "learning_rate": 4.88040672950096e-05, "loss": 2.2561, "mean_token_accuracy": 0.5196007251739502, "step": 147970 }, { "epoch": 0.14904199161799322, "grad_norm": 9.899810880791657, "learning_rate": 4.880394675305198e-05, "loss": 2.2497, "mean_token_accuracy": 0.39999999701976774, "step": 147975 }, { "epoch": 0.1490470276710974, "grad_norm": 9.77434742918475, "learning_rate": 4.8803826205185633e-05, "loss": 2.0911, "mean_token_accuracy": 0.4586206912994385, "step": 147980 }, { "epoch": 0.14905206372420157, "grad_norm": 15.18536553971495, "learning_rate": 4.880370565141059e-05, "loss": 2.2278, "mean_token_accuracy": 0.48620688915252686, "step": 147985 }, { "epoch": 0.14905709977730572, "grad_norm": 10.029070090872178, "learning_rate": 4.8803585091726886e-05, "loss": 2.0095, "mean_token_accuracy": 0.5034482717514038, "step": 147990 }, { "epoch": 0.1490621358304099, "grad_norm": 11.370103732242823, "learning_rate": 4.880346452613454e-05, "loss": 2.3296, "mean_token_accuracy": 0.44482759237289426, "step": 147995 }, { "epoch": 0.14906717188351407, "grad_norm": 11.39297337300726, "learning_rate": 4.880334395463361e-05, "loss": 2.4859, "mean_token_accuracy": 0.45517241954803467, "step": 148000 }, { "epoch": 0.14907220793661824, "grad_norm": 10.198390889706975, "learning_rate": 4.880322337722412e-05, "loss": 2.1748, "mean_token_accuracy": 0.4482758641242981, "step": 148005 }, { "epoch": 0.1490772439897224, "grad_norm": 13.72151304920062, "learning_rate": 4.88031027939061e-05, "loss": 2.5864, "mean_token_accuracy": 0.4206896543502808, "step": 148010 }, { "epoch": 0.1490822800428266, "grad_norm": 12.757458801764914, "learning_rate": 4.880298220467957e-05, "loss": 2.3495, "mean_token_accuracy": 0.4758620738983154, "step": 148015 }, { "epoch": 0.14908731609593076, "grad_norm": 15.292942184564707, "learning_rate": 4.88028616095446e-05, "loss": 2.3623, "mean_token_accuracy": 0.4433151841163635, "step": 148020 }, { "epoch": 0.14909235214903493, "grad_norm": 10.3950684465651, "learning_rate": 4.8802741008501194e-05, "loss": 2.3249, "mean_token_accuracy": 0.41034482717514037, "step": 148025 }, { "epoch": 0.1490973882021391, "grad_norm": 12.299271022344156, "learning_rate": 4.880262040154939e-05, "loss": 2.7619, "mean_token_accuracy": 0.3655172407627106, "step": 148030 }, { "epoch": 0.14910242425524328, "grad_norm": 9.655104205759264, "learning_rate": 4.880249978868923e-05, "loss": 1.987, "mean_token_accuracy": 0.4862068951129913, "step": 148035 }, { "epoch": 0.14910746030834746, "grad_norm": 11.344340266457227, "learning_rate": 4.880237916992074e-05, "loss": 2.5613, "mean_token_accuracy": 0.3758620649576187, "step": 148040 }, { "epoch": 0.14911249636145163, "grad_norm": 10.600874858143706, "learning_rate": 4.8802258545243955e-05, "loss": 2.423, "mean_token_accuracy": 0.37241379022598264, "step": 148045 }, { "epoch": 0.1491175324145558, "grad_norm": 12.347228527759603, "learning_rate": 4.880213791465891e-05, "loss": 2.1328, "mean_token_accuracy": 0.44482759237289426, "step": 148050 }, { "epoch": 0.14912256846765998, "grad_norm": 11.719124955892893, "learning_rate": 4.880201727816564e-05, "loss": 2.5571, "mean_token_accuracy": 0.39310344457626345, "step": 148055 }, { "epoch": 0.14912760452076415, "grad_norm": 10.685215262592372, "learning_rate": 4.880189663576417e-05, "loss": 2.078, "mean_token_accuracy": 0.49655172824859617, "step": 148060 }, { "epoch": 0.14913264057386832, "grad_norm": 11.196217155355464, "learning_rate": 4.8801775987454546e-05, "loss": 2.2808, "mean_token_accuracy": 0.41724138259887694, "step": 148065 }, { "epoch": 0.1491376766269725, "grad_norm": 12.032237079586652, "learning_rate": 4.880165533323679e-05, "loss": 2.4513, "mean_token_accuracy": 0.4068965494632721, "step": 148070 }, { "epoch": 0.14914271268007667, "grad_norm": 7.982271596568263, "learning_rate": 4.880153467311094e-05, "loss": 2.18, "mean_token_accuracy": 0.4517241358757019, "step": 148075 }, { "epoch": 0.14914774873318085, "grad_norm": 10.324089178687746, "learning_rate": 4.8801414007077036e-05, "loss": 2.3984, "mean_token_accuracy": 0.4586206912994385, "step": 148080 }, { "epoch": 0.14915278478628502, "grad_norm": 9.682314737900265, "learning_rate": 4.8801293335135095e-05, "loss": 2.2261, "mean_token_accuracy": 0.48275861144065857, "step": 148085 }, { "epoch": 0.1491578208393892, "grad_norm": 10.53415016224788, "learning_rate": 4.880117265728517e-05, "loss": 2.3294, "mean_token_accuracy": 0.43629764318466185, "step": 148090 }, { "epoch": 0.14916285689249337, "grad_norm": 10.695377573487987, "learning_rate": 4.8801051973527284e-05, "loss": 2.1814, "mean_token_accuracy": 0.4586206912994385, "step": 148095 }, { "epoch": 0.14916789294559754, "grad_norm": 11.201467064617074, "learning_rate": 4.880093128386147e-05, "loss": 2.1282, "mean_token_accuracy": 0.4551724076271057, "step": 148100 }, { "epoch": 0.14917292899870171, "grad_norm": 9.098638102134485, "learning_rate": 4.880081058828776e-05, "loss": 2.3743, "mean_token_accuracy": 0.458620685338974, "step": 148105 }, { "epoch": 0.1491779650518059, "grad_norm": 12.150650350062113, "learning_rate": 4.88006898868062e-05, "loss": 2.4131, "mean_token_accuracy": 0.43103447556495667, "step": 148110 }, { "epoch": 0.14918300110491006, "grad_norm": 9.611284191717868, "learning_rate": 4.880056917941681e-05, "loss": 2.4236, "mean_token_accuracy": 0.40344826579093934, "step": 148115 }, { "epoch": 0.14918803715801424, "grad_norm": 13.619647064722711, "learning_rate": 4.880044846611962e-05, "loss": 2.6716, "mean_token_accuracy": 0.4034482717514038, "step": 148120 }, { "epoch": 0.1491930732111184, "grad_norm": 11.68044294464124, "learning_rate": 4.880032774691467e-05, "loss": 2.5673, "mean_token_accuracy": 0.3448275804519653, "step": 148125 }, { "epoch": 0.14919810926422256, "grad_norm": 10.596723446342825, "learning_rate": 4.880020702180201e-05, "loss": 2.4072, "mean_token_accuracy": 0.39655172228813174, "step": 148130 }, { "epoch": 0.14920314531732673, "grad_norm": 14.501522636336029, "learning_rate": 4.880008629078164e-05, "loss": 2.6094, "mean_token_accuracy": 0.41034482717514037, "step": 148135 }, { "epoch": 0.1492081813704309, "grad_norm": 10.564000987435252, "learning_rate": 4.8799965553853626e-05, "loss": 2.2041, "mean_token_accuracy": 0.4689655125141144, "step": 148140 }, { "epoch": 0.14921321742353508, "grad_norm": 12.94808724437348, "learning_rate": 4.879984481101799e-05, "loss": 2.52, "mean_token_accuracy": 0.4172413766384125, "step": 148145 }, { "epoch": 0.14921825347663925, "grad_norm": 11.230065534700563, "learning_rate": 4.879972406227475e-05, "loss": 2.293, "mean_token_accuracy": 0.44990925788879393, "step": 148150 }, { "epoch": 0.14922328952974342, "grad_norm": 9.76289953961644, "learning_rate": 4.8799603307623956e-05, "loss": 2.3523, "mean_token_accuracy": 0.42413793206214906, "step": 148155 }, { "epoch": 0.1492283255828476, "grad_norm": 10.300957512961741, "learning_rate": 4.8799482547065645e-05, "loss": 2.3979, "mean_token_accuracy": 0.4586206912994385, "step": 148160 }, { "epoch": 0.14923336163595177, "grad_norm": 11.172738754575848, "learning_rate": 4.879936178059983e-05, "loss": 2.3189, "mean_token_accuracy": 0.417241370677948, "step": 148165 }, { "epoch": 0.14923839768905595, "grad_norm": 12.392856946602523, "learning_rate": 4.8799241008226564e-05, "loss": 2.3853, "mean_token_accuracy": 0.4206896543502808, "step": 148170 }, { "epoch": 0.14924343374216012, "grad_norm": 13.787329741753858, "learning_rate": 4.879912022994588e-05, "loss": 2.4156, "mean_token_accuracy": 0.4310344815254211, "step": 148175 }, { "epoch": 0.1492484697952643, "grad_norm": 10.937126774578767, "learning_rate": 4.87989994457578e-05, "loss": 2.9653, "mean_token_accuracy": 0.34482758939266206, "step": 148180 }, { "epoch": 0.14925350584836847, "grad_norm": 8.180717057531883, "learning_rate": 4.8798878655662364e-05, "loss": 2.1, "mean_token_accuracy": 0.4689655125141144, "step": 148185 }, { "epoch": 0.14925854190147264, "grad_norm": 10.266403607283968, "learning_rate": 4.87987578596596e-05, "loss": 2.4839, "mean_token_accuracy": 0.3793103456497192, "step": 148190 }, { "epoch": 0.14926357795457681, "grad_norm": 14.077174798230866, "learning_rate": 4.8798637057749546e-05, "loss": 2.1807, "mean_token_accuracy": 0.4413793087005615, "step": 148195 }, { "epoch": 0.149268614007681, "grad_norm": 10.008264155026232, "learning_rate": 4.8798516249932244e-05, "loss": 2.1304, "mean_token_accuracy": 0.458620685338974, "step": 148200 }, { "epoch": 0.14927365006078516, "grad_norm": 9.375009881029253, "learning_rate": 4.8798395436207714e-05, "loss": 2.1686, "mean_token_accuracy": 0.46317734122276305, "step": 148205 }, { "epoch": 0.14927868611388934, "grad_norm": 7.737577122020157, "learning_rate": 4.879827461657599e-05, "loss": 2.4191, "mean_token_accuracy": 0.46733213067054746, "step": 148210 }, { "epoch": 0.1492837221669935, "grad_norm": 10.915379533960568, "learning_rate": 4.879815379103712e-05, "loss": 2.5869, "mean_token_accuracy": 0.41724138855934145, "step": 148215 }, { "epoch": 0.14928875822009768, "grad_norm": 9.97862536975699, "learning_rate": 4.8798032959591114e-05, "loss": 2.4009, "mean_token_accuracy": 0.43272837400436404, "step": 148220 }, { "epoch": 0.14929379427320186, "grad_norm": 10.10233242277236, "learning_rate": 4.879791212223803e-05, "loss": 2.3473, "mean_token_accuracy": 0.4068965554237366, "step": 148225 }, { "epoch": 0.14929883032630603, "grad_norm": 8.989365844841098, "learning_rate": 4.8797791278977895e-05, "loss": 2.3237, "mean_token_accuracy": 0.4551724076271057, "step": 148230 }, { "epoch": 0.1493038663794102, "grad_norm": 10.403670656294587, "learning_rate": 4.879767042981073e-05, "loss": 2.5127, "mean_token_accuracy": 0.4310344815254211, "step": 148235 }, { "epoch": 0.14930890243251438, "grad_norm": 11.875467960287512, "learning_rate": 4.879754957473657e-05, "loss": 2.389, "mean_token_accuracy": 0.4448275864124298, "step": 148240 }, { "epoch": 0.14931393848561855, "grad_norm": 10.850534147072421, "learning_rate": 4.879742871375547e-05, "loss": 2.0959, "mean_token_accuracy": 0.49655171632766726, "step": 148245 }, { "epoch": 0.14931897453872273, "grad_norm": 15.019775288432555, "learning_rate": 4.879730784686744e-05, "loss": 2.5207, "mean_token_accuracy": 0.4241379380226135, "step": 148250 }, { "epoch": 0.1493240105918269, "grad_norm": 9.466701582848748, "learning_rate": 4.8797186974072523e-05, "loss": 2.3528, "mean_token_accuracy": 0.46896552443504336, "step": 148255 }, { "epoch": 0.14932904664493107, "grad_norm": 9.708752915221913, "learning_rate": 4.879706609537076e-05, "loss": 2.4561, "mean_token_accuracy": 0.4068965494632721, "step": 148260 }, { "epoch": 0.14933408269803525, "grad_norm": 11.789880332533471, "learning_rate": 4.8796945210762163e-05, "loss": 2.8097, "mean_token_accuracy": 0.37586207389831544, "step": 148265 }, { "epoch": 0.1493391187511394, "grad_norm": 12.661856126260206, "learning_rate": 4.879682432024679e-05, "loss": 2.9115, "mean_token_accuracy": 0.379310342669487, "step": 148270 }, { "epoch": 0.14934415480424357, "grad_norm": 9.00311771537564, "learning_rate": 4.879670342382466e-05, "loss": 1.8237, "mean_token_accuracy": 0.510344821214676, "step": 148275 }, { "epoch": 0.14934919085734774, "grad_norm": 9.195596600836756, "learning_rate": 4.87965825214958e-05, "loss": 2.3685, "mean_token_accuracy": 0.3931034505367279, "step": 148280 }, { "epoch": 0.14935422691045191, "grad_norm": 10.939414970578273, "learning_rate": 4.879646161326026e-05, "loss": 2.6741, "mean_token_accuracy": 0.4294010937213898, "step": 148285 }, { "epoch": 0.1493592629635561, "grad_norm": 11.24508777724285, "learning_rate": 4.879634069911808e-05, "loss": 2.5377, "mean_token_accuracy": 0.4172413766384125, "step": 148290 }, { "epoch": 0.14936429901666026, "grad_norm": 11.252053841601317, "learning_rate": 4.8796219779069264e-05, "loss": 2.4785, "mean_token_accuracy": 0.43103448748588563, "step": 148295 }, { "epoch": 0.14936933506976444, "grad_norm": 11.025939425139963, "learning_rate": 4.8796098853113864e-05, "loss": 2.2564, "mean_token_accuracy": 0.46551724076271056, "step": 148300 }, { "epoch": 0.1493743711228686, "grad_norm": 11.297055593307292, "learning_rate": 4.8795977921251914e-05, "loss": 2.1477, "mean_token_accuracy": 0.4482758641242981, "step": 148305 }, { "epoch": 0.14937940717597278, "grad_norm": 10.265772440832475, "learning_rate": 4.8795856983483454e-05, "loss": 2.5669, "mean_token_accuracy": 0.4620689570903778, "step": 148310 }, { "epoch": 0.14938444322907696, "grad_norm": 14.469361902827545, "learning_rate": 4.879573603980849e-05, "loss": 2.7066, "mean_token_accuracy": 0.4, "step": 148315 }, { "epoch": 0.14938947928218113, "grad_norm": 9.65081543193366, "learning_rate": 4.879561509022708e-05, "loss": 2.3258, "mean_token_accuracy": 0.4379310369491577, "step": 148320 }, { "epoch": 0.1493945153352853, "grad_norm": 11.969685504550256, "learning_rate": 4.879549413473926e-05, "loss": 2.5683, "mean_token_accuracy": 0.3827586233615875, "step": 148325 }, { "epoch": 0.14939955138838948, "grad_norm": 10.111833002790497, "learning_rate": 4.879537317334505e-05, "loss": 2.3123, "mean_token_accuracy": 0.47931034564971925, "step": 148330 }, { "epoch": 0.14940458744149365, "grad_norm": 8.73186656743287, "learning_rate": 4.879525220604449e-05, "loss": 2.4074, "mean_token_accuracy": 0.44482758045196535, "step": 148335 }, { "epoch": 0.14940962349459783, "grad_norm": 9.168036204906091, "learning_rate": 4.879513123283762e-05, "loss": 2.2334, "mean_token_accuracy": 0.46896552443504336, "step": 148340 }, { "epoch": 0.149414659547702, "grad_norm": 11.846550094738497, "learning_rate": 4.879501025372445e-05, "loss": 2.5143, "mean_token_accuracy": 0.41724138259887694, "step": 148345 }, { "epoch": 0.14941969560080617, "grad_norm": 10.590952234268762, "learning_rate": 4.879488926870504e-05, "loss": 2.3816, "mean_token_accuracy": 0.46551724672317507, "step": 148350 }, { "epoch": 0.14942473165391035, "grad_norm": 9.803524960239452, "learning_rate": 4.8794768277779415e-05, "loss": 1.9604, "mean_token_accuracy": 0.5241379320621491, "step": 148355 }, { "epoch": 0.14942976770701452, "grad_norm": 14.868326483510446, "learning_rate": 4.87946472809476e-05, "loss": 2.6741, "mean_token_accuracy": 0.4, "step": 148360 }, { "epoch": 0.1494348037601187, "grad_norm": 7.333397551595195, "learning_rate": 4.879452627820963e-05, "loss": 2.1696, "mean_token_accuracy": 0.4551724135875702, "step": 148365 }, { "epoch": 0.14943983981322287, "grad_norm": 9.724838965693143, "learning_rate": 4.879440526956556e-05, "loss": 2.436, "mean_token_accuracy": 0.42758620977401735, "step": 148370 }, { "epoch": 0.14944487586632704, "grad_norm": 10.310872502385898, "learning_rate": 4.87942842550154e-05, "loss": 2.1634, "mean_token_accuracy": 0.460496062040329, "step": 148375 }, { "epoch": 0.14944991191943122, "grad_norm": 10.677030462892514, "learning_rate": 4.8794163234559185e-05, "loss": 2.5883, "mean_token_accuracy": 0.40344826579093934, "step": 148380 }, { "epoch": 0.1494549479725354, "grad_norm": 9.613311474343398, "learning_rate": 4.879404220819696e-05, "loss": 2.0175, "mean_token_accuracy": 0.5206896483898162, "step": 148385 }, { "epoch": 0.14945998402563956, "grad_norm": 13.896996971768822, "learning_rate": 4.8793921175928755e-05, "loss": 2.9477, "mean_token_accuracy": 0.4117967367172241, "step": 148390 }, { "epoch": 0.14946502007874374, "grad_norm": 11.309013381608043, "learning_rate": 4.8793800137754595e-05, "loss": 1.8786, "mean_token_accuracy": 0.5551724135875702, "step": 148395 }, { "epoch": 0.1494700561318479, "grad_norm": 10.927716761509304, "learning_rate": 4.879367909367452e-05, "loss": 2.1021, "mean_token_accuracy": 0.4620689630508423, "step": 148400 }, { "epoch": 0.14947509218495209, "grad_norm": 9.130444915693413, "learning_rate": 4.879355804368857e-05, "loss": 2.277, "mean_token_accuracy": 0.4379310369491577, "step": 148405 }, { "epoch": 0.14948012823805623, "grad_norm": 8.930308734693071, "learning_rate": 4.879343698779676e-05, "loss": 2.3452, "mean_token_accuracy": 0.42758620381355283, "step": 148410 }, { "epoch": 0.1494851642911604, "grad_norm": 10.4499694150736, "learning_rate": 4.879331592599915e-05, "loss": 2.1978, "mean_token_accuracy": 0.4931034505367279, "step": 148415 }, { "epoch": 0.14949020034426458, "grad_norm": 11.749447215994785, "learning_rate": 4.879319485829576e-05, "loss": 2.5527, "mean_token_accuracy": 0.3896551728248596, "step": 148420 }, { "epoch": 0.14949523639736875, "grad_norm": 10.884922832677695, "learning_rate": 4.8793073784686625e-05, "loss": 2.6011, "mean_token_accuracy": 0.4379310369491577, "step": 148425 }, { "epoch": 0.14950027245047293, "grad_norm": 10.015806435975714, "learning_rate": 4.8792952705171755e-05, "loss": 2.2615, "mean_token_accuracy": 0.4068965554237366, "step": 148430 }, { "epoch": 0.1495053085035771, "grad_norm": 10.630052758406004, "learning_rate": 4.879283161975122e-05, "loss": 2.0222, "mean_token_accuracy": 0.4965517222881317, "step": 148435 }, { "epoch": 0.14951034455668127, "grad_norm": 10.98089015132552, "learning_rate": 4.879271052842504e-05, "loss": 1.9883, "mean_token_accuracy": 0.4896551728248596, "step": 148440 }, { "epoch": 0.14951538060978545, "grad_norm": 8.067609525428704, "learning_rate": 4.879258943119325e-05, "loss": 2.7161, "mean_token_accuracy": 0.384633994102478, "step": 148445 }, { "epoch": 0.14952041666288962, "grad_norm": 10.345105820653858, "learning_rate": 4.879246832805587e-05, "loss": 2.5786, "mean_token_accuracy": 0.4103448331356049, "step": 148450 }, { "epoch": 0.1495254527159938, "grad_norm": 10.10041364528631, "learning_rate": 4.879234721901295e-05, "loss": 2.1977, "mean_token_accuracy": 0.44827585816383364, "step": 148455 }, { "epoch": 0.14953048876909797, "grad_norm": 9.71605378251738, "learning_rate": 4.8792226104064524e-05, "loss": 2.3384, "mean_token_accuracy": 0.46721113920211793, "step": 148460 }, { "epoch": 0.14953552482220214, "grad_norm": 16.277063381384682, "learning_rate": 4.879210498321061e-05, "loss": 2.5945, "mean_token_accuracy": 0.47791893482208253, "step": 148465 }, { "epoch": 0.14954056087530632, "grad_norm": 12.194419489531699, "learning_rate": 4.879198385645126e-05, "loss": 2.1959, "mean_token_accuracy": 0.441379314661026, "step": 148470 }, { "epoch": 0.1495455969284105, "grad_norm": 9.71061939736646, "learning_rate": 4.879186272378649e-05, "loss": 1.931, "mean_token_accuracy": 0.5020568609237671, "step": 148475 }, { "epoch": 0.14955063298151466, "grad_norm": 13.272642274508467, "learning_rate": 4.879174158521635e-05, "loss": 2.5405, "mean_token_accuracy": 0.3620689630508423, "step": 148480 }, { "epoch": 0.14955566903461884, "grad_norm": 14.746158025462833, "learning_rate": 4.879162044074086e-05, "loss": 2.8331, "mean_token_accuracy": 0.37241379618644715, "step": 148485 }, { "epoch": 0.149560705087723, "grad_norm": 11.632739596647642, "learning_rate": 4.879149929036006e-05, "loss": 2.5489, "mean_token_accuracy": 0.4465819776058197, "step": 148490 }, { "epoch": 0.14956574114082719, "grad_norm": 9.695752195287898, "learning_rate": 4.879137813407399e-05, "loss": 1.9983, "mean_token_accuracy": 0.5, "step": 148495 }, { "epoch": 0.14957077719393136, "grad_norm": 10.519858993061627, "learning_rate": 4.879125697188267e-05, "loss": 2.488, "mean_token_accuracy": 0.41379311084747317, "step": 148500 }, { "epoch": 0.14957581324703553, "grad_norm": 10.993756191527162, "learning_rate": 4.8791135803786146e-05, "loss": 2.3292, "mean_token_accuracy": 0.46551724076271056, "step": 148505 }, { "epoch": 0.1495808493001397, "grad_norm": 8.359580108318214, "learning_rate": 4.879101462978444e-05, "loss": 2.3767, "mean_token_accuracy": 0.4310344815254211, "step": 148510 }, { "epoch": 0.14958588535324388, "grad_norm": 9.309462509988045, "learning_rate": 4.87908934498776e-05, "loss": 2.382, "mean_token_accuracy": 0.4206896543502808, "step": 148515 }, { "epoch": 0.14959092140634805, "grad_norm": 10.295602515321162, "learning_rate": 4.879077226406564e-05, "loss": 2.3324, "mean_token_accuracy": 0.4551724135875702, "step": 148520 }, { "epoch": 0.14959595745945223, "grad_norm": 10.38944902971831, "learning_rate": 4.8790651072348614e-05, "loss": 2.7604, "mean_token_accuracy": 0.4206896543502808, "step": 148525 }, { "epoch": 0.1496009935125564, "grad_norm": 9.178152618584285, "learning_rate": 4.879052987472654e-05, "loss": 2.3404, "mean_token_accuracy": 0.4068965554237366, "step": 148530 }, { "epoch": 0.14960602956566058, "grad_norm": 10.901861484426453, "learning_rate": 4.879040867119946e-05, "loss": 2.4274, "mean_token_accuracy": 0.4068965494632721, "step": 148535 }, { "epoch": 0.14961106561876475, "grad_norm": 9.739387451810444, "learning_rate": 4.879028746176741e-05, "loss": 2.3493, "mean_token_accuracy": 0.43103448748588563, "step": 148540 }, { "epoch": 0.14961610167186892, "grad_norm": 9.446004471544143, "learning_rate": 4.879016624643041e-05, "loss": 2.2514, "mean_token_accuracy": 0.45517241954803467, "step": 148545 }, { "epoch": 0.14962113772497307, "grad_norm": 10.526794921908618, "learning_rate": 4.879004502518852e-05, "loss": 2.1232, "mean_token_accuracy": 0.4784029066562653, "step": 148550 }, { "epoch": 0.14962617377807724, "grad_norm": 11.175589238857892, "learning_rate": 4.878992379804174e-05, "loss": 2.6477, "mean_token_accuracy": 0.44482759237289426, "step": 148555 }, { "epoch": 0.14963120983118142, "grad_norm": 9.543428036272735, "learning_rate": 4.878980256499013e-05, "loss": 2.2817, "mean_token_accuracy": 0.44137930274009707, "step": 148560 }, { "epoch": 0.1496362458842856, "grad_norm": 9.415113350912609, "learning_rate": 4.878968132603371e-05, "loss": 2.2891, "mean_token_accuracy": 0.42413793206214906, "step": 148565 }, { "epoch": 0.14964128193738976, "grad_norm": 11.36001779401205, "learning_rate": 4.878956008117251e-05, "loss": 2.667, "mean_token_accuracy": 0.40344828367233276, "step": 148570 }, { "epoch": 0.14964631799049394, "grad_norm": 14.881006271886568, "learning_rate": 4.8789438830406586e-05, "loss": 2.6217, "mean_token_accuracy": 0.441379314661026, "step": 148575 }, { "epoch": 0.1496513540435981, "grad_norm": 9.865572530580646, "learning_rate": 4.878931757373595e-05, "loss": 2.1517, "mean_token_accuracy": 0.443254691362381, "step": 148580 }, { "epoch": 0.14965639009670229, "grad_norm": 11.906719238054274, "learning_rate": 4.878919631116064e-05, "loss": 2.5137, "mean_token_accuracy": 0.3724137842655182, "step": 148585 }, { "epoch": 0.14966142614980646, "grad_norm": 9.502577276013145, "learning_rate": 4.878907504268069e-05, "loss": 2.2702, "mean_token_accuracy": 0.43793103098869324, "step": 148590 }, { "epoch": 0.14966646220291063, "grad_norm": 10.653360898934892, "learning_rate": 4.878895376829614e-05, "loss": 2.2837, "mean_token_accuracy": 0.4517241358757019, "step": 148595 }, { "epoch": 0.1496714982560148, "grad_norm": 10.891731410940247, "learning_rate": 4.878883248800702e-05, "loss": 2.7295, "mean_token_accuracy": 0.324137932062149, "step": 148600 }, { "epoch": 0.14967653430911898, "grad_norm": 9.086316416203646, "learning_rate": 4.8788711201813365e-05, "loss": 2.7321, "mean_token_accuracy": 0.4068965494632721, "step": 148605 }, { "epoch": 0.14968157036222315, "grad_norm": 10.601148418755962, "learning_rate": 4.878858990971521e-05, "loss": 2.6769, "mean_token_accuracy": 0.41724138259887694, "step": 148610 }, { "epoch": 0.14968660641532733, "grad_norm": 9.004200206360139, "learning_rate": 4.8788468611712566e-05, "loss": 1.7701, "mean_token_accuracy": 0.5379310190677643, "step": 148615 }, { "epoch": 0.1496916424684315, "grad_norm": 10.74539967391511, "learning_rate": 4.87883473078055e-05, "loss": 2.1114, "mean_token_accuracy": 0.45517241954803467, "step": 148620 }, { "epoch": 0.14969667852153568, "grad_norm": 10.052218926308193, "learning_rate": 4.878822599799403e-05, "loss": 2.34, "mean_token_accuracy": 0.4586206912994385, "step": 148625 }, { "epoch": 0.14970171457463985, "grad_norm": 10.529776402739493, "learning_rate": 4.8788104682278194e-05, "loss": 2.2907, "mean_token_accuracy": 0.43103448748588563, "step": 148630 }, { "epoch": 0.14970675062774402, "grad_norm": 11.32648192754511, "learning_rate": 4.878798336065802e-05, "loss": 2.508, "mean_token_accuracy": 0.42413793206214906, "step": 148635 }, { "epoch": 0.1497117866808482, "grad_norm": 10.448057935138856, "learning_rate": 4.878786203313354e-05, "loss": 2.3618, "mean_token_accuracy": 0.44482757449150084, "step": 148640 }, { "epoch": 0.14971682273395237, "grad_norm": 8.313293761136984, "learning_rate": 4.878774069970479e-05, "loss": 1.7163, "mean_token_accuracy": 0.5586206793785096, "step": 148645 }, { "epoch": 0.14972185878705654, "grad_norm": 10.557855113077451, "learning_rate": 4.878761936037182e-05, "loss": 2.2664, "mean_token_accuracy": 0.4379310369491577, "step": 148650 }, { "epoch": 0.14972689484016072, "grad_norm": 9.361030906470665, "learning_rate": 4.8787498015134644e-05, "loss": 2.1178, "mean_token_accuracy": 0.4793103516101837, "step": 148655 }, { "epoch": 0.1497319308932649, "grad_norm": 10.295678940027424, "learning_rate": 4.8787376663993296e-05, "loss": 2.6094, "mean_token_accuracy": 0.43236538767814636, "step": 148660 }, { "epoch": 0.14973696694636907, "grad_norm": 11.960768941097838, "learning_rate": 4.878725530694781e-05, "loss": 2.4738, "mean_token_accuracy": 0.39310343861579894, "step": 148665 }, { "epoch": 0.14974200299947324, "grad_norm": 9.959753009658037, "learning_rate": 4.878713394399823e-05, "loss": 2.4047, "mean_token_accuracy": 0.42068964838981626, "step": 148670 }, { "epoch": 0.1497470390525774, "grad_norm": 9.535012635855178, "learning_rate": 4.8787012575144585e-05, "loss": 2.5496, "mean_token_accuracy": 0.4310344815254211, "step": 148675 }, { "epoch": 0.1497520751056816, "grad_norm": 10.72258053360446, "learning_rate": 4.878689120038691e-05, "loss": 2.5148, "mean_token_accuracy": 0.3931034505367279, "step": 148680 }, { "epoch": 0.14975711115878576, "grad_norm": 11.355359417416457, "learning_rate": 4.878676981972523e-05, "loss": 2.313, "mean_token_accuracy": 0.422202056646347, "step": 148685 }, { "epoch": 0.1497621472118899, "grad_norm": 9.342657878267854, "learning_rate": 4.878664843315959e-05, "loss": 2.0304, "mean_token_accuracy": 0.5160314619541169, "step": 148690 }, { "epoch": 0.14976718326499408, "grad_norm": 11.44283549233883, "learning_rate": 4.8786527040690025e-05, "loss": 2.7469, "mean_token_accuracy": 0.4, "step": 148695 }, { "epoch": 0.14977221931809825, "grad_norm": 10.724944970993485, "learning_rate": 4.8786405642316554e-05, "loss": 2.0304, "mean_token_accuracy": 0.44827585220336913, "step": 148700 }, { "epoch": 0.14977725537120243, "grad_norm": 10.729697271669913, "learning_rate": 4.878628423803922e-05, "loss": 2.4441, "mean_token_accuracy": 0.4241379201412201, "step": 148705 }, { "epoch": 0.1497822914243066, "grad_norm": 11.515331184858699, "learning_rate": 4.878616282785805e-05, "loss": 2.5693, "mean_token_accuracy": 0.3965517282485962, "step": 148710 }, { "epoch": 0.14978732747741078, "grad_norm": 9.19058191607933, "learning_rate": 4.878604141177309e-05, "loss": 2.345, "mean_token_accuracy": 0.46896552443504336, "step": 148715 }, { "epoch": 0.14979236353051495, "grad_norm": 12.720433430729209, "learning_rate": 4.878591998978436e-05, "loss": 2.3439, "mean_token_accuracy": 0.4395644307136536, "step": 148720 }, { "epoch": 0.14979739958361912, "grad_norm": 9.643130474407748, "learning_rate": 4.878579856189191e-05, "loss": 2.3975, "mean_token_accuracy": 0.4103448212146759, "step": 148725 }, { "epoch": 0.1498024356367233, "grad_norm": 9.539707176881274, "learning_rate": 4.8785677128095756e-05, "loss": 2.2614, "mean_token_accuracy": 0.3896551728248596, "step": 148730 }, { "epoch": 0.14980747168982747, "grad_norm": 10.989628210174091, "learning_rate": 4.8785555688395954e-05, "loss": 2.4258, "mean_token_accuracy": 0.41379310488700866, "step": 148735 }, { "epoch": 0.14981250774293164, "grad_norm": 11.014661865481312, "learning_rate": 4.878543424279251e-05, "loss": 2.2241, "mean_token_accuracy": 0.41379310488700866, "step": 148740 }, { "epoch": 0.14981754379603582, "grad_norm": 9.115445852981367, "learning_rate": 4.878531279128548e-05, "loss": 2.8028, "mean_token_accuracy": 0.37931033968925476, "step": 148745 }, { "epoch": 0.14982257984914, "grad_norm": 9.10019966275961, "learning_rate": 4.878519133387488e-05, "loss": 1.9476, "mean_token_accuracy": 0.47586206793785096, "step": 148750 }, { "epoch": 0.14982761590224417, "grad_norm": 9.209248147045804, "learning_rate": 4.878506987056076e-05, "loss": 2.266, "mean_token_accuracy": 0.4310344815254211, "step": 148755 }, { "epoch": 0.14983265195534834, "grad_norm": 8.635951368430888, "learning_rate": 4.878494840134314e-05, "loss": 2.2878, "mean_token_accuracy": 0.39794313311576845, "step": 148760 }, { "epoch": 0.1498376880084525, "grad_norm": 11.425567283496296, "learning_rate": 4.878482692622206e-05, "loss": 2.8391, "mean_token_accuracy": 0.3896551728248596, "step": 148765 }, { "epoch": 0.1498427240615567, "grad_norm": 11.610636106879273, "learning_rate": 4.8784705445197564e-05, "loss": 2.4379, "mean_token_accuracy": 0.4896551787853241, "step": 148770 }, { "epoch": 0.14984776011466086, "grad_norm": 12.747569915168574, "learning_rate": 4.878458395826967e-05, "loss": 2.2274, "mean_token_accuracy": 0.45517241954803467, "step": 148775 }, { "epoch": 0.14985279616776503, "grad_norm": 9.773138455861067, "learning_rate": 4.878446246543842e-05, "loss": 2.2831, "mean_token_accuracy": 0.4724137902259827, "step": 148780 }, { "epoch": 0.1498578322208692, "grad_norm": 13.892887420402237, "learning_rate": 4.878434096670384e-05, "loss": 2.6799, "mean_token_accuracy": 0.4448275864124298, "step": 148785 }, { "epoch": 0.14986286827397338, "grad_norm": 11.442342608476883, "learning_rate": 4.878421946206597e-05, "loss": 2.9005, "mean_token_accuracy": 0.43103448748588563, "step": 148790 }, { "epoch": 0.14986790432707756, "grad_norm": 12.133394612618105, "learning_rate": 4.878409795152484e-05, "loss": 2.7557, "mean_token_accuracy": 0.43581366539001465, "step": 148795 }, { "epoch": 0.14987294038018173, "grad_norm": 11.272833980475172, "learning_rate": 4.878397643508049e-05, "loss": 2.4824, "mean_token_accuracy": 0.4068965494632721, "step": 148800 }, { "epoch": 0.1498779764332859, "grad_norm": 8.786581817368605, "learning_rate": 4.878385491273295e-05, "loss": 2.6901, "mean_token_accuracy": 0.43103447556495667, "step": 148805 }, { "epoch": 0.14988301248639008, "grad_norm": 10.254010243849295, "learning_rate": 4.878373338448226e-05, "loss": 2.2083, "mean_token_accuracy": 0.4413793087005615, "step": 148810 }, { "epoch": 0.14988804853949425, "grad_norm": 10.231022723134345, "learning_rate": 4.8783611850328436e-05, "loss": 2.6185, "mean_token_accuracy": 0.3931034505367279, "step": 148815 }, { "epoch": 0.14989308459259842, "grad_norm": 14.05618257685569, "learning_rate": 4.878349031027153e-05, "loss": 2.5343, "mean_token_accuracy": 0.3999999940395355, "step": 148820 }, { "epoch": 0.1498981206457026, "grad_norm": 10.071231512045767, "learning_rate": 4.878336876431156e-05, "loss": 2.2721, "mean_token_accuracy": 0.4068965494632721, "step": 148825 }, { "epoch": 0.14990315669880674, "grad_norm": 9.069083442024345, "learning_rate": 4.8783247212448575e-05, "loss": 1.8482, "mean_token_accuracy": 0.5034482657909394, "step": 148830 }, { "epoch": 0.14990819275191092, "grad_norm": 9.367930012235796, "learning_rate": 4.87831256546826e-05, "loss": 1.9385, "mean_token_accuracy": 0.5059286117553711, "step": 148835 }, { "epoch": 0.1499132288050151, "grad_norm": 11.94402885022271, "learning_rate": 4.878300409101367e-05, "loss": 2.3734, "mean_token_accuracy": 0.42413793206214906, "step": 148840 }, { "epoch": 0.14991826485811927, "grad_norm": 12.019428941739706, "learning_rate": 4.878288252144181e-05, "loss": 3.0016, "mean_token_accuracy": 0.3977011501789093, "step": 148845 }, { "epoch": 0.14992330091122344, "grad_norm": 10.197775634114915, "learning_rate": 4.878276094596708e-05, "loss": 2.5506, "mean_token_accuracy": 0.39655172228813174, "step": 148850 }, { "epoch": 0.1499283369643276, "grad_norm": 9.764477539721625, "learning_rate": 4.8782639364589485e-05, "loss": 2.2504, "mean_token_accuracy": 0.46896551847457885, "step": 148855 }, { "epoch": 0.1499333730174318, "grad_norm": 14.046883612809316, "learning_rate": 4.878251777730908e-05, "loss": 2.3325, "mean_token_accuracy": 0.46551724672317507, "step": 148860 }, { "epoch": 0.14993840907053596, "grad_norm": 9.04202385634362, "learning_rate": 4.878239618412589e-05, "loss": 2.0593, "mean_token_accuracy": 0.4517241299152374, "step": 148865 }, { "epoch": 0.14994344512364013, "grad_norm": 8.243663159017458, "learning_rate": 4.878227458503994e-05, "loss": 2.6628, "mean_token_accuracy": 0.4068965494632721, "step": 148870 }, { "epoch": 0.1499484811767443, "grad_norm": 11.426014531084967, "learning_rate": 4.878215298005127e-05, "loss": 2.5905, "mean_token_accuracy": 0.4034482777118683, "step": 148875 }, { "epoch": 0.14995351722984848, "grad_norm": 10.020135155018252, "learning_rate": 4.878203136915992e-05, "loss": 2.1685, "mean_token_accuracy": 0.46896551847457885, "step": 148880 }, { "epoch": 0.14995855328295266, "grad_norm": 11.532289313172079, "learning_rate": 4.878190975236592e-05, "loss": 2.2614, "mean_token_accuracy": 0.4551724135875702, "step": 148885 }, { "epoch": 0.14996358933605683, "grad_norm": 12.9859053630927, "learning_rate": 4.878178812966931e-05, "loss": 2.2446, "mean_token_accuracy": 0.48275862336158754, "step": 148890 }, { "epoch": 0.149968625389161, "grad_norm": 11.543856647959993, "learning_rate": 4.878166650107011e-05, "loss": 2.4612, "mean_token_accuracy": 0.4, "step": 148895 }, { "epoch": 0.14997366144226518, "grad_norm": 11.8968978190887, "learning_rate": 4.8781544866568354e-05, "loss": 2.6225, "mean_token_accuracy": 0.42758620977401735, "step": 148900 }, { "epoch": 0.14997869749536935, "grad_norm": 9.076593654845928, "learning_rate": 4.878142322616409e-05, "loss": 2.5203, "mean_token_accuracy": 0.4103448331356049, "step": 148905 }, { "epoch": 0.14998373354847352, "grad_norm": 11.38870506646091, "learning_rate": 4.878130157985734e-05, "loss": 2.1623, "mean_token_accuracy": 0.4206896543502808, "step": 148910 }, { "epoch": 0.1499887696015777, "grad_norm": 9.964757215398352, "learning_rate": 4.878117992764814e-05, "loss": 2.6638, "mean_token_accuracy": 0.42758620381355283, "step": 148915 }, { "epoch": 0.14999380565468187, "grad_norm": 11.51807322376013, "learning_rate": 4.878105826953653e-05, "loss": 2.6976, "mean_token_accuracy": 0.3793103456497192, "step": 148920 }, { "epoch": 0.14999884170778605, "grad_norm": 10.919341263421053, "learning_rate": 4.878093660552254e-05, "loss": 2.5428, "mean_token_accuracy": 0.3965517163276672, "step": 148925 }, { "epoch": 0.15000387776089022, "grad_norm": 9.707162326772812, "learning_rate": 4.87808149356062e-05, "loss": 2.3382, "mean_token_accuracy": 0.43103448748588563, "step": 148930 }, { "epoch": 0.1500089138139944, "grad_norm": 8.820594368317725, "learning_rate": 4.878069325978755e-05, "loss": 2.1495, "mean_token_accuracy": 0.4517241418361664, "step": 148935 }, { "epoch": 0.15001394986709857, "grad_norm": 9.614218955216476, "learning_rate": 4.8780571578066606e-05, "loss": 2.7179, "mean_token_accuracy": 0.42413793206214906, "step": 148940 }, { "epoch": 0.15001898592020274, "grad_norm": 7.759849633117516, "learning_rate": 4.878044989044343e-05, "loss": 2.4532, "mean_token_accuracy": 0.4379310250282288, "step": 148945 }, { "epoch": 0.15002402197330691, "grad_norm": 10.271266963285328, "learning_rate": 4.8780328196918045e-05, "loss": 2.3861, "mean_token_accuracy": 0.4448275864124298, "step": 148950 }, { "epoch": 0.1500290580264111, "grad_norm": 7.403448989981989, "learning_rate": 4.878020649749048e-05, "loss": 2.2682, "mean_token_accuracy": 0.4862069010734558, "step": 148955 }, { "epoch": 0.15003409407951526, "grad_norm": 9.291017232304474, "learning_rate": 4.8780084792160764e-05, "loss": 2.275, "mean_token_accuracy": 0.4379310250282288, "step": 148960 }, { "epoch": 0.15003913013261944, "grad_norm": 9.578114053205415, "learning_rate": 4.877996308092893e-05, "loss": 2.3154, "mean_token_accuracy": 0.42413793206214906, "step": 148965 }, { "epoch": 0.15004416618572358, "grad_norm": 9.919018049511516, "learning_rate": 4.877984136379503e-05, "loss": 2.4315, "mean_token_accuracy": 0.4413793087005615, "step": 148970 }, { "epoch": 0.15004920223882776, "grad_norm": 9.391851167547143, "learning_rate": 4.8779719640759095e-05, "loss": 2.0621, "mean_token_accuracy": 0.493103438615799, "step": 148975 }, { "epoch": 0.15005423829193193, "grad_norm": 9.44248432454044, "learning_rate": 4.877959791182113e-05, "loss": 3.3055, "mean_token_accuracy": 0.37586206793785093, "step": 148980 }, { "epoch": 0.1500592743450361, "grad_norm": 10.491930120988131, "learning_rate": 4.87794761769812e-05, "loss": 2.3414, "mean_token_accuracy": 0.39310344457626345, "step": 148985 }, { "epoch": 0.15006431039814028, "grad_norm": 10.74666710167593, "learning_rate": 4.8779354436239325e-05, "loss": 2.3343, "mean_token_accuracy": 0.4517241299152374, "step": 148990 }, { "epoch": 0.15006934645124445, "grad_norm": 9.267816190884194, "learning_rate": 4.877923268959555e-05, "loss": 2.1378, "mean_token_accuracy": 0.4862069010734558, "step": 148995 }, { "epoch": 0.15007438250434862, "grad_norm": 9.858592186187275, "learning_rate": 4.877911093704989e-05, "loss": 2.2572, "mean_token_accuracy": 0.4655172348022461, "step": 149000 }, { "epoch": 0.1500794185574528, "grad_norm": 10.757871962043437, "learning_rate": 4.87789891786024e-05, "loss": 2.1928, "mean_token_accuracy": 0.49999999403953554, "step": 149005 }, { "epoch": 0.15008445461055697, "grad_norm": 9.408616508959676, "learning_rate": 4.8778867414253096e-05, "loss": 2.4488, "mean_token_accuracy": 0.4103448212146759, "step": 149010 }, { "epoch": 0.15008949066366115, "grad_norm": 12.575984067721732, "learning_rate": 4.8778745644002014e-05, "loss": 2.5434, "mean_token_accuracy": 0.41034482717514037, "step": 149015 }, { "epoch": 0.15009452671676532, "grad_norm": 12.11592356626097, "learning_rate": 4.87786238678492e-05, "loss": 2.379, "mean_token_accuracy": 0.4172413766384125, "step": 149020 }, { "epoch": 0.1500995627698695, "grad_norm": 8.501044669073574, "learning_rate": 4.877850208579468e-05, "loss": 2.5428, "mean_token_accuracy": 0.4172413796186447, "step": 149025 }, { "epoch": 0.15010459882297367, "grad_norm": 10.395452406559038, "learning_rate": 4.877838029783848e-05, "loss": 2.3063, "mean_token_accuracy": 0.44827585816383364, "step": 149030 }, { "epoch": 0.15010963487607784, "grad_norm": 11.122363805470826, "learning_rate": 4.877825850398065e-05, "loss": 2.5179, "mean_token_accuracy": 0.42758620977401735, "step": 149035 }, { "epoch": 0.15011467092918201, "grad_norm": 11.790340827487478, "learning_rate": 4.877813670422121e-05, "loss": 2.8304, "mean_token_accuracy": 0.37241379618644715, "step": 149040 }, { "epoch": 0.1501197069822862, "grad_norm": 10.961885638719535, "learning_rate": 4.87780148985602e-05, "loss": 2.3959, "mean_token_accuracy": 0.4482758641242981, "step": 149045 }, { "epoch": 0.15012474303539036, "grad_norm": 10.516540892468106, "learning_rate": 4.877789308699766e-05, "loss": 2.0377, "mean_token_accuracy": 0.4551724076271057, "step": 149050 }, { "epoch": 0.15012977908849454, "grad_norm": 10.10650256499464, "learning_rate": 4.8777771269533615e-05, "loss": 2.4774, "mean_token_accuracy": 0.4758620738983154, "step": 149055 }, { "epoch": 0.1501348151415987, "grad_norm": 11.354013503005183, "learning_rate": 4.87776494461681e-05, "loss": 2.5554, "mean_token_accuracy": 0.37931033968925476, "step": 149060 }, { "epoch": 0.15013985119470288, "grad_norm": 10.048751204023914, "learning_rate": 4.8777527616901155e-05, "loss": 2.1382, "mean_token_accuracy": 0.44482759237289426, "step": 149065 }, { "epoch": 0.15014488724780706, "grad_norm": 12.242922317850937, "learning_rate": 4.87774057817328e-05, "loss": 2.4445, "mean_token_accuracy": 0.42068966031074523, "step": 149070 }, { "epoch": 0.15014992330091123, "grad_norm": 12.636820440373466, "learning_rate": 4.877728394066308e-05, "loss": 2.6001, "mean_token_accuracy": 0.38620689511299133, "step": 149075 }, { "epoch": 0.1501549593540154, "grad_norm": 9.973016619717942, "learning_rate": 4.877716209369202e-05, "loss": 2.1032, "mean_token_accuracy": 0.4758620738983154, "step": 149080 }, { "epoch": 0.15015999540711958, "grad_norm": 11.203408280948786, "learning_rate": 4.877704024081966e-05, "loss": 2.6174, "mean_token_accuracy": 0.3862068891525269, "step": 149085 }, { "epoch": 0.15016503146022375, "grad_norm": 10.333547039096494, "learning_rate": 4.877691838204604e-05, "loss": 2.6543, "mean_token_accuracy": 0.36551723480224607, "step": 149090 }, { "epoch": 0.15017006751332793, "grad_norm": 10.845263814813272, "learning_rate": 4.8776796517371186e-05, "loss": 2.3034, "mean_token_accuracy": 0.44652147889137267, "step": 149095 }, { "epoch": 0.1501751035664321, "grad_norm": 10.777521911190956, "learning_rate": 4.877667464679514e-05, "loss": 2.4951, "mean_token_accuracy": 0.4034482717514038, "step": 149100 }, { "epoch": 0.15018013961953627, "grad_norm": 9.960942792422149, "learning_rate": 4.877655277031792e-05, "loss": 2.5793, "mean_token_accuracy": 0.4103448301553726, "step": 149105 }, { "epoch": 0.15018517567264042, "grad_norm": 11.364894197839963, "learning_rate": 4.877643088793957e-05, "loss": 2.1251, "mean_token_accuracy": 0.41034482419490814, "step": 149110 }, { "epoch": 0.1501902117257446, "grad_norm": 13.104924108626037, "learning_rate": 4.877630899966012e-05, "loss": 3.132, "mean_token_accuracy": 0.29655172526836393, "step": 149115 }, { "epoch": 0.15019524777884877, "grad_norm": 10.310989905943005, "learning_rate": 4.877618710547961e-05, "loss": 2.5062, "mean_token_accuracy": 0.3551724135875702, "step": 149120 }, { "epoch": 0.15020028383195294, "grad_norm": 9.967542614058262, "learning_rate": 4.877606520539807e-05, "loss": 2.1026, "mean_token_accuracy": 0.4586206912994385, "step": 149125 }, { "epoch": 0.15020531988505711, "grad_norm": 10.845694508080364, "learning_rate": 4.877594329941553e-05, "loss": 2.4612, "mean_token_accuracy": 0.3896551787853241, "step": 149130 }, { "epoch": 0.1502103559381613, "grad_norm": 10.533629399924388, "learning_rate": 4.877582138753203e-05, "loss": 2.9102, "mean_token_accuracy": 0.3793103516101837, "step": 149135 }, { "epoch": 0.15021539199126546, "grad_norm": 9.404855071504524, "learning_rate": 4.8775699469747605e-05, "loss": 2.7323, "mean_token_accuracy": 0.37241379022598264, "step": 149140 }, { "epoch": 0.15022042804436964, "grad_norm": 8.008902234627529, "learning_rate": 4.877557754606227e-05, "loss": 2.2674, "mean_token_accuracy": 0.47241379618644713, "step": 149145 }, { "epoch": 0.1502254640974738, "grad_norm": 8.82197542943708, "learning_rate": 4.877545561647609e-05, "loss": 2.5263, "mean_token_accuracy": 0.4448275864124298, "step": 149150 }, { "epoch": 0.15023050015057798, "grad_norm": 8.347223630394668, "learning_rate": 4.877533368098908e-05, "loss": 2.3304, "mean_token_accuracy": 0.4329703599214554, "step": 149155 }, { "epoch": 0.15023553620368216, "grad_norm": 10.424946909484468, "learning_rate": 4.877521173960128e-05, "loss": 2.4824, "mean_token_accuracy": 0.3931034505367279, "step": 149160 }, { "epoch": 0.15024057225678633, "grad_norm": 11.320850131146, "learning_rate": 4.8775089792312725e-05, "loss": 2.5967, "mean_token_accuracy": 0.41379310488700866, "step": 149165 }, { "epoch": 0.1502456083098905, "grad_norm": 11.00300938683333, "learning_rate": 4.877496783912343e-05, "loss": 2.2146, "mean_token_accuracy": 0.4496672749519348, "step": 149170 }, { "epoch": 0.15025064436299468, "grad_norm": 10.470230954318321, "learning_rate": 4.8774845880033455e-05, "loss": 2.6855, "mean_token_accuracy": 0.42068966031074523, "step": 149175 }, { "epoch": 0.15025568041609885, "grad_norm": 13.7454723795735, "learning_rate": 4.877472391504282e-05, "loss": 2.4764, "mean_token_accuracy": 0.42758620977401735, "step": 149180 }, { "epoch": 0.15026071646920303, "grad_norm": 10.745877053318678, "learning_rate": 4.877460194415155e-05, "loss": 2.5242, "mean_token_accuracy": 0.4344827651977539, "step": 149185 }, { "epoch": 0.1502657525223072, "grad_norm": 10.848949143388364, "learning_rate": 4.87744799673597e-05, "loss": 2.3681, "mean_token_accuracy": 0.4482758641242981, "step": 149190 }, { "epoch": 0.15027078857541137, "grad_norm": 10.287839339477767, "learning_rate": 4.87743579846673e-05, "loss": 2.3982, "mean_token_accuracy": 0.458620685338974, "step": 149195 }, { "epoch": 0.15027582462851555, "grad_norm": 9.315448651543322, "learning_rate": 4.8774235996074365e-05, "loss": 2.3174, "mean_token_accuracy": 0.4275861978530884, "step": 149200 }, { "epoch": 0.15028086068161972, "grad_norm": 9.761662336587126, "learning_rate": 4.877411400158094e-05, "loss": 2.4101, "mean_token_accuracy": 0.41724138259887694, "step": 149205 }, { "epoch": 0.1502858967347239, "grad_norm": 11.528203017076272, "learning_rate": 4.877399200118707e-05, "loss": 2.3096, "mean_token_accuracy": 0.4379310429096222, "step": 149210 }, { "epoch": 0.15029093278782807, "grad_norm": 10.440056460581612, "learning_rate": 4.877386999489277e-05, "loss": 2.4287, "mean_token_accuracy": 0.3931034505367279, "step": 149215 }, { "epoch": 0.15029596884093224, "grad_norm": 8.533533323501887, "learning_rate": 4.8773747982698086e-05, "loss": 2.2564, "mean_token_accuracy": 0.4779189318418503, "step": 149220 }, { "epoch": 0.15030100489403642, "grad_norm": 10.388192434978636, "learning_rate": 4.877362596460305e-05, "loss": 2.3945, "mean_token_accuracy": 0.41034482717514037, "step": 149225 }, { "epoch": 0.1503060409471406, "grad_norm": 10.86507885932944, "learning_rate": 4.87735039406077e-05, "loss": 2.8181, "mean_token_accuracy": 0.41034482717514037, "step": 149230 }, { "epoch": 0.15031107700024476, "grad_norm": 10.066701372831748, "learning_rate": 4.877338191071205e-05, "loss": 2.9566, "mean_token_accuracy": 0.42758620977401735, "step": 149235 }, { "epoch": 0.15031611305334894, "grad_norm": 10.812436793850436, "learning_rate": 4.8773259874916165e-05, "loss": 2.5275, "mean_token_accuracy": 0.39310344457626345, "step": 149240 }, { "epoch": 0.1503211491064531, "grad_norm": 16.360855448456203, "learning_rate": 4.877313783322006e-05, "loss": 2.4582, "mean_token_accuracy": 0.4448275983333588, "step": 149245 }, { "epoch": 0.15032618515955726, "grad_norm": 11.22783409596567, "learning_rate": 4.877301578562375e-05, "loss": 2.7584, "mean_token_accuracy": 0.35862069129943847, "step": 149250 }, { "epoch": 0.15033122121266143, "grad_norm": 16.856161186586128, "learning_rate": 4.877289373212731e-05, "loss": 2.1153, "mean_token_accuracy": 0.5034482777118683, "step": 149255 }, { "epoch": 0.1503362572657656, "grad_norm": 8.961445332797537, "learning_rate": 4.877277167273075e-05, "loss": 2.3328, "mean_token_accuracy": 0.41724138259887694, "step": 149260 }, { "epoch": 0.15034129331886978, "grad_norm": 10.668934313505392, "learning_rate": 4.877264960743411e-05, "loss": 2.609, "mean_token_accuracy": 0.3827586233615875, "step": 149265 }, { "epoch": 0.15034632937197395, "grad_norm": 15.418782403901512, "learning_rate": 4.877252753623742e-05, "loss": 2.9705, "mean_token_accuracy": 0.3931034505367279, "step": 149270 }, { "epoch": 0.15035136542507813, "grad_norm": 9.610150668532713, "learning_rate": 4.87724054591407e-05, "loss": 1.9062, "mean_token_accuracy": 0.5034482657909394, "step": 149275 }, { "epoch": 0.1503564014781823, "grad_norm": 9.16564707574316, "learning_rate": 4.8772283376144015e-05, "loss": 2.26, "mean_token_accuracy": 0.4482758641242981, "step": 149280 }, { "epoch": 0.15036143753128647, "grad_norm": 9.912865727384233, "learning_rate": 4.877216128724738e-05, "loss": 2.5221, "mean_token_accuracy": 0.3896551787853241, "step": 149285 }, { "epoch": 0.15036647358439065, "grad_norm": 9.90997671815578, "learning_rate": 4.8772039192450825e-05, "loss": 2.3044, "mean_token_accuracy": 0.45015124082565305, "step": 149290 }, { "epoch": 0.15037150963749482, "grad_norm": 11.336666980570953, "learning_rate": 4.87719170917544e-05, "loss": 2.3481, "mean_token_accuracy": 0.4034482717514038, "step": 149295 }, { "epoch": 0.150376545690599, "grad_norm": 11.430705908547983, "learning_rate": 4.877179498515812e-05, "loss": 2.4726, "mean_token_accuracy": 0.40344828367233276, "step": 149300 }, { "epoch": 0.15038158174370317, "grad_norm": 11.413600274057302, "learning_rate": 4.877167287266204e-05, "loss": 2.2024, "mean_token_accuracy": 0.482758629322052, "step": 149305 }, { "epoch": 0.15038661779680734, "grad_norm": 10.866667473949668, "learning_rate": 4.877155075426617e-05, "loss": 2.4666, "mean_token_accuracy": 0.42413793206214906, "step": 149310 }, { "epoch": 0.15039165384991152, "grad_norm": 8.734110171638065, "learning_rate": 4.877142862997056e-05, "loss": 2.4321, "mean_token_accuracy": 0.4172413647174835, "step": 149315 }, { "epoch": 0.1503966899030157, "grad_norm": 9.512535632327548, "learning_rate": 4.8771306499775244e-05, "loss": 2.5164, "mean_token_accuracy": 0.35862069129943847, "step": 149320 }, { "epoch": 0.15040172595611986, "grad_norm": 8.81103727049511, "learning_rate": 4.877118436368025e-05, "loss": 2.798, "mean_token_accuracy": 0.3482758641242981, "step": 149325 }, { "epoch": 0.15040676200922404, "grad_norm": 11.204882573941534, "learning_rate": 4.877106222168562e-05, "loss": 2.4417, "mean_token_accuracy": 0.4034482777118683, "step": 149330 }, { "epoch": 0.1504117980623282, "grad_norm": 11.05799001645349, "learning_rate": 4.877094007379137e-05, "loss": 2.2249, "mean_token_accuracy": 0.4517241358757019, "step": 149335 }, { "epoch": 0.15041683411543239, "grad_norm": 10.010865410151064, "learning_rate": 4.877081791999755e-05, "loss": 2.1427, "mean_token_accuracy": 0.458620685338974, "step": 149340 }, { "epoch": 0.15042187016853656, "grad_norm": 11.702188432915667, "learning_rate": 4.8770695760304183e-05, "loss": 2.459, "mean_token_accuracy": 0.4034482717514038, "step": 149345 }, { "epoch": 0.15042690622164073, "grad_norm": 10.4820106833545, "learning_rate": 4.877057359471133e-05, "loss": 2.4436, "mean_token_accuracy": 0.3999999940395355, "step": 149350 }, { "epoch": 0.1504319422747449, "grad_norm": 11.9252868977379, "learning_rate": 4.8770451423218985e-05, "loss": 2.641, "mean_token_accuracy": 0.4206896543502808, "step": 149355 }, { "epoch": 0.15043697832784908, "grad_norm": 10.841181273833318, "learning_rate": 4.87703292458272e-05, "loss": 3.0287, "mean_token_accuracy": 0.30689655244350433, "step": 149360 }, { "epoch": 0.15044201438095325, "grad_norm": 11.468360403874234, "learning_rate": 4.8770207062536024e-05, "loss": 2.526, "mean_token_accuracy": 0.41379310488700866, "step": 149365 }, { "epoch": 0.15044705043405743, "grad_norm": 14.506096313675567, "learning_rate": 4.877008487334546e-05, "loss": 2.3074, "mean_token_accuracy": 0.42758620381355283, "step": 149370 }, { "epoch": 0.1504520864871616, "grad_norm": 10.6157349502536, "learning_rate": 4.876996267825557e-05, "loss": 2.4079, "mean_token_accuracy": 0.4330913364887238, "step": 149375 }, { "epoch": 0.15045712254026578, "grad_norm": 13.243265727732807, "learning_rate": 4.876984047726638e-05, "loss": 2.4476, "mean_token_accuracy": 0.41379310488700866, "step": 149380 }, { "epoch": 0.15046215859336995, "grad_norm": 9.444903195673113, "learning_rate": 4.876971827037791e-05, "loss": 2.2499, "mean_token_accuracy": 0.46551724672317507, "step": 149385 }, { "epoch": 0.1504671946464741, "grad_norm": 9.185852543320232, "learning_rate": 4.876959605759021e-05, "loss": 2.5892, "mean_token_accuracy": 0.37586206793785093, "step": 149390 }, { "epoch": 0.15047223069957827, "grad_norm": 10.788080496086017, "learning_rate": 4.876947383890331e-05, "loss": 2.3162, "mean_token_accuracy": 0.48620688915252686, "step": 149395 }, { "epoch": 0.15047726675268244, "grad_norm": 13.227738234343674, "learning_rate": 4.876935161431723e-05, "loss": 2.6437, "mean_token_accuracy": 0.4, "step": 149400 }, { "epoch": 0.15048230280578662, "grad_norm": 12.452748156590717, "learning_rate": 4.8769229383832024e-05, "loss": 2.7981, "mean_token_accuracy": 0.3965517163276672, "step": 149405 }, { "epoch": 0.1504873388588908, "grad_norm": 10.493279317078986, "learning_rate": 4.876910714744772e-05, "loss": 2.4059, "mean_token_accuracy": 0.41034482717514037, "step": 149410 }, { "epoch": 0.15049237491199496, "grad_norm": 9.929553696538715, "learning_rate": 4.876898490516435e-05, "loss": 2.1125, "mean_token_accuracy": 0.4517241358757019, "step": 149415 }, { "epoch": 0.15049741096509914, "grad_norm": 10.224033646320988, "learning_rate": 4.8768862656981945e-05, "loss": 2.3911, "mean_token_accuracy": 0.42758620977401735, "step": 149420 }, { "epoch": 0.1505024470182033, "grad_norm": 13.87025510305605, "learning_rate": 4.8768740402900534e-05, "loss": 2.2135, "mean_token_accuracy": 0.5160314619541169, "step": 149425 }, { "epoch": 0.15050748307130749, "grad_norm": 12.424796039630738, "learning_rate": 4.8768618142920176e-05, "loss": 2.5655, "mean_token_accuracy": 0.42068966031074523, "step": 149430 }, { "epoch": 0.15051251912441166, "grad_norm": 11.188931679050908, "learning_rate": 4.876849587704088e-05, "loss": 2.6041, "mean_token_accuracy": 0.4103448212146759, "step": 149435 }, { "epoch": 0.15051755517751583, "grad_norm": 11.335121738830493, "learning_rate": 4.876837360526268e-05, "loss": 2.4056, "mean_token_accuracy": 0.4724137902259827, "step": 149440 }, { "epoch": 0.15052259123062, "grad_norm": 9.028648895221707, "learning_rate": 4.8768251327585626e-05, "loss": 2.2789, "mean_token_accuracy": 0.4344827592372894, "step": 149445 }, { "epoch": 0.15052762728372418, "grad_norm": 12.847073322829422, "learning_rate": 4.8768129044009736e-05, "loss": 2.2681, "mean_token_accuracy": 0.47586206197738645, "step": 149450 }, { "epoch": 0.15053266333682835, "grad_norm": 9.599561657725832, "learning_rate": 4.876800675453505e-05, "loss": 2.0743, "mean_token_accuracy": 0.4810042381286621, "step": 149455 }, { "epoch": 0.15053769938993253, "grad_norm": 9.955343765867251, "learning_rate": 4.876788445916162e-05, "loss": 2.3083, "mean_token_accuracy": 0.43793103098869324, "step": 149460 }, { "epoch": 0.1505427354430367, "grad_norm": 9.338307927577622, "learning_rate": 4.876776215788945e-05, "loss": 2.3352, "mean_token_accuracy": 0.46551724672317507, "step": 149465 }, { "epoch": 0.15054777149614088, "grad_norm": 11.521789616475244, "learning_rate": 4.876763985071858e-05, "loss": 2.5044, "mean_token_accuracy": 0.4379310369491577, "step": 149470 }, { "epoch": 0.15055280754924505, "grad_norm": 9.05530161827276, "learning_rate": 4.8767517537649064e-05, "loss": 2.419, "mean_token_accuracy": 0.42758620381355283, "step": 149475 }, { "epoch": 0.15055784360234922, "grad_norm": 8.388553656012549, "learning_rate": 4.876739521868092e-05, "loss": 2.4173, "mean_token_accuracy": 0.41494253277778625, "step": 149480 }, { "epoch": 0.1505628796554534, "grad_norm": 10.079270759903098, "learning_rate": 4.876727289381418e-05, "loss": 2.3905, "mean_token_accuracy": 0.42758620381355283, "step": 149485 }, { "epoch": 0.15056791570855757, "grad_norm": 10.73941934643547, "learning_rate": 4.876715056304888e-05, "loss": 2.5387, "mean_token_accuracy": 0.3862069010734558, "step": 149490 }, { "epoch": 0.15057295176166174, "grad_norm": 10.55671533343616, "learning_rate": 4.8767028226385065e-05, "loss": 2.4625, "mean_token_accuracy": 0.42758620381355283, "step": 149495 }, { "epoch": 0.15057798781476592, "grad_norm": 10.201375510332802, "learning_rate": 4.876690588382275e-05, "loss": 2.3459, "mean_token_accuracy": 0.47241378426551817, "step": 149500 }, { "epoch": 0.1505830238678701, "grad_norm": 10.905500776732705, "learning_rate": 4.8766783535361986e-05, "loss": 2.8893, "mean_token_accuracy": 0.41379310488700866, "step": 149505 }, { "epoch": 0.15058805992097427, "grad_norm": 10.895544190398885, "learning_rate": 4.87666611810028e-05, "loss": 2.1634, "mean_token_accuracy": 0.4896551787853241, "step": 149510 }, { "epoch": 0.15059309597407844, "grad_norm": 23.294735431575923, "learning_rate": 4.876653882074523e-05, "loss": 2.5955, "mean_token_accuracy": 0.4413793087005615, "step": 149515 }, { "epoch": 0.1505981320271826, "grad_norm": 8.261015692328558, "learning_rate": 4.87664164545893e-05, "loss": 2.3241, "mean_token_accuracy": 0.4862068951129913, "step": 149520 }, { "epoch": 0.1506031680802868, "grad_norm": 13.469207705946964, "learning_rate": 4.876629408253505e-05, "loss": 2.6341, "mean_token_accuracy": 0.408771938085556, "step": 149525 }, { "epoch": 0.15060820413339093, "grad_norm": 10.451827611209271, "learning_rate": 4.876617170458252e-05, "loss": 2.6457, "mean_token_accuracy": 0.3517241358757019, "step": 149530 }, { "epoch": 0.1506132401864951, "grad_norm": 11.676499626880178, "learning_rate": 4.876604932073173e-05, "loss": 2.5571, "mean_token_accuracy": 0.3862069010734558, "step": 149535 }, { "epoch": 0.15061827623959928, "grad_norm": 15.888734576950432, "learning_rate": 4.8765926930982726e-05, "loss": 2.2606, "mean_token_accuracy": 0.48275862336158754, "step": 149540 }, { "epoch": 0.15062331229270345, "grad_norm": 10.140698797615885, "learning_rate": 4.876580453533553e-05, "loss": 2.3532, "mean_token_accuracy": 0.3965517163276672, "step": 149545 }, { "epoch": 0.15062834834580763, "grad_norm": 10.246921303055457, "learning_rate": 4.87656821337902e-05, "loss": 2.2139, "mean_token_accuracy": 0.46406533718109133, "step": 149550 }, { "epoch": 0.1506333843989118, "grad_norm": 13.345602807923415, "learning_rate": 4.876555972634675e-05, "loss": 2.5852, "mean_token_accuracy": 0.3999999940395355, "step": 149555 }, { "epoch": 0.15063842045201598, "grad_norm": 11.97479059377392, "learning_rate": 4.876543731300521e-05, "loss": 2.371, "mean_token_accuracy": 0.44482758045196535, "step": 149560 }, { "epoch": 0.15064345650512015, "grad_norm": 9.914794722641435, "learning_rate": 4.876531489376562e-05, "loss": 2.2839, "mean_token_accuracy": 0.43793103098869324, "step": 149565 }, { "epoch": 0.15064849255822432, "grad_norm": 10.36707524348504, "learning_rate": 4.8765192468628015e-05, "loss": 2.4503, "mean_token_accuracy": 0.4172413796186447, "step": 149570 }, { "epoch": 0.1506535286113285, "grad_norm": 7.952800023192369, "learning_rate": 4.876507003759244e-05, "loss": 2.3974, "mean_token_accuracy": 0.45517241954803467, "step": 149575 }, { "epoch": 0.15065856466443267, "grad_norm": 13.575020087122866, "learning_rate": 4.876494760065892e-05, "loss": 2.737, "mean_token_accuracy": 0.36896551847457887, "step": 149580 }, { "epoch": 0.15066360071753684, "grad_norm": 11.941980383149899, "learning_rate": 4.876482515782748e-05, "loss": 2.7966, "mean_token_accuracy": 0.35862069129943847, "step": 149585 }, { "epoch": 0.15066863677064102, "grad_norm": 9.96491244454929, "learning_rate": 4.876470270909816e-05, "loss": 2.2438, "mean_token_accuracy": 0.3517241388559341, "step": 149590 }, { "epoch": 0.1506736728237452, "grad_norm": 10.509370502643945, "learning_rate": 4.8764580254470994e-05, "loss": 2.3021, "mean_token_accuracy": 0.4172413766384125, "step": 149595 }, { "epoch": 0.15067870887684937, "grad_norm": 9.433669914356763, "learning_rate": 4.8764457793946024e-05, "loss": 2.5857, "mean_token_accuracy": 0.3862069010734558, "step": 149600 }, { "epoch": 0.15068374492995354, "grad_norm": 9.068087399692464, "learning_rate": 4.876433532752327e-05, "loss": 2.3172, "mean_token_accuracy": 0.43793103098869324, "step": 149605 }, { "epoch": 0.1506887809830577, "grad_norm": 9.853031068514168, "learning_rate": 4.8764212855202786e-05, "loss": 2.2964, "mean_token_accuracy": 0.44827585816383364, "step": 149610 }, { "epoch": 0.1506938170361619, "grad_norm": 8.26916054596924, "learning_rate": 4.876409037698458e-05, "loss": 2.5484, "mean_token_accuracy": 0.43781003952026365, "step": 149615 }, { "epoch": 0.15069885308926606, "grad_norm": 13.388403559134366, "learning_rate": 4.8763967892868704e-05, "loss": 2.6676, "mean_token_accuracy": 0.37241379022598264, "step": 149620 }, { "epoch": 0.15070388914237023, "grad_norm": 8.793423965927898, "learning_rate": 4.8763845402855184e-05, "loss": 2.7327, "mean_token_accuracy": 0.38965518176555636, "step": 149625 }, { "epoch": 0.1507089251954744, "grad_norm": 11.60805081152104, "learning_rate": 4.8763722906944064e-05, "loss": 2.5524, "mean_token_accuracy": 0.42413793206214906, "step": 149630 }, { "epoch": 0.15071396124857858, "grad_norm": 8.912183412341369, "learning_rate": 4.8763600405135366e-05, "loss": 2.2035, "mean_token_accuracy": 0.43559112548828127, "step": 149635 }, { "epoch": 0.15071899730168276, "grad_norm": 9.571437507568355, "learning_rate": 4.8763477897429136e-05, "loss": 2.6076, "mean_token_accuracy": 0.39655173420906065, "step": 149640 }, { "epoch": 0.15072403335478693, "grad_norm": 10.763953085530634, "learning_rate": 4.876335538382539e-05, "loss": 2.5354, "mean_token_accuracy": 0.39655172228813174, "step": 149645 }, { "epoch": 0.1507290694078911, "grad_norm": 10.4601301709877, "learning_rate": 4.876323286432418e-05, "loss": 2.179, "mean_token_accuracy": 0.43793103098869324, "step": 149650 }, { "epoch": 0.15073410546099528, "grad_norm": 19.890295078568048, "learning_rate": 4.8763110338925524e-05, "loss": 2.7891, "mean_token_accuracy": 0.4068965494632721, "step": 149655 }, { "epoch": 0.15073914151409945, "grad_norm": 12.478393594624785, "learning_rate": 4.876298780762948e-05, "loss": 2.7203, "mean_token_accuracy": 0.37241379022598264, "step": 149660 }, { "epoch": 0.15074417756720362, "grad_norm": 11.159453248831037, "learning_rate": 4.8762865270436056e-05, "loss": 2.8838, "mean_token_accuracy": 0.358620685338974, "step": 149665 }, { "epoch": 0.15074921362030777, "grad_norm": 8.064070902507428, "learning_rate": 4.87627427273453e-05, "loss": 2.2971, "mean_token_accuracy": 0.40689656138420105, "step": 149670 }, { "epoch": 0.15075424967341194, "grad_norm": 11.76727276919041, "learning_rate": 4.876262017835724e-05, "loss": 2.209, "mean_token_accuracy": 0.45862067937850953, "step": 149675 }, { "epoch": 0.15075928572651612, "grad_norm": 13.916195826948814, "learning_rate": 4.876249762347193e-05, "loss": 2.6081, "mean_token_accuracy": 0.441379314661026, "step": 149680 }, { "epoch": 0.1507643217796203, "grad_norm": 10.686591713634101, "learning_rate": 4.876237506268937e-05, "loss": 2.264, "mean_token_accuracy": 0.4379310369491577, "step": 149685 }, { "epoch": 0.15076935783272447, "grad_norm": 9.979529297950961, "learning_rate": 4.8762252496009607e-05, "loss": 2.2003, "mean_token_accuracy": 0.4862069010734558, "step": 149690 }, { "epoch": 0.15077439388582864, "grad_norm": 9.614156118623567, "learning_rate": 4.876212992343269e-05, "loss": 2.0433, "mean_token_accuracy": 0.4603448212146759, "step": 149695 }, { "epoch": 0.1507794299389328, "grad_norm": 11.738415442388206, "learning_rate": 4.876200734495864e-05, "loss": 2.2681, "mean_token_accuracy": 0.4503932178020477, "step": 149700 }, { "epoch": 0.150784465992037, "grad_norm": 11.02224262017473, "learning_rate": 4.876188476058749e-05, "loss": 2.2197, "mean_token_accuracy": 0.4655172348022461, "step": 149705 }, { "epoch": 0.15078950204514116, "grad_norm": 9.407998096189218, "learning_rate": 4.876176217031928e-05, "loss": 2.4105, "mean_token_accuracy": 0.41379310488700866, "step": 149710 }, { "epoch": 0.15079453809824533, "grad_norm": 13.314234277823637, "learning_rate": 4.876163957415404e-05, "loss": 2.6917, "mean_token_accuracy": 0.3896551728248596, "step": 149715 }, { "epoch": 0.1507995741513495, "grad_norm": 10.472740319930455, "learning_rate": 4.87615169720918e-05, "loss": 3.0116, "mean_token_accuracy": 0.4, "step": 149720 }, { "epoch": 0.15080461020445368, "grad_norm": 9.369210092147018, "learning_rate": 4.87613943641326e-05, "loss": 2.4317, "mean_token_accuracy": 0.4862068951129913, "step": 149725 }, { "epoch": 0.15080964625755786, "grad_norm": 10.097844247755216, "learning_rate": 4.876127175027648e-05, "loss": 2.7347, "mean_token_accuracy": 0.41724138259887694, "step": 149730 }, { "epoch": 0.15081468231066203, "grad_norm": 11.913214865902967, "learning_rate": 4.876114913052346e-05, "loss": 2.3606, "mean_token_accuracy": 0.4068965554237366, "step": 149735 }, { "epoch": 0.1508197183637662, "grad_norm": 10.96368015813979, "learning_rate": 4.876102650487359e-05, "loss": 2.1533, "mean_token_accuracy": 0.4620689630508423, "step": 149740 }, { "epoch": 0.15082475441687038, "grad_norm": 9.163423053906138, "learning_rate": 4.876090387332689e-05, "loss": 2.3668, "mean_token_accuracy": 0.41034482717514037, "step": 149745 }, { "epoch": 0.15082979046997455, "grad_norm": 12.09859929102039, "learning_rate": 4.876078123588339e-05, "loss": 2.5033, "mean_token_accuracy": 0.3793103456497192, "step": 149750 }, { "epoch": 0.15083482652307872, "grad_norm": 11.01466926116127, "learning_rate": 4.8760658592543146e-05, "loss": 2.0059, "mean_token_accuracy": 0.517241370677948, "step": 149755 }, { "epoch": 0.1508398625761829, "grad_norm": 9.827046095619751, "learning_rate": 4.8760535943306174e-05, "loss": 2.208, "mean_token_accuracy": 0.4310344815254211, "step": 149760 }, { "epoch": 0.15084489862928707, "grad_norm": 11.32186809045248, "learning_rate": 4.8760413288172505e-05, "loss": 2.0528, "mean_token_accuracy": 0.5, "step": 149765 }, { "epoch": 0.15084993468239125, "grad_norm": 11.874814258453007, "learning_rate": 4.87602906271422e-05, "loss": 2.3954, "mean_token_accuracy": 0.45359951853752134, "step": 149770 }, { "epoch": 0.15085497073549542, "grad_norm": 10.613733684072402, "learning_rate": 4.8760167960215256e-05, "loss": 2.5054, "mean_token_accuracy": 0.42758620977401735, "step": 149775 }, { "epoch": 0.1508600067885996, "grad_norm": 11.072250712893451, "learning_rate": 4.876004528739173e-05, "loss": 2.351, "mean_token_accuracy": 0.44827587008476255, "step": 149780 }, { "epoch": 0.15086504284170377, "grad_norm": 12.871378237029303, "learning_rate": 4.8759922608671656e-05, "loss": 2.4521, "mean_token_accuracy": 0.4379310369491577, "step": 149785 }, { "epoch": 0.15087007889480794, "grad_norm": 11.573208833020342, "learning_rate": 4.875979992405506e-05, "loss": 2.4368, "mean_token_accuracy": 0.4103448212146759, "step": 149790 }, { "epoch": 0.15087511494791211, "grad_norm": 10.250447725908355, "learning_rate": 4.875967723354198e-05, "loss": 2.3031, "mean_token_accuracy": 0.42413793206214906, "step": 149795 }, { "epoch": 0.1508801510010163, "grad_norm": 13.781562912727232, "learning_rate": 4.875955453713245e-05, "loss": 2.6232, "mean_token_accuracy": 0.3999999940395355, "step": 149800 }, { "epoch": 0.15088518705412046, "grad_norm": 9.86087302515152, "learning_rate": 4.87594318348265e-05, "loss": 2.2988, "mean_token_accuracy": 0.44827587008476255, "step": 149805 }, { "epoch": 0.1508902231072246, "grad_norm": 10.43181599833497, "learning_rate": 4.875930912662417e-05, "loss": 2.5119, "mean_token_accuracy": 0.5172413766384125, "step": 149810 }, { "epoch": 0.15089525916032878, "grad_norm": 10.752435901438233, "learning_rate": 4.875918641252549e-05, "loss": 2.7333, "mean_token_accuracy": 0.4206896543502808, "step": 149815 }, { "epoch": 0.15090029521343296, "grad_norm": 12.751389213004384, "learning_rate": 4.8759063692530496e-05, "loss": 2.243, "mean_token_accuracy": 0.44827587008476255, "step": 149820 }, { "epoch": 0.15090533126653713, "grad_norm": 11.992010897769077, "learning_rate": 4.8758940966639225e-05, "loss": 2.699, "mean_token_accuracy": 0.40490018129348754, "step": 149825 }, { "epoch": 0.1509103673196413, "grad_norm": 10.151150603179934, "learning_rate": 4.87588182348517e-05, "loss": 2.2279, "mean_token_accuracy": 0.46551724076271056, "step": 149830 }, { "epoch": 0.15091540337274548, "grad_norm": 10.177737530621329, "learning_rate": 4.8758695497167965e-05, "loss": 2.2841, "mean_token_accuracy": 0.4241379380226135, "step": 149835 }, { "epoch": 0.15092043942584965, "grad_norm": 9.183236179182716, "learning_rate": 4.875857275358806e-05, "loss": 2.2407, "mean_token_accuracy": 0.47586206793785096, "step": 149840 }, { "epoch": 0.15092547547895382, "grad_norm": 10.34303708360322, "learning_rate": 4.875845000411201e-05, "loss": 2.7097, "mean_token_accuracy": 0.38620689511299133, "step": 149845 }, { "epoch": 0.150930511532058, "grad_norm": 8.966914420759421, "learning_rate": 4.8758327248739835e-05, "loss": 2.1153, "mean_token_accuracy": 0.48275861144065857, "step": 149850 }, { "epoch": 0.15093554758516217, "grad_norm": 11.799186822662449, "learning_rate": 4.875820448747159e-05, "loss": 2.5937, "mean_token_accuracy": 0.39655172228813174, "step": 149855 }, { "epoch": 0.15094058363826635, "grad_norm": 8.60768745881344, "learning_rate": 4.87580817203073e-05, "loss": 2.072, "mean_token_accuracy": 0.47241379618644713, "step": 149860 }, { "epoch": 0.15094561969137052, "grad_norm": 13.147817082903778, "learning_rate": 4.875795894724701e-05, "loss": 2.8686, "mean_token_accuracy": 0.4344827592372894, "step": 149865 }, { "epoch": 0.1509506557444747, "grad_norm": 10.146111595814933, "learning_rate": 4.8757836168290736e-05, "loss": 2.7214, "mean_token_accuracy": 0.4172413766384125, "step": 149870 }, { "epoch": 0.15095569179757887, "grad_norm": 11.741144842181695, "learning_rate": 4.875771338343854e-05, "loss": 2.5815, "mean_token_accuracy": 0.3655172407627106, "step": 149875 }, { "epoch": 0.15096072785068304, "grad_norm": 10.74812188369407, "learning_rate": 4.875759059269042e-05, "loss": 2.5139, "mean_token_accuracy": 0.4379310429096222, "step": 149880 }, { "epoch": 0.15096576390378721, "grad_norm": 11.23036703299972, "learning_rate": 4.875746779604643e-05, "loss": 2.9112, "mean_token_accuracy": 0.35862069129943847, "step": 149885 }, { "epoch": 0.1509707999568914, "grad_norm": 10.52249335232662, "learning_rate": 4.875734499350661e-05, "loss": 2.2026, "mean_token_accuracy": 0.4551724135875702, "step": 149890 }, { "epoch": 0.15097583600999556, "grad_norm": 9.957793294884176, "learning_rate": 4.875722218507098e-05, "loss": 2.2617, "mean_token_accuracy": 0.44827585816383364, "step": 149895 }, { "epoch": 0.15098087206309974, "grad_norm": 22.814203705122754, "learning_rate": 4.875709937073958e-05, "loss": 3.1332, "mean_token_accuracy": 0.38620689511299133, "step": 149900 }, { "epoch": 0.1509859081162039, "grad_norm": 10.12616944077804, "learning_rate": 4.875697655051245e-05, "loss": 2.2424, "mean_token_accuracy": 0.46037507653236387, "step": 149905 }, { "epoch": 0.15099094416930808, "grad_norm": 11.989940470325239, "learning_rate": 4.875685372438961e-05, "loss": 2.3053, "mean_token_accuracy": 0.43678161799907683, "step": 149910 }, { "epoch": 0.15099598022241226, "grad_norm": 11.498788668223176, "learning_rate": 4.875673089237111e-05, "loss": 2.2171, "mean_token_accuracy": 0.43103448748588563, "step": 149915 }, { "epoch": 0.15100101627551643, "grad_norm": 8.686268021095499, "learning_rate": 4.875660805445697e-05, "loss": 2.2469, "mean_token_accuracy": 0.4413793087005615, "step": 149920 }, { "epoch": 0.1510060523286206, "grad_norm": 18.64208592614061, "learning_rate": 4.875648521064723e-05, "loss": 2.5046, "mean_token_accuracy": 0.39310344457626345, "step": 149925 }, { "epoch": 0.15101108838172478, "grad_norm": 10.94239788937068, "learning_rate": 4.875636236094193e-05, "loss": 2.2818, "mean_token_accuracy": 0.4310344815254211, "step": 149930 }, { "epoch": 0.15101612443482895, "grad_norm": 11.725273404964776, "learning_rate": 4.8756239505341096e-05, "loss": 2.2754, "mean_token_accuracy": 0.4448275864124298, "step": 149935 }, { "epoch": 0.15102116048793313, "grad_norm": 10.708183403675019, "learning_rate": 4.875611664384477e-05, "loss": 2.5445, "mean_token_accuracy": 0.4517241358757019, "step": 149940 }, { "epoch": 0.1510261965410373, "grad_norm": 11.75507898168539, "learning_rate": 4.875599377645297e-05, "loss": 2.2224, "mean_token_accuracy": 0.42413793206214906, "step": 149945 }, { "epoch": 0.15103123259414145, "grad_norm": 9.294074764167819, "learning_rate": 4.875587090316575e-05, "loss": 2.4482, "mean_token_accuracy": 0.4310344815254211, "step": 149950 }, { "epoch": 0.15103626864724562, "grad_norm": 10.191946578781396, "learning_rate": 4.875574802398313e-05, "loss": 2.5094, "mean_token_accuracy": 0.3981851160526276, "step": 149955 }, { "epoch": 0.1510413047003498, "grad_norm": 9.686523551946502, "learning_rate": 4.8755625138905156e-05, "loss": 2.3312, "mean_token_accuracy": 0.4896551787853241, "step": 149960 }, { "epoch": 0.15104634075345397, "grad_norm": 11.592236597962085, "learning_rate": 4.8755502247931846e-05, "loss": 2.5748, "mean_token_accuracy": 0.4503327250480652, "step": 149965 }, { "epoch": 0.15105137680655814, "grad_norm": 10.545175741716223, "learning_rate": 4.875537935106325e-05, "loss": 2.3918, "mean_token_accuracy": 0.4068965494632721, "step": 149970 }, { "epoch": 0.15105641285966231, "grad_norm": 9.994698799653452, "learning_rate": 4.8755256448299394e-05, "loss": 2.1932, "mean_token_accuracy": 0.46551724076271056, "step": 149975 }, { "epoch": 0.1510614489127665, "grad_norm": 9.878210308961304, "learning_rate": 4.875513353964031e-05, "loss": 2.4291, "mean_token_accuracy": 0.42413792610168455, "step": 149980 }, { "epoch": 0.15106648496587066, "grad_norm": 9.891552626954644, "learning_rate": 4.8755010625086026e-05, "loss": 1.9938, "mean_token_accuracy": 0.5137931048870087, "step": 149985 }, { "epoch": 0.15107152101897484, "grad_norm": 9.439133080119229, "learning_rate": 4.8754887704636606e-05, "loss": 2.626, "mean_token_accuracy": 0.42226255536079405, "step": 149990 }, { "epoch": 0.151076557072079, "grad_norm": 12.202766034641444, "learning_rate": 4.875476477829205e-05, "loss": 2.2253, "mean_token_accuracy": 0.44482757449150084, "step": 149995 }, { "epoch": 0.15108159312518318, "grad_norm": 11.572475729870138, "learning_rate": 4.875464184605241e-05, "loss": 2.3586, "mean_token_accuracy": 0.43793103098869324, "step": 150000 }, { "epoch": 0.15108662917828736, "grad_norm": 11.804893872908302, "learning_rate": 4.875451890791772e-05, "loss": 1.9869, "mean_token_accuracy": 0.5000000059604645, "step": 150005 }, { "epoch": 0.15109166523139153, "grad_norm": 8.704480657737932, "learning_rate": 4.8754395963888e-05, "loss": 2.4238, "mean_token_accuracy": 0.517241382598877, "step": 150010 }, { "epoch": 0.1510967012844957, "grad_norm": 10.845416016884803, "learning_rate": 4.87542730139633e-05, "loss": 2.5662, "mean_token_accuracy": 0.3999999940395355, "step": 150015 }, { "epoch": 0.15110173733759988, "grad_norm": 10.044540464393616, "learning_rate": 4.8754150058143646e-05, "loss": 2.6184, "mean_token_accuracy": 0.4344827592372894, "step": 150020 }, { "epoch": 0.15110677339070405, "grad_norm": 9.008269068807397, "learning_rate": 4.875402709642908e-05, "loss": 2.3114, "mean_token_accuracy": 0.4540229856967926, "step": 150025 }, { "epoch": 0.15111180944380823, "grad_norm": 9.409736321952405, "learning_rate": 4.875390412881962e-05, "loss": 2.3293, "mean_token_accuracy": 0.4275862157344818, "step": 150030 }, { "epoch": 0.1511168454969124, "grad_norm": 10.192921760209092, "learning_rate": 4.875378115531532e-05, "loss": 2.468, "mean_token_accuracy": 0.4000000059604645, "step": 150035 }, { "epoch": 0.15112188155001657, "grad_norm": 9.628749607775681, "learning_rate": 4.8753658175916206e-05, "loss": 2.3601, "mean_token_accuracy": 0.4172413766384125, "step": 150040 }, { "epoch": 0.15112691760312075, "grad_norm": 10.18243906386138, "learning_rate": 4.87535351906223e-05, "loss": 2.2478, "mean_token_accuracy": 0.4689655125141144, "step": 150045 }, { "epoch": 0.15113195365622492, "grad_norm": 10.183179336879729, "learning_rate": 4.875341219943365e-05, "loss": 2.2614, "mean_token_accuracy": 0.4482758641242981, "step": 150050 }, { "epoch": 0.1511369897093291, "grad_norm": 12.149520292714604, "learning_rate": 4.875328920235029e-05, "loss": 2.4328, "mean_token_accuracy": 0.44827585816383364, "step": 150055 }, { "epoch": 0.15114202576243327, "grad_norm": 9.441114482622565, "learning_rate": 4.875316619937225e-05, "loss": 2.0619, "mean_token_accuracy": 0.4620689630508423, "step": 150060 }, { "epoch": 0.15114706181553744, "grad_norm": 11.023222362568502, "learning_rate": 4.875304319049956e-05, "loss": 2.392, "mean_token_accuracy": 0.42413793206214906, "step": 150065 }, { "epoch": 0.15115209786864162, "grad_norm": 8.160654604508869, "learning_rate": 4.875292017573227e-05, "loss": 2.1346, "mean_token_accuracy": 0.4848154842853546, "step": 150070 }, { "epoch": 0.1511571339217458, "grad_norm": 10.713756748241426, "learning_rate": 4.87527971550704e-05, "loss": 2.2277, "mean_token_accuracy": 0.4275861978530884, "step": 150075 }, { "epoch": 0.15116216997484996, "grad_norm": 12.92849714902062, "learning_rate": 4.875267412851398e-05, "loss": 2.8905, "mean_token_accuracy": 0.3793103456497192, "step": 150080 }, { "epoch": 0.15116720602795414, "grad_norm": 10.12466140362177, "learning_rate": 4.875255109606306e-05, "loss": 2.3144, "mean_token_accuracy": 0.4379310369491577, "step": 150085 }, { "epoch": 0.15117224208105828, "grad_norm": 12.040803136784405, "learning_rate": 4.8752428057717666e-05, "loss": 2.5957, "mean_token_accuracy": 0.417241370677948, "step": 150090 }, { "epoch": 0.15117727813416246, "grad_norm": 9.918949842160067, "learning_rate": 4.875230501347782e-05, "loss": 2.3446, "mean_token_accuracy": 0.41034482717514037, "step": 150095 }, { "epoch": 0.15118231418726663, "grad_norm": 8.825961285039142, "learning_rate": 4.875218196334359e-05, "loss": 2.5387, "mean_token_accuracy": 0.4379310369491577, "step": 150100 }, { "epoch": 0.1511873502403708, "grad_norm": 10.521101926381077, "learning_rate": 4.875205890731497e-05, "loss": 2.8611, "mean_token_accuracy": 0.37241379618644715, "step": 150105 }, { "epoch": 0.15119238629347498, "grad_norm": 11.364583726901614, "learning_rate": 4.875193584539201e-05, "loss": 2.3789, "mean_token_accuracy": 0.4137930989265442, "step": 150110 }, { "epoch": 0.15119742234657915, "grad_norm": 12.29302657449953, "learning_rate": 4.875181277757475e-05, "loss": 2.9767, "mean_token_accuracy": 0.3931034505367279, "step": 150115 }, { "epoch": 0.15120245839968333, "grad_norm": 11.157772317481305, "learning_rate": 4.875168970386322e-05, "loss": 2.6389, "mean_token_accuracy": 0.3931034505367279, "step": 150120 }, { "epoch": 0.1512074944527875, "grad_norm": 9.61915305788299, "learning_rate": 4.8751566624257464e-05, "loss": 2.451, "mean_token_accuracy": 0.4206896543502808, "step": 150125 }, { "epoch": 0.15121253050589167, "grad_norm": 8.398807625686924, "learning_rate": 4.8751443538757495e-05, "loss": 2.2748, "mean_token_accuracy": 0.45862069725990295, "step": 150130 }, { "epoch": 0.15121756655899585, "grad_norm": 12.167023287774294, "learning_rate": 4.875132044736337e-05, "loss": 2.9361, "mean_token_accuracy": 0.3879612863063812, "step": 150135 }, { "epoch": 0.15122260261210002, "grad_norm": 11.881947019372472, "learning_rate": 4.8751197350075106e-05, "loss": 2.2916, "mean_token_accuracy": 0.4862068951129913, "step": 150140 }, { "epoch": 0.1512276386652042, "grad_norm": 10.945645136623938, "learning_rate": 4.8751074246892734e-05, "loss": 2.2487, "mean_token_accuracy": 0.4172413766384125, "step": 150145 }, { "epoch": 0.15123267471830837, "grad_norm": 12.90750095286037, "learning_rate": 4.875095113781631e-05, "loss": 2.3537, "mean_token_accuracy": 0.42758620977401735, "step": 150150 }, { "epoch": 0.15123771077141254, "grad_norm": 10.971493191383479, "learning_rate": 4.875082802284586e-05, "loss": 2.5497, "mean_token_accuracy": 0.4088324248790741, "step": 150155 }, { "epoch": 0.15124274682451672, "grad_norm": 9.815691802977403, "learning_rate": 4.87507049019814e-05, "loss": 1.9228, "mean_token_accuracy": 0.5241379320621491, "step": 150160 }, { "epoch": 0.1512477828776209, "grad_norm": 11.939367120445292, "learning_rate": 4.875058177522298e-05, "loss": 2.6031, "mean_token_accuracy": 0.43448275327682495, "step": 150165 }, { "epoch": 0.15125281893072506, "grad_norm": 10.768429806105537, "learning_rate": 4.875045864257064e-05, "loss": 2.2736, "mean_token_accuracy": 0.48275862336158754, "step": 150170 }, { "epoch": 0.15125785498382924, "grad_norm": 9.313215074885422, "learning_rate": 4.8750335504024396e-05, "loss": 2.2369, "mean_token_accuracy": 0.4620689630508423, "step": 150175 }, { "epoch": 0.1512628910369334, "grad_norm": 10.156919514341073, "learning_rate": 4.87502123595843e-05, "loss": 2.3537, "mean_token_accuracy": 0.44482758045196535, "step": 150180 }, { "epoch": 0.15126792709003758, "grad_norm": 11.079845153322289, "learning_rate": 4.875008920925037e-05, "loss": 2.5276, "mean_token_accuracy": 0.4068965494632721, "step": 150185 }, { "epoch": 0.15127296314314176, "grad_norm": 10.225464559437409, "learning_rate": 4.8749966053022656e-05, "loss": 2.7613, "mean_token_accuracy": 0.3793103456497192, "step": 150190 }, { "epoch": 0.15127799919624593, "grad_norm": 11.020912950327778, "learning_rate": 4.874984289090119e-05, "loss": 2.0749, "mean_token_accuracy": 0.5241379380226135, "step": 150195 }, { "epoch": 0.1512830352493501, "grad_norm": 10.386691523749422, "learning_rate": 4.874971972288599e-05, "loss": 2.515, "mean_token_accuracy": 0.4517241418361664, "step": 150200 }, { "epoch": 0.15128807130245428, "grad_norm": 8.70204916094324, "learning_rate": 4.8749596548977104e-05, "loss": 2.252, "mean_token_accuracy": 0.47931033968925474, "step": 150205 }, { "epoch": 0.15129310735555845, "grad_norm": 13.673377437268389, "learning_rate": 4.8749473369174564e-05, "loss": 2.0869, "mean_token_accuracy": 0.4735221564769745, "step": 150210 }, { "epoch": 0.15129814340866263, "grad_norm": 10.088551272021586, "learning_rate": 4.87493501834784e-05, "loss": 2.2799, "mean_token_accuracy": 0.46896551847457885, "step": 150215 }, { "epoch": 0.1513031794617668, "grad_norm": 9.158333291736222, "learning_rate": 4.874922699188866e-05, "loss": 2.0271, "mean_token_accuracy": 0.45862067937850953, "step": 150220 }, { "epoch": 0.15130821551487095, "grad_norm": 12.101635829202934, "learning_rate": 4.874910379440535e-05, "loss": 2.3815, "mean_token_accuracy": 0.42413793206214906, "step": 150225 }, { "epoch": 0.15131325156797512, "grad_norm": 12.373608911713731, "learning_rate": 4.8748980591028544e-05, "loss": 2.8683, "mean_token_accuracy": 0.420689657330513, "step": 150230 }, { "epoch": 0.1513182876210793, "grad_norm": 7.8917931854661765, "learning_rate": 4.8748857381758236e-05, "loss": 2.2322, "mean_token_accuracy": 0.4689655065536499, "step": 150235 }, { "epoch": 0.15132332367418347, "grad_norm": 12.627721669148663, "learning_rate": 4.8748734166594483e-05, "loss": 2.5509, "mean_token_accuracy": 0.42758620977401735, "step": 150240 }, { "epoch": 0.15132835972728764, "grad_norm": 10.519164249965797, "learning_rate": 4.874861094553731e-05, "loss": 2.3548, "mean_token_accuracy": 0.3896551728248596, "step": 150245 }, { "epoch": 0.15133339578039182, "grad_norm": 11.91811160381917, "learning_rate": 4.8748487718586764e-05, "loss": 2.2428, "mean_token_accuracy": 0.46551724672317507, "step": 150250 }, { "epoch": 0.151338431833496, "grad_norm": 10.283878205986278, "learning_rate": 4.874836448574287e-05, "loss": 2.2604, "mean_token_accuracy": 0.42413793206214906, "step": 150255 }, { "epoch": 0.15134346788660016, "grad_norm": 11.426007052094397, "learning_rate": 4.874824124700566e-05, "loss": 2.4846, "mean_token_accuracy": 0.42413792610168455, "step": 150260 }, { "epoch": 0.15134850393970434, "grad_norm": 9.572970064106837, "learning_rate": 4.874811800237517e-05, "loss": 2.2192, "mean_token_accuracy": 0.4275861978530884, "step": 150265 }, { "epoch": 0.1513535399928085, "grad_norm": 9.315929302011174, "learning_rate": 4.874799475185144e-05, "loss": 2.3851, "mean_token_accuracy": 0.4344827592372894, "step": 150270 }, { "epoch": 0.15135857604591268, "grad_norm": 10.571683100886027, "learning_rate": 4.874787149543449e-05, "loss": 2.2536, "mean_token_accuracy": 0.4584392011165619, "step": 150275 }, { "epoch": 0.15136361209901686, "grad_norm": 8.96391113802866, "learning_rate": 4.874774823312437e-05, "loss": 2.564, "mean_token_accuracy": 0.4344827592372894, "step": 150280 }, { "epoch": 0.15136864815212103, "grad_norm": 25.055465272034354, "learning_rate": 4.874762496492111e-05, "loss": 2.7606, "mean_token_accuracy": 0.4275861978530884, "step": 150285 }, { "epoch": 0.1513736842052252, "grad_norm": 10.58333950914564, "learning_rate": 4.874750169082474e-05, "loss": 2.4669, "mean_token_accuracy": 0.441379314661026, "step": 150290 }, { "epoch": 0.15137872025832938, "grad_norm": 8.555513739456336, "learning_rate": 4.874737841083529e-05, "loss": 2.2696, "mean_token_accuracy": 0.46206897497177124, "step": 150295 }, { "epoch": 0.15138375631143355, "grad_norm": 9.887168120500515, "learning_rate": 4.874725512495281e-05, "loss": 2.257, "mean_token_accuracy": 0.4931034564971924, "step": 150300 }, { "epoch": 0.15138879236453773, "grad_norm": 8.356905069660419, "learning_rate": 4.8747131833177315e-05, "loss": 2.1606, "mean_token_accuracy": 0.5034482657909394, "step": 150305 }, { "epoch": 0.1513938284176419, "grad_norm": 10.579210387552152, "learning_rate": 4.874700853550886e-05, "loss": 2.4557, "mean_token_accuracy": 0.4034482717514038, "step": 150310 }, { "epoch": 0.15139886447074608, "grad_norm": 9.465618154448595, "learning_rate": 4.8746885231947455e-05, "loss": 2.2165, "mean_token_accuracy": 0.44482758045196535, "step": 150315 }, { "epoch": 0.15140390052385025, "grad_norm": 8.761453061524728, "learning_rate": 4.874676192249315e-05, "loss": 2.5343, "mean_token_accuracy": 0.4000000059604645, "step": 150320 }, { "epoch": 0.15140893657695442, "grad_norm": 9.581804774231564, "learning_rate": 4.874663860714598e-05, "loss": 2.1224, "mean_token_accuracy": 0.5068965554237366, "step": 150325 }, { "epoch": 0.1514139726300586, "grad_norm": 9.945106232319056, "learning_rate": 4.874651528590599e-05, "loss": 2.3541, "mean_token_accuracy": 0.4448275864124298, "step": 150330 }, { "epoch": 0.15141900868316277, "grad_norm": 10.719467867636174, "learning_rate": 4.874639195877318e-05, "loss": 2.3878, "mean_token_accuracy": 0.44482757449150084, "step": 150335 }, { "epoch": 0.15142404473626694, "grad_norm": 12.203019194067696, "learning_rate": 4.8746268625747606e-05, "loss": 3.027, "mean_token_accuracy": 0.35317604541778563, "step": 150340 }, { "epoch": 0.15142908078937112, "grad_norm": 9.766598134636835, "learning_rate": 4.874614528682931e-05, "loss": 2.2568, "mean_token_accuracy": 0.4310344815254211, "step": 150345 }, { "epoch": 0.1514341168424753, "grad_norm": 10.817124358248575, "learning_rate": 4.87460219420183e-05, "loss": 2.2315, "mean_token_accuracy": 0.45710829496383665, "step": 150350 }, { "epoch": 0.15143915289557947, "grad_norm": 10.33077888464912, "learning_rate": 4.874589859131464e-05, "loss": 2.1726, "mean_token_accuracy": 0.44482758045196535, "step": 150355 }, { "epoch": 0.15144418894868364, "grad_norm": 9.617127992856574, "learning_rate": 4.874577523471834e-05, "loss": 2.3214, "mean_token_accuracy": 0.4379310369491577, "step": 150360 }, { "epoch": 0.15144922500178779, "grad_norm": 11.254211020887224, "learning_rate": 4.8745651872229456e-05, "loss": 2.4388, "mean_token_accuracy": 0.441379314661026, "step": 150365 }, { "epoch": 0.15145426105489196, "grad_norm": 17.469372191003337, "learning_rate": 4.874552850384801e-05, "loss": 2.1248, "mean_token_accuracy": 0.4896551609039307, "step": 150370 }, { "epoch": 0.15145929710799613, "grad_norm": 10.259845077846446, "learning_rate": 4.8745405129574035e-05, "loss": 2.1766, "mean_token_accuracy": 0.43793103098869324, "step": 150375 }, { "epoch": 0.1514643331611003, "grad_norm": 13.819621397696476, "learning_rate": 4.8745281749407566e-05, "loss": 2.9662, "mean_token_accuracy": 0.41724137365818026, "step": 150380 }, { "epoch": 0.15146936921420448, "grad_norm": 10.156105766218072, "learning_rate": 4.8745158363348634e-05, "loss": 2.4862, "mean_token_accuracy": 0.43448275327682495, "step": 150385 }, { "epoch": 0.15147440526730865, "grad_norm": 12.076582965522295, "learning_rate": 4.8745034971397294e-05, "loss": 2.3162, "mean_token_accuracy": 0.4551724135875702, "step": 150390 }, { "epoch": 0.15147944132041283, "grad_norm": 10.684685970012838, "learning_rate": 4.8744911573553546e-05, "loss": 2.7113, "mean_token_accuracy": 0.3793103456497192, "step": 150395 }, { "epoch": 0.151484477373517, "grad_norm": 12.158531400115896, "learning_rate": 4.8744788169817456e-05, "loss": 2.7915, "mean_token_accuracy": 0.44827585816383364, "step": 150400 }, { "epoch": 0.15148951342662118, "grad_norm": 9.360065953940515, "learning_rate": 4.874466476018904e-05, "loss": 2.1124, "mean_token_accuracy": 0.48275861144065857, "step": 150405 }, { "epoch": 0.15149454947972535, "grad_norm": 11.331226905079259, "learning_rate": 4.874454134466834e-05, "loss": 2.0816, "mean_token_accuracy": 0.4655172348022461, "step": 150410 }, { "epoch": 0.15149958553282952, "grad_norm": 9.679648592540959, "learning_rate": 4.874441792325538e-05, "loss": 2.3184, "mean_token_accuracy": 0.37586206793785093, "step": 150415 }, { "epoch": 0.1515046215859337, "grad_norm": 9.308470284668466, "learning_rate": 4.874429449595021e-05, "loss": 2.1625, "mean_token_accuracy": 0.45517241954803467, "step": 150420 }, { "epoch": 0.15150965763903787, "grad_norm": 9.659529000544817, "learning_rate": 4.874417106275285e-05, "loss": 2.321, "mean_token_accuracy": 0.4310344815254211, "step": 150425 }, { "epoch": 0.15151469369214204, "grad_norm": 17.556991282622622, "learning_rate": 4.874404762366334e-05, "loss": 2.564, "mean_token_accuracy": 0.4344827592372894, "step": 150430 }, { "epoch": 0.15151972974524622, "grad_norm": 10.803726976284512, "learning_rate": 4.8743924178681714e-05, "loss": 2.1544, "mean_token_accuracy": 0.4082274615764618, "step": 150435 }, { "epoch": 0.1515247657983504, "grad_norm": 9.338568646056933, "learning_rate": 4.874380072780801e-05, "loss": 2.6222, "mean_token_accuracy": 0.3793103516101837, "step": 150440 }, { "epoch": 0.15152980185145457, "grad_norm": 8.165962261337153, "learning_rate": 4.874367727104225e-05, "loss": 2.2757, "mean_token_accuracy": 0.44482759237289426, "step": 150445 }, { "epoch": 0.15153483790455874, "grad_norm": 9.43841886394729, "learning_rate": 4.8743553808384484e-05, "loss": 2.08, "mean_token_accuracy": 0.510344821214676, "step": 150450 }, { "epoch": 0.1515398739576629, "grad_norm": 14.072186571701389, "learning_rate": 4.874343033983474e-05, "loss": 2.5124, "mean_token_accuracy": 0.3896551728248596, "step": 150455 }, { "epoch": 0.1515449100107671, "grad_norm": 11.394049111164001, "learning_rate": 4.874330686539305e-05, "loss": 2.2264, "mean_token_accuracy": 0.4068965554237366, "step": 150460 }, { "epoch": 0.15154994606387126, "grad_norm": 10.346039073101949, "learning_rate": 4.874318338505945e-05, "loss": 2.3938, "mean_token_accuracy": 0.42758620381355283, "step": 150465 }, { "epoch": 0.15155498211697543, "grad_norm": 9.453980304303089, "learning_rate": 4.874305989883398e-05, "loss": 2.5905, "mean_token_accuracy": 0.4344827592372894, "step": 150470 }, { "epoch": 0.1515600181700796, "grad_norm": 13.23978943954671, "learning_rate": 4.874293640671665e-05, "loss": 2.349, "mean_token_accuracy": 0.4620689690113068, "step": 150475 }, { "epoch": 0.15156505422318378, "grad_norm": 10.258650735743732, "learning_rate": 4.874281290870753e-05, "loss": 2.4331, "mean_token_accuracy": 0.46896551847457885, "step": 150480 }, { "epoch": 0.15157009027628796, "grad_norm": 8.817047781918076, "learning_rate": 4.874268940480663e-05, "loss": 2.3779, "mean_token_accuracy": 0.42758620977401735, "step": 150485 }, { "epoch": 0.15157512632939213, "grad_norm": 8.672893836478407, "learning_rate": 4.874256589501398e-05, "loss": 2.4349, "mean_token_accuracy": 0.45517241954803467, "step": 150490 }, { "epoch": 0.1515801623824963, "grad_norm": 11.903649789741818, "learning_rate": 4.874244237932964e-05, "loss": 2.2391, "mean_token_accuracy": 0.4482758641242981, "step": 150495 }, { "epoch": 0.15158519843560048, "grad_norm": 10.746453281167769, "learning_rate": 4.8742318857753625e-05, "loss": 2.0424, "mean_token_accuracy": 0.47803992629051206, "step": 150500 }, { "epoch": 0.15159023448870462, "grad_norm": 13.562185438651952, "learning_rate": 4.874219533028598e-05, "loss": 2.3888, "mean_token_accuracy": 0.458620685338974, "step": 150505 }, { "epoch": 0.1515952705418088, "grad_norm": 9.790192076271353, "learning_rate": 4.874207179692672e-05, "loss": 2.5233, "mean_token_accuracy": 0.39655172228813174, "step": 150510 }, { "epoch": 0.15160030659491297, "grad_norm": 8.458841449175747, "learning_rate": 4.87419482576759e-05, "loss": 2.6556, "mean_token_accuracy": 0.40344828367233276, "step": 150515 }, { "epoch": 0.15160534264801714, "grad_norm": 10.722364764240057, "learning_rate": 4.874182471253355e-05, "loss": 2.4542, "mean_token_accuracy": 0.42068966031074523, "step": 150520 }, { "epoch": 0.15161037870112132, "grad_norm": 8.994659434697269, "learning_rate": 4.874170116149969e-05, "loss": 2.4861, "mean_token_accuracy": 0.4, "step": 150525 }, { "epoch": 0.1516154147542255, "grad_norm": 10.693536127125087, "learning_rate": 4.874157760457438e-05, "loss": 2.3542, "mean_token_accuracy": 0.4344827592372894, "step": 150530 }, { "epoch": 0.15162045080732967, "grad_norm": 9.007398111677427, "learning_rate": 4.874145404175762e-05, "loss": 2.2279, "mean_token_accuracy": 0.458620685338974, "step": 150535 }, { "epoch": 0.15162548686043384, "grad_norm": 9.22207650987322, "learning_rate": 4.874133047304948e-05, "loss": 2.3779, "mean_token_accuracy": 0.41379310488700866, "step": 150540 }, { "epoch": 0.151630522913538, "grad_norm": 8.807127213952073, "learning_rate": 4.874120689844997e-05, "loss": 2.2532, "mean_token_accuracy": 0.4310344815254211, "step": 150545 }, { "epoch": 0.1516355589666422, "grad_norm": 11.007965670423662, "learning_rate": 4.874108331795913e-05, "loss": 2.2826, "mean_token_accuracy": 0.4551724135875702, "step": 150550 }, { "epoch": 0.15164059501974636, "grad_norm": 10.27759008419283, "learning_rate": 4.8740959731577004e-05, "loss": 2.5944, "mean_token_accuracy": 0.4275861978530884, "step": 150555 }, { "epoch": 0.15164563107285053, "grad_norm": 8.178922110203978, "learning_rate": 4.874083613930361e-05, "loss": 2.343, "mean_token_accuracy": 0.4747731387615204, "step": 150560 }, { "epoch": 0.1516506671259547, "grad_norm": 11.133960785633366, "learning_rate": 4.8740712541139e-05, "loss": 2.4533, "mean_token_accuracy": 0.42413792610168455, "step": 150565 }, { "epoch": 0.15165570317905888, "grad_norm": 9.986264461845304, "learning_rate": 4.8740588937083194e-05, "loss": 2.6498, "mean_token_accuracy": 0.42068966031074523, "step": 150570 }, { "epoch": 0.15166073923216306, "grad_norm": 10.280098156367536, "learning_rate": 4.874046532713623e-05, "loss": 2.1286, "mean_token_accuracy": 0.475862056016922, "step": 150575 }, { "epoch": 0.15166577528526723, "grad_norm": 13.494117498302932, "learning_rate": 4.8740341711298144e-05, "loss": 2.8961, "mean_token_accuracy": 0.39310344457626345, "step": 150580 }, { "epoch": 0.1516708113383714, "grad_norm": 11.958428598222532, "learning_rate": 4.874021808956897e-05, "loss": 2.1979, "mean_token_accuracy": 0.4344827592372894, "step": 150585 }, { "epoch": 0.15167584739147558, "grad_norm": 12.82460550507251, "learning_rate": 4.8740094461948745e-05, "loss": 2.9441, "mean_token_accuracy": 0.35517241060733795, "step": 150590 }, { "epoch": 0.15168088344457975, "grad_norm": 10.587126171189466, "learning_rate": 4.87399708284375e-05, "loss": 2.0716, "mean_token_accuracy": 0.4655172348022461, "step": 150595 }, { "epoch": 0.15168591949768392, "grad_norm": 9.923382689729799, "learning_rate": 4.873984718903527e-05, "loss": 2.3636, "mean_token_accuracy": 0.4068965494632721, "step": 150600 }, { "epoch": 0.1516909555507881, "grad_norm": 13.47038608821393, "learning_rate": 4.873972354374208e-05, "loss": 2.5455, "mean_token_accuracy": 0.4344827592372894, "step": 150605 }, { "epoch": 0.15169599160389227, "grad_norm": 12.812706379878655, "learning_rate": 4.873959989255799e-05, "loss": 2.37, "mean_token_accuracy": 0.44694494009017943, "step": 150610 }, { "epoch": 0.15170102765699645, "grad_norm": 8.120043157794253, "learning_rate": 4.8739476235483004e-05, "loss": 2.5319, "mean_token_accuracy": 0.42413792908191683, "step": 150615 }, { "epoch": 0.15170606371010062, "grad_norm": 11.281864042182796, "learning_rate": 4.873935257251718e-05, "loss": 2.3366, "mean_token_accuracy": 0.46896551847457885, "step": 150620 }, { "epoch": 0.1517110997632048, "grad_norm": 12.529714941157007, "learning_rate": 4.8739228903660534e-05, "loss": 2.3125, "mean_token_accuracy": 0.47438423037528993, "step": 150625 }, { "epoch": 0.15171613581630897, "grad_norm": 8.640803934901799, "learning_rate": 4.8739105228913116e-05, "loss": 1.9594, "mean_token_accuracy": 0.47931033968925474, "step": 150630 }, { "epoch": 0.15172117186941314, "grad_norm": 10.169405481963791, "learning_rate": 4.873898154827495e-05, "loss": 2.172, "mean_token_accuracy": 0.48965516686439514, "step": 150635 }, { "epoch": 0.15172620792251731, "grad_norm": 10.757194128750637, "learning_rate": 4.873885786174607e-05, "loss": 2.2302, "mean_token_accuracy": 0.4779794216156006, "step": 150640 }, { "epoch": 0.15173124397562146, "grad_norm": 17.585003030503234, "learning_rate": 4.8738734169326515e-05, "loss": 2.0028, "mean_token_accuracy": 0.44482758045196535, "step": 150645 }, { "epoch": 0.15173628002872563, "grad_norm": 11.042385067666803, "learning_rate": 4.873861047101632e-05, "loss": 2.2474, "mean_token_accuracy": 0.4344827592372894, "step": 150650 }, { "epoch": 0.1517413160818298, "grad_norm": 8.833257855649816, "learning_rate": 4.873848676681551e-05, "loss": 2.4657, "mean_token_accuracy": 0.4344827592372894, "step": 150655 }, { "epoch": 0.15174635213493398, "grad_norm": 10.68783756984192, "learning_rate": 4.873836305672414e-05, "loss": 2.3579, "mean_token_accuracy": 0.42577131986618044, "step": 150660 }, { "epoch": 0.15175138818803816, "grad_norm": 15.986663323270122, "learning_rate": 4.8738239340742226e-05, "loss": 2.5879, "mean_token_accuracy": 0.4, "step": 150665 }, { "epoch": 0.15175642424114233, "grad_norm": 11.721582755047603, "learning_rate": 4.87381156188698e-05, "loss": 2.219, "mean_token_accuracy": 0.42068966031074523, "step": 150670 }, { "epoch": 0.1517614602942465, "grad_norm": 11.87661260623941, "learning_rate": 4.873799189110691e-05, "loss": 2.314, "mean_token_accuracy": 0.4379310250282288, "step": 150675 }, { "epoch": 0.15176649634735068, "grad_norm": 13.004632324647494, "learning_rate": 4.873786815745358e-05, "loss": 2.0984, "mean_token_accuracy": 0.5243366956710815, "step": 150680 }, { "epoch": 0.15177153240045485, "grad_norm": 9.618445936924376, "learning_rate": 4.873774441790985e-05, "loss": 2.1615, "mean_token_accuracy": 0.4773744702339172, "step": 150685 }, { "epoch": 0.15177656845355902, "grad_norm": 11.501936129113819, "learning_rate": 4.8737620672475754e-05, "loss": 2.4036, "mean_token_accuracy": 0.46551724076271056, "step": 150690 }, { "epoch": 0.1517816045066632, "grad_norm": 10.638031260349974, "learning_rate": 4.873749692115132e-05, "loss": 2.5078, "mean_token_accuracy": 0.3827586233615875, "step": 150695 }, { "epoch": 0.15178664055976737, "grad_norm": 10.828222783567954, "learning_rate": 4.87373731639366e-05, "loss": 2.4322, "mean_token_accuracy": 0.4551724135875702, "step": 150700 }, { "epoch": 0.15179167661287155, "grad_norm": 6.954232589315808, "learning_rate": 4.8737249400831595e-05, "loss": 2.1544, "mean_token_accuracy": 0.4655172348022461, "step": 150705 }, { "epoch": 0.15179671266597572, "grad_norm": 9.608329515350112, "learning_rate": 4.8737125631836366e-05, "loss": 2.4061, "mean_token_accuracy": 0.42413793206214906, "step": 150710 }, { "epoch": 0.1518017487190799, "grad_norm": 11.374239213659573, "learning_rate": 4.8737001856950945e-05, "loss": 2.3526, "mean_token_accuracy": 0.4551724135875702, "step": 150715 }, { "epoch": 0.15180678477218407, "grad_norm": 11.184258517742835, "learning_rate": 4.8736878076175364e-05, "loss": 2.3387, "mean_token_accuracy": 0.4413793087005615, "step": 150720 }, { "epoch": 0.15181182082528824, "grad_norm": 12.413705025847536, "learning_rate": 4.873675428950966e-05, "loss": 2.5459, "mean_token_accuracy": 0.4034482717514038, "step": 150725 }, { "epoch": 0.15181685687839241, "grad_norm": 10.689350988924502, "learning_rate": 4.873663049695385e-05, "loss": 2.2883, "mean_token_accuracy": 0.47749546766281126, "step": 150730 }, { "epoch": 0.1518218929314966, "grad_norm": 11.819130754000316, "learning_rate": 4.873650669850799e-05, "loss": 2.2928, "mean_token_accuracy": 0.39310344457626345, "step": 150735 }, { "epoch": 0.15182692898460076, "grad_norm": 13.166761966945314, "learning_rate": 4.873638289417211e-05, "loss": 2.5584, "mean_token_accuracy": 0.4504537105560303, "step": 150740 }, { "epoch": 0.15183196503770494, "grad_norm": 10.09324875520783, "learning_rate": 4.873625908394622e-05, "loss": 2.5139, "mean_token_accuracy": 0.43103448748588563, "step": 150745 }, { "epoch": 0.1518370010908091, "grad_norm": 9.236224476920444, "learning_rate": 4.873613526783039e-05, "loss": 2.4757, "mean_token_accuracy": 0.3793103456497192, "step": 150750 }, { "epoch": 0.15184203714391328, "grad_norm": 11.282594566292891, "learning_rate": 4.873601144582464e-05, "loss": 2.5157, "mean_token_accuracy": 0.4707199037075043, "step": 150755 }, { "epoch": 0.15184707319701746, "grad_norm": 12.387120700263967, "learning_rate": 4.873588761792899e-05, "loss": 2.6317, "mean_token_accuracy": 0.4344827592372894, "step": 150760 }, { "epoch": 0.15185210925012163, "grad_norm": 13.11763911520989, "learning_rate": 4.873576378414351e-05, "loss": 2.6105, "mean_token_accuracy": 0.441379314661026, "step": 150765 }, { "epoch": 0.1518571453032258, "grad_norm": 10.153568637920841, "learning_rate": 4.87356399444682e-05, "loss": 2.2433, "mean_token_accuracy": 0.44482759237289426, "step": 150770 }, { "epoch": 0.15186218135632998, "grad_norm": 10.007439944301805, "learning_rate": 4.8735516098903096e-05, "loss": 2.1689, "mean_token_accuracy": 0.4241379380226135, "step": 150775 }, { "epoch": 0.15186721740943415, "grad_norm": 11.40487302493936, "learning_rate": 4.8735392247448254e-05, "loss": 2.4577, "mean_token_accuracy": 0.3999999940395355, "step": 150780 }, { "epoch": 0.1518722534625383, "grad_norm": 10.81069004641041, "learning_rate": 4.873526839010369e-05, "loss": 2.3155, "mean_token_accuracy": 0.4620689690113068, "step": 150785 }, { "epoch": 0.15187728951564247, "grad_norm": 9.283858758743062, "learning_rate": 4.8735144526869456e-05, "loss": 2.6422, "mean_token_accuracy": 0.4344827592372894, "step": 150790 }, { "epoch": 0.15188232556874665, "grad_norm": 11.166442509161339, "learning_rate": 4.873502065774556e-05, "loss": 2.203, "mean_token_accuracy": 0.46551724672317507, "step": 150795 }, { "epoch": 0.15188736162185082, "grad_norm": 9.578904333460258, "learning_rate": 4.873489678273206e-05, "loss": 2.2315, "mean_token_accuracy": 0.46896551847457885, "step": 150800 }, { "epoch": 0.151892397674955, "grad_norm": 8.835839406234053, "learning_rate": 4.873477290182898e-05, "loss": 2.5571, "mean_token_accuracy": 0.4344827592372894, "step": 150805 }, { "epoch": 0.15189743372805917, "grad_norm": 12.044151655830165, "learning_rate": 4.8734649015036366e-05, "loss": 2.5559, "mean_token_accuracy": 0.4344827592372894, "step": 150810 }, { "epoch": 0.15190246978116334, "grad_norm": 9.899572861756353, "learning_rate": 4.873452512235423e-05, "loss": 2.4713, "mean_token_accuracy": 0.4379310369491577, "step": 150815 }, { "epoch": 0.15190750583426751, "grad_norm": 9.46910806652318, "learning_rate": 4.873440122378263e-05, "loss": 2.5389, "mean_token_accuracy": 0.38275861740112305, "step": 150820 }, { "epoch": 0.1519125418873717, "grad_norm": 11.552822444880906, "learning_rate": 4.873427731932158e-05, "loss": 2.2074, "mean_token_accuracy": 0.4620689690113068, "step": 150825 }, { "epoch": 0.15191757794047586, "grad_norm": 10.46456787412755, "learning_rate": 4.8734153408971125e-05, "loss": 2.3814, "mean_token_accuracy": 0.4, "step": 150830 }, { "epoch": 0.15192261399358004, "grad_norm": 10.183681970068573, "learning_rate": 4.873402949273131e-05, "loss": 2.2787, "mean_token_accuracy": 0.4931034505367279, "step": 150835 }, { "epoch": 0.1519276500466842, "grad_norm": 9.653474408396317, "learning_rate": 4.8733905570602144e-05, "loss": 2.2426, "mean_token_accuracy": 0.48965516686439514, "step": 150840 }, { "epoch": 0.15193268609978838, "grad_norm": 9.065102843908237, "learning_rate": 4.873378164258368e-05, "loss": 2.67, "mean_token_accuracy": 0.4034482777118683, "step": 150845 }, { "epoch": 0.15193772215289256, "grad_norm": 9.49958958881616, "learning_rate": 4.873365770867595e-05, "loss": 2.2855, "mean_token_accuracy": 0.4620689630508423, "step": 150850 }, { "epoch": 0.15194275820599673, "grad_norm": 10.16470988860959, "learning_rate": 4.873353376887898e-05, "loss": 2.0291, "mean_token_accuracy": 0.4620689630508423, "step": 150855 }, { "epoch": 0.1519477942591009, "grad_norm": 19.174208245390435, "learning_rate": 4.873340982319281e-05, "loss": 2.881, "mean_token_accuracy": 0.3965517163276672, "step": 150860 }, { "epoch": 0.15195283031220508, "grad_norm": 9.756136747844288, "learning_rate": 4.873328587161749e-05, "loss": 2.2686, "mean_token_accuracy": 0.4793103337287903, "step": 150865 }, { "epoch": 0.15195786636530925, "grad_norm": 9.376287546385894, "learning_rate": 4.873316191415303e-05, "loss": 2.5359, "mean_token_accuracy": 0.4172413766384125, "step": 150870 }, { "epoch": 0.15196290241841343, "grad_norm": 9.290025809661005, "learning_rate": 4.873303795079947e-05, "loss": 2.4794, "mean_token_accuracy": 0.38275861740112305, "step": 150875 }, { "epoch": 0.1519679384715176, "grad_norm": 9.66248497997102, "learning_rate": 4.873291398155685e-05, "loss": 2.1645, "mean_token_accuracy": 0.4344827651977539, "step": 150880 }, { "epoch": 0.15197297452462177, "grad_norm": 9.37619109700161, "learning_rate": 4.8732790006425193e-05, "loss": 2.0703, "mean_token_accuracy": 0.4620689690113068, "step": 150885 }, { "epoch": 0.15197801057772595, "grad_norm": 8.552158677026144, "learning_rate": 4.8732666025404546e-05, "loss": 2.4804, "mean_token_accuracy": 0.441379314661026, "step": 150890 }, { "epoch": 0.15198304663083012, "grad_norm": 11.416381163478356, "learning_rate": 4.873254203849495e-05, "loss": 2.718, "mean_token_accuracy": 0.4225045442581177, "step": 150895 }, { "epoch": 0.1519880826839343, "grad_norm": 9.7506027903717, "learning_rate": 4.873241804569642e-05, "loss": 1.977, "mean_token_accuracy": 0.42068964838981626, "step": 150900 }, { "epoch": 0.15199311873703847, "grad_norm": 9.691227767635663, "learning_rate": 4.8732294047009005e-05, "loss": 2.4689, "mean_token_accuracy": 0.4172413766384125, "step": 150905 }, { "epoch": 0.15199815479014264, "grad_norm": 8.736319444334372, "learning_rate": 4.873217004243274e-05, "loss": 2.4549, "mean_token_accuracy": 0.458620685338974, "step": 150910 }, { "epoch": 0.15200319084324682, "grad_norm": 12.25224135947201, "learning_rate": 4.873204603196764e-05, "loss": 2.7741, "mean_token_accuracy": 0.37241379618644715, "step": 150915 }, { "epoch": 0.152008226896351, "grad_norm": 9.598961387935718, "learning_rate": 4.873192201561376e-05, "loss": 2.5233, "mean_token_accuracy": 0.441379314661026, "step": 150920 }, { "epoch": 0.15201326294945514, "grad_norm": 10.988610621069487, "learning_rate": 4.873179799337113e-05, "loss": 2.524, "mean_token_accuracy": 0.4068965494632721, "step": 150925 }, { "epoch": 0.1520182990025593, "grad_norm": 8.425051200285985, "learning_rate": 4.873167396523977e-05, "loss": 2.1215, "mean_token_accuracy": 0.46551724672317507, "step": 150930 }, { "epoch": 0.15202333505566348, "grad_norm": 10.023100736401778, "learning_rate": 4.8731549931219735e-05, "loss": 2.4643, "mean_token_accuracy": 0.4103448212146759, "step": 150935 }, { "epoch": 0.15202837110876766, "grad_norm": 12.066617312862238, "learning_rate": 4.8731425891311046e-05, "loss": 2.7199, "mean_token_accuracy": 0.41379311084747317, "step": 150940 }, { "epoch": 0.15203340716187183, "grad_norm": 9.97821526886476, "learning_rate": 4.8731301845513744e-05, "loss": 2.169, "mean_token_accuracy": 0.46896551847457885, "step": 150945 }, { "epoch": 0.152038443214976, "grad_norm": 8.222836564518998, "learning_rate": 4.873117779382786e-05, "loss": 2.1493, "mean_token_accuracy": 0.4689655125141144, "step": 150950 }, { "epoch": 0.15204347926808018, "grad_norm": 12.529976976440432, "learning_rate": 4.873105373625343e-05, "loss": 2.4912, "mean_token_accuracy": 0.4137930929660797, "step": 150955 }, { "epoch": 0.15204851532118435, "grad_norm": 11.377235120576106, "learning_rate": 4.8730929672790495e-05, "loss": 2.5131, "mean_token_accuracy": 0.3999999940395355, "step": 150960 }, { "epoch": 0.15205355137428853, "grad_norm": 11.110446166584714, "learning_rate": 4.873080560343907e-05, "loss": 2.3101, "mean_token_accuracy": 0.4068965494632721, "step": 150965 }, { "epoch": 0.1520585874273927, "grad_norm": 11.856457678003226, "learning_rate": 4.8730681528199214e-05, "loss": 2.2479, "mean_token_accuracy": 0.4172413766384125, "step": 150970 }, { "epoch": 0.15206362348049687, "grad_norm": 10.168747734603619, "learning_rate": 4.873055744707094e-05, "loss": 2.0031, "mean_token_accuracy": 0.4724137902259827, "step": 150975 }, { "epoch": 0.15206865953360105, "grad_norm": 10.540583976378606, "learning_rate": 4.873043336005429e-05, "loss": 2.3692, "mean_token_accuracy": 0.43793103098869324, "step": 150980 }, { "epoch": 0.15207369558670522, "grad_norm": 9.55927376836404, "learning_rate": 4.873030926714931e-05, "loss": 2.3414, "mean_token_accuracy": 0.45862069725990295, "step": 150985 }, { "epoch": 0.1520787316398094, "grad_norm": 8.231008157035912, "learning_rate": 4.873018516835603e-05, "loss": 1.9673, "mean_token_accuracy": 0.5034482836723327, "step": 150990 }, { "epoch": 0.15208376769291357, "grad_norm": 8.640598046638395, "learning_rate": 4.8730061063674465e-05, "loss": 2.2573, "mean_token_accuracy": 0.4517241299152374, "step": 150995 }, { "epoch": 0.15208880374601774, "grad_norm": 7.973262380196957, "learning_rate": 4.872993695310466e-05, "loss": 2.0267, "mean_token_accuracy": 0.48620688915252686, "step": 151000 }, { "epoch": 0.15209383979912192, "grad_norm": 8.263058231180564, "learning_rate": 4.872981283664666e-05, "loss": 2.2512, "mean_token_accuracy": 0.441379314661026, "step": 151005 }, { "epoch": 0.1520988758522261, "grad_norm": 9.160556789446183, "learning_rate": 4.8729688714300495e-05, "loss": 2.4617, "mean_token_accuracy": 0.4206896543502808, "step": 151010 }, { "epoch": 0.15210391190533026, "grad_norm": 11.601005730049268, "learning_rate": 4.872956458606619e-05, "loss": 2.4511, "mean_token_accuracy": 0.3862068891525269, "step": 151015 }, { "epoch": 0.15210894795843444, "grad_norm": 8.937949429738923, "learning_rate": 4.87294404519438e-05, "loss": 2.4401, "mean_token_accuracy": 0.41034482717514037, "step": 151020 }, { "epoch": 0.1521139840115386, "grad_norm": 9.105207038852074, "learning_rate": 4.872931631193333e-05, "loss": 1.9159, "mean_token_accuracy": 0.5597290694713593, "step": 151025 }, { "epoch": 0.15211902006464278, "grad_norm": 19.48967502517362, "learning_rate": 4.8729192166034835e-05, "loss": 2.5433, "mean_token_accuracy": 0.4551724135875702, "step": 151030 }, { "epoch": 0.15212405611774696, "grad_norm": 10.36361148041307, "learning_rate": 4.872906801424834e-05, "loss": 2.272, "mean_token_accuracy": 0.48620688915252686, "step": 151035 }, { "epoch": 0.15212909217085113, "grad_norm": 11.153884189177703, "learning_rate": 4.872894385657389e-05, "loss": 2.3786, "mean_token_accuracy": 0.3965517282485962, "step": 151040 }, { "epoch": 0.1521341282239553, "grad_norm": 8.448549559316971, "learning_rate": 4.872881969301152e-05, "loss": 2.5053, "mean_token_accuracy": 0.417241370677948, "step": 151045 }, { "epoch": 0.15213916427705948, "grad_norm": 10.269903253532002, "learning_rate": 4.872869552356125e-05, "loss": 2.3708, "mean_token_accuracy": 0.41379310488700866, "step": 151050 }, { "epoch": 0.15214420033016365, "grad_norm": 9.53397820847752, "learning_rate": 4.872857134822312e-05, "loss": 2.2478, "mean_token_accuracy": 0.5034482717514038, "step": 151055 }, { "epoch": 0.15214923638326783, "grad_norm": 10.586792599406287, "learning_rate": 4.872844716699717e-05, "loss": 2.4764, "mean_token_accuracy": 0.4137930989265442, "step": 151060 }, { "epoch": 0.15215427243637197, "grad_norm": 11.179751580697559, "learning_rate": 4.8728322979883423e-05, "loss": 2.3748, "mean_token_accuracy": 0.441379314661026, "step": 151065 }, { "epoch": 0.15215930848947615, "grad_norm": 9.961983738649938, "learning_rate": 4.8728198786881926e-05, "loss": 2.4995, "mean_token_accuracy": 0.4137930989265442, "step": 151070 }, { "epoch": 0.15216434454258032, "grad_norm": 10.571941493260121, "learning_rate": 4.8728074587992715e-05, "loss": 2.3962, "mean_token_accuracy": 0.4896551728248596, "step": 151075 }, { "epoch": 0.1521693805956845, "grad_norm": 9.008727603390826, "learning_rate": 4.8727950383215806e-05, "loss": 2.156, "mean_token_accuracy": 0.4689655125141144, "step": 151080 }, { "epoch": 0.15217441664878867, "grad_norm": 9.753150163864436, "learning_rate": 4.872782617255125e-05, "loss": 2.2142, "mean_token_accuracy": 0.47241378426551817, "step": 151085 }, { "epoch": 0.15217945270189284, "grad_norm": 11.141523331250118, "learning_rate": 4.8727701955999095e-05, "loss": 2.4837, "mean_token_accuracy": 0.4517241418361664, "step": 151090 }, { "epoch": 0.15218448875499702, "grad_norm": 11.792727770775645, "learning_rate": 4.872757773355934e-05, "loss": 2.543, "mean_token_accuracy": 0.35862069129943847, "step": 151095 }, { "epoch": 0.1521895248081012, "grad_norm": 11.545932494197213, "learning_rate": 4.872745350523204e-05, "loss": 2.4839, "mean_token_accuracy": 0.40689654350280763, "step": 151100 }, { "epoch": 0.15219456086120536, "grad_norm": 9.85121842225319, "learning_rate": 4.872732927101723e-05, "loss": 2.5074, "mean_token_accuracy": 0.44137930274009707, "step": 151105 }, { "epoch": 0.15219959691430954, "grad_norm": 10.956718908528908, "learning_rate": 4.872720503091493e-05, "loss": 2.2291, "mean_token_accuracy": 0.48275861144065857, "step": 151110 }, { "epoch": 0.1522046329674137, "grad_norm": 11.484814754127292, "learning_rate": 4.872708078492519e-05, "loss": 2.3697, "mean_token_accuracy": 0.42758620977401735, "step": 151115 }, { "epoch": 0.15220966902051788, "grad_norm": 8.172030638190998, "learning_rate": 4.8726956533048055e-05, "loss": 2.0658, "mean_token_accuracy": 0.47241380214691164, "step": 151120 }, { "epoch": 0.15221470507362206, "grad_norm": 8.869491910141184, "learning_rate": 4.872683227528353e-05, "loss": 2.4065, "mean_token_accuracy": 0.4413793087005615, "step": 151125 }, { "epoch": 0.15221974112672623, "grad_norm": 13.832355468500554, "learning_rate": 4.8726708011631664e-05, "loss": 2.516, "mean_token_accuracy": 0.36896551847457887, "step": 151130 }, { "epoch": 0.1522247771798304, "grad_norm": 9.766679517702944, "learning_rate": 4.87265837420925e-05, "loss": 2.514, "mean_token_accuracy": 0.4137930989265442, "step": 151135 }, { "epoch": 0.15222981323293458, "grad_norm": 12.753550298149598, "learning_rate": 4.872645946666605e-05, "loss": 2.2012, "mean_token_accuracy": 0.42413792610168455, "step": 151140 }, { "epoch": 0.15223484928603875, "grad_norm": 9.571916920301375, "learning_rate": 4.8726335185352375e-05, "loss": 2.1147, "mean_token_accuracy": 0.46418632864952086, "step": 151145 }, { "epoch": 0.15223988533914293, "grad_norm": 12.480189635331628, "learning_rate": 4.872621089815149e-05, "loss": 2.4292, "mean_token_accuracy": 0.39655172228813174, "step": 151150 }, { "epoch": 0.1522449213922471, "grad_norm": 10.937704722682419, "learning_rate": 4.872608660506344e-05, "loss": 2.0482, "mean_token_accuracy": 0.4689655065536499, "step": 151155 }, { "epoch": 0.15224995744535127, "grad_norm": 11.07659338745059, "learning_rate": 4.8725962306088254e-05, "loss": 2.4514, "mean_token_accuracy": 0.4206896543502808, "step": 151160 }, { "epoch": 0.15225499349845545, "grad_norm": 11.37615556389797, "learning_rate": 4.872583800122597e-05, "loss": 2.2952, "mean_token_accuracy": 0.39310344457626345, "step": 151165 }, { "epoch": 0.15226002955155962, "grad_norm": 9.381166374574759, "learning_rate": 4.872571369047662e-05, "loss": 2.3261, "mean_token_accuracy": 0.4413793087005615, "step": 151170 }, { "epoch": 0.1522650656046638, "grad_norm": 10.054925547489749, "learning_rate": 4.872558937384024e-05, "loss": 2.4361, "mean_token_accuracy": 0.458620685338974, "step": 151175 }, { "epoch": 0.15227010165776797, "grad_norm": 11.124754873244454, "learning_rate": 4.872546505131686e-05, "loss": 2.3083, "mean_token_accuracy": 0.45862069725990295, "step": 151180 }, { "epoch": 0.15227513771087214, "grad_norm": 10.610146354795342, "learning_rate": 4.872534072290651e-05, "loss": 2.7605, "mean_token_accuracy": 0.37241379618644715, "step": 151185 }, { "epoch": 0.15228017376397632, "grad_norm": 8.559116716687027, "learning_rate": 4.872521638860925e-05, "loss": 2.2421, "mean_token_accuracy": 0.44984875321388246, "step": 151190 }, { "epoch": 0.1522852098170805, "grad_norm": 11.246755108386017, "learning_rate": 4.8725092048425094e-05, "loss": 2.2351, "mean_token_accuracy": 0.4517241358757019, "step": 151195 }, { "epoch": 0.15229024587018467, "grad_norm": 10.395795161932568, "learning_rate": 4.872496770235407e-05, "loss": 2.4224, "mean_token_accuracy": 0.42413793206214906, "step": 151200 }, { "epoch": 0.1522952819232888, "grad_norm": 8.56066293132436, "learning_rate": 4.872484335039623e-05, "loss": 2.3127, "mean_token_accuracy": 0.4344827592372894, "step": 151205 }, { "epoch": 0.15230031797639298, "grad_norm": 10.069502260910408, "learning_rate": 4.8724718992551595e-05, "loss": 2.1555, "mean_token_accuracy": 0.46721113920211793, "step": 151210 }, { "epoch": 0.15230535402949716, "grad_norm": 10.05625330771739, "learning_rate": 4.8724594628820205e-05, "loss": 2.5231, "mean_token_accuracy": 0.4517241358757019, "step": 151215 }, { "epoch": 0.15231039008260133, "grad_norm": 11.665651200013338, "learning_rate": 4.87244702592021e-05, "loss": 2.4187, "mean_token_accuracy": 0.45862067937850953, "step": 151220 }, { "epoch": 0.1523154261357055, "grad_norm": 12.054411229911091, "learning_rate": 4.8724345883697304e-05, "loss": 2.5488, "mean_token_accuracy": 0.3620689630508423, "step": 151225 }, { "epoch": 0.15232046218880968, "grad_norm": 8.949540897361521, "learning_rate": 4.872422150230585e-05, "loss": 2.3183, "mean_token_accuracy": 0.42758620381355283, "step": 151230 }, { "epoch": 0.15232549824191385, "grad_norm": 12.423064094683197, "learning_rate": 4.8724097115027794e-05, "loss": 2.7452, "mean_token_accuracy": 0.39655172228813174, "step": 151235 }, { "epoch": 0.15233053429501803, "grad_norm": 9.963564813301332, "learning_rate": 4.8723972721863155e-05, "loss": 2.6117, "mean_token_accuracy": 0.3965517282485962, "step": 151240 }, { "epoch": 0.1523355703481222, "grad_norm": 11.463395785147187, "learning_rate": 4.8723848322811955e-05, "loss": 2.4206, "mean_token_accuracy": 0.4206896543502808, "step": 151245 }, { "epoch": 0.15234060640122637, "grad_norm": 9.204748131351378, "learning_rate": 4.8723723917874255e-05, "loss": 2.3752, "mean_token_accuracy": 0.41379310488700866, "step": 151250 }, { "epoch": 0.15234564245433055, "grad_norm": 10.609299581236924, "learning_rate": 4.872359950705006e-05, "loss": 2.4623, "mean_token_accuracy": 0.4517241358757019, "step": 151255 }, { "epoch": 0.15235067850743472, "grad_norm": 9.018425755908083, "learning_rate": 4.872347509033944e-05, "loss": 2.2345, "mean_token_accuracy": 0.4709618926048279, "step": 151260 }, { "epoch": 0.1523557145605389, "grad_norm": 9.136870786321776, "learning_rate": 4.8723350667742396e-05, "loss": 2.3615, "mean_token_accuracy": 0.4172413766384125, "step": 151265 }, { "epoch": 0.15236075061364307, "grad_norm": 9.148837411963486, "learning_rate": 4.8723226239258977e-05, "loss": 2.1876, "mean_token_accuracy": 0.47241380214691164, "step": 151270 }, { "epoch": 0.15236578666674724, "grad_norm": 10.463293262802559, "learning_rate": 4.872310180488922e-05, "loss": 2.385, "mean_token_accuracy": 0.44482758045196535, "step": 151275 }, { "epoch": 0.15237082271985142, "grad_norm": 15.458488770555565, "learning_rate": 4.872297736463316e-05, "loss": 2.5012, "mean_token_accuracy": 0.4379310429096222, "step": 151280 }, { "epoch": 0.1523758587729556, "grad_norm": 10.883732114438944, "learning_rate": 4.872285291849083e-05, "loss": 2.8245, "mean_token_accuracy": 0.3896551698446274, "step": 151285 }, { "epoch": 0.15238089482605977, "grad_norm": 11.37264110961484, "learning_rate": 4.8722728466462254e-05, "loss": 2.6627, "mean_token_accuracy": 0.38275861740112305, "step": 151290 }, { "epoch": 0.15238593087916394, "grad_norm": 8.531408550975884, "learning_rate": 4.872260400854748e-05, "loss": 2.0808, "mean_token_accuracy": 0.47785844206809996, "step": 151295 }, { "epoch": 0.1523909669322681, "grad_norm": 9.829915672435417, "learning_rate": 4.8722479544746535e-05, "loss": 2.5631, "mean_token_accuracy": 0.4724137902259827, "step": 151300 }, { "epoch": 0.1523960029853723, "grad_norm": 10.240994779451803, "learning_rate": 4.8722355075059455e-05, "loss": 2.4636, "mean_token_accuracy": 0.358620685338974, "step": 151305 }, { "epoch": 0.15240103903847646, "grad_norm": 9.286018997890258, "learning_rate": 4.872223059948628e-05, "loss": 2.5043, "mean_token_accuracy": 0.4034482777118683, "step": 151310 }, { "epoch": 0.15240607509158063, "grad_norm": 7.98614348933282, "learning_rate": 4.8722106118027037e-05, "loss": 2.2528, "mean_token_accuracy": 0.3965517282485962, "step": 151315 }, { "epoch": 0.1524111111446848, "grad_norm": 10.576783960652094, "learning_rate": 4.8721981630681766e-05, "loss": 2.8685, "mean_token_accuracy": 0.3793103456497192, "step": 151320 }, { "epoch": 0.15241614719778898, "grad_norm": 9.47566822415473, "learning_rate": 4.87218571374505e-05, "loss": 2.3843, "mean_token_accuracy": 0.44482758045196535, "step": 151325 }, { "epoch": 0.15242118325089316, "grad_norm": 12.70702678932557, "learning_rate": 4.8721732638333264e-05, "loss": 2.3833, "mean_token_accuracy": 0.37586207389831544, "step": 151330 }, { "epoch": 0.15242621930399733, "grad_norm": 9.559229378472281, "learning_rate": 4.872160813333011e-05, "loss": 2.3079, "mean_token_accuracy": 0.41724138259887694, "step": 151335 }, { "epoch": 0.1524312553571015, "grad_norm": 11.21058177240816, "learning_rate": 4.872148362244107e-05, "loss": 2.6711, "mean_token_accuracy": 0.39655171036720277, "step": 151340 }, { "epoch": 0.15243629141020565, "grad_norm": 6.865840075983419, "learning_rate": 4.872135910566616e-05, "loss": 2.1928, "mean_token_accuracy": 0.45051421523094176, "step": 151345 }, { "epoch": 0.15244132746330982, "grad_norm": 8.012826219050568, "learning_rate": 4.872123458300543e-05, "loss": 1.794, "mean_token_accuracy": 0.4990385353565216, "step": 151350 }, { "epoch": 0.152446363516414, "grad_norm": 12.407393461241666, "learning_rate": 4.872111005445891e-05, "loss": 2.7145, "mean_token_accuracy": 0.3724137842655182, "step": 151355 }, { "epoch": 0.15245139956951817, "grad_norm": 10.486275789575275, "learning_rate": 4.872098552002664e-05, "loss": 2.7942, "mean_token_accuracy": 0.37586206793785093, "step": 151360 }, { "epoch": 0.15245643562262234, "grad_norm": 11.080908282816205, "learning_rate": 4.872086097970865e-05, "loss": 2.4046, "mean_token_accuracy": 0.4206896543502808, "step": 151365 }, { "epoch": 0.15246147167572652, "grad_norm": 10.730315708466055, "learning_rate": 4.872073643350498e-05, "loss": 2.3672, "mean_token_accuracy": 0.41379310488700866, "step": 151370 }, { "epoch": 0.1524665077288307, "grad_norm": 9.226592529268325, "learning_rate": 4.872061188141565e-05, "loss": 2.6013, "mean_token_accuracy": 0.4103448331356049, "step": 151375 }, { "epoch": 0.15247154378193487, "grad_norm": 9.685964780924254, "learning_rate": 4.87204873234407e-05, "loss": 2.1737, "mean_token_accuracy": 0.482758617401123, "step": 151380 }, { "epoch": 0.15247657983503904, "grad_norm": 10.196151693032158, "learning_rate": 4.872036275958018e-05, "loss": 2.3903, "mean_token_accuracy": 0.4413793087005615, "step": 151385 }, { "epoch": 0.1524816158881432, "grad_norm": 12.470920953611174, "learning_rate": 4.8720238189834116e-05, "loss": 2.4314, "mean_token_accuracy": 0.41379310488700866, "step": 151390 }, { "epoch": 0.1524866519412474, "grad_norm": 10.18988932451362, "learning_rate": 4.8720113614202527e-05, "loss": 2.4184, "mean_token_accuracy": 0.4241379380226135, "step": 151395 }, { "epoch": 0.15249168799435156, "grad_norm": 9.970728859771793, "learning_rate": 4.8719989032685474e-05, "loss": 2.6618, "mean_token_accuracy": 0.38275861740112305, "step": 151400 }, { "epoch": 0.15249672404745573, "grad_norm": 11.779190485255194, "learning_rate": 4.871986444528297e-05, "loss": 2.393, "mean_token_accuracy": 0.4448275864124298, "step": 151405 }, { "epoch": 0.1525017601005599, "grad_norm": 10.41819503820325, "learning_rate": 4.8719739851995056e-05, "loss": 2.1337, "mean_token_accuracy": 0.4551724135875702, "step": 151410 }, { "epoch": 0.15250679615366408, "grad_norm": 10.956776170226467, "learning_rate": 4.871961525282177e-05, "loss": 2.3852, "mean_token_accuracy": 0.41379310488700866, "step": 151415 }, { "epoch": 0.15251183220676826, "grad_norm": 9.908217228212681, "learning_rate": 4.871949064776315e-05, "loss": 2.6243, "mean_token_accuracy": 0.3999999910593033, "step": 151420 }, { "epoch": 0.15251686825987243, "grad_norm": 11.650979512008755, "learning_rate": 4.871936603681922e-05, "loss": 2.5913, "mean_token_accuracy": 0.39310344457626345, "step": 151425 }, { "epoch": 0.1525219043129766, "grad_norm": 10.779053234591526, "learning_rate": 4.8719241419990016e-05, "loss": 2.6788, "mean_token_accuracy": 0.4137930989265442, "step": 151430 }, { "epoch": 0.15252694036608078, "grad_norm": 9.177385145937727, "learning_rate": 4.871911679727558e-05, "loss": 2.4672, "mean_token_accuracy": 0.42758620977401735, "step": 151435 }, { "epoch": 0.15253197641918495, "grad_norm": 11.801793588825042, "learning_rate": 4.871899216867594e-05, "loss": 2.5814, "mean_token_accuracy": 0.4206896543502808, "step": 151440 }, { "epoch": 0.15253701247228912, "grad_norm": 10.252536901418951, "learning_rate": 4.8718867534191135e-05, "loss": 2.0177, "mean_token_accuracy": 0.4781609296798706, "step": 151445 }, { "epoch": 0.1525420485253933, "grad_norm": 14.765049214663536, "learning_rate": 4.8718742893821204e-05, "loss": 2.4407, "mean_token_accuracy": 0.39147005677223207, "step": 151450 }, { "epoch": 0.15254708457849747, "grad_norm": 13.904018329227313, "learning_rate": 4.871861824756617e-05, "loss": 2.3932, "mean_token_accuracy": 0.44482758045196535, "step": 151455 }, { "epoch": 0.15255212063160165, "grad_norm": 12.408712836858651, "learning_rate": 4.871849359542607e-05, "loss": 2.5695, "mean_token_accuracy": 0.38620689511299133, "step": 151460 }, { "epoch": 0.15255715668470582, "grad_norm": 9.650186047103047, "learning_rate": 4.871836893740095e-05, "loss": 2.4711, "mean_token_accuracy": 0.4103448212146759, "step": 151465 }, { "epoch": 0.15256219273781, "grad_norm": 13.717501718422508, "learning_rate": 4.871824427349083e-05, "loss": 2.4142, "mean_token_accuracy": 0.458620685338974, "step": 151470 }, { "epoch": 0.15256722879091417, "grad_norm": 11.141813123222663, "learning_rate": 4.871811960369575e-05, "loss": 2.3301, "mean_token_accuracy": 0.41379310488700866, "step": 151475 }, { "epoch": 0.15257226484401834, "grad_norm": 10.193002637409508, "learning_rate": 4.871799492801574e-05, "loss": 2.0523, "mean_token_accuracy": 0.49854809045791626, "step": 151480 }, { "epoch": 0.1525773008971225, "grad_norm": 9.021868523805702, "learning_rate": 4.871787024645085e-05, "loss": 2.3067, "mean_token_accuracy": 0.4103448212146759, "step": 151485 }, { "epoch": 0.15258233695022666, "grad_norm": 10.015896053840377, "learning_rate": 4.871774555900111e-05, "loss": 2.4058, "mean_token_accuracy": 0.4758620738983154, "step": 151490 }, { "epoch": 0.15258737300333083, "grad_norm": 10.121598657624293, "learning_rate": 4.8717620865666544e-05, "loss": 2.0974, "mean_token_accuracy": 0.46896552443504336, "step": 151495 }, { "epoch": 0.152592409056435, "grad_norm": 9.691344769911515, "learning_rate": 4.8717496166447176e-05, "loss": 2.3171, "mean_token_accuracy": 0.4068965554237366, "step": 151500 }, { "epoch": 0.15259744510953918, "grad_norm": 11.183142971157983, "learning_rate": 4.871737146134308e-05, "loss": 2.3376, "mean_token_accuracy": 0.44482759237289426, "step": 151505 }, { "epoch": 0.15260248116264336, "grad_norm": 9.510514166036772, "learning_rate": 4.871724675035425e-05, "loss": 2.1412, "mean_token_accuracy": 0.4379310369491577, "step": 151510 }, { "epoch": 0.15260751721574753, "grad_norm": 8.786903969199543, "learning_rate": 4.871712203348074e-05, "loss": 2.6494, "mean_token_accuracy": 0.4517241418361664, "step": 151515 }, { "epoch": 0.1526125532688517, "grad_norm": 10.04746745391184, "learning_rate": 4.871699731072259e-05, "loss": 2.1131, "mean_token_accuracy": 0.46551724672317507, "step": 151520 }, { "epoch": 0.15261758932195588, "grad_norm": 11.178736741802812, "learning_rate": 4.871687258207982e-05, "loss": 2.2215, "mean_token_accuracy": 0.4172413766384125, "step": 151525 }, { "epoch": 0.15262262537506005, "grad_norm": 9.301607131628963, "learning_rate": 4.871674784755247e-05, "loss": 2.639, "mean_token_accuracy": 0.36551723778247835, "step": 151530 }, { "epoch": 0.15262766142816422, "grad_norm": 9.529478469282866, "learning_rate": 4.8716623107140584e-05, "loss": 2.054, "mean_token_accuracy": 0.48275861144065857, "step": 151535 }, { "epoch": 0.1526326974812684, "grad_norm": 9.029804860064633, "learning_rate": 4.871649836084419e-05, "loss": 2.2715, "mean_token_accuracy": 0.43103447258472444, "step": 151540 }, { "epoch": 0.15263773353437257, "grad_norm": 11.3498084336476, "learning_rate": 4.871637360866331e-05, "loss": 2.4037, "mean_token_accuracy": 0.4068965494632721, "step": 151545 }, { "epoch": 0.15264276958747675, "grad_norm": 12.478862459655357, "learning_rate": 4.871624885059799e-05, "loss": 2.1751, "mean_token_accuracy": 0.441379314661026, "step": 151550 }, { "epoch": 0.15264780564058092, "grad_norm": 9.739873710484195, "learning_rate": 4.871612408664827e-05, "loss": 2.0861, "mean_token_accuracy": 0.4517241418361664, "step": 151555 }, { "epoch": 0.1526528416936851, "grad_norm": 12.934551084407365, "learning_rate": 4.871599931681418e-05, "loss": 2.7575, "mean_token_accuracy": 0.4034482717514038, "step": 151560 }, { "epoch": 0.15265787774678927, "grad_norm": 11.334478729623061, "learning_rate": 4.871587454109575e-05, "loss": 2.4468, "mean_token_accuracy": 0.4172413766384125, "step": 151565 }, { "epoch": 0.15266291379989344, "grad_norm": 12.927983394788198, "learning_rate": 4.871574975949302e-05, "loss": 2.3165, "mean_token_accuracy": 0.43793103098869324, "step": 151570 }, { "epoch": 0.15266794985299761, "grad_norm": 6.886275198753878, "learning_rate": 4.871562497200602e-05, "loss": 1.9703, "mean_token_accuracy": 0.5232304811477662, "step": 151575 }, { "epoch": 0.1526729859061018, "grad_norm": 13.185844511468096, "learning_rate": 4.871550017863479e-05, "loss": 2.2847, "mean_token_accuracy": 0.4517241418361664, "step": 151580 }, { "epoch": 0.15267802195920596, "grad_norm": 9.347999062056264, "learning_rate": 4.871537537937936e-05, "loss": 2.1035, "mean_token_accuracy": 0.47586206793785096, "step": 151585 }, { "epoch": 0.15268305801231014, "grad_norm": 10.548593734462095, "learning_rate": 4.871525057423976e-05, "loss": 2.4976, "mean_token_accuracy": 0.43103448748588563, "step": 151590 }, { "epoch": 0.1526880940654143, "grad_norm": 10.416995581507157, "learning_rate": 4.871512576321604e-05, "loss": 2.2351, "mean_token_accuracy": 0.4379310250282288, "step": 151595 }, { "epoch": 0.15269313011851848, "grad_norm": 11.387295603028846, "learning_rate": 4.8715000946308235e-05, "loss": 2.5229, "mean_token_accuracy": 0.38620689511299133, "step": 151600 }, { "epoch": 0.15269816617162266, "grad_norm": 11.667339614137786, "learning_rate": 4.871487612351635e-05, "loss": 2.0419, "mean_token_accuracy": 0.5241379380226135, "step": 151605 }, { "epoch": 0.15270320222472683, "grad_norm": 9.405809663694358, "learning_rate": 4.871475129484045e-05, "loss": 2.5109, "mean_token_accuracy": 0.441379314661026, "step": 151610 }, { "epoch": 0.152708238277831, "grad_norm": 11.530587124899066, "learning_rate": 4.871462646028056e-05, "loss": 2.3764, "mean_token_accuracy": 0.38275861740112305, "step": 151615 }, { "epoch": 0.15271327433093518, "grad_norm": 9.513767262623915, "learning_rate": 4.8714501619836714e-05, "loss": 2.181, "mean_token_accuracy": 0.4586207032203674, "step": 151620 }, { "epoch": 0.15271831038403932, "grad_norm": 9.29868485534484, "learning_rate": 4.871437677350894e-05, "loss": 2.4578, "mean_token_accuracy": 0.39999999701976774, "step": 151625 }, { "epoch": 0.1527233464371435, "grad_norm": 10.265234570587927, "learning_rate": 4.8714251921297294e-05, "loss": 2.5704, "mean_token_accuracy": 0.3827586114406586, "step": 151630 }, { "epoch": 0.15272838249024767, "grad_norm": 9.709357851501359, "learning_rate": 4.871412706320178e-05, "loss": 2.2481, "mean_token_accuracy": 0.441379314661026, "step": 151635 }, { "epoch": 0.15273341854335185, "grad_norm": 8.000562747492522, "learning_rate": 4.871400219922246e-05, "loss": 2.036, "mean_token_accuracy": 0.5091349124908447, "step": 151640 }, { "epoch": 0.15273845459645602, "grad_norm": 12.837839701697442, "learning_rate": 4.8713877329359354e-05, "loss": 2.5302, "mean_token_accuracy": 0.3999999940395355, "step": 151645 }, { "epoch": 0.1527434906495602, "grad_norm": 10.61756748720087, "learning_rate": 4.8713752453612506e-05, "loss": 2.7302, "mean_token_accuracy": 0.39655172228813174, "step": 151650 }, { "epoch": 0.15274852670266437, "grad_norm": 7.246577094775593, "learning_rate": 4.871362757198193e-05, "loss": 2.3921, "mean_token_accuracy": 0.4206896543502808, "step": 151655 }, { "epoch": 0.15275356275576854, "grad_norm": 10.725187477502589, "learning_rate": 4.8713502684467686e-05, "loss": 2.5052, "mean_token_accuracy": 0.4172413766384125, "step": 151660 }, { "epoch": 0.15275859880887271, "grad_norm": 10.952326455724945, "learning_rate": 4.871337779106979e-05, "loss": 2.3419, "mean_token_accuracy": 0.4620689690113068, "step": 151665 }, { "epoch": 0.1527636348619769, "grad_norm": 11.94202973151706, "learning_rate": 4.87132528917883e-05, "loss": 2.8008, "mean_token_accuracy": 0.4206896543502808, "step": 151670 }, { "epoch": 0.15276867091508106, "grad_norm": 9.051026554822105, "learning_rate": 4.871312798662322e-05, "loss": 2.1124, "mean_token_accuracy": 0.4793103516101837, "step": 151675 }, { "epoch": 0.15277370696818524, "grad_norm": 10.633870038916406, "learning_rate": 4.8713003075574605e-05, "loss": 3.0386, "mean_token_accuracy": 0.4068965494632721, "step": 151680 }, { "epoch": 0.1527787430212894, "grad_norm": 11.788718539534917, "learning_rate": 4.871287815864248e-05, "loss": 2.2269, "mean_token_accuracy": 0.4620689690113068, "step": 151685 }, { "epoch": 0.15278377907439358, "grad_norm": 11.175608296512673, "learning_rate": 4.871275323582689e-05, "loss": 2.3596, "mean_token_accuracy": 0.432667875289917, "step": 151690 }, { "epoch": 0.15278881512749776, "grad_norm": 12.892527846812667, "learning_rate": 4.871262830712786e-05, "loss": 2.3894, "mean_token_accuracy": 0.4464004814624786, "step": 151695 }, { "epoch": 0.15279385118060193, "grad_norm": 9.815879617138426, "learning_rate": 4.871250337254543e-05, "loss": 2.2856, "mean_token_accuracy": 0.42413793206214906, "step": 151700 }, { "epoch": 0.1527988872337061, "grad_norm": 9.977656180084065, "learning_rate": 4.871237843207964e-05, "loss": 2.2053, "mean_token_accuracy": 0.42413793206214906, "step": 151705 }, { "epoch": 0.15280392328681028, "grad_norm": 9.166777625200304, "learning_rate": 4.871225348573051e-05, "loss": 2.1497, "mean_token_accuracy": 0.5068965435028077, "step": 151710 }, { "epoch": 0.15280895933991445, "grad_norm": 9.282678263058903, "learning_rate": 4.871212853349808e-05, "loss": 1.9627, "mean_token_accuracy": 0.47586206793785096, "step": 151715 }, { "epoch": 0.15281399539301863, "grad_norm": 11.166826866246458, "learning_rate": 4.871200357538239e-05, "loss": 2.4416, "mean_token_accuracy": 0.42413792908191683, "step": 151720 }, { "epoch": 0.1528190314461228, "grad_norm": 13.727064013421689, "learning_rate": 4.871187861138348e-05, "loss": 3.2137, "mean_token_accuracy": 0.3862069070339203, "step": 151725 }, { "epoch": 0.15282406749922697, "grad_norm": 8.18788724744751, "learning_rate": 4.8711753641501366e-05, "loss": 2.2336, "mean_token_accuracy": 0.46551724076271056, "step": 151730 }, { "epoch": 0.15282910355233115, "grad_norm": 21.35450454651548, "learning_rate": 4.8711628665736095e-05, "loss": 2.5557, "mean_token_accuracy": 0.4413792997598648, "step": 151735 }, { "epoch": 0.15283413960543532, "grad_norm": 10.299272229502446, "learning_rate": 4.87115036840877e-05, "loss": 2.4002, "mean_token_accuracy": 0.4103448212146759, "step": 151740 }, { "epoch": 0.1528391756585395, "grad_norm": 11.849195906220306, "learning_rate": 4.871137869655621e-05, "loss": 2.2997, "mean_token_accuracy": 0.4464609742164612, "step": 151745 }, { "epoch": 0.15284421171164367, "grad_norm": 10.549046259772638, "learning_rate": 4.8711253703141674e-05, "loss": 2.4816, "mean_token_accuracy": 0.44482758045196535, "step": 151750 }, { "epoch": 0.15284924776474784, "grad_norm": 7.691927733556517, "learning_rate": 4.871112870384411e-05, "loss": 2.6441, "mean_token_accuracy": 0.42758620977401735, "step": 151755 }, { "epoch": 0.15285428381785202, "grad_norm": 8.574594418168173, "learning_rate": 4.871100369866357e-05, "loss": 2.386, "mean_token_accuracy": 0.4310344815254211, "step": 151760 }, { "epoch": 0.15285931987095616, "grad_norm": 9.111693549388136, "learning_rate": 4.871087868760008e-05, "loss": 2.5299, "mean_token_accuracy": 0.39310344457626345, "step": 151765 }, { "epoch": 0.15286435592406034, "grad_norm": 8.608842666669304, "learning_rate": 4.8710753670653656e-05, "loss": 2.4598, "mean_token_accuracy": 0.38965516686439516, "step": 151770 }, { "epoch": 0.1528693919771645, "grad_norm": 8.51081389050932, "learning_rate": 4.871062864782437e-05, "loss": 2.0984, "mean_token_accuracy": 0.4987900912761688, "step": 151775 }, { "epoch": 0.15287442803026868, "grad_norm": 10.581574312767009, "learning_rate": 4.871050361911223e-05, "loss": 2.5342, "mean_token_accuracy": 0.38965516686439516, "step": 151780 }, { "epoch": 0.15287946408337286, "grad_norm": 10.184842033287048, "learning_rate": 4.871037858451727e-05, "loss": 2.7637, "mean_token_accuracy": 0.3793103456497192, "step": 151785 }, { "epoch": 0.15288450013647703, "grad_norm": 8.89618753294905, "learning_rate": 4.871025354403954e-05, "loss": 2.2985, "mean_token_accuracy": 0.506896561384201, "step": 151790 }, { "epoch": 0.1528895361895812, "grad_norm": 11.40291612077752, "learning_rate": 4.871012849767906e-05, "loss": 2.8271, "mean_token_accuracy": 0.36896551549434664, "step": 151795 }, { "epoch": 0.15289457224268538, "grad_norm": 10.671323069098571, "learning_rate": 4.8710003445435885e-05, "loss": 2.2116, "mean_token_accuracy": 0.5050816655158996, "step": 151800 }, { "epoch": 0.15289960829578955, "grad_norm": 11.042671145085544, "learning_rate": 4.870987838731003e-05, "loss": 2.5248, "mean_token_accuracy": 0.42413793206214906, "step": 151805 }, { "epoch": 0.15290464434889373, "grad_norm": 11.431738991206663, "learning_rate": 4.8709753323301536e-05, "loss": 2.3597, "mean_token_accuracy": 0.47586206197738645, "step": 151810 }, { "epoch": 0.1529096804019979, "grad_norm": 9.870418688214734, "learning_rate": 4.8709628253410433e-05, "loss": 2.4948, "mean_token_accuracy": 0.4586206912994385, "step": 151815 }, { "epoch": 0.15291471645510207, "grad_norm": 10.262012387535739, "learning_rate": 4.870950317763677e-05, "loss": 2.4061, "mean_token_accuracy": 0.41034482717514037, "step": 151820 }, { "epoch": 0.15291975250820625, "grad_norm": 9.90876377918367, "learning_rate": 4.870937809598056e-05, "loss": 2.3683, "mean_token_accuracy": 0.4620689570903778, "step": 151825 }, { "epoch": 0.15292478856131042, "grad_norm": 10.241089527551376, "learning_rate": 4.870925300844186e-05, "loss": 2.5011, "mean_token_accuracy": 0.41905626058578493, "step": 151830 }, { "epoch": 0.1529298246144146, "grad_norm": 13.773002869600292, "learning_rate": 4.870912791502069e-05, "loss": 2.4792, "mean_token_accuracy": 0.41379310488700866, "step": 151835 }, { "epoch": 0.15293486066751877, "grad_norm": 10.888111416290574, "learning_rate": 4.8709002815717085e-05, "loss": 2.3794, "mean_token_accuracy": 0.41724138259887694, "step": 151840 }, { "epoch": 0.15293989672062294, "grad_norm": 10.939115903693443, "learning_rate": 4.8708877710531095e-05, "loss": 2.3927, "mean_token_accuracy": 0.46061705946922304, "step": 151845 }, { "epoch": 0.15294493277372712, "grad_norm": 10.748434161322365, "learning_rate": 4.870875259946274e-05, "loss": 2.5535, "mean_token_accuracy": 0.38275861740112305, "step": 151850 }, { "epoch": 0.1529499688268313, "grad_norm": 9.78189905866777, "learning_rate": 4.870862748251205e-05, "loss": 2.2492, "mean_token_accuracy": 0.46896552443504336, "step": 151855 }, { "epoch": 0.15295500487993546, "grad_norm": 12.131439172240118, "learning_rate": 4.870850235967908e-05, "loss": 2.0645, "mean_token_accuracy": 0.5103448152542114, "step": 151860 }, { "epoch": 0.15296004093303964, "grad_norm": 11.884658880384821, "learning_rate": 4.870837723096385e-05, "loss": 2.7462, "mean_token_accuracy": 0.3655172407627106, "step": 151865 }, { "epoch": 0.1529650769861438, "grad_norm": 10.468612699040388, "learning_rate": 4.870825209636639e-05, "loss": 2.3633, "mean_token_accuracy": 0.4482758641242981, "step": 151870 }, { "epoch": 0.15297011303924798, "grad_norm": 9.603526905565396, "learning_rate": 4.8708126955886744e-05, "loss": 2.062, "mean_token_accuracy": 0.48275862336158754, "step": 151875 }, { "epoch": 0.15297514909235216, "grad_norm": 11.437606666558919, "learning_rate": 4.870800180952495e-05, "loss": 2.7419, "mean_token_accuracy": 0.379310342669487, "step": 151880 }, { "epoch": 0.15298018514545633, "grad_norm": 8.09579107786234, "learning_rate": 4.870787665728103e-05, "loss": 2.2339, "mean_token_accuracy": 0.4, "step": 151885 }, { "epoch": 0.1529852211985605, "grad_norm": 10.162420187244974, "learning_rate": 4.870775149915503e-05, "loss": 2.4522, "mean_token_accuracy": 0.4206896543502808, "step": 151890 }, { "epoch": 0.15299025725166468, "grad_norm": 10.801585831118802, "learning_rate": 4.870762633514699e-05, "loss": 2.4288, "mean_token_accuracy": 0.44482758045196535, "step": 151895 }, { "epoch": 0.15299529330476885, "grad_norm": 11.21144892146897, "learning_rate": 4.870750116525693e-05, "loss": 2.3875, "mean_token_accuracy": 0.42413793206214906, "step": 151900 }, { "epoch": 0.153000329357873, "grad_norm": 9.285829202834849, "learning_rate": 4.870737598948488e-05, "loss": 2.3471, "mean_token_accuracy": 0.441379314661026, "step": 151905 }, { "epoch": 0.15300536541097717, "grad_norm": 8.504852708628952, "learning_rate": 4.8707250807830904e-05, "loss": 2.2572, "mean_token_accuracy": 0.4620689630508423, "step": 151910 }, { "epoch": 0.15301040146408135, "grad_norm": 8.427991066045669, "learning_rate": 4.8707125620295e-05, "loss": 2.4162, "mean_token_accuracy": 0.42413792610168455, "step": 151915 }, { "epoch": 0.15301543751718552, "grad_norm": 15.864187315162367, "learning_rate": 4.870700042687723e-05, "loss": 2.6463, "mean_token_accuracy": 0.3984271019697189, "step": 151920 }, { "epoch": 0.1530204735702897, "grad_norm": 10.667663313154023, "learning_rate": 4.870687522757761e-05, "loss": 2.3237, "mean_token_accuracy": 0.4586206912994385, "step": 151925 }, { "epoch": 0.15302550962339387, "grad_norm": 9.73024947682569, "learning_rate": 4.87067500223962e-05, "loss": 2.1833, "mean_token_accuracy": 0.43103448748588563, "step": 151930 }, { "epoch": 0.15303054567649804, "grad_norm": 11.319871254495444, "learning_rate": 4.8706624811333014e-05, "loss": 2.3869, "mean_token_accuracy": 0.43448275327682495, "step": 151935 }, { "epoch": 0.15303558172960222, "grad_norm": 8.559160719969734, "learning_rate": 4.870649959438808e-05, "loss": 2.0895, "mean_token_accuracy": 0.5088324248790741, "step": 151940 }, { "epoch": 0.1530406177827064, "grad_norm": 22.24676302890798, "learning_rate": 4.870637437156145e-05, "loss": 2.6164, "mean_token_accuracy": 0.40344828367233276, "step": 151945 }, { "epoch": 0.15304565383581056, "grad_norm": 9.468613409389205, "learning_rate": 4.8706249142853156e-05, "loss": 1.9933, "mean_token_accuracy": 0.4655172526836395, "step": 151950 }, { "epoch": 0.15305068988891474, "grad_norm": 12.216585862813039, "learning_rate": 4.870612390826323e-05, "loss": 2.8181, "mean_token_accuracy": 0.4586206912994385, "step": 151955 }, { "epoch": 0.1530557259420189, "grad_norm": 10.829345589024582, "learning_rate": 4.87059986677917e-05, "loss": 2.1054, "mean_token_accuracy": 0.5137930989265442, "step": 151960 }, { "epoch": 0.15306076199512308, "grad_norm": 9.20775051237787, "learning_rate": 4.870587342143861e-05, "loss": 2.5326, "mean_token_accuracy": 0.47586206793785096, "step": 151965 }, { "epoch": 0.15306579804822726, "grad_norm": 11.45067653964252, "learning_rate": 4.8705748169204e-05, "loss": 2.179, "mean_token_accuracy": 0.4819116771221161, "step": 151970 }, { "epoch": 0.15307083410133143, "grad_norm": 10.047627535541823, "learning_rate": 4.870562291108789e-05, "loss": 2.0545, "mean_token_accuracy": 0.44482758045196535, "step": 151975 }, { "epoch": 0.1530758701544356, "grad_norm": 10.258015097278069, "learning_rate": 4.870549764709033e-05, "loss": 2.5542, "mean_token_accuracy": 0.38275861740112305, "step": 151980 }, { "epoch": 0.15308090620753978, "grad_norm": 11.667519297988106, "learning_rate": 4.870537237721133e-05, "loss": 2.69, "mean_token_accuracy": 0.36551723480224607, "step": 151985 }, { "epoch": 0.15308594226064395, "grad_norm": 9.342234785050787, "learning_rate": 4.870524710145095e-05, "loss": 2.5104, "mean_token_accuracy": 0.43448275327682495, "step": 151990 }, { "epoch": 0.15309097831374813, "grad_norm": 11.208457899320862, "learning_rate": 4.870512181980922e-05, "loss": 3.254, "mean_token_accuracy": 0.40550513863563536, "step": 151995 }, { "epoch": 0.1530960143668523, "grad_norm": 12.167625315276945, "learning_rate": 4.870499653228617e-05, "loss": 3.1626, "mean_token_accuracy": 0.3275862067937851, "step": 152000 }, { "epoch": 0.15310105041995647, "grad_norm": 11.197789786158246, "learning_rate": 4.870487123888182e-05, "loss": 2.3781, "mean_token_accuracy": 0.4551724135875702, "step": 152005 }, { "epoch": 0.15310608647306065, "grad_norm": 11.009016812366454, "learning_rate": 4.870474593959623e-05, "loss": 2.616, "mean_token_accuracy": 0.4172413766384125, "step": 152010 }, { "epoch": 0.15311112252616482, "grad_norm": 9.40820274933862, "learning_rate": 4.870462063442943e-05, "loss": 1.9719, "mean_token_accuracy": 0.5491228044033051, "step": 152015 }, { "epoch": 0.153116158579269, "grad_norm": 10.726778665828386, "learning_rate": 4.870449532338144e-05, "loss": 2.4751, "mean_token_accuracy": 0.4586206912994385, "step": 152020 }, { "epoch": 0.15312119463237317, "grad_norm": 11.481276961778825, "learning_rate": 4.870437000645231e-05, "loss": 2.861, "mean_token_accuracy": 0.39655172228813174, "step": 152025 }, { "epoch": 0.15312623068547734, "grad_norm": 10.022286534324389, "learning_rate": 4.870424468364206e-05, "loss": 2.2702, "mean_token_accuracy": 0.4862068951129913, "step": 152030 }, { "epoch": 0.15313126673858152, "grad_norm": 11.400913619322484, "learning_rate": 4.870411935495075e-05, "loss": 2.4162, "mean_token_accuracy": 0.4551724135875702, "step": 152035 }, { "epoch": 0.1531363027916857, "grad_norm": 9.920515834152631, "learning_rate": 4.870399402037838e-05, "loss": 2.4407, "mean_token_accuracy": 0.42068966031074523, "step": 152040 }, { "epoch": 0.15314133884478984, "grad_norm": 8.676121970435092, "learning_rate": 4.870386867992502e-05, "loss": 2.4858, "mean_token_accuracy": 0.441379314661026, "step": 152045 }, { "epoch": 0.153146374897894, "grad_norm": 12.247268065948628, "learning_rate": 4.870374333359068e-05, "loss": 2.4024, "mean_token_accuracy": 0.4379310369491577, "step": 152050 }, { "epoch": 0.15315141095099818, "grad_norm": 10.583086772042593, "learning_rate": 4.87036179813754e-05, "loss": 2.7123, "mean_token_accuracy": 0.36896551847457887, "step": 152055 }, { "epoch": 0.15315644700410236, "grad_norm": 20.987639824287957, "learning_rate": 4.8703492623279216e-05, "loss": 2.6867, "mean_token_accuracy": 0.4241379201412201, "step": 152060 }, { "epoch": 0.15316148305720653, "grad_norm": 10.593143472784043, "learning_rate": 4.870336725930217e-05, "loss": 2.5997, "mean_token_accuracy": 0.3931034505367279, "step": 152065 }, { "epoch": 0.1531665191103107, "grad_norm": 9.389721465870304, "learning_rate": 4.87032418894443e-05, "loss": 2.8018, "mean_token_accuracy": 0.4103448212146759, "step": 152070 }, { "epoch": 0.15317155516341488, "grad_norm": 14.340618558359449, "learning_rate": 4.870311651370561e-05, "loss": 2.6436, "mean_token_accuracy": 0.4137930989265442, "step": 152075 }, { "epoch": 0.15317659121651905, "grad_norm": 10.887951600550396, "learning_rate": 4.8702991132086175e-05, "loss": 2.3393, "mean_token_accuracy": 0.4137930989265442, "step": 152080 }, { "epoch": 0.15318162726962323, "grad_norm": 9.179446741229333, "learning_rate": 4.8702865744586e-05, "loss": 2.7852, "mean_token_accuracy": 0.4965517222881317, "step": 152085 }, { "epoch": 0.1531866633227274, "grad_norm": 12.785418896397752, "learning_rate": 4.8702740351205135e-05, "loss": 2.5041, "mean_token_accuracy": 0.4189352720975876, "step": 152090 }, { "epoch": 0.15319169937583157, "grad_norm": 10.168351661429892, "learning_rate": 4.870261495194361e-05, "loss": 2.3832, "mean_token_accuracy": 0.43103448748588563, "step": 152095 }, { "epoch": 0.15319673542893575, "grad_norm": 9.741432202081342, "learning_rate": 4.870248954680146e-05, "loss": 2.3221, "mean_token_accuracy": 0.4896551728248596, "step": 152100 }, { "epoch": 0.15320177148203992, "grad_norm": 9.075597757831215, "learning_rate": 4.870236413577873e-05, "loss": 2.363, "mean_token_accuracy": 0.4068965554237366, "step": 152105 }, { "epoch": 0.1532068075351441, "grad_norm": 9.398835256050344, "learning_rate": 4.870223871887543e-05, "loss": 2.5737, "mean_token_accuracy": 0.42758620381355283, "step": 152110 }, { "epoch": 0.15321184358824827, "grad_norm": 13.762839246389571, "learning_rate": 4.870211329609162e-05, "loss": 2.451, "mean_token_accuracy": 0.41379311084747317, "step": 152115 }, { "epoch": 0.15321687964135244, "grad_norm": 10.456048215453778, "learning_rate": 4.8701987867427325e-05, "loss": 2.2277, "mean_token_accuracy": 0.46551724672317507, "step": 152120 }, { "epoch": 0.15322191569445662, "grad_norm": 13.652380420118053, "learning_rate": 4.870186243288257e-05, "loss": 2.5853, "mean_token_accuracy": 0.4103448331356049, "step": 152125 }, { "epoch": 0.1532269517475608, "grad_norm": 9.178461136234572, "learning_rate": 4.870173699245741e-05, "loss": 2.5205, "mean_token_accuracy": 0.42413793206214906, "step": 152130 }, { "epoch": 0.15323198780066496, "grad_norm": 9.548187819708897, "learning_rate": 4.870161154615187e-05, "loss": 1.8927, "mean_token_accuracy": 0.48965516686439514, "step": 152135 }, { "epoch": 0.15323702385376914, "grad_norm": 10.156394428915025, "learning_rate": 4.8701486093965976e-05, "loss": 2.2144, "mean_token_accuracy": 0.4068965494632721, "step": 152140 }, { "epoch": 0.1532420599068733, "grad_norm": 12.787612252849316, "learning_rate": 4.8701360635899765e-05, "loss": 2.1762, "mean_token_accuracy": 0.4879007816314697, "step": 152145 }, { "epoch": 0.1532470959599775, "grad_norm": 8.952782330368324, "learning_rate": 4.8701235171953295e-05, "loss": 2.3771, "mean_token_accuracy": 0.4482758641242981, "step": 152150 }, { "epoch": 0.15325213201308166, "grad_norm": 10.151318056025973, "learning_rate": 4.8701109702126574e-05, "loss": 2.3134, "mean_token_accuracy": 0.4, "step": 152155 }, { "epoch": 0.15325716806618583, "grad_norm": 10.935120848655497, "learning_rate": 4.870098422641964e-05, "loss": 2.4237, "mean_token_accuracy": 0.3931034505367279, "step": 152160 }, { "epoch": 0.15326220411929, "grad_norm": 11.7656470658916, "learning_rate": 4.8700858744832545e-05, "loss": 2.5806, "mean_token_accuracy": 0.4, "step": 152165 }, { "epoch": 0.15326724017239418, "grad_norm": 10.255123983177315, "learning_rate": 4.870073325736531e-05, "loss": 2.381, "mean_token_accuracy": 0.42758620977401735, "step": 152170 }, { "epoch": 0.15327227622549836, "grad_norm": 10.36466229374279, "learning_rate": 4.870060776401797e-05, "loss": 2.3818, "mean_token_accuracy": 0.4379310369491577, "step": 152175 }, { "epoch": 0.15327731227860253, "grad_norm": 9.550894712758543, "learning_rate": 4.870048226479056e-05, "loss": 1.9145, "mean_token_accuracy": 0.45320196747779845, "step": 152180 }, { "epoch": 0.15328234833170667, "grad_norm": 10.255987789739978, "learning_rate": 4.870035675968313e-05, "loss": 2.3522, "mean_token_accuracy": 0.44827587008476255, "step": 152185 }, { "epoch": 0.15328738438481085, "grad_norm": 9.95569579867558, "learning_rate": 4.870023124869569e-05, "loss": 2.2056, "mean_token_accuracy": 0.43448275327682495, "step": 152190 }, { "epoch": 0.15329242043791502, "grad_norm": 8.818753028987645, "learning_rate": 4.8700105731828286e-05, "loss": 2.1763, "mean_token_accuracy": 0.45741077959537507, "step": 152195 }, { "epoch": 0.1532974564910192, "grad_norm": 11.142784617952936, "learning_rate": 4.869998020908095e-05, "loss": 1.9844, "mean_token_accuracy": 0.48275862336158754, "step": 152200 }, { "epoch": 0.15330249254412337, "grad_norm": 9.150038123525317, "learning_rate": 4.869985468045373e-05, "loss": 2.4487, "mean_token_accuracy": 0.4586206912994385, "step": 152205 }, { "epoch": 0.15330752859722754, "grad_norm": 12.007596270976562, "learning_rate": 4.869972914594665e-05, "loss": 2.5659, "mean_token_accuracy": 0.3999999940395355, "step": 152210 }, { "epoch": 0.15331256465033172, "grad_norm": 9.242306005622929, "learning_rate": 4.869960360555974e-05, "loss": 1.9762, "mean_token_accuracy": 0.4896551787853241, "step": 152215 }, { "epoch": 0.1533176007034359, "grad_norm": 11.780202907530558, "learning_rate": 4.8699478059293055e-05, "loss": 2.5922, "mean_token_accuracy": 0.37241379022598264, "step": 152220 }, { "epoch": 0.15332263675654006, "grad_norm": 11.240414962369448, "learning_rate": 4.86993525071466e-05, "loss": 2.3012, "mean_token_accuracy": 0.45517241954803467, "step": 152225 }, { "epoch": 0.15332767280964424, "grad_norm": 8.947508511221614, "learning_rate": 4.869922694912044e-05, "loss": 2.2212, "mean_token_accuracy": 0.43793103098869324, "step": 152230 }, { "epoch": 0.1533327088627484, "grad_norm": 10.289988593945553, "learning_rate": 4.8699101385214584e-05, "loss": 2.3248, "mean_token_accuracy": 0.4310344815254211, "step": 152235 }, { "epoch": 0.1533377449158526, "grad_norm": 9.24756338131609, "learning_rate": 4.869897581542908e-05, "loss": 2.4678, "mean_token_accuracy": 0.4379310429096222, "step": 152240 }, { "epoch": 0.15334278096895676, "grad_norm": 10.167413129838256, "learning_rate": 4.869885023976396e-05, "loss": 2.2453, "mean_token_accuracy": 0.4551724135875702, "step": 152245 }, { "epoch": 0.15334781702206093, "grad_norm": 10.324287830962248, "learning_rate": 4.869872465821927e-05, "loss": 2.4204, "mean_token_accuracy": 0.42413793206214906, "step": 152250 }, { "epoch": 0.1533528530751651, "grad_norm": 10.911024951241847, "learning_rate": 4.869859907079501e-05, "loss": 2.1514, "mean_token_accuracy": 0.48475499749183654, "step": 152255 }, { "epoch": 0.15335788912826928, "grad_norm": 9.945288471955433, "learning_rate": 4.869847347749127e-05, "loss": 2.0171, "mean_token_accuracy": 0.4551724076271057, "step": 152260 }, { "epoch": 0.15336292518137346, "grad_norm": 10.274558276005083, "learning_rate": 4.8698347878308036e-05, "loss": 2.4321, "mean_token_accuracy": 0.4289776086807251, "step": 152265 }, { "epoch": 0.15336796123447763, "grad_norm": 12.1782921480261, "learning_rate": 4.869822227324536e-05, "loss": 2.8446, "mean_token_accuracy": 0.4137930989265442, "step": 152270 }, { "epoch": 0.1533729972875818, "grad_norm": 18.565229509773516, "learning_rate": 4.869809666230329e-05, "loss": 2.5896, "mean_token_accuracy": 0.4379310250282288, "step": 152275 }, { "epoch": 0.15337803334068598, "grad_norm": 9.744418386786226, "learning_rate": 4.869797104548184e-05, "loss": 2.3869, "mean_token_accuracy": 0.4344827592372894, "step": 152280 }, { "epoch": 0.15338306939379015, "grad_norm": 11.295196495161099, "learning_rate": 4.8697845422781056e-05, "loss": 2.2613, "mean_token_accuracy": 0.4361766457557678, "step": 152285 }, { "epoch": 0.15338810544689432, "grad_norm": 10.025867934039303, "learning_rate": 4.869771979420097e-05, "loss": 2.5179, "mean_token_accuracy": 0.4310344815254211, "step": 152290 }, { "epoch": 0.1533931414999985, "grad_norm": 13.330858239719818, "learning_rate": 4.8697594159741615e-05, "loss": 2.7727, "mean_token_accuracy": 0.3999999940395355, "step": 152295 }, { "epoch": 0.15339817755310267, "grad_norm": 7.797643653476521, "learning_rate": 4.8697468519403034e-05, "loss": 1.9521, "mean_token_accuracy": 0.5505142211914062, "step": 152300 }, { "epoch": 0.15340321360620685, "grad_norm": 10.075622810958173, "learning_rate": 4.869734287318525e-05, "loss": 2.1191, "mean_token_accuracy": 0.4448275864124298, "step": 152305 }, { "epoch": 0.15340824965931102, "grad_norm": 9.390315734396285, "learning_rate": 4.869721722108831e-05, "loss": 2.2317, "mean_token_accuracy": 0.48620688915252686, "step": 152310 }, { "epoch": 0.1534132857124152, "grad_norm": 8.566844396187701, "learning_rate": 4.869709156311224e-05, "loss": 2.1328, "mean_token_accuracy": 0.42758620977401735, "step": 152315 }, { "epoch": 0.15341832176551937, "grad_norm": 9.24133103355712, "learning_rate": 4.869696589925708e-05, "loss": 2.4533, "mean_token_accuracy": 0.42068964838981626, "step": 152320 }, { "epoch": 0.1534233578186235, "grad_norm": 8.391797864011842, "learning_rate": 4.8696840229522856e-05, "loss": 2.1019, "mean_token_accuracy": 0.48275861144065857, "step": 152325 }, { "epoch": 0.1534283938717277, "grad_norm": 8.234138451324727, "learning_rate": 4.8696714553909616e-05, "loss": 2.3108, "mean_token_accuracy": 0.4379310429096222, "step": 152330 }, { "epoch": 0.15343342992483186, "grad_norm": 9.849756764974499, "learning_rate": 4.8696588872417384e-05, "loss": 2.4796, "mean_token_accuracy": 0.42413793206214906, "step": 152335 }, { "epoch": 0.15343846597793603, "grad_norm": 8.894894113432109, "learning_rate": 4.86964631850462e-05, "loss": 2.2184, "mean_token_accuracy": 0.49836660623550416, "step": 152340 }, { "epoch": 0.1534435020310402, "grad_norm": 11.793901180383713, "learning_rate": 4.869633749179609e-05, "loss": 2.7683, "mean_token_accuracy": 0.3551724135875702, "step": 152345 }, { "epoch": 0.15344853808414438, "grad_norm": 11.082730339311247, "learning_rate": 4.8696211792667106e-05, "loss": 2.6311, "mean_token_accuracy": 0.441379314661026, "step": 152350 }, { "epoch": 0.15345357413724856, "grad_norm": 9.583236600994754, "learning_rate": 4.8696086087659275e-05, "loss": 2.4572, "mean_token_accuracy": 0.45359950661659243, "step": 152355 }, { "epoch": 0.15345861019035273, "grad_norm": 20.854441692991973, "learning_rate": 4.869596037677263e-05, "loss": 2.8506, "mean_token_accuracy": 0.40000000298023225, "step": 152360 }, { "epoch": 0.1534636462434569, "grad_norm": 10.36072750664657, "learning_rate": 4.86958346600072e-05, "loss": 2.8644, "mean_token_accuracy": 0.38620689511299133, "step": 152365 }, { "epoch": 0.15346868229656108, "grad_norm": 11.308472932885493, "learning_rate": 4.869570893736303e-05, "loss": 2.3472, "mean_token_accuracy": 0.44646098613739016, "step": 152370 }, { "epoch": 0.15347371834966525, "grad_norm": 9.414311120096766, "learning_rate": 4.869558320884015e-05, "loss": 2.4039, "mean_token_accuracy": 0.441379314661026, "step": 152375 }, { "epoch": 0.15347875440276942, "grad_norm": 12.742787954971101, "learning_rate": 4.86954574744386e-05, "loss": 2.5986, "mean_token_accuracy": 0.3862069010734558, "step": 152380 }, { "epoch": 0.1534837904558736, "grad_norm": 10.168780960808778, "learning_rate": 4.869533173415841e-05, "loss": 2.1009, "mean_token_accuracy": 0.45686630010604856, "step": 152385 }, { "epoch": 0.15348882650897777, "grad_norm": 10.078630702096303, "learning_rate": 4.8695205987999614e-05, "loss": 2.3917, "mean_token_accuracy": 0.4172413766384125, "step": 152390 }, { "epoch": 0.15349386256208195, "grad_norm": 9.863909397321065, "learning_rate": 4.869508023596224e-05, "loss": 2.5403, "mean_token_accuracy": 0.42068966031074523, "step": 152395 }, { "epoch": 0.15349889861518612, "grad_norm": 10.669192221008938, "learning_rate": 4.869495447804634e-05, "loss": 2.2612, "mean_token_accuracy": 0.4896551787853241, "step": 152400 }, { "epoch": 0.1535039346682903, "grad_norm": 11.188126843425506, "learning_rate": 4.869482871425194e-05, "loss": 2.0641, "mean_token_accuracy": 0.4699507415294647, "step": 152405 }, { "epoch": 0.15350897072139447, "grad_norm": 10.829038445867432, "learning_rate": 4.8694702944579076e-05, "loss": 2.6204, "mean_token_accuracy": 0.36896551847457887, "step": 152410 }, { "epoch": 0.15351400677449864, "grad_norm": 10.089142619140494, "learning_rate": 4.8694577169027786e-05, "loss": 2.0657, "mean_token_accuracy": 0.46896551847457885, "step": 152415 }, { "epoch": 0.15351904282760281, "grad_norm": 9.504665646203092, "learning_rate": 4.8694451387598094e-05, "loss": 2.3952, "mean_token_accuracy": 0.4551724076271057, "step": 152420 }, { "epoch": 0.153524078880707, "grad_norm": 8.25398137991359, "learning_rate": 4.869432560029004e-05, "loss": 2.2875, "mean_token_accuracy": 0.48620688915252686, "step": 152425 }, { "epoch": 0.15352911493381116, "grad_norm": 8.733203931560963, "learning_rate": 4.8694199807103675e-05, "loss": 2.2285, "mean_token_accuracy": 0.4137930929660797, "step": 152430 }, { "epoch": 0.15353415098691534, "grad_norm": 11.304386946503627, "learning_rate": 4.8694074008039e-05, "loss": 2.2537, "mean_token_accuracy": 0.47241378426551817, "step": 152435 }, { "epoch": 0.1535391870400195, "grad_norm": 10.675690583665103, "learning_rate": 4.869394820309609e-05, "loss": 2.404, "mean_token_accuracy": 0.41379310488700866, "step": 152440 }, { "epoch": 0.15354422309312368, "grad_norm": 9.103331158613084, "learning_rate": 4.869382239227494e-05, "loss": 2.455, "mean_token_accuracy": 0.4034482777118683, "step": 152445 }, { "epoch": 0.15354925914622786, "grad_norm": 9.924707943636589, "learning_rate": 4.869369657557561e-05, "loss": 2.3058, "mean_token_accuracy": 0.48451300859451296, "step": 152450 }, { "epoch": 0.15355429519933203, "grad_norm": 10.28258156921134, "learning_rate": 4.8693570752998137e-05, "loss": 2.3101, "mean_token_accuracy": 0.4517241358757019, "step": 152455 }, { "epoch": 0.1535593312524362, "grad_norm": 9.38510570628702, "learning_rate": 4.8693444924542544e-05, "loss": 2.3647, "mean_token_accuracy": 0.42758620977401735, "step": 152460 }, { "epoch": 0.15356436730554035, "grad_norm": 9.419868744950376, "learning_rate": 4.869331909020886e-05, "loss": 2.3943, "mean_token_accuracy": 0.4793103516101837, "step": 152465 }, { "epoch": 0.15356940335864452, "grad_norm": 9.43359204304903, "learning_rate": 4.8693193249997144e-05, "loss": 2.6575, "mean_token_accuracy": 0.42607380747795104, "step": 152470 }, { "epoch": 0.1535744394117487, "grad_norm": 9.171541503348404, "learning_rate": 4.869306740390741e-05, "loss": 2.0849, "mean_token_accuracy": 0.4413793087005615, "step": 152475 }, { "epoch": 0.15357947546485287, "grad_norm": 9.651402615198204, "learning_rate": 4.86929415519397e-05, "loss": 2.3366, "mean_token_accuracy": 0.43629764318466185, "step": 152480 }, { "epoch": 0.15358451151795705, "grad_norm": 9.652629114021362, "learning_rate": 4.8692815694094044e-05, "loss": 2.4831, "mean_token_accuracy": 0.4344827592372894, "step": 152485 }, { "epoch": 0.15358954757106122, "grad_norm": 8.241141436201081, "learning_rate": 4.869268983037049e-05, "loss": 2.4756, "mean_token_accuracy": 0.4275861978530884, "step": 152490 }, { "epoch": 0.1535945836241654, "grad_norm": 9.494076878201364, "learning_rate": 4.869256396076906e-05, "loss": 2.2705, "mean_token_accuracy": 0.4413793087005615, "step": 152495 }, { "epoch": 0.15359961967726957, "grad_norm": 10.374822030694109, "learning_rate": 4.86924380852898e-05, "loss": 2.3682, "mean_token_accuracy": 0.38965516686439516, "step": 152500 }, { "epoch": 0.15360465573037374, "grad_norm": 9.93031576148973, "learning_rate": 4.869231220393273e-05, "loss": 2.3026, "mean_token_accuracy": 0.4517241299152374, "step": 152505 }, { "epoch": 0.15360969178347791, "grad_norm": 9.856929197692208, "learning_rate": 4.86921863166979e-05, "loss": 2.5612, "mean_token_accuracy": 0.441379314661026, "step": 152510 }, { "epoch": 0.1536147278365821, "grad_norm": 11.199658291288697, "learning_rate": 4.869206042358533e-05, "loss": 2.5062, "mean_token_accuracy": 0.43611615896224976, "step": 152515 }, { "epoch": 0.15361976388968626, "grad_norm": 11.68128292572958, "learning_rate": 4.869193452459507e-05, "loss": 2.1489, "mean_token_accuracy": 0.458620685338974, "step": 152520 }, { "epoch": 0.15362479994279044, "grad_norm": 10.499709392686029, "learning_rate": 4.8691808619727145e-05, "loss": 2.1773, "mean_token_accuracy": 0.4758620738983154, "step": 152525 }, { "epoch": 0.1536298359958946, "grad_norm": 16.147385061442417, "learning_rate": 4.869168270898159e-05, "loss": 2.494, "mean_token_accuracy": 0.4222625494003296, "step": 152530 }, { "epoch": 0.15363487204899878, "grad_norm": 12.015926411021631, "learning_rate": 4.869155679235845e-05, "loss": 2.6662, "mean_token_accuracy": 0.38965516090393065, "step": 152535 }, { "epoch": 0.15363990810210296, "grad_norm": 8.673566776580444, "learning_rate": 4.869143086985775e-05, "loss": 2.4433, "mean_token_accuracy": 0.4448275864124298, "step": 152540 }, { "epoch": 0.15364494415520713, "grad_norm": 10.714984107264097, "learning_rate": 4.869130494147952e-05, "loss": 2.4263, "mean_token_accuracy": 0.41724138259887694, "step": 152545 }, { "epoch": 0.1536499802083113, "grad_norm": 12.028512055636313, "learning_rate": 4.869117900722382e-05, "loss": 2.4154, "mean_token_accuracy": 0.41034482717514037, "step": 152550 }, { "epoch": 0.15365501626141548, "grad_norm": 9.3523473085196, "learning_rate": 4.8691053067090644e-05, "loss": 2.2619, "mean_token_accuracy": 0.44137930274009707, "step": 152555 }, { "epoch": 0.15366005231451965, "grad_norm": 8.842601992176446, "learning_rate": 4.8690927121080065e-05, "loss": 2.3186, "mean_token_accuracy": 0.4620689570903778, "step": 152560 }, { "epoch": 0.15366508836762383, "grad_norm": 9.489756434870225, "learning_rate": 4.86908011691921e-05, "loss": 2.2818, "mean_token_accuracy": 0.4517241358757019, "step": 152565 }, { "epoch": 0.153670124420728, "grad_norm": 10.603547930469745, "learning_rate": 4.869067521142679e-05, "loss": 2.2164, "mean_token_accuracy": 0.47023593783378603, "step": 152570 }, { "epoch": 0.15367516047383217, "grad_norm": 9.965384134069337, "learning_rate": 4.8690549247784166e-05, "loss": 2.8012, "mean_token_accuracy": 0.4034482717514038, "step": 152575 }, { "epoch": 0.15368019652693635, "grad_norm": 11.217262137012245, "learning_rate": 4.869042327826426e-05, "loss": 2.5843, "mean_token_accuracy": 0.4000000059604645, "step": 152580 }, { "epoch": 0.15368523258004052, "grad_norm": 9.858179119866252, "learning_rate": 4.8690297302867115e-05, "loss": 2.2966, "mean_token_accuracy": 0.46896551847457885, "step": 152585 }, { "epoch": 0.1536902686331447, "grad_norm": 12.404693301576899, "learning_rate": 4.8690171321592766e-05, "loss": 2.3784, "mean_token_accuracy": 0.4137930989265442, "step": 152590 }, { "epoch": 0.15369530468624887, "grad_norm": 10.284968920607268, "learning_rate": 4.8690045334441235e-05, "loss": 2.3856, "mean_token_accuracy": 0.4620689570903778, "step": 152595 }, { "epoch": 0.15370034073935304, "grad_norm": 9.776008089751452, "learning_rate": 4.8689919341412575e-05, "loss": 2.5279, "mean_token_accuracy": 0.41560798287391665, "step": 152600 }, { "epoch": 0.1537053767924572, "grad_norm": 8.931952165722855, "learning_rate": 4.868979334250681e-05, "loss": 2.0005, "mean_token_accuracy": 0.5034482777118683, "step": 152605 }, { "epoch": 0.15371041284556136, "grad_norm": 11.407742997655527, "learning_rate": 4.868966733772397e-05, "loss": 2.6623, "mean_token_accuracy": 0.42413792610168455, "step": 152610 }, { "epoch": 0.15371544889866554, "grad_norm": 9.301814285796274, "learning_rate": 4.86895413270641e-05, "loss": 1.9567, "mean_token_accuracy": 0.4689655125141144, "step": 152615 }, { "epoch": 0.1537204849517697, "grad_norm": 9.466167801782776, "learning_rate": 4.868941531052723e-05, "loss": 2.2248, "mean_token_accuracy": 0.47586206793785096, "step": 152620 }, { "epoch": 0.15372552100487388, "grad_norm": 10.341738617332876, "learning_rate": 4.8689289288113396e-05, "loss": 2.3502, "mean_token_accuracy": 0.41379311084747317, "step": 152625 }, { "epoch": 0.15373055705797806, "grad_norm": 8.762419214356752, "learning_rate": 4.868916325982264e-05, "loss": 2.4355, "mean_token_accuracy": 0.4310344815254211, "step": 152630 }, { "epoch": 0.15373559311108223, "grad_norm": 11.003419269371657, "learning_rate": 4.868903722565499e-05, "loss": 2.6732, "mean_token_accuracy": 0.40514216423034666, "step": 152635 }, { "epoch": 0.1537406291641864, "grad_norm": 10.187576999514388, "learning_rate": 4.8688911185610476e-05, "loss": 2.4305, "mean_token_accuracy": 0.3827586233615875, "step": 152640 }, { "epoch": 0.15374566521729058, "grad_norm": 9.901928476456222, "learning_rate": 4.8688785139689136e-05, "loss": 2.7794, "mean_token_accuracy": 0.4068965554237366, "step": 152645 }, { "epoch": 0.15375070127039475, "grad_norm": 9.43769074776903, "learning_rate": 4.868865908789101e-05, "loss": 2.6365, "mean_token_accuracy": 0.3931034505367279, "step": 152650 }, { "epoch": 0.15375573732349893, "grad_norm": 11.635205249043453, "learning_rate": 4.8688533030216135e-05, "loss": 2.5248, "mean_token_accuracy": 0.417241370677948, "step": 152655 }, { "epoch": 0.1537607733766031, "grad_norm": 10.615360249817746, "learning_rate": 4.868840696666454e-05, "loss": 2.6765, "mean_token_accuracy": 0.4172413766384125, "step": 152660 }, { "epoch": 0.15376580942970727, "grad_norm": 9.41859958477984, "learning_rate": 4.868828089723626e-05, "loss": 2.141, "mean_token_accuracy": 0.4517241358757019, "step": 152665 }, { "epoch": 0.15377084548281145, "grad_norm": 12.239592175748397, "learning_rate": 4.868815482193133e-05, "loss": 2.26, "mean_token_accuracy": 0.4517241358757019, "step": 152670 }, { "epoch": 0.15377588153591562, "grad_norm": 9.621273044316686, "learning_rate": 4.868802874074979e-05, "loss": 2.4762, "mean_token_accuracy": 0.4137931138277054, "step": 152675 }, { "epoch": 0.1537809175890198, "grad_norm": 15.60293971843449, "learning_rate": 4.8687902653691664e-05, "loss": 2.8578, "mean_token_accuracy": 0.4068965524435043, "step": 152680 }, { "epoch": 0.15378595364212397, "grad_norm": 10.757539670076417, "learning_rate": 4.8687776560757007e-05, "loss": 2.5652, "mean_token_accuracy": 0.3999999940395355, "step": 152685 }, { "epoch": 0.15379098969522814, "grad_norm": 11.638909722825764, "learning_rate": 4.8687650461945824e-05, "loss": 2.1667, "mean_token_accuracy": 0.4896551787853241, "step": 152690 }, { "epoch": 0.15379602574833232, "grad_norm": 12.017498981453661, "learning_rate": 4.868752435725818e-05, "loss": 2.1646, "mean_token_accuracy": 0.46896551847457885, "step": 152695 }, { "epoch": 0.1538010618014365, "grad_norm": 11.106545452130167, "learning_rate": 4.8687398246694096e-05, "loss": 2.45, "mean_token_accuracy": 0.41379310488700866, "step": 152700 }, { "epoch": 0.15380609785454066, "grad_norm": 11.697980917622623, "learning_rate": 4.8687272130253603e-05, "loss": 2.2245, "mean_token_accuracy": 0.41724138259887694, "step": 152705 }, { "epoch": 0.15381113390764484, "grad_norm": 8.418259308360632, "learning_rate": 4.868714600793675e-05, "loss": 2.3768, "mean_token_accuracy": 0.44827587008476255, "step": 152710 }, { "epoch": 0.153816169960749, "grad_norm": 9.492873589694012, "learning_rate": 4.868701987974355e-05, "loss": 2.3504, "mean_token_accuracy": 0.4034482777118683, "step": 152715 }, { "epoch": 0.15382120601385318, "grad_norm": 8.892302044764094, "learning_rate": 4.8686893745674055e-05, "loss": 2.2816, "mean_token_accuracy": 0.42758620381355283, "step": 152720 }, { "epoch": 0.15382624206695736, "grad_norm": 10.67497047776544, "learning_rate": 4.8686767605728296e-05, "loss": 2.5715, "mean_token_accuracy": 0.4, "step": 152725 }, { "epoch": 0.15383127812006153, "grad_norm": 9.75752235937745, "learning_rate": 4.868664145990631e-05, "loss": 2.7417, "mean_token_accuracy": 0.4000000059604645, "step": 152730 }, { "epoch": 0.1538363141731657, "grad_norm": 9.128268119289027, "learning_rate": 4.8686515308208133e-05, "loss": 1.8361, "mean_token_accuracy": 0.5448275864124298, "step": 152735 }, { "epoch": 0.15384135022626988, "grad_norm": 9.693243609545528, "learning_rate": 4.868638915063379e-05, "loss": 2.2404, "mean_token_accuracy": 0.47241379618644713, "step": 152740 }, { "epoch": 0.15384638627937403, "grad_norm": 10.447568259107445, "learning_rate": 4.8686262987183335e-05, "loss": 2.4169, "mean_token_accuracy": 0.4344827592372894, "step": 152745 }, { "epoch": 0.1538514223324782, "grad_norm": 9.82108706086436, "learning_rate": 4.868613681785678e-05, "loss": 2.3081, "mean_token_accuracy": 0.46551724076271056, "step": 152750 }, { "epoch": 0.15385645838558237, "grad_norm": 10.268381460801237, "learning_rate": 4.868601064265417e-05, "loss": 2.5582, "mean_token_accuracy": 0.4551724135875702, "step": 152755 }, { "epoch": 0.15386149443868655, "grad_norm": 12.650362365762081, "learning_rate": 4.868588446157555e-05, "loss": 3.0257, "mean_token_accuracy": 0.34482758641242983, "step": 152760 }, { "epoch": 0.15386653049179072, "grad_norm": 7.7637909385303985, "learning_rate": 4.868575827462094e-05, "loss": 2.3698, "mean_token_accuracy": 0.45862067937850953, "step": 152765 }, { "epoch": 0.1538715665448949, "grad_norm": 10.666801660482077, "learning_rate": 4.868563208179038e-05, "loss": 2.304, "mean_token_accuracy": 0.42758620977401735, "step": 152770 }, { "epoch": 0.15387660259799907, "grad_norm": 11.625868395731853, "learning_rate": 4.868550588308391e-05, "loss": 2.2665, "mean_token_accuracy": 0.4448275864124298, "step": 152775 }, { "epoch": 0.15388163865110324, "grad_norm": 14.713889818267283, "learning_rate": 4.8685379678501555e-05, "loss": 2.5167, "mean_token_accuracy": 0.4620689690113068, "step": 152780 }, { "epoch": 0.15388667470420742, "grad_norm": 11.527496518353503, "learning_rate": 4.868525346804336e-05, "loss": 2.2512, "mean_token_accuracy": 0.43103447556495667, "step": 152785 }, { "epoch": 0.1538917107573116, "grad_norm": 11.332514694420142, "learning_rate": 4.868512725170936e-05, "loss": 2.369, "mean_token_accuracy": 0.382758629322052, "step": 152790 }, { "epoch": 0.15389674681041576, "grad_norm": 16.29999719625112, "learning_rate": 4.868500102949958e-05, "loss": 2.173, "mean_token_accuracy": 0.40689654648303986, "step": 152795 }, { "epoch": 0.15390178286351994, "grad_norm": 8.796989125108604, "learning_rate": 4.868487480141406e-05, "loss": 2.0905, "mean_token_accuracy": 0.46896551847457885, "step": 152800 }, { "epoch": 0.1539068189166241, "grad_norm": 10.951496891262385, "learning_rate": 4.868474856745284e-05, "loss": 2.3651, "mean_token_accuracy": 0.47241379618644713, "step": 152805 }, { "epoch": 0.15391185496972828, "grad_norm": 9.474121639424538, "learning_rate": 4.8684622327615944e-05, "loss": 2.2049, "mean_token_accuracy": 0.510344821214676, "step": 152810 }, { "epoch": 0.15391689102283246, "grad_norm": 13.570609595405084, "learning_rate": 4.868449608190342e-05, "loss": 2.312, "mean_token_accuracy": 0.49999999403953554, "step": 152815 }, { "epoch": 0.15392192707593663, "grad_norm": 12.135781499796394, "learning_rate": 4.8684369830315295e-05, "loss": 2.6659, "mean_token_accuracy": 0.38275861740112305, "step": 152820 }, { "epoch": 0.1539269631290408, "grad_norm": 15.067170887591697, "learning_rate": 4.868424357285161e-05, "loss": 2.2609, "mean_token_accuracy": 0.4344827592372894, "step": 152825 }, { "epoch": 0.15393199918214498, "grad_norm": 16.406711908220593, "learning_rate": 4.8684117309512395e-05, "loss": 2.7508, "mean_token_accuracy": 0.38275861740112305, "step": 152830 }, { "epoch": 0.15393703523524915, "grad_norm": 8.737784514812487, "learning_rate": 4.8683991040297686e-05, "loss": 2.2995, "mean_token_accuracy": 0.41034482717514037, "step": 152835 }, { "epoch": 0.15394207128835333, "grad_norm": 9.350975427811166, "learning_rate": 4.868386476520751e-05, "loss": 1.9568, "mean_token_accuracy": 0.49999999403953554, "step": 152840 }, { "epoch": 0.1539471073414575, "grad_norm": 11.035349567187088, "learning_rate": 4.868373848424191e-05, "loss": 2.6929, "mean_token_accuracy": 0.37586206793785093, "step": 152845 }, { "epoch": 0.15395214339456167, "grad_norm": 12.079743759526588, "learning_rate": 4.8683612197400936e-05, "loss": 2.3261, "mean_token_accuracy": 0.43103448748588563, "step": 152850 }, { "epoch": 0.15395717944766585, "grad_norm": 9.01501846629349, "learning_rate": 4.868348590468459e-05, "loss": 2.2341, "mean_token_accuracy": 0.4896551787853241, "step": 152855 }, { "epoch": 0.15396221550077002, "grad_norm": 12.127958586624136, "learning_rate": 4.8683359606092945e-05, "loss": 2.3126, "mean_token_accuracy": 0.4344827592372894, "step": 152860 }, { "epoch": 0.1539672515538742, "grad_norm": 10.334268476539346, "learning_rate": 4.8683233301626005e-05, "loss": 2.3268, "mean_token_accuracy": 0.4448275864124298, "step": 152865 }, { "epoch": 0.15397228760697837, "grad_norm": 11.005755890460767, "learning_rate": 4.8683106991283815e-05, "loss": 2.2639, "mean_token_accuracy": 0.4379310429096222, "step": 152870 }, { "epoch": 0.15397732366008254, "grad_norm": 9.102217154011607, "learning_rate": 4.8682980675066414e-05, "loss": 2.0473, "mean_token_accuracy": 0.517241370677948, "step": 152875 }, { "epoch": 0.15398235971318672, "grad_norm": 9.966187980874091, "learning_rate": 4.868285435297383e-05, "loss": 2.5421, "mean_token_accuracy": 0.42758620977401735, "step": 152880 }, { "epoch": 0.15398739576629086, "grad_norm": 10.82624557080725, "learning_rate": 4.868272802500611e-05, "loss": 2.3233, "mean_token_accuracy": 0.4620689690113068, "step": 152885 }, { "epoch": 0.15399243181939504, "grad_norm": 11.412644996767245, "learning_rate": 4.8682601691163276e-05, "loss": 2.3123, "mean_token_accuracy": 0.4206896543502808, "step": 152890 }, { "epoch": 0.1539974678724992, "grad_norm": 11.551899775696123, "learning_rate": 4.8682475351445366e-05, "loss": 2.5273, "mean_token_accuracy": 0.38965516686439516, "step": 152895 }, { "epoch": 0.15400250392560338, "grad_norm": 10.24828059053627, "learning_rate": 4.868234900585242e-05, "loss": 2.3307, "mean_token_accuracy": 0.42413792610168455, "step": 152900 }, { "epoch": 0.15400753997870756, "grad_norm": 9.266559997527391, "learning_rate": 4.868222265438447e-05, "loss": 2.3876, "mean_token_accuracy": 0.47931034564971925, "step": 152905 }, { "epoch": 0.15401257603181173, "grad_norm": 9.33744208582358, "learning_rate": 4.868209629704156e-05, "loss": 2.1558, "mean_token_accuracy": 0.43448275327682495, "step": 152910 }, { "epoch": 0.1540176120849159, "grad_norm": 12.2049612651845, "learning_rate": 4.868196993382371e-05, "loss": 2.244, "mean_token_accuracy": 0.4601935833692551, "step": 152915 }, { "epoch": 0.15402264813802008, "grad_norm": 9.968679309644138, "learning_rate": 4.8681843564730964e-05, "loss": 2.1679, "mean_token_accuracy": 0.4809437394142151, "step": 152920 }, { "epoch": 0.15402768419112425, "grad_norm": 11.445931883220927, "learning_rate": 4.868171718976335e-05, "loss": 2.4562, "mean_token_accuracy": 0.441379314661026, "step": 152925 }, { "epoch": 0.15403272024422843, "grad_norm": 13.253691668134923, "learning_rate": 4.8681590808920914e-05, "loss": 2.882, "mean_token_accuracy": 0.34482757449150087, "step": 152930 }, { "epoch": 0.1540377562973326, "grad_norm": 12.277239813837044, "learning_rate": 4.8681464422203674e-05, "loss": 2.4737, "mean_token_accuracy": 0.41379310488700866, "step": 152935 }, { "epoch": 0.15404279235043677, "grad_norm": 11.189818215633903, "learning_rate": 4.8681338029611684e-05, "loss": 2.4426, "mean_token_accuracy": 0.44827585816383364, "step": 152940 }, { "epoch": 0.15404782840354095, "grad_norm": 11.866710622062639, "learning_rate": 4.868121163114497e-05, "loss": 2.2985, "mean_token_accuracy": 0.4034482777118683, "step": 152945 }, { "epoch": 0.15405286445664512, "grad_norm": 10.561943385692002, "learning_rate": 4.8681085226803564e-05, "loss": 2.2836, "mean_token_accuracy": 0.42758620977401735, "step": 152950 }, { "epoch": 0.1540579005097493, "grad_norm": 11.644118161482014, "learning_rate": 4.868095881658751e-05, "loss": 2.0437, "mean_token_accuracy": 0.5137931108474731, "step": 152955 }, { "epoch": 0.15406293656285347, "grad_norm": 12.155367878415287, "learning_rate": 4.868083240049683e-05, "loss": 2.3164, "mean_token_accuracy": 0.4344827473163605, "step": 152960 }, { "epoch": 0.15406797261595764, "grad_norm": 11.531993944287722, "learning_rate": 4.868070597853158e-05, "loss": 2.3746, "mean_token_accuracy": 0.4379310369491577, "step": 152965 }, { "epoch": 0.15407300866906182, "grad_norm": 12.077769907358492, "learning_rate": 4.8680579550691774e-05, "loss": 2.3574, "mean_token_accuracy": 0.43986691236495973, "step": 152970 }, { "epoch": 0.154078044722166, "grad_norm": 11.647663613278928, "learning_rate": 4.868045311697746e-05, "loss": 2.3383, "mean_token_accuracy": 0.4655172348022461, "step": 152975 }, { "epoch": 0.15408308077527016, "grad_norm": 10.795617295775722, "learning_rate": 4.868032667738866e-05, "loss": 2.112, "mean_token_accuracy": 0.4724138081073761, "step": 152980 }, { "epoch": 0.15408811682837434, "grad_norm": 12.275849309866002, "learning_rate": 4.868020023192542e-05, "loss": 2.1301, "mean_token_accuracy": 0.4551724135875702, "step": 152985 }, { "epoch": 0.1540931528814785, "grad_norm": 9.635435076566564, "learning_rate": 4.8680073780587774e-05, "loss": 1.9659, "mean_token_accuracy": 0.4862068951129913, "step": 152990 }, { "epoch": 0.15409818893458269, "grad_norm": 37.426696374871334, "learning_rate": 4.8679947323375756e-05, "loss": 2.8081, "mean_token_accuracy": 0.4655172348022461, "step": 152995 }, { "epoch": 0.15410322498768686, "grad_norm": 8.485265836781963, "learning_rate": 4.86798208602894e-05, "loss": 2.2371, "mean_token_accuracy": 0.493103438615799, "step": 153000 }, { "epoch": 0.15410322498768686, "step": 153000, "total_flos": 3941084160000.0, "train_loss": 0.0, "train_runtime": 2.3821, "train_samples_per_second": 4198.041, "train_steps_per_second": 16792.163 } ], "logging_steps": 5, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3941084160000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }